From c6a33e2464edd87f8c12cc2d11369a5b44c65b77 Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Wed, 12 Oct 2005 15:12:26 +0800
Subject: [PATCH] libata CHS: LBA28/LBA48 optimization (revise #6)

     - add lba_28_ok() and lba_48_ok() to ata.h.
     - check ending block number instead of staring block number.
     - use lba_28_ok() for CHS range check
     - LBA28/LBA48 optimization

Suggested by Mark Lord and Alan Cox.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>

=====
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 include/linux/ata.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 630908c9378b..b7e7e1cb2633 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -291,4 +291,16 @@ static inline int ata_ok(u8 status)
 			== ATA_DRDY);
 }
 
+static inline int lba_28_ok(u64 block, u32 n_block)
+{
+	/* check the ending block number */
+	return ((block + n_block - 1) < ((u64)1 << 28)) && (n_block <= 256);
+}
+
+static inline int lba_48_ok(u64 block, u32 n_block)
+{
+	/* check the ending block number */
+	return ((block + n_block - 1) < ((u64)1 << 48)) && (n_block <= 65536);
+}
+
 #endif /* __LINUX_ATA_H__ */
-- 
cgit v1.2.3


From f03aa2d89ad600a1ed21a223f196776f217cfe00 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 14 Jan 2006 03:10:22 +0100
Subject: [PATCH] drivers/net/arcnet/: possible cleanups

This patch contains the following possible cleanups:
- make needlessly global code static
- arcnet.c: remove the unneeded EXPORT_SYMBOL(arc_proto_null)
- arcnet.c: remove the unneeded EXPORT_SYMBOL(arcnet_dump_packet)

To make Jeff happy, arcnet.c still prints
  arcnet: v3.93 BETA 2000/04/29 - by Avery Pennarun et al.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/net/arcnet/arc-rawmode.c |  2 +-
 drivers/net/arcnet/arcnet.c      | 17 ++++++++++-------
 drivers/net/arcnet/rfc1051.c     |  2 +-
 drivers/net/arcnet/rfc1201.c     |  2 +-
 include/linux/arcdevice.h        |  9 ---------
 5 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/arcnet/arc-rawmode.c b/drivers/net/arcnet/arc-rawmode.c
index e1ea29b0cd14..e7555d4e6ff1 100644
--- a/drivers/net/arcnet/arc-rawmode.c
+++ b/drivers/net/arcnet/arc-rawmode.c
@@ -42,7 +42,7 @@ static int build_header(struct sk_buff *skb, struct net_device *dev,
 static int prepare_tx(struct net_device *dev, struct archdr *pkt, int length,
 		      int bufnum);
 
-struct ArcProto rawmode_proto =
+static struct ArcProto rawmode_proto =
 {
 	.suffix		= 'r',
 	.mtu		= XMTU,
diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index 409d61d8d083..64e2caf3083d 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -62,6 +62,7 @@ static int null_build_header(struct sk_buff *skb, struct net_device *dev,
 static int null_prepare_tx(struct net_device *dev, struct archdr *pkt,
 			   int length, int bufnum);
 
+static void arcnet_rx(struct net_device *dev, int bufnum);
 
 /*
  * one ArcProto per possible proto ID.  None of the elements of
@@ -72,7 +73,7 @@ static int null_prepare_tx(struct net_device *dev, struct archdr *pkt,
  struct ArcProto *arc_proto_map[256], *arc_proto_default,
    *arc_bcast_proto, *arc_raw_proto;
 
-struct ArcProto arc_proto_null =
+static struct ArcProto arc_proto_null =
 {
 	.suffix		= '?',
 	.mtu		= XMTU,
@@ -91,7 +92,6 @@ EXPORT_SYMBOL(arc_proto_map);
 EXPORT_SYMBOL(arc_proto_default);
 EXPORT_SYMBOL(arc_bcast_proto);
 EXPORT_SYMBOL(arc_raw_proto);
-EXPORT_SYMBOL(arc_proto_null);
 EXPORT_SYMBOL(arcnet_unregister_proto);
 EXPORT_SYMBOL(arcnet_debug);
 EXPORT_SYMBOL(alloc_arcdev);
@@ -119,7 +119,7 @@ static int __init arcnet_init(void)
 
 	arcnet_debug = debug;
 
-	printk(VERSION);
+	printk("arcnet loaded.\n");
 
 #ifdef ALPHA_WARNING
 	BUGLVL(D_EXTRA) {
@@ -179,8 +179,8 @@ EXPORT_SYMBOL(arcnet_dump_skb);
  * Dump the contents of an ARCnet buffer
  */
 #if (ARCNET_DEBUG_MAX & (D_RX | D_TX))
-void arcnet_dump_packet(struct net_device *dev, int bufnum, char *desc,
-			int take_arcnet_lock)
+static void arcnet_dump_packet(struct net_device *dev, int bufnum,
+			       char *desc, int take_arcnet_lock)
 {
 	struct arcnet_local *lp = dev->priv;
 	int i, length;
@@ -209,7 +209,10 @@ void arcnet_dump_packet(struct net_device *dev, int bufnum, char *desc,
 
 }
 
-EXPORT_SYMBOL(arcnet_dump_packet);
+#else
+
+#define arcnet_dump_packet(dev, bufnum, desc,take_arcnet_lock) do { } while (0)
+
 #endif
 
 
@@ -997,7 +1000,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  * This is a generic packet receiver that calls arcnet??_rx depending on the
  * protocol ID found.
  */
-void arcnet_rx(struct net_device *dev, int bufnum)
+static void arcnet_rx(struct net_device *dev, int bufnum)
 {
 	struct arcnet_local *lp = dev->priv;
 	struct archdr pkt;
diff --git a/drivers/net/arcnet/rfc1051.c b/drivers/net/arcnet/rfc1051.c
index 6d7913704fb5..6d6c69f036ef 100644
--- a/drivers/net/arcnet/rfc1051.c
+++ b/drivers/net/arcnet/rfc1051.c
@@ -43,7 +43,7 @@ static int prepare_tx(struct net_device *dev, struct archdr *pkt, int length,
 		      int bufnum);
 
 
-struct ArcProto rfc1051_proto =
+static struct ArcProto rfc1051_proto =
 {
 	.suffix		= 's',
 	.mtu		= XMTU - RFC1051_HDR_SIZE,
diff --git a/drivers/net/arcnet/rfc1201.c b/drivers/net/arcnet/rfc1201.c
index 6b6ae4bf3d39..bee34226abfa 100644
--- a/drivers/net/arcnet/rfc1201.c
+++ b/drivers/net/arcnet/rfc1201.c
@@ -43,7 +43,7 @@ static int prepare_tx(struct net_device *dev, struct archdr *pkt, int length,
 		      int bufnum);
 static int continue_tx(struct net_device *dev, int bufnum);
 
-struct ArcProto rfc1201_proto =
+static struct ArcProto rfc1201_proto =
 {
 	.suffix		= 'a',
 	.mtu		= 1500,	/* could be more, but some receivers can't handle it... */
diff --git a/include/linux/arcdevice.h b/include/linux/arcdevice.h
index 7198f129e135..231ba090ae34 100644
--- a/include/linux/arcdevice.h
+++ b/include/linux/arcdevice.h
@@ -206,7 +206,6 @@ struct ArcProto {
 
 extern struct ArcProto *arc_proto_map[256], *arc_proto_default,
 	*arc_bcast_proto, *arc_raw_proto;
-extern struct ArcProto arc_proto_null;
 
 
 /*
@@ -334,17 +333,9 @@ void arcnet_dump_skb(struct net_device *dev, struct sk_buff *skb, char *desc);
 #define arcnet_dump_skb(dev,skb,desc) ;
 #endif
 
-#if (ARCNET_DEBUG_MAX & D_RX) || (ARCNET_DEBUG_MAX & D_TX)
-void arcnet_dump_packet(struct net_device *dev, int bufnum, char *desc,
-			int take_arcnet_lock);
-#else
-#define arcnet_dump_packet(dev, bufnum, desc,take_arcnet_lock) ;
-#endif
-
 void arcnet_unregister_proto(struct ArcProto *proto);
 irqreturn_t arcnet_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 struct net_device *alloc_arcdev(char *name);
-void arcnet_rx(struct net_device *dev, int bufnum);
 
 #endif				/* __KERNEL__ */
 #endif				/* _LINUX_ARCDEVICE_H */
-- 
cgit v1.2.3


From bfd6057959ecd3ff779a373a4d07cda2c2d0eec1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy_d_dunlap@linux.intel.com>
Date: Tue, 17 Jan 2006 19:34:42 -0500
Subject: From: Borislav Petkov <petkov@uni-muenster.de>

libata new debugging macro definitions

Signed-off-by: Borislav Petkov <petkov@uni-muenster.de>
Signed-off-by: Randy Dunlap <randy_d_dunlap@linux.intel.com>
---
 include/linux/libata.h | 52 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index a43c95f8f968..339f7e75cb60 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -35,7 +35,8 @@
 #include <linux/workqueue.h>
 
 /*
- * compile-time options
+ * compile-time options: to be removed as soon as all the drivers are
+ * converted to the new debugging mechanism
  */
 #undef ATA_DEBUG		/* debugging output */
 #undef ATA_VERBOSE_DEBUG	/* yet more debugging output */
@@ -71,6 +72,38 @@
         }
 #endif
 
+/* NEW: debug levels */
+#define HAVE_LIBATA_MSG 1
+
+enum {
+	ATA_MSG_DRV	= 0x0001,
+	ATA_MSG_INFO	= 0x0002,
+	ATA_MSG_PROBE	= 0x0004,
+	ATA_MSG_WARN	= 0x0008,
+	ATA_MSG_MALLOC	= 0x0010,
+	ATA_MSG_CTL	= 0x0020,
+	ATA_MSG_INTR	= 0x0040,
+	ATA_MSG_ERR	= 0x0080,
+};
+
+#define ata_msg_drv(p)    ((p)->msg_enable & ATA_MSG_DRV)
+#define ata_msg_info(p)   ((p)->msg_enable & ATA_MSG_INFO)
+#define ata_msg_probe(p)  ((p)->msg_enable & ATA_MSG_PROBE)
+#define ata_msg_warn(p)   ((p)->msg_enable & ATA_MSG_WARN)
+#define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC)
+#define ata_msg_ctl(p)    ((p)->msg_enable & ATA_MSG_CTL)
+#define ata_msg_intr(p)   ((p)->msg_enable & ATA_MSG_INTR)
+#define ata_msg_err(p)    ((p)->msg_enable & ATA_MSG_ERR)
+
+static inline u32 ata_msg_init(int dval, int default_msg_enable_bits)
+{
+	if (dval < 0 || dval >= (sizeof(u32) * 8))
+		return default_msg_enable_bits; /* should be 0x1 - only driver info msgs */
+	if (!dval)
+		return 0;
+	return (1 << dval) - 1;
+}
+
 /* defines only for the constants which don't work well as enums */
 #define ATA_TAG_POISON		0xfafbfcfdU
 
@@ -356,6 +389,8 @@ struct ata_port {
 	unsigned int		hsm_task_state;
 	unsigned long		pio_task_timeout;
 
+	u32			msg_enable;
+
 	void			*private_data;
 };
 
@@ -640,9 +675,9 @@ static inline u8 ata_wait_idle(struct ata_port *ap)
 
 	if (status & (ATA_BUSY | ATA_DRQ)) {
 		unsigned long l = ap->ioaddr.status_addr;
-		printk(KERN_WARNING
-		       "ATA: abnormal status 0x%X on port 0x%lX\n",
-		       status, l);
+		if (ata_msg_warn(ap))
+			printk(KERN_WARNING "ATA: abnormal status 0x%X on port 0x%lX\n",
+				status, l);
 	}
 
 	return status;
@@ -734,7 +769,8 @@ static inline u8 ata_irq_ack(struct ata_port *ap, unsigned int chk_drq)
 
 	status = ata_busy_wait(ap, bits, 1000);
 	if (status & bits)
-		DPRINTK("abnormal status 0x%X\n", status);
+		if (ata_msg_err(ap))
+			printk(KERN_ERR "abnormal status 0x%X\n", status);
 
 	/* get controller status; clear intr, err bits */
 	if (ap->flags & ATA_FLAG_MMIO) {
@@ -752,8 +788,10 @@ static inline u8 ata_irq_ack(struct ata_port *ap, unsigned int chk_drq)
 		post_stat = inb(ap->ioaddr.bmdma_addr + ATA_DMA_STATUS);
 	}
 
-	VPRINTK("irq ack: host_stat 0x%X, new host_stat 0x%X, drv_stat 0x%X\n",
-		host_stat, post_stat, status);
+	if (ata_msg_intr(ap))
+		printk(KERN_INFO "%s: irq ack: host_stat 0x%X, new host_stat 0x%X, drv_stat 0x%X\n",
+			__FUNCTION__,
+			host_stat, post_stat, status);
 
 	return status;
 }
-- 
cgit v1.2.3


From 6ac48b458769059ee6147dd8bf2767e820407292 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@infradead.org>
Date: Mon, 23 Jan 2006 17:11:05 -0200
Subject: V4L/DVB (3408): Included new sliced VBI types to videodev2.h and
 tvp5150

- Added other sliced VBI types to videodev2.h
- tvp5150 now uses standard V4L2 API codes from videodev2.h
- Implemented VIDIOC_G_SLICED_VBI_CAP for tvp5150. This is
dynamically filled based on defined VDP C-RAM values filled
by the driver.

Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/cx25840/cx25840-vbi.c |   6 +-
 drivers/media/video/saa7115.c             |   6 +-
 drivers/media/video/tvp5150.c             | 161 +++++++++++++++++++-----------
 drivers/media/video/tvp5150_reg.h         |   7 ++
 include/linux/videodev2.h                 |  51 ++++++++--
 5 files changed, 161 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/cx25840/cx25840-vbi.c b/drivers/media/video/cx25840/cx25840-vbi.c
index 04d879da7d63..e96fd1f1d6dc 100644
--- a/drivers/media/video/cx25840/cx25840-vbi.c
+++ b/drivers/media/video/cx25840/cx25840-vbi.c
@@ -151,7 +151,7 @@ int cx25840_vbi(struct i2c_client *client, unsigned int cmd, void *arg)
 	case VIDIOC_G_FMT:
 	{
 		static u16 lcr2vbi[] = {
-			0, V4L2_SLICED_TELETEXT_B, 0,	/* 1 */
+			0, V4L2_SLICED_TELETEXT_PAL_B, 0,	/* 1 */
 			0, V4L2_SLICED_WSS_625, 0,	/* 4 */
 			V4L2_SLICED_CAPTION_525,	/* 6 */
 			0, 0, V4L2_SLICED_VPS, 0, 0,	/* 9 */
@@ -231,7 +231,7 @@ int cx25840_vbi(struct i2c_client *client, unsigned int cmd, void *arg)
 		for (i = 7; i <= 23; i++) {
 			for (x = 0; x <= 1; x++) {
 				switch (svbi->service_lines[1-x][i]) {
-				case V4L2_SLICED_TELETEXT_B:
+				case V4L2_SLICED_TELETEXT_PAL_B:
 					lcr[i] |= 1 << (4 * x);
 					break;
 				case V4L2_SLICED_WSS_625:
@@ -282,7 +282,7 @@ int cx25840_vbi(struct i2c_client *client, unsigned int cmd, void *arg)
 
 		switch (id2) {
 		case 1:
-			id2 = V4L2_SLICED_TELETEXT_B;
+			id2 = V4L2_SLICED_TELETEXT_PAL_B;
 			break;
 		case 4:
 			id2 = V4L2_SLICED_WSS_625;
diff --git a/drivers/media/video/saa7115.c b/drivers/media/video/saa7115.c
index 048d000941c7..487a42970963 100644
--- a/drivers/media/video/saa7115.c
+++ b/drivers/media/video/saa7115.c
@@ -791,7 +791,7 @@ static void saa7115_set_lcr(struct i2c_client *client, struct v4l2_sliced_vbi_fo
 					case 0:
 						lcr[i] |= 0xf << (4 * x);
 						break;
-					case V4L2_SLICED_TELETEXT_B:
+					case V4L2_SLICED_TELETEXT_PAL_B:
 						lcr[i] |= 1 << (4 * x);
 						break;
 					case V4L2_SLICED_CAPTION_525:
@@ -820,7 +820,7 @@ static void saa7115_set_lcr(struct i2c_client *client, struct v4l2_sliced_vbi_fo
 static int saa7115_get_v4lfmt(struct i2c_client *client, struct v4l2_format *fmt)
 {
 	static u16 lcr2vbi[] = {
-		0, V4L2_SLICED_TELETEXT_B, 0,	/* 1 */
+		0, V4L2_SLICED_TELETEXT_PAL_B, 0,	/* 1 */
 		0, V4L2_SLICED_CAPTION_525,	/* 4 */
 		V4L2_SLICED_WSS_625, 0,		/* 5 */
 		V4L2_SLICED_VPS, 0, 0, 0, 0,	/* 7 */
@@ -985,7 +985,7 @@ static void saa7115_decode_vbi_line(struct i2c_client *client,
 	/* decode payloads */
 	switch (id2) {
 	case 1:
-		vbi->type = V4L2_SLICED_TELETEXT_B;
+		vbi->type = V4L2_SLICED_TELETEXT_PAL_B;
 		break;
 	case 4:
 		if (!saa7115_odd_parity(p[0]) || !saa7115_odd_parity(p[1]))
diff --git a/drivers/media/video/tvp5150.c b/drivers/media/video/tvp5150.c
index f7fa93c64d11..17a8dd726912 100644
--- a/drivers/media/video/tvp5150.c
+++ b/drivers/media/video/tvp5150.c
@@ -1,8 +1,8 @@
 /*
- * tvp5150 - Texas Instruments TVP5150A(M) video decoder driver
+ * tvp5150 - Texas Instruments TVP5150A/AM1 video decoder driver
  *
- * Copyright (c) 2005 Mauro Carvalho Chehab (mchehab@brturbo.com.br)
- * This code is placed under the terms of the GNU General Public License
+ * Copyright (c) 2005,2006 Mauro Carvalho Chehab (mchehab@infradead.org)
+ * This code is placed under the terms of the GNU General Public License v2
  */
 
 #include <linux/i2c.h>
@@ -13,10 +13,11 @@
 
 #include "tvp5150_reg.h"
 
-MODULE_DESCRIPTION("Texas Instruments TVP5150A video decoder driver");	/* standard i2c insmod options */
+MODULE_DESCRIPTION("Texas Instruments TVP5150A video decoder driver");
 MODULE_AUTHOR("Mauro Carvalho Chehab");
 MODULE_LICENSE("GPL");
 
+/* standard i2c insmod options */
 static unsigned short normal_i2c[] = {
 	0xb8 >> 1,
 	0xba >> 1,
@@ -477,82 +478,101 @@ static const struct i2c_reg_value tvp5150_init_enable[] = {
 	}
 };
 
+struct tvp5150_vbi_type {
+	unsigned int vbi_type;
+	unsigned int ini_line;
+	unsigned int end_line;
+	unsigned int by_field :1;
+};
+
 struct i2c_vbi_ram_value {
 	u16 reg;
-	unsigned char values[26];
+	struct tvp5150_vbi_type type;
+	unsigned char values[16];
 };
 
-/* tvp5150_vbi_types should follow the same order as vbi_ram_default
+/* This struct have the values for each supported VBI Standard
+ * by
+ tvp5150_vbi_types should follow the same order as vbi_ram_default
  * value 0 means rom position 0x10, value 1 means rom position 0x30
  * and so on. There are 16 possible locations from 0 to 15.
  */
-enum tvp5150_vbi_types {	/* Video line number  Description */
-	VBI_WST_SECAM,		/* 6-23  (field 1,2)  Teletext, SECAM */
-	VBI_WST_PAL_B,		/* 6-22  (field 1,2)  Teletext, PAL, System B */
-	VBI_WST_PAL_C,		/* 6-22  (field 1,2)  Teletext, PAL, System C */
-	VBI_WST_NTSC_B,		/* 10-21 (field 1,2)  Teletext, NTSC, System B */
-	VBI_NABTS_NTSC_C,	/* 10-21 (field 1,2)  Teletext, NTSC, System C */
-	VBI_NABTS_NTSC_D,	/* 10-21 (field 1,2)  Teletext, NTSC, System D */
-	VBI_CC_PAL_SECAM,	/* 22    (field 1,2)  Closed Caption PAL/SECAM */
-	VBI_CC_NTSC,		/* 21    (field 1,2)  Closed Caption NTSC */
-	VBI_WSS_PAL_SECAM,	/* 23    (field 1,2)  Wide Screen Signal PAL/SECAM */
-	VBI_WSS_NTSC,		/* 20    (field 1,2)  Wide Screen Signal NTSC */
-	VBI_VITC_PAL_SECAM,	/* 6-22               Vertical Interval Timecode PAL/SECAM */
-	VBI_VITC_NTSC,		/* 10-20              Vertical Interval Timecode NTSC */
-	VBI_VPS_PAL,		/* 16                 Video Program System PAL */
-	VBI_EPG_GEMSTAR,	/*                    EPG/Gemstar Electronic program guide */
-	VBI_RESERVED,		/*                    not in use on vbi_ram_default table */
-	VBI_FULL_FIELD		/*                    Active video/Full Field */
-};
 
 static struct i2c_vbi_ram_value vbi_ram_default[] =
 {
-	{0x010, /* WST SECAM */
-		{ 0xaa, 0xaa, 0xff, 0xff , 0xe7, 0x2e, 0x20, 0x26, 0xe6, 0xb4, 0x0e, 0x0, 0x0, 0x0, 0x10, 0x0 }
+	{0x010, /* Teletext, SECAM, WST System A */
+		{V4L2_SLICED_TELETEXT_SECAM,6,23,1},
+		{ 0xaa, 0xaa, 0xff, 0xff, 0xe7, 0x2e, 0x20, 0x26,
+		  0xe6, 0xb4, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00 }
 	},
-	{0x030, /* WST PAL B */
-		{ 0xaa, 0xaa, 0xff, 0xff , 0x27, 0x2e, 0x20, 0x2b, 0xa6, 0x72, 0x10, 0x0, 0x0, 0x0, 0x10, 0x0 }
+	{0x030, /* Teletext, PAL, WST System B */
+		{V4L2_SLICED_TELETEXT_PAL_B,6,22,1},
+		{ 0xaa, 0xaa, 0xff, 0xff, 0x27, 0x2e, 0x20, 0x2b,
+		  0xa6, 0x72, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00 }
 	},
-	{0x050, /* WST PAL C */
-		{ 0xaa, 0xaa, 0xff, 0xff , 0xe7, 0x2e, 0x20, 0x22, 0xa6, 0x98, 0x0d, 0x0, 0x0, 0x0, 0x10, 0x0 }
+	{0x050, /* Teletext, PAL, WST System C */
+		{V4L2_SLICED_TELETEXT_PAL_C,6,22,1},
+		{ 0xaa, 0xaa, 0xff, 0xff, 0xe7, 0x2e, 0x20, 0x22,
+		  0xa6, 0x98, 0x0d, 0x00, 0x00, 0x00, 0x10, 0x00 }
 	},
-	{0x070, /* WST NTSC B */
-		{ 0xaa, 0xaa, 0xff, 0xff , 0x27, 0x2e, 0x20, 0x23, 0x69, 0x93, 0x0d, 0x0, 0x0, 0x0, 0x10, 0x0 }
+	{0x070, /* Teletext, NTSC, WST System B */
+		{V4L2_SLICED_TELETEXT_NTSC_B,10,21,1},
+		{ 0xaa, 0xaa, 0xff, 0xff, 0x27, 0x2e, 0x20, 0x23,
+		  0x69, 0x93, 0x0d, 0x00, 0x00, 0x00, 0x10, 0x00 }
 	},
-	{0x090, /* NABTS, NTSC */
-		{ 0xaa, 0xaa, 0xff, 0xff , 0xe7, 0x2e, 0x20, 0x22, 0x69, 0x93, 0x0d, 0x0, 0x0, 0x0, 0x15, 0x0 }
+	{0x090, /* Tetetext, NTSC NABTS System C */
+		{V4L2_SLICED_TELETEXT_NTSC_C,10,21,1},
+		{ 0xaa, 0xaa, 0xff, 0xff, 0xe7, 0x2e, 0x20, 0x22,
+		  0x69, 0x93, 0x0d, 0x00, 0x00, 0x00, 0x15, 0x00 }
 	},
-	{0x0b0, /* NABTS, NTSC-J */
-		{ 0xaa, 0xaa, 0xff, 0xff , 0xa7, 0x2e, 0x20, 0x23, 0x69, 0x93, 0x0d, 0x0, 0x0, 0x0, 0x10, 0x0 }
+	{0x0b0, /* Teletext, NTSC-J, NABTS System D */
+		{V4L2_SLICED_TELETEXT_NTSC_D,10,21,1},
+		{ 0xaa, 0xaa, 0xff, 0xff, 0xa7, 0x2e, 0x20, 0x23,
+		  0x69, 0x93, 0x0d, 0x00, 0x00, 0x00, 0x10, 0x00 }
 	},
-	{0x0d0, /* CC, PAL/SECAM */
-		{ 0xaa, 0x2a, 0xff, 0x3f , 0x04, 0x51, 0x6e, 0x02, 0xa6, 0x7b, 0x09, 0x0, 0x0, 0x0, 0x27, 0x0 }
+	{0x0d0, /* Closed Caption, PAL/SECAM */
+		{V4L2_SLICED_CAPTION_625,22,22,1},
+		{ 0xaa, 0x2a, 0xff, 0x3f, 0x04, 0x51, 0x6e, 0x02,
+		  0xa6, 0x7b, 0x09, 0x00, 0x00, 0x00, 0x27, 0x00 }
 	},
-	{0x0f0, /* CC, NTSC */
-		{ 0xaa, 0x2a, 0xff, 0x3f , 0x04, 0x51, 0x6e, 0x02, 0x69, 0x8c, 0x09, 0x0, 0x0, 0x0, 0x27, 0x0 }
+	{0x0f0, /* Closed Caption, NTSC */
+		{V4L2_SLICED_CAPTION_525,21,21,1},
+		{ 0xaa, 0x2a, 0xff, 0x3f, 0x04, 0x51, 0x6e, 0x02,
+		  0x69, 0x8c, 0x09, 0x00, 0x00, 0x00, 0x27, 0x00 }
 	},
-	{0x110, /* WSS, PAL/SECAM */
-		{ 0x5b, 0x55, 0xc5, 0xff , 0x0, 0x71, 0x6e, 0x42, 0xa6, 0xcd, 0x0f, 0x0, 0x0, 0x0, 0x3a, 0x0 }
+	{0x110, /* Wide Screen Signal, PAL/SECAM */
+		{V4L2_SLICED_WSS_625,20,21,1},
+		{ 0x5b, 0x55, 0xc5, 0xff, 0x00, 0x71, 0x6e, 0x42,
+		  0xa6, 0xcd, 0x0f, 0x00, 0x00, 0x00, 0x3a, 0x00 }
 	},
-	{0x130, /* WSS, NTSC C */
-		{ 0x38, 0x00, 0x3f, 0x00 , 0x0, 0x71, 0x6e, 0x43, 0x69, 0x7c, 0x08, 0x0, 0x0, 0x0, 0x39, 0x0 }
+	{0x130, /* Wide Screen Signal, NTSC C */
+		{V4L2_SLICED_WSS_525,20,20,1},
+		{ 0x38, 0x00, 0x3f, 0x00, 0x00, 0x71, 0x6e, 0x43,
+		  0x69, 0x7c, 0x08, 0x00, 0x00, 0x00, 0x39, 0x00 }
 	},
-	{0x150, /* VITC, PAL/SECAM */
-		{ 0x0, 0x0, 0x0, 0x0 , 0x0, 0x8f, 0x6d, 0x49, 0xa6, 0x85, 0x08, 0x0, 0x0, 0x0, 0x4c, 0x0 }
+	{0x150, /* Vertical Interval Timecode (VITC), PAL/SECAM */
+		{V4l2_SLICED_VITC_625,6,22,0},
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x8f, 0x6d, 0x49,
+		  0xa6, 0x85, 0x08, 0x00, 0x00, 0x00, 0x4c, 0x00 }
 	},
-	{0x170, /* VITC, NTSC */
-		{ 0x0, 0x0, 0x0, 0x0 , 0x0, 0x8f, 0x6d, 0x49, 0x69, 0x94, 0x08, 0x0, 0x0, 0x0, 0x4c, 0x0 }
+	{0x170, /* Vertical Interval Timecode (VITC), NTSC */
+		{V4l2_SLICED_VITC_525,10,20,0},
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x8f, 0x6d, 0x49,
+		  0x69, 0x94, 0x08, 0x00, 0x00, 0x00, 0x4c, 0x00 }
 	},
-	{0x190, /* VPS, PAL */
-		{ 0xaa, 0xaa, 0xff, 0xff, 0xba, 0xce, 0x2b, 0x0d, 0xa6, 0xda, 0x0b, 0x0, 0x0, 0x0, 0x60, 0x0 }
-	},
-	{0x1b0, /* Gemstar Custom 1 */
-		{ 0xcc, 0xcc, 0xff, 0xff, 0x05, 0x51, 0x6e, 0x05, 0x69, 0x19, 0x13, 0x0, 0x0, 0x0, 0x60, 0x0 }
+	{0x190, /* Video Program System (VPS), PAL */
+		{V4L2_SLICED_VPS,16,16,0},
+		{ 0xaa, 0xaa, 0xff, 0xff, 0xba, 0xce, 0x2b, 0x0d,
+		  0xa6, 0xda, 0x0b, 0x00, 0x00, 0x00, 0x60, 0x00 }
 	},
+	/* 0x1d0 User programmable */
+
+	/* End of struct */
+	{ (u16)-1 }
 };
 
 static int tvp5150_write_inittab(struct i2c_client *c,
-				 const struct i2c_reg_value *regs)
+				const struct i2c_reg_value *regs)
 {
 	while (regs->reg != 0xff) {
 		tvp5150_write(c, regs->reg, regs->value);
@@ -562,7 +582,7 @@ static int tvp5150_write_inittab(struct i2c_client *c,
 }
 
 static int tvp5150_vdp_init(struct i2c_client *c,
-				 const struct i2c_vbi_ram_value *regs)
+				const struct i2c_vbi_ram_value *regs)
 {
 	unsigned int i;
 
@@ -586,6 +606,24 @@ static int tvp5150_vdp_init(struct i2c_client *c,
 	return 0;
 }
 
+/* Fills VBI capabilities based on i2c_vbi_ram_value struct */
+static void tvp5150_vbi_get_cap(const struct i2c_vbi_ram_value *regs,
+				struct v4l2_sliced_vbi_cap *cap)
+{
+	int line;
+
+	memset(cap, 0, sizeof *cap);
+
+	while (regs->reg != (u16)-1 ) {
+		for (line=regs->type.ini_line;line<=regs->type.end_line;line++) {
+			cap->service_lines[0][line] |= regs->type.vbi_type;
+		}
+		cap->service_set |= regs->type.vbi_type;
+
+		regs++;
+	}
+}
+
 /* Set vbi processing
  * type - one of tvp5150_vbi_types
  * line - line to gather data
@@ -599,7 +637,7 @@ static int tvp5150_vdp_init(struct i2c_client *c,
  *	LSB = field1
  *	MSB = field2
  */
-static int tvp5150_set_vbi(struct i2c_client *c, enum tvp5150_vbi_types type,
+static int tvp5150_set_vbi(struct i2c_client *c, unsigned int type,
 					u8 flags, int line, const int fields)
 {
 	struct tvp5150 *decoder = i2c_get_clientdata(c);
@@ -775,6 +813,15 @@ static int tvp5150_command(struct i2c_client *c,
 		*(v4l2_std_id *)arg = decoder->norm;
 		break;
 
+	case VIDIOC_G_SLICED_VBI_CAP:
+	{
+		struct v4l2_sliced_vbi_cap *cap = arg;
+		tvp5150_dbg(1, "VIDIOC_G_SLICED_VBI_CAP\n");
+
+		tvp5150_vbi_get_cap(vbi_ram_default, cap);
+		break;
+	}
+
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 	case VIDIOC_INT_G_REGISTER:
 	{
@@ -1021,7 +1068,7 @@ static int tvp5150_detect_client(struct i2c_adapter *adapter,
 		return rv;
 	}
 
-	if (debug > 1)
+//	if (debug > 1)
 		dump_reg(c);
 	return 0;
 }
diff --git a/drivers/media/video/tvp5150_reg.h b/drivers/media/video/tvp5150_reg.h
index c81587e06e37..4240043c0b2a 100644
--- a/drivers/media/video/tvp5150_reg.h
+++ b/drivers/media/video/tvp5150_reg.h
@@ -1,3 +1,10 @@
+/*
+ * tvp5150 - Texas Instruments TVP5150A/AM1 video decoder registers
+ *
+ * Copyright (c) 2005,2006 Mauro Carvalho Chehab (mchehab@infradead.org)
+ * This code is placed under the terms of the GNU General Public License v2
+ */
+
 #define TVP5150_VD_IN_SRC_SEL_1      0x00 /* Video input source selection #1 */
 #define TVP5150_ANAL_CHL_CTL         0x01 /* Analog channel controls */
 #define TVP5150_OP_MODE_CTL          0x02 /* Operation mode controls */
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index ce40675324bd..27ae3d679cbe 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -949,13 +949,50 @@ struct v4l2_sliced_vbi_format
 	__u32   reserved[2];            /* must be zero */
 };
 
-#define V4L2_SLICED_TELETEXT_B          (0x0001)
-#define V4L2_SLICED_VPS                 (0x0400)
-#define V4L2_SLICED_CAPTION_525         (0x1000)
-#define V4L2_SLICED_WSS_625             (0x4000)
-
-#define V4L2_SLICED_VBI_525             (V4L2_SLICED_CAPTION_525)
-#define V4L2_SLICED_VBI_625             (V4L2_SLICED_TELETEXT_B | V4L2_SLICED_VPS | V4L2_SLICED_WSS_625)
+/* Teletext WST, defined on ITU-R BT.653-2 */
+#define V4L2_SLICED_TELETEXT_PAL_B      (0x000001)
+#define V4L2_SLICED_TELETEXT_PAL_C      (0x000002)
+#define V4L2_SLICED_TELETEXT_NTSC_B     (0x000010)
+#define V4L2_SLICED_TELETEXT_SECAM      (0x000020)
+
+/* Teletext NABTS, defined on ITU-R BT.653-2 */
+#define V4L2_SLICED_TELETEXT_NTSC_C     (0x000040)
+#define V4L2_SLICED_TELETEXT_NTSC_D     (0x000080)
+
+/* Video Program System, defined on ETS 300 231*/
+#define V4L2_SLICED_VPS                 (0x000400)
+
+/* Closed Caption, defined on EIA-608 */
+#define V4L2_SLICED_CAPTION_525         (0x001000)
+#define V4L2_SLICED_CAPTION_625         (0x002000)
+
+/* Wide Screen System, defined on ITU-R BT1119.1 */
+#define V4L2_SLICED_WSS_625             (0x004000)
+
+/* Wide Screen System, defined on IEC 61880 */
+#define V4L2_SLICED_WSS_525             (0x008000)
+
+/* Vertical Interval Timecode (VITC), defined on SMPTE 12M */
+#define V4l2_SLICED_VITC_625		(0x010000)
+#define V4l2_SLICED_VITC_525		(0x020000)
+
+/* Compat macro - Should be removed for 2.6.18 */
+#define V4L2_SLICED_TELETEXT_B V4L2_SLICED_TELETEXT_PAL_B
+
+#define V4L2_SLICED_VBI_525             (V4L2_SLICED_TELETEXT_NTSC_B |\
+					 V4L2_SLICED_TELETEXT_NTSC_C |\
+					 V4L2_SLICED_TELETEXT_NTSC_D |\
+					 V4L2_SLICED_CAPTION_525     |\
+					 V4L2_SLICED_WSS_525         |\
+					 V4l2_SLICED_VITC_525)
+
+#define V4L2_SLICED_VBI_625             (V4L2_SLICED_TELETEXT_PAL_B  |\
+					 V4L2_SLICED_TELETEXT_PAL_C  |\
+					 V4L2_SLICED_TELETEXT_SECAM  |\
+					 V4L2_SLICED_VPS             |\
+					 V4L2_SLICED_CAPTION_625     |\
+					 V4L2_SLICED_WSS_625         |\
+					 V4l2_SLICED_VITC_625)
 
 struct v4l2_sliced_vbi_cap
 {
-- 
cgit v1.2.3


From 4d0dddb10723cee2b3048bd2389673703bc228e4 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@infradead.org>
Date: Mon, 23 Jan 2006 17:11:07 -0200
Subject: V4L/DVB (3419): added some VBI macros and moved minor definitions to
 header file

- Moved some hardcoded minor numbers to videodev2.h
- Included more comments for sliced VBI standards
- Included some VBI macros to group similar standards

Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/videodev.c | 16 ++++++++--------
 include/linux/videodev2.h      | 38 +++++++++++++++++++++++++++++++++-----
 2 files changed, 41 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 078880e4c8c0..908fbec776ac 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -279,23 +279,23 @@ int video_register_device(struct video_device *vfd, int type, int nr)
 	switch(type)
 	{
 		case VFL_TYPE_GRABBER:
-			base=0;
-			end=64;
+			base=MINOR_VFL_TYPE_GRABBER_MIN;
+			end=MINOR_VFL_TYPE_GRABBER_MAX+1;
 			name_base = "video";
 			break;
 		case VFL_TYPE_VTX:
-			base=192;
-			end=224;
+			base=MINOR_VFL_TYPE_VTX_MIN;
+			end=MINOR_VFL_TYPE_VTX_MAX+1;
 			name_base = "vtx";
 			break;
 		case VFL_TYPE_VBI:
-			base=224;
-			end=256;
+			base=MINOR_VFL_TYPE_VBI_MIN;
+			end=MINOR_VFL_TYPE_VBI_MAX+1;
 			name_base = "vbi";
 			break;
 		case VFL_TYPE_RADIO:
-			base=64;
-			end=128;
+			base=MINOR_VFL_TYPE_RADIO_MIN;
+			end=MINOR_VFL_TYPE_RADIO_MAX+1;
 			name_base = "radio";
 			break;
 		default:
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 27ae3d679cbe..6e33ce96cab0 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -21,7 +21,7 @@
 #include <linux/compiler.h> /* need __user */
 
 
-#define OBSOLETE_OWNER 1 /* It will be removed for 2.6.15 */
+#define OBSOLETE_OWNER 1 /* It will be removed for 2.6.17 */
 #define HAVE_V4L2 1
 
 /*
@@ -48,6 +48,16 @@
 
 #ifdef __KERNEL__
 
+/* Minor device allocation */
+#define MINOR_VFL_TYPE_GRABBER_MIN   0
+#define MINOR_VFL_TYPE_GRABBER_MAX  63
+#define MINOR_VFL_TYPE_RADIO_MIN    64
+#define MINOR_VFL_TYPE_RADIO_MAX   127
+#define MINOR_VFL_TYPE_VTX_MIN     192
+#define MINOR_VFL_TYPE_VTX_MAX     223
+#define MINOR_VFL_TYPE_VBI_MIN     224
+#define MINOR_VFL_TYPE_VBI_MAX     255
+
 #define VFL_TYPE_GRABBER	0
 #define VFL_TYPE_VBI		1
 #define VFL_TYPE_RADIO		2
@@ -949,13 +959,15 @@ struct v4l2_sliced_vbi_format
 	__u32   reserved[2];            /* must be zero */
 };
 
-/* Teletext WST, defined on ITU-R BT.653-2 */
+/* Teletext World System Teletext
+   (WST), defined on ITU-R BT.653-2 */
 #define V4L2_SLICED_TELETEXT_PAL_B      (0x000001)
 #define V4L2_SLICED_TELETEXT_PAL_C      (0x000002)
 #define V4L2_SLICED_TELETEXT_NTSC_B     (0x000010)
 #define V4L2_SLICED_TELETEXT_SECAM      (0x000020)
 
-/* Teletext NABTS, defined on ITU-R BT.653-2 */
+/* Teletext North American Broadcast Teletext Specification
+   (NABTS), defined on ITU-R BT.653-2 */
 #define V4L2_SLICED_TELETEXT_NTSC_C     (0x000040)
 #define V4L2_SLICED_TELETEXT_NTSC_D     (0x000080)
 
@@ -976,8 +988,24 @@ struct v4l2_sliced_vbi_format
 #define V4l2_SLICED_VITC_625		(0x010000)
 #define V4l2_SLICED_VITC_525		(0x020000)
 
-/* Compat macro - Should be removed for 2.6.18 */
-#define V4L2_SLICED_TELETEXT_B V4L2_SLICED_TELETEXT_PAL_B
+#define V4L2_SLICED_TELETEXT_B		(V4L2_SLICED_TELETEXT_PAL_B  |\
+					 V4L2_SLICED_TELETEXT_NTSC_B)
+
+#define V4L2_SLICED_TELETEXT		(V4L2_SLICED_TELETEXT_PAL_B  |\
+					 V4L2_SLICED_TELETEXT_PAL_C  |\
+					 V4L2_SLICED_TELETEXT_SECAM  |\
+					 V4L2_SLICED_TELETEXT_NTSC_B |\
+					 V4L2_SLICED_TELETEXT_NTSC_C |\
+					 V4L2_SLICED_TELETEXT_NTSC_D)
+
+#define V4L2_SLICED_CAPTION		(V4L2_SLICED_CAPTION_525     |\
+					 V4L2_SLICED_CAPTION_625)
+
+#define V4L2_SLICED_WSS			(V4L2_SLICED_WSS_525         |\
+					 V4L2_SLICED_WSS_625)
+
+#define V4L2_SLICED_VITC		(V4L2_SLICED_VITC_525        |\
+					 V4L2_SLICED_VITC_625)
 
 #define V4L2_SLICED_VBI_525             (V4L2_SLICED_TELETEXT_NTSC_B |\
 					 V4L2_SLICED_TELETEXT_NTSC_C |\
-- 
cgit v1.2.3


From 77853bf2b48e34449e826a9ef4df5ea0dbe947f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 23 Jan 2006 13:09:36 +0900
Subject: [PATCH] libata: make the owner of a qc responsible for freeing it

qc used to be freed automatically on command completion.  However, as
a qc can carry information about its completion status, it can be
useful to its owner/issuer after command completion.  This patch makes
freeing qc responsibility of its owner.  This simplifies
ata_exec_internal() and makes command turn-around for atapi request
sensing less hackish.

This change was originally suggested by Jeff Garzik.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 47 +++++++++++++---------------------------------
 drivers/scsi/libata-scsi.c | 14 +++++++-------
 include/linux/libata.h     |  2 +-
 3 files changed, 21 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 0e4932362949..15df633521d0 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1072,24 +1072,12 @@ static unsigned int ata_pio_modes(const struct ata_device *adev)
 	   timing API will get this right anyway */
 }
 
-struct ata_exec_internal_arg {
-	unsigned int err_mask;
-	struct ata_taskfile *tf;
-	struct completion *waiting;
-};
-
-int ata_qc_complete_internal(struct ata_queued_cmd *qc)
+void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 {
-	struct ata_exec_internal_arg *arg = qc->private_data;
-	struct completion *waiting = arg->waiting;
+	struct completion *waiting = qc->private_data;
 
-	if (!(qc->err_mask & ~AC_ERR_DEV))
-		qc->ap->ops->tf_read(qc->ap, arg->tf);
-	arg->err_mask = qc->err_mask;
-	arg->waiting = NULL;
+	qc->ap->ops->tf_read(qc->ap, &qc->tf);
 	complete(waiting);
-
-	return 0;
 }
 
 /**
@@ -1120,7 +1108,7 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 	struct ata_queued_cmd *qc;
 	DECLARE_COMPLETION(wait);
 	unsigned long flags;
-	struct ata_exec_internal_arg arg;
+	unsigned int err_mask;
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 
@@ -1134,9 +1122,7 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 		qc->nsect = buflen / ATA_SECT_SIZE;
 	}
 
-	arg.waiting = &wait;
-	arg.tf = tf;
-	qc->private_data = &arg;
+	qc->private_data = &wait;
 	qc->complete_fn = ata_qc_complete_internal;
 
 	if (ata_qc_issue(qc))
@@ -1153,7 +1139,7 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 		 * before the caller cleans up, it will result in a
 		 * spurious interrupt.  We can live with that.
 		 */
-		if (arg.waiting) {
+		if (qc->flags & ATA_QCFLAG_ACTIVE) {
 			qc->err_mask = AC_ERR_OTHER;
 			ata_qc_complete(qc);
 			printk(KERN_WARNING "ata%u: qc timeout (cmd 0x%x)\n",
@@ -1163,7 +1149,12 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 		spin_unlock_irqrestore(&ap->host_set->lock, flags);
 	}
 
-	return arg.err_mask;
+	*tf = qc->tf;
+	err_mask = qc->err_mask;
+
+	ata_qc_free(qc);
+
+	return err_mask;
 
  issue_fail:
 	ata_qc_free(qc);
@@ -3633,8 +3624,6 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 
 void ata_qc_complete(struct ata_queued_cmd *qc)
 {
-	int rc;
-
 	assert(qc != NULL);	/* ata_qc_from_tag _might_ return NULL */
 	assert(qc->flags & ATA_QCFLAG_ACTIVE);
 
@@ -3648,17 +3637,7 @@ void ata_qc_complete(struct ata_queued_cmd *qc)
 	qc->flags &= ~ATA_QCFLAG_ACTIVE;
 
 	/* call completion callback */
-	rc = qc->complete_fn(qc);
-
-	/* if callback indicates not to complete command (non-zero),
-	 * return immediately
-	 */
-	if (rc != 0)
-		return;
-
-	ata_qc_free(qc);
-
-	VPRINTK("EXIT\n");
+	qc->complete_fn(qc);
 }
 
 static inline int ata_should_dma_map(struct ata_queued_cmd *qc)
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 0e65bfe92e6f..ce3fe928a386 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1219,7 +1219,7 @@ nothing_to_do:
 	return 1;
 }
 
-static int ata_scsi_qc_complete(struct ata_queued_cmd *qc)
+static void ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct scsi_cmnd *cmd = qc->scsicmd;
 	u8 *cdb = cmd->cmnd;
@@ -1256,7 +1256,7 @@ static int ata_scsi_qc_complete(struct ata_queued_cmd *qc)
 
 	qc->scsidone(cmd);
 
-	return 0;
+	ata_qc_free(qc);
 }
 
 /**
@@ -1982,7 +1982,7 @@ void ata_scsi_badcmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *), u8
 	done(cmd);
 }
 
-static int atapi_sense_complete(struct ata_queued_cmd *qc)
+static void atapi_sense_complete(struct ata_queued_cmd *qc)
 {
 	if (qc->err_mask && ((qc->err_mask & AC_ERR_DEV) == 0))
 		/* FIXME: not quite right; we don't want the
@@ -1993,7 +1993,7 @@ static int atapi_sense_complete(struct ata_queued_cmd *qc)
 		ata_gen_ata_desc_sense(qc);
 
 	qc->scsidone(qc->scsicmd);
-	return 0;
+	ata_qc_free(qc);
 }
 
 /* is it pointless to prefer PIO for "safety reasons"? */
@@ -2050,7 +2050,7 @@ static void atapi_request_sense(struct ata_queued_cmd *qc)
 	DPRINTK("EXIT\n");
 }
 
-static int atapi_qc_complete(struct ata_queued_cmd *qc)
+static void atapi_qc_complete(struct ata_queued_cmd *qc)
 {
 	struct scsi_cmnd *cmd = qc->scsicmd;
 	unsigned int err_mask = qc->err_mask;
@@ -2060,7 +2060,7 @@ static int atapi_qc_complete(struct ata_queued_cmd *qc)
 	if (unlikely(err_mask & AC_ERR_DEV)) {
 		cmd->result = SAM_STAT_CHECK_CONDITION;
 		atapi_request_sense(qc);
-		return 1;
+		return;
 	}
 
 	else if (unlikely(err_mask))
@@ -2100,7 +2100,7 @@ static int atapi_qc_complete(struct ata_queued_cmd *qc)
 	}
 
 	qc->scsidone(cmd);
-	return 0;
+	ata_qc_free(qc);
 }
 /**
  *	atapi_xlat - Initialize PACKET taskfile
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 46ccea215892..d58b659cf3f5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -235,7 +235,7 @@ struct ata_port;
 struct ata_queued_cmd;
 
 /* typedefs */
-typedef int (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
+typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
 
 struct ata_ioports {
 	unsigned long		cmd_addr;
-- 
cgit v1.2.3


From 11a56d2439259892319df81cf1582687d7e7fde5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 23 Jan 2006 13:09:36 +0900
Subject: [PATCH] libata: add detailed AC_ERR_* flags

Add detailed AC_ERR_* flags and use them.  Long-term goal is to
describe all errors with err_mask and tf combination (tf for failed
sector information, etc...).  After proper error diagnosis is
implemented, sense data should also be generated from err_mask instead
of directly from hardware tf registers as it is currently.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/ahci.c        |  2 +-
 drivers/scsi/libata-core.c | 12 ++++++------
 drivers/scsi/sata_mv.c     |  2 +-
 drivers/scsi/sata_sil24.c  |  2 +-
 include/linux/libata.h     | 15 ++++++++++-----
 5 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index a168b525d079..bb3686ae1885 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -680,7 +680,7 @@ static void ahci_eng_timeout(struct ata_port *ap)
 	 	 * not being called from the SCSI EH.
 	 	 */
 		qc->scsidone = scsi_finish_command;
-		qc->err_mask |= AC_ERR_OTHER;
+		qc->err_mask |= AC_ERR_TIMEOUT;
 		ata_qc_complete(qc);
 	}
 
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 43a23286d6fe..f5519f01491c 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1142,7 +1142,7 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 		 * spurious interrupt.  We can live with that.
 		 */
 		if (qc->flags & ATA_QCFLAG_ACTIVE) {
-			qc->err_mask = AC_ERR_OTHER;
+			qc->err_mask = AC_ERR_TIMEOUT;
 			ata_qc_complete(qc);
 			printk(KERN_WARNING "ata%u: qc timeout (cmd 0x%x)\n",
 			       ap->id, command);
@@ -2917,7 +2917,7 @@ static unsigned long ata_pio_poll(struct ata_port *ap)
 	status = ata_chk_status(ap);
 	if (status & ATA_BUSY) {
 		if (time_after(jiffies, ap->pio_task_timeout)) {
-			qc->err_mask |= AC_ERR_ATA_BUS;
+			qc->err_mask |= AC_ERR_TIMEOUT;
 			ap->hsm_task_state = HSM_ST_TMOUT;
 			return 0;
 		}
@@ -3295,7 +3295,7 @@ static void atapi_pio_bytes(struct ata_queued_cmd *qc)
 err_out:
 	printk(KERN_INFO "ata%u: dev %u: ATAPI check failed\n",
 	      ap->id, dev->devno);
-	qc->err_mask |= AC_ERR_ATA_BUS;
+	qc->err_mask |= AC_ERR_HSM;
 	ap->hsm_task_state = HSM_ST_ERR;
 }
 
@@ -3353,7 +3353,7 @@ static void ata_pio_block(struct ata_port *ap)
 	} else {
 		/* handle BSY=0, DRQ=0 as error */
 		if ((status & ATA_DRQ) == 0) {
-			qc->err_mask |= AC_ERR_ATA_BUS;
+			qc->err_mask |= AC_ERR_HSM;
 			ap->hsm_task_state = HSM_ST_ERR;
 			return;
 		}
@@ -4159,14 +4159,14 @@ static void atapi_packet_task(void *_data)
 	/* sleep-wait for BSY to clear */
 	DPRINTK("busy wait\n");
 	if (ata_busy_sleep(ap, ATA_TMOUT_CDB_QUICK, ATA_TMOUT_CDB)) {
-		qc->err_mask |= AC_ERR_ATA_BUS;
+		qc->err_mask |= AC_ERR_TIMEOUT;
 		goto err_out;
 	}
 
 	/* make sure DRQ is set */
 	status = ata_chk_status(ap);
 	if ((status & (ATA_BUSY | ATA_DRQ)) != ATA_DRQ) {
-		qc->err_mask |= AC_ERR_ATA_BUS;
+		qc->err_mask |= AC_ERR_HSM;
 		goto err_out;
 	}
 
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index cd54244058b5..89bcd85fa58c 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -1866,7 +1866,7 @@ static void mv_eng_timeout(struct ata_port *ap)
 	 	 */
 		spin_lock_irqsave(&ap->host_set->lock, flags);
 		qc->scsidone = scsi_finish_command;
-		qc->err_mask |= AC_ERR_OTHER;
+		qc->err_mask |= AC_ERR_TIMEOUT;
 		ata_qc_complete(qc);
 		spin_unlock_irqrestore(&ap->host_set->lock, flags);
 	}
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 923130185a9e..fb59012b9fbe 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -653,7 +653,7 @@ static void sil24_eng_timeout(struct ata_port *ap)
 	 */
 	printk(KERN_ERR "ata%u: command timeout\n", ap->id);
 	qc->scsidone = scsi_finish_command;
-	qc->err_mask |= AC_ERR_OTHER;
+	qc->err_mask |= AC_ERR_TIMEOUT;
 	ata_qc_complete(qc);
 
 	sil24_reset_controller(ap);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d58b659cf3f5..8ff3a7f6f63c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -222,10 +222,15 @@ enum hsm_task_states {
 };
 
 enum ata_completion_errors {
-	AC_ERR_OTHER		= (1 << 0),
-	AC_ERR_DEV		= (1 << 1),
-	AC_ERR_ATA_BUS		= (1 << 2),
-	AC_ERR_HOST_BUS		= (1 << 3),
+	AC_ERR_DEV		= (1 << 0), /* device reported error */
+	AC_ERR_HSM		= (1 << 1), /* host state machine violation */
+	AC_ERR_TIMEOUT		= (1 << 2), /* timeout */
+	AC_ERR_MEDIA		= (1 << 3), /* media error */
+	AC_ERR_ATA_BUS		= (1 << 4), /* ATA bus error */
+	AC_ERR_HOST_BUS		= (1 << 5), /* host bus error */
+	AC_ERR_SYSTEM		= (1 << 6), /* system error */
+	AC_ERR_INVALID		= (1 << 7), /* invalid argument */
+	AC_ERR_OTHER		= (1 << 8), /* unknown */
 };
 
 /* forward declarations */
@@ -833,7 +838,7 @@ static inline int ata_try_flush_cache(const struct ata_device *dev)
 static inline unsigned int ac_err_mask(u8 status)
 {
 	if (status & ATA_BUSY)
-		return AC_ERR_ATA_BUS;
+		return AC_ERR_HSM;
 	if (status & (ATA_ERR | ATA_DF))
 		return AC_ERR_DEV;
 	return 0;
-- 
cgit v1.2.3


From 9a3d9eb0177eb10500d49cd283b35576082a522d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 23 Jan 2006 13:09:36 +0900
Subject: [PATCH] libata: return AC_ERR_* from issue functions

Return AC_ERR_* mask from issue fuctions instead of 0/-1.  This
enables things like failing a qc with AC_ERR_HSM when the device
doesn't set DRDY when the qc is about to be issued.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/ahci.c         |  4 ++--
 drivers/scsi/libata-core.c  | 17 ++++++++---------
 drivers/scsi/libata-scsi.c  | 10 ++++------
 drivers/scsi/libata.h       |  2 +-
 drivers/scsi/pdc_adma.c     |  4 ++--
 drivers/scsi/sata_mv.c      |  4 ++--
 drivers/scsi/sata_promise.c |  4 ++--
 drivers/scsi/sata_qstor.c   |  4 ++--
 drivers/scsi/sata_sil24.c   |  4 ++--
 drivers/scsi/sata_sx4.c     |  4 ++--
 include/linux/libata.h      |  4 ++--
 11 files changed, 29 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index bb3686ae1885..0f6e4dd22901 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -184,7 +184,7 @@ struct ahci_port_priv {
 static u32 ahci_scr_read (struct ata_port *ap, unsigned int sc_reg);
 static void ahci_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val);
 static int ahci_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
-static int ahci_qc_issue(struct ata_queued_cmd *qc);
+static unsigned int ahci_qc_issue(struct ata_queued_cmd *qc);
 static irqreturn_t ahci_interrupt (int irq, void *dev_instance, struct pt_regs *regs);
 static void ahci_phy_reset(struct ata_port *ap);
 static void ahci_irq_clear(struct ata_port *ap);
@@ -800,7 +800,7 @@ static irqreturn_t ahci_interrupt (int irq, void *dev_instance, struct pt_regs *
 	return IRQ_RETVAL(handled);
 }
 
-static int ahci_qc_issue(struct ata_queued_cmd *qc)
+static unsigned int ahci_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 	void __iomem *port_mmio = (void __iomem *) ap->ioaddr.cmd_addr;
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index f5519f01491c..b29bf0dc948a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1125,10 +1125,9 @@ ata_exec_internal(struct ata_port *ap, struct ata_device *dev,
 	qc->private_data = &wait;
 	qc->complete_fn = ata_qc_complete_internal;
 
-	if (ata_qc_issue(qc)) {
-		qc->err_mask = AC_ERR_OTHER;
+	qc->err_mask = ata_qc_issue(qc);
+	if (qc->err_mask)
 		ata_qc_complete(qc);
-	}
 
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
@@ -3674,10 +3673,10 @@ static inline int ata_should_dma_map(struct ata_queued_cmd *qc)
  *	spin_lock_irqsave(host_set lock)
  *
  *	RETURNS:
- *	Zero on success, negative on error.
+ *	Zero on success, AC_ERR_* mask on failure
  */
 
-int ata_qc_issue(struct ata_queued_cmd *qc)
+unsigned int ata_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 
@@ -3702,7 +3701,7 @@ int ata_qc_issue(struct ata_queued_cmd *qc)
 
 sg_err:
 	qc->flags &= ~ATA_QCFLAG_DMAMAP;
-	return -1;
+	return AC_ERR_SYSTEM;
 }
 
 
@@ -3721,10 +3720,10 @@ sg_err:
  *	spin_lock_irqsave(host_set lock)
  *
  *	RETURNS:
- *	Zero on success, negative on error.
+ *	Zero on success, AC_ERR_* mask on failure
  */
 
-int ata_qc_issue_prot(struct ata_queued_cmd *qc)
+unsigned int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 
@@ -3769,7 +3768,7 @@ int ata_qc_issue_prot(struct ata_queued_cmd *qc)
 
 	default:
 		WARN_ON(1);
-		return -1;
+		return AC_ERR_SYSTEM;
 	}
 
 	return 0;
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index c496309f691a..95e3c278dd43 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1322,10 +1322,9 @@ static void ata_scsi_translate(struct ata_port *ap, struct ata_device *dev,
 		goto early_finish;
 
 	/* select device, send command to hardware */
-	if (ata_qc_issue(qc)) {
-		qc->err_mask |= AC_ERR_OTHER;
+	qc->err_mask = ata_qc_issue(qc);
+	if (qc->err_mask)
 		ata_qc_complete(qc);
-	}
 
 	VPRINTK("EXIT\n");
 	return;
@@ -2044,10 +2043,9 @@ static void atapi_request_sense(struct ata_queued_cmd *qc)
 
 	qc->complete_fn = atapi_sense_complete;
 
-	if (ata_qc_issue(qc)) {
-		qc->err_mask |= AC_ERR_OTHER;
+	qc->err_mask = ata_qc_issue(qc);
+	if (qc->err_mask)
 		ata_qc_complete(qc);
-	}
 
 	DPRINTK("EXIT\n");
 }
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index e03ce48b7b4b..9d76923a2253 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -45,7 +45,7 @@ extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 				      struct ata_device *dev);
 extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
-extern int ata_qc_issue(struct ata_queued_cmd *qc);
+extern unsigned int ata_qc_issue(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
 extern void ata_dev_select(struct ata_port *ap, unsigned int device,
                            unsigned int wait, unsigned int can_sleep);
diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c
index e8df0c9ec1e6..3a6bf58dc37b 100644
--- a/drivers/scsi/pdc_adma.c
+++ b/drivers/scsi/pdc_adma.c
@@ -131,7 +131,7 @@ static void adma_host_stop(struct ata_host_set *host_set);
 static void adma_port_stop(struct ata_port *ap);
 static void adma_phy_reset(struct ata_port *ap);
 static void adma_qc_prep(struct ata_queued_cmd *qc);
-static int adma_qc_issue(struct ata_queued_cmd *qc);
+static unsigned int adma_qc_issue(struct ata_queued_cmd *qc);
 static int adma_check_atapi_dma(struct ata_queued_cmd *qc);
 static void adma_bmdma_stop(struct ata_queued_cmd *qc);
 static u8 adma_bmdma_status(struct ata_port *ap);
@@ -419,7 +419,7 @@ static inline void adma_packet_start(struct ata_queued_cmd *qc)
 	writew(aPIOMD4 | aGO, chan + ADMA_CONTROL);
 }
 
-static int adma_qc_issue(struct ata_queued_cmd *qc)
+static unsigned int adma_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct adma_port_priv *pp = qc->ap->private_data;
 
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index 89bcd85fa58c..281223a0e45f 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -328,7 +328,7 @@ static void mv_host_stop(struct ata_host_set *host_set);
 static int mv_port_start(struct ata_port *ap);
 static void mv_port_stop(struct ata_port *ap);
 static void mv_qc_prep(struct ata_queued_cmd *qc);
-static int mv_qc_issue(struct ata_queued_cmd *qc);
+static unsigned int mv_qc_issue(struct ata_queued_cmd *qc);
 static irqreturn_t mv_interrupt(int irq, void *dev_instance,
 				struct pt_regs *regs);
 static void mv_eng_timeout(struct ata_port *ap);
@@ -1040,7 +1040,7 @@ static void mv_qc_prep(struct ata_queued_cmd *qc)
  *      LOCKING:
  *      Inherited from caller.
  */
-static int mv_qc_issue(struct ata_queued_cmd *qc)
+static unsigned int mv_qc_issue(struct ata_queued_cmd *qc)
 {
 	void __iomem *port_mmio = mv_ap_base(qc->ap);
 	struct mv_port_priv *pp = qc->ap->private_data;
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index b0b0a69b3563..bac36d5b7c3e 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -95,7 +95,7 @@ static void pdc_qc_prep(struct ata_queued_cmd *qc);
 static void pdc_tf_load_mmio(struct ata_port *ap, const struct ata_taskfile *tf);
 static void pdc_exec_command_mmio(struct ata_port *ap, const struct ata_taskfile *tf);
 static void pdc_irq_clear(struct ata_port *ap);
-static int pdc_qc_issue_prot(struct ata_queued_cmd *qc);
+static unsigned int pdc_qc_issue_prot(struct ata_queued_cmd *qc);
 
 
 static struct scsi_host_template pdc_ata_sht = {
@@ -544,7 +544,7 @@ static inline void pdc_packet_start(struct ata_queued_cmd *qc)
 	readl((void __iomem *) ap->ioaddr.cmd_addr + PDC_PKT_SUBMIT); /* flush */
 }
 
-static int pdc_qc_issue_prot(struct ata_queued_cmd *qc)
+static unsigned int pdc_qc_issue_prot(struct ata_queued_cmd *qc)
 {
 	switch (qc->tf.protocol) {
 	case ATA_PROT_DMA:
diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c
index de05e2883f9c..2afbeb77f6fe 100644
--- a/drivers/scsi/sata_qstor.c
+++ b/drivers/scsi/sata_qstor.c
@@ -120,7 +120,7 @@ static void qs_host_stop(struct ata_host_set *host_set);
 static void qs_port_stop(struct ata_port *ap);
 static void qs_phy_reset(struct ata_port *ap);
 static void qs_qc_prep(struct ata_queued_cmd *qc);
-static int qs_qc_issue(struct ata_queued_cmd *qc);
+static unsigned int qs_qc_issue(struct ata_queued_cmd *qc);
 static int qs_check_atapi_dma(struct ata_queued_cmd *qc);
 static void qs_bmdma_stop(struct ata_queued_cmd *qc);
 static u8 qs_bmdma_status(struct ata_port *ap);
@@ -352,7 +352,7 @@ static inline void qs_packet_start(struct ata_queued_cmd *qc)
 	readl(chan + QS_CCT_CFF);          /* flush */
 }
 
-static int qs_qc_issue(struct ata_queued_cmd *qc)
+static unsigned int qs_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct qs_port_priv *pp = qc->ap->private_data;
 
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index fb59012b9fbe..5a7a7b1d4add 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -251,7 +251,7 @@ static void sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val);
 static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf);
 static void sil24_phy_reset(struct ata_port *ap);
 static void sil24_qc_prep(struct ata_queued_cmd *qc);
-static int sil24_qc_issue(struct ata_queued_cmd *qc);
+static unsigned int sil24_qc_issue(struct ata_queued_cmd *qc);
 static void sil24_irq_clear(struct ata_port *ap);
 static void sil24_eng_timeout(struct ata_port *ap);
 static irqreturn_t sil24_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
@@ -557,7 +557,7 @@ static void sil24_qc_prep(struct ata_queued_cmd *qc)
 		sil24_fill_sg(qc, sge);
 }
 
-static int sil24_qc_issue(struct ata_queued_cmd *qc)
+static unsigned int sil24_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
 	void __iomem *port = (void __iomem *)ap->ioaddr.cmd_addr;
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index bc87c16c80d2..3175c6bb4fec 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -174,7 +174,7 @@ static void pdc20621_get_from_dimm(struct ata_probe_ent *pe,
 static void pdc20621_put_to_dimm(struct ata_probe_ent *pe,
 				 void *psource, u32 offset, u32 size);
 static void pdc20621_irq_clear(struct ata_port *ap);
-static int pdc20621_qc_issue_prot(struct ata_queued_cmd *qc);
+static unsigned int pdc20621_qc_issue_prot(struct ata_queued_cmd *qc);
 
 
 static struct scsi_host_template pdc_sata_sht = {
@@ -678,7 +678,7 @@ static void pdc20621_packet_start(struct ata_queued_cmd *qc)
 	}
 }
 
-static int pdc20621_qc_issue_prot(struct ata_queued_cmd *qc)
+static unsigned int pdc20621_qc_issue_prot(struct ata_queued_cmd *qc)
 {
 	switch (qc->tf.protocol) {
 	case ATA_PROT_DMA:
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 8ff3a7f6f63c..b1ea2f98bfbb 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -427,7 +427,7 @@ struct ata_port_operations {
 	void (*bmdma_start) (struct ata_queued_cmd *qc);
 
 	void (*qc_prep) (struct ata_queued_cmd *qc);
-	int (*qc_issue) (struct ata_queued_cmd *qc);
+	unsigned int (*qc_issue) (struct ata_queued_cmd *qc);
 
 	void (*eng_timeout) (struct ata_port *ap);
 
@@ -515,7 +515,7 @@ extern void ata_port_stop (struct ata_port *ap);
 extern void ata_host_stop (struct ata_host_set *host_set);
 extern irqreturn_t ata_interrupt (int irq, void *dev_instance, struct pt_regs *regs);
 extern void ata_qc_prep(struct ata_queued_cmd *qc);
-extern int ata_qc_issue_prot(struct ata_queued_cmd *qc);
+extern unsigned int ata_qc_issue_prot(struct ata_queued_cmd *qc);
 extern void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf,
 		unsigned int buflen);
 extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
-- 
cgit v1.2.3


From a72ec4ce6d3ae92e76baf5b2c65cc26e5e775e83 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 23 Jan 2006 13:09:37 +0900
Subject: [PATCH] libata: implement and apply ata_eh_qc_complete/retry()

Implement ata_eh_qc_complete/retry() using scsi_eh_finish_cmd() and
scsi_eh_flush_done_q().  This removes all eh scsicmd finish hacks from
low level drivers.

This change was first suggested by Jeff Garzik.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/ahci.c         | 12 +++------
 drivers/scsi/libata-core.c  | 14 ++++-------
 drivers/scsi/libata-scsi.c  | 59 ++++++++++++++++++++++++++++++++++++++++-----
 drivers/scsi/sata_mv.c      | 12 +--------
 drivers/scsi/sata_promise.c | 12 ++-------
 drivers/scsi/sata_sil24.c   | 10 +-------
 drivers/scsi/sata_sx4.c     | 12 ++-------
 include/linux/libata.h      |  3 +++
 8 files changed, 70 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 0f6e4dd22901..5a6b23009897 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -672,19 +672,13 @@ static void ahci_eng_timeout(struct ata_port *ap)
 		       ap->id);
 	} else {
 		ahci_restart_port(ap, readl(port_mmio + PORT_IRQ_STAT));
-
-		/* hack alert!  We cannot use the supplied completion
-	 	 * function from inside the ->eh_strategy_handler() thread.
-	 	 * libata is the only user of ->eh_strategy_handler() in
-	 	 * any kernel, so the default scsi_done() assumes it is
-	 	 * not being called from the SCSI EH.
-	 	 */
-		qc->scsidone = scsi_finish_command;
 		qc->err_mask |= AC_ERR_TIMEOUT;
-		ata_qc_complete(qc);
 	}
 
 	spin_unlock_irqrestore(&host_set->lock, flags);
+
+	if (qc)
+		ata_eh_qc_complete(qc);
 }
 
 static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index b29bf0dc948a..93ae2dcc837c 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3449,14 +3449,6 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 
 	spin_lock_irqsave(&host_set->lock, flags);
 
-	/* hack alert!  We cannot use the supplied completion
-	 * function from inside the ->eh_strategy_handler() thread.
-	 * libata is the only user of ->eh_strategy_handler() in
-	 * any kernel, so the default scsi_done() assumes it is
-	 * not being called from the SCSI EH.
-	 */
-	qc->scsidone = scsi_finish_command;
-
 	switch (qc->tf.protocol) {
 
 	case ATA_PROT_DMA:
@@ -3480,12 +3472,13 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 
 		/* complete taskfile transaction */
 		qc->err_mask |= ac_err_mask(drv_stat);
-		ata_qc_complete(qc);
 		break;
 	}
 
 	spin_unlock_irqrestore(&host_set->lock, flags);
 
+	ata_eh_qc_complete(qc);
+
 	DPRINTK("EXIT\n");
 }
 
@@ -4422,6 +4415,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 
 	INIT_WORK(&ap->packet_task, atapi_packet_task, ap);
 	INIT_WORK(&ap->pio_task, ata_pio_task, ap);
+	INIT_LIST_HEAD(&ap->eh_done_q);
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
 		ap->device[i].devno = i;
@@ -5165,6 +5159,8 @@ EXPORT_SYMBOL_GPL(ata_dev_classify);
 EXPORT_SYMBOL_GPL(ata_dev_id_string);
 EXPORT_SYMBOL_GPL(ata_dev_config);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
+EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
+EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
 
 EXPORT_SYMBOL_GPL(ata_pio_need_iordy);
 EXPORT_SYMBOL_GPL(ata_timing_compute);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 95e3c278dd43..ab6b53349d6f 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -738,17 +738,64 @@ int ata_scsi_error(struct Scsi_Host *host)
 	ap = (struct ata_port *) &host->hostdata[0];
 	ap->ops->eng_timeout(ap);
 
-	/* TODO: this is per-command; when queueing is supported
-	 * this code will either change or move to a more
-	 * appropriate place
-	 */
-	host->host_failed--;
-	INIT_LIST_HEAD(&host->eh_cmd_q);
+	assert(host->host_failed == 0 && list_empty(&host->eh_cmd_q));
+
+	scsi_eh_flush_done_q(&ap->eh_done_q);
 
 	DPRINTK("EXIT\n");
 	return 0;
 }
 
+static void ata_eh_scsidone(struct scsi_cmnd *scmd)
+{
+	/* nada */
+}
+
+static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
+{
+	struct ata_port *ap = qc->ap;
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	qc->scsidone = ata_eh_scsidone;
+	ata_qc_complete(qc);
+	assert(!ata_tag_valid(qc->tag));
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
+}
+
+/**
+ *	ata_eh_qc_complete - Complete an active ATA command from EH
+ *	@qc: Command to complete
+ *
+ *	Indicate to the mid and upper layers that an ATA command has
+ *	completed.  To be used from EH.
+ */
+void ata_eh_qc_complete(struct ata_queued_cmd *qc)
+{
+	struct scsi_cmnd *scmd = qc->scsicmd;
+	scmd->retries = scmd->allowed;
+	__ata_eh_qc_complete(qc);
+}
+
+/**
+ *	ata_eh_qc_retry - Tell midlayer to retry an ATA command after EH
+ *	@qc: Command to retry
+ *
+ *	Indicate to the mid and upper layers that an ATA command
+ *	should be retried.  To be used from EH.
+ *
+ *	SCSI midlayer limits the number of retries to scmd->allowed.
+ *	This function might need to adjust scmd->retries for commands
+ *	which get retried due to unrelated NCQ failures.
+ */
+void ata_eh_qc_retry(struct ata_queued_cmd *qc)
+{
+	__ata_eh_qc_complete(qc);
+}
+
 /**
  *	ata_scsi_start_stop_xlat - Translate SCSI START STOP UNIT command
  *	@qc: Storage for translated ATA taskfile
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index 281223a0e45f..d9a554ca45c7 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -1839,7 +1839,6 @@ static void mv_phy_reset(struct ata_port *ap)
 static void mv_eng_timeout(struct ata_port *ap)
 {
 	struct ata_queued_cmd *qc;
-	unsigned long flags;
 
 	printk(KERN_ERR "ata%u: Entering mv_eng_timeout\n",ap->id);
 	DPRINTK("All regs @ start of eng_timeout\n");
@@ -1858,17 +1857,8 @@ static void mv_eng_timeout(struct ata_port *ap)
 		printk(KERN_ERR "ata%u: BUG: timeout without command\n",
 		       ap->id);
 	} else {
-		/* hack alert!  We cannot use the supplied completion
-	 	 * function from inside the ->eh_strategy_handler() thread.
-	 	 * libata is the only user of ->eh_strategy_handler() in
-	 	 * any kernel, so the default scsi_done() assumes it is
-	 	 * not being called from the SCSI EH.
-	 	 */
-		spin_lock_irqsave(&ap->host_set->lock, flags);
-		qc->scsidone = scsi_finish_command;
 		qc->err_mask |= AC_ERR_TIMEOUT;
-		ata_qc_complete(qc);
-		spin_unlock_irqrestore(&ap->host_set->lock, flags);
+		ata_eh_qc_complete(qc);
 	}
 }
 
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index bac36d5b7c3e..77ef646b0f92 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -400,21 +400,12 @@ static void pdc_eng_timeout(struct ata_port *ap)
 		goto out;
 	}
 
-	/* hack alert!  We cannot use the supplied completion
-	 * function from inside the ->eh_strategy_handler() thread.
-	 * libata is the only user of ->eh_strategy_handler() in
-	 * any kernel, so the default scsi_done() assumes it is
-	 * not being called from the SCSI EH.
-	 */
-	qc->scsidone = scsi_finish_command;
-
 	switch (qc->tf.protocol) {
 	case ATA_PROT_DMA:
 	case ATA_PROT_NODATA:
 		printk(KERN_ERR "ata%u: command timeout\n", ap->id);
 		drv_stat = ata_wait_idle(ap);
 		qc->err_mask |= __ac_err_mask(drv_stat);
-		ata_qc_complete(qc);
 		break;
 
 	default:
@@ -424,12 +415,13 @@ static void pdc_eng_timeout(struct ata_port *ap)
 		       ap->id, qc->tf.command, drv_stat);
 
 		qc->err_mask |= ac_err_mask(drv_stat);
-		ata_qc_complete(qc);
 		break;
 	}
 
 out:
 	spin_unlock_irqrestore(&host_set->lock, flags);
+	if (qc)
+		ata_eh_qc_complete(qc);
 	DPRINTK("EXIT\n");
 }
 
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 5a7a7b1d4add..7222fc7ff3fc 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -644,17 +644,9 @@ static void sil24_eng_timeout(struct ata_port *ap)
 		return;
 	}
 
-	/*
-	 * hack alert!  We cannot use the supplied completion
-	 * function from inside the ->eh_strategy_handler() thread.
-	 * libata is the only user of ->eh_strategy_handler() in
-	 * any kernel, so the default scsi_done() assumes it is
-	 * not being called from the SCSI EH.
-	 */
 	printk(KERN_ERR "ata%u: command timeout\n", ap->id);
-	qc->scsidone = scsi_finish_command;
 	qc->err_mask |= AC_ERR_TIMEOUT;
-	ata_qc_complete(qc);
+	ata_eh_qc_complete(qc);
 
 	sil24_reset_controller(ap);
 }
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 3175c6bb4fec..9f992fbcf2e7 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -872,20 +872,11 @@ static void pdc_eng_timeout(struct ata_port *ap)
 		goto out;
 	}
 
-	/* hack alert!  We cannot use the supplied completion
-	 * function from inside the ->eh_strategy_handler() thread.
-	 * libata is the only user of ->eh_strategy_handler() in
-	 * any kernel, so the default scsi_done() assumes it is
-	 * not being called from the SCSI EH.
-	 */
-	qc->scsidone = scsi_finish_command;
-
 	switch (qc->tf.protocol) {
 	case ATA_PROT_DMA:
 	case ATA_PROT_NODATA:
 		printk(KERN_ERR "ata%u: command timeout\n", ap->id);
 		qc->err_mask |= __ac_err_mask(ata_wait_idle(ap));
-		ata_qc_complete(qc);
 		break;
 
 	default:
@@ -895,12 +886,13 @@ static void pdc_eng_timeout(struct ata_port *ap)
 		       ap->id, qc->tf.command, drv_stat);
 
 		qc->err_mask |= ac_err_mask(drv_stat);
-		ata_qc_complete(qc);
 		break;
 	}
 
 out:
 	spin_unlock_irqrestore(&host_set->lock, flags);
+	if (qc)
+		ata_eh_qc_complete(qc);
 	DPRINTK("EXIT\n");
 }
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index b1ea2f98bfbb..576788de962a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -398,6 +398,7 @@ struct ata_port {
 	unsigned long		pio_task_timeout;
 
 	u32			msg_enable;
+	struct list_head	eh_done_q;
 
 	void			*private_data;
 };
@@ -490,6 +491,8 @@ extern int ata_scsi_detect(struct scsi_host_template *sht);
 extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
 extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *));
 extern int ata_scsi_error(struct Scsi_Host *host);
+extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
+extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 extern int ata_scsi_release(struct Scsi_Host *host);
 extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
 extern int ata_scsi_device_resume(struct scsi_device *);
-- 
cgit v1.2.3


From 6f8b99589524f3e759e44721376abcdf88ed8915 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 24 Jan 2006 17:05:21 +0900
Subject: [PATCH] libata: export ata_busy_sleep

Export ata_busy_sleep(), to be used by low level driver reset functions.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 9 +++------
 include/linux/libata.h     | 3 +++
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 71de697e2327..4336fc889acd 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -61,9 +61,6 @@
 
 #include "libata.h"
 
-static unsigned int ata_busy_sleep (struct ata_port *ap,
-				    unsigned long tmout_pat,
-			    	    unsigned long tmout);
 static void ata_dev_reread_id(struct ata_port *ap, struct ata_device *dev);
 static void ata_dev_init_params(struct ata_port *ap, struct ata_device *dev);
 static void ata_set_mode(struct ata_port *ap);
@@ -1960,9 +1957,8 @@ err_out:
  *
  */
 
-static unsigned int ata_busy_sleep (struct ata_port *ap,
-				    unsigned long tmout_pat,
-			    	    unsigned long tmout)
+unsigned int ata_busy_sleep (struct ata_port *ap,
+			     unsigned long tmout_pat, unsigned long tmout)
 {
 	unsigned long timer_start, timeout;
 	u8 status;
@@ -5167,6 +5163,7 @@ EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
 EXPORT_SYMBOL_GPL(ata_port_disable);
 EXPORT_SYMBOL_GPL(ata_ratelimit);
+EXPORT_SYMBOL_GPL(ata_busy_sleep);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
 EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
 EXPORT_SYMBOL_GPL(ata_scsi_error);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 576788de962a..45646f6ebbf5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -500,6 +500,9 @@ extern int ata_scsi_device_suspend(struct scsi_device *);
 extern int ata_device_resume(struct ata_port *, struct ata_device *);
 extern int ata_device_suspend(struct ata_port *, struct ata_device *);
 extern int ata_ratelimit(void);
+extern unsigned int ata_busy_sleep(struct ata_port *ap,
+				   unsigned long timeout_pat,
+				   unsigned long timeout);
 
 /*
  * Default driver ops implementations
-- 
cgit v1.2.3


From c19ba8af4f104cca28d548cac55c128b28dd31fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 24 Jan 2006 17:05:22 +0900
Subject: [PATCH] libata: new ->probe_reset operation

Add new ->probe_reset operation to ata_port_operations obsoleting
->phy_reset.  The main difference from ->phy_reset is that the new
operation is not allowed to manipulate libata internals directly.
It's not allowed to configure or disable the port or devices.  It can
only succeed or fail and classify attached devices into passed
@classes.

This change gives more control to higher level and eases sharing reset
methods with EH.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 19 ++++++++++++++++++-
 include/linux/libata.h     |  8 +++++---
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 1f78e246f5e0..147e1461062d 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1485,7 +1485,24 @@ static int ata_bus_probe(struct ata_port *ap)
 {
 	unsigned int i, found = 0;
 
-	ap->ops->phy_reset(ap);
+	if (ap->ops->probe_reset) {
+		unsigned int classes[ATA_MAX_DEVICES];
+		int rc;
+
+		ata_port_probe(ap);
+
+		rc = ap->ops->probe_reset(ap, classes);
+		if (rc == 0) {
+			for (i = 0; i < ATA_MAX_DEVICES; i++)
+				ap->device[i].class = classes[i];
+		} else {
+			printk(KERN_ERR "ata%u: probe reset failed, "
+			       "disabling port\n", ap->id);
+			ata_port_disable(ap);
+		}
+	} else
+		ap->ops->phy_reset(ap);
+
 	if (ap->flags & ATA_FLAG_PORT_DISABLED)
 		goto err_out;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 45646f6ebbf5..a84d1c3a5429 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -148,9 +148,9 @@ enum {
 	ATA_FLAG_PORT_DISABLED	= (1 << 2), /* port is disabled, ignore it */
 	ATA_FLAG_SATA		= (1 << 3),
 	ATA_FLAG_NO_LEGACY	= (1 << 4), /* no legacy mode check */
-	ATA_FLAG_SRST		= (1 << 5), /* use ATA SRST, not E.D.D. */
+	ATA_FLAG_SRST		= (1 << 5), /* (obsolete) use ATA SRST, not E.D.D. */
 	ATA_FLAG_MMIO		= (1 << 6), /* use MMIO, not PIO */
-	ATA_FLAG_SATA_RESET	= (1 << 7), /* use COMRESET */
+	ATA_FLAG_SATA_RESET	= (1 << 7), /* (obsolete) use COMRESET */
 	ATA_FLAG_PIO_DMA	= (1 << 8), /* PIO cmds via DMA */
 	ATA_FLAG_NOINTR		= (1 << 9), /* FIXME: Remove this once
 					     * proper HSM is in place. */
@@ -419,7 +419,9 @@ struct ata_port_operations {
 	u8   (*check_altstatus)(struct ata_port *ap);
 	void (*dev_select)(struct ata_port *ap, unsigned int device);
 
-	void (*phy_reset) (struct ata_port *ap);
+	void (*phy_reset) (struct ata_port *ap); /* obsolete */
+	int (*probe_reset) (struct ata_port *ap, unsigned int *classes);
+
 	void (*post_set_mode) (struct ata_port *ap);
 
 	int (*check_atapi_dma) (struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From d0412d967032b9e147bcbacc9ff0c0342636cf2d Mon Sep 17 00:00:00 2001
From: James Chapman <jchapman@katalix.com>
Date: Fri, 27 Jan 2006 01:15:30 -0700
Subject: [PATCH] mv643xx_eth: use MII library for ethtool functions

Use the common ethtool support functions of the MII library.
Add generic MII ioctl handler.
Add PHY parameter speed/duplex/negotiation initialization and modification.

Signed-off-by: James Chapman <jchapman@katalix.com>
Signed-off-by: Dale Farnsworth <dale@farnsworth.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/net/mv643xx_eth.c | 351 +++++++++++++++++++++++++++-------------------
 include/linux/mv643xx.h   |   3 +
 2 files changed, 213 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index bca7257c707a..ff2b613a7436 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -101,6 +101,7 @@ static void ethernet_phy_set(unsigned int eth_port_num, int phy_addr);
 static int ethernet_phy_detect(unsigned int eth_port_num);
 static int mv643xx_mdio_read(struct net_device *dev, int phy_id, int location);
 static void mv643xx_mdio_write(struct net_device *dev, int phy_id, int location, int val);
+static int mv643xx_eth_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
 static struct ethtool_ops mv643xx_ethtool_ops;
 
 static char mv643xx_driver_name[] = "mv643xx_eth";
@@ -457,6 +458,56 @@ static int mv643xx_eth_receive_queue(struct net_device *dev)
 	return received_packets;
 }
 
+/* Set the mv643xx port configuration register for the speed/duplex mode. */
+static void mv643xx_eth_update_pscr(struct net_device *dev,
+				    struct ethtool_cmd *ecmd)
+{
+	struct mv643xx_private *mp = netdev_priv(dev);
+	int port_num = mp->port_num;
+	u32 o_pscr, n_pscr;
+	unsigned int channels;
+
+	o_pscr = mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num));
+	n_pscr = o_pscr;
+
+	/* clear speed, duplex and rx buffer size fields */
+	n_pscr &= ~(MV643XX_ETH_SET_MII_SPEED_TO_100  |
+		   MV643XX_ETH_SET_GMII_SPEED_TO_1000 |
+		   MV643XX_ETH_SET_FULL_DUPLEX_MODE   |
+		   MV643XX_ETH_MAX_RX_PACKET_MASK);
+
+	if (ecmd->duplex == DUPLEX_FULL)
+		n_pscr |= MV643XX_ETH_SET_FULL_DUPLEX_MODE;
+
+	if (ecmd->speed == SPEED_1000)
+		n_pscr |= MV643XX_ETH_SET_GMII_SPEED_TO_1000 |
+			  MV643XX_ETH_MAX_RX_PACKET_9700BYTE;
+	else {
+		if (ecmd->speed == SPEED_100)
+			n_pscr |= MV643XX_ETH_SET_MII_SPEED_TO_100;
+		n_pscr |= MV643XX_ETH_MAX_RX_PACKET_1522BYTE;
+	}
+
+	if (n_pscr != o_pscr) {
+		if ((o_pscr & MV643XX_ETH_SERIAL_PORT_ENABLE) == 0)
+			mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num),
+								n_pscr);
+		else {
+			channels = mv643xx_eth_port_disable_tx(port_num);
+
+			o_pscr &= ~MV643XX_ETH_SERIAL_PORT_ENABLE;
+			mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num),
+								o_pscr);
+			mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num),
+								n_pscr);
+			mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num),
+								n_pscr);
+			if (channels)
+				mv643xx_eth_port_enable_tx(port_num, channels);
+		}
+	}
+}
+
 /*
  * mv643xx_eth_int_handler
  *
@@ -539,13 +590,19 @@ static irqreturn_t mv643xx_eth_int_handler(int irq, void *dev_id,
 	}
 	/* PHY status changed */
 	if (eth_int_cause_ext & (BIT16 | BIT20)) {
+		struct ethtool_cmd cmd;
+
 		if (mii_link_ok(&mp->mii)) {
+			mii_ethtool_gset(&mp->mii, &cmd);
+			mv643xx_eth_update_pscr(dev, &cmd);
 			if (!netif_carrier_ok(dev)) {
 				netif_carrier_on(dev);
-				netif_wake_queue(dev);
-				/* Start TX queue */
-				mv643xx_eth_port_enable_tx(port_num,
-						mp->port_tx_queue_command);
+				if (mp->tx_ring_size > mp->tx_desc_count +
+							MAX_DESCS_PER_SKB) {
+					netif_wake_queue(dev);
+					/* Start TX queue */
+					mv643xx_eth_port_enable_tx(port_num, mp->port_tx_queue_command);
+				}
 			}
 		} else if (netif_carrier_ok(dev)) {
 			netif_stop_queue(dev);
@@ -729,6 +786,34 @@ static void ether_init_tx_desc_ring(struct mv643xx_private *mp)
 	mp->port_tx_queue_command = 1;
 }
 
+static int mv643xx_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct mv643xx_private *mp = netdev_priv(dev);
+	int err;
+
+	spin_lock_irq(&mp->lock);
+	err = mii_ethtool_sset(&mp->mii, cmd);
+	spin_unlock_irq(&mp->lock);
+
+	return err;
+}
+
+static int mv643xx_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct mv643xx_private *mp = netdev_priv(dev);
+	int err;
+
+	spin_lock_irq(&mp->lock);
+	err = mii_ethtool_gset(&mp->mii, cmd);
+	spin_unlock_irq(&mp->lock);
+
+	/* The PHY may support 1000baseT_Half, but the mv643xx does not */
+	cmd->supported &= ~SUPPORTED_1000baseT_Half;
+	cmd->advertising &= ~ADVERTISED_1000baseT_Half;
+
+	return err;
+}
+
 /*
  * mv643xx_eth_open
  *
@@ -842,6 +927,10 @@ static int mv643xx_eth_open(struct net_device *dev)
 
 	mv643xx_eth_rx_task(dev);	/* Fill RX ring with skb's */
 
+	/* Clear any pending ethernet port interrupts */
+	mv_write(MV643XX_ETH_INTERRUPT_CAUSE_REG(port_num), 0);
+	mv_write(MV643XX_ETH_INTERRUPT_CAUSE_EXTEND_REG(port_num), 0);
+
 	eth_port_start(dev);
 
 	/* Interrupt Coalescing */
@@ -854,16 +943,13 @@ static int mv643xx_eth_open(struct net_device *dev)
 	mp->tx_int_coal =
 		eth_port_set_tx_coal(port_num, 133000000, MV643XX_TX_COAL);
 
-	/* Clear any pending ethernet port interrupts */
-	mv_write(MV643XX_ETH_INTERRUPT_CAUSE_REG(port_num), 0);
-	mv_write(MV643XX_ETH_INTERRUPT_CAUSE_EXTEND_REG(port_num), 0);
-
 	/* Unmask phy and link status changes interrupts */
 	mv_write(MV643XX_ETH_INTERRUPT_EXTEND_MASK_REG(port_num),
 						INT_UNMASK_ALL_EXT);
 
 	/* Unmask RX buffer and TX end interrupt */
 	mv_write(MV643XX_ETH_INTERRUPT_MASK_REG(port_num), INT_UNMASK_ALL);
+
 	return 0;
 
 out_free_tx_skb:
@@ -1318,6 +1404,35 @@ static void mv643xx_netpoll(struct net_device *netdev)
 }
 #endif
 
+static void mv643xx_init_ethtool_cmd(struct net_device *dev, int phy_address,
+				     int speed, int duplex,
+				     struct ethtool_cmd *cmd)
+{
+	struct mv643xx_private *mp = netdev_priv(dev);
+
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->port = PORT_MII;
+	cmd->transceiver = XCVR_INTERNAL;
+	cmd->phy_address = phy_address;
+
+	if (speed == 0) {
+		cmd->autoneg = AUTONEG_ENABLE;
+		/* mii lib checks, but doesn't use speed on AUTONEG_ENABLE */
+		cmd->speed = SPEED_100;
+		cmd->advertising = ADVERTISED_10baseT_Half  |
+				   ADVERTISED_10baseT_Full  |
+				   ADVERTISED_100baseT_Half |
+				   ADVERTISED_100baseT_Full;
+		if (mp->mii.supports_gmii)
+			cmd->advertising |= ADVERTISED_1000baseT_Full;
+	} else {
+		cmd->autoneg = AUTONEG_DISABLE;
+		cmd->speed = speed;
+		cmd->duplex = duplex;
+	}
+}
+
 /*/
  * mv643xx_eth_probe
  *
@@ -1338,6 +1453,10 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	u8 *p;
 	struct resource *res;
 	int err;
+	struct ethtool_cmd cmd;
+	u32 pscr;
+	int duplex;
+	int speed;
 
 	dev = alloc_etherdev(sizeof(struct mv643xx_private));
 	if (!dev)
@@ -1375,6 +1494,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	dev->tx_queue_len = mp->tx_ring_size;
 	dev->base_addr = 0;
 	dev->change_mtu = mv643xx_eth_change_mtu;
+	dev->do_ioctl = mv643xx_eth_do_ioctl;
 	SET_ETHTOOL_OPS(dev, &mv643xx_ethtool_ops);
 
 #ifdef MV643XX_CHECKSUM_OFFLOAD_TX
@@ -1452,10 +1572,35 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 		pr_debug("MV643xx ethernet port %d: "
 					"No PHY detected at addr %d\n",
 					port_num, ethernet_phy_get(port_num));
-		return err;
+		goto out;
 	}
 
+	pscr = mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num));
+	pscr &= ~MV643XX_ETH_SERIAL_PORT_ENABLE;
+	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
+	pscr = mp->port_serial_control;
+	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
+
+	if (!(pscr & MV643XX_ETH_DISABLE_AUTO_NEG_FOR_DUPLX) &&
+	    !(pscr & MV643XX_ETH_DISABLE_AUTO_NEG_SPEED_GMII))
+		speed = 0;
+	else if (pscr & MV643XX_ETH_PORT_STATUS_GMII_1000)
+		speed = SPEED_1000;
+	else if (pscr & MV643XX_ETH_PORT_STATUS_MII_100)
+		speed = SPEED_100;
+	else
+		speed = SPEED_10;
+
+	if (pscr & MV643XX_ETH_PORT_STATUS_FULL_DUPLEX)
+		duplex = DUPLEX_FULL;
+	else
+		duplex = DUPLEX_HALF;
+
+	ethernet_phy_reset(mp->port_num);
 	mp->mii.supports_gmii = mii_check_gmii_support(&mp->mii);
+	mv643xx_init_ethtool_cmd(dev, mp->mii.phy_id, speed, duplex, &cmd);
+	mv643xx_eth_update_pscr(dev, &cmd);
+	mv643xx_set_settings(dev, &cmd);
 
 	err = register_netdev(dev);
 	if (err)
@@ -1775,8 +1920,6 @@ static void eth_port_init(struct mv643xx_private *mp)
 	eth_port_reset(mp->port_num);
 
 	eth_port_init_mac_tables(mp->port_num);
-
-	ethernet_phy_reset(mp->port_num);
 }
 
 /*
@@ -1811,6 +1954,8 @@ static void eth_port_start(struct net_device *dev)
 	struct mv643xx_private *mp = netdev_priv(dev);
 	unsigned int port_num = mp->port_num;
 	int tx_curr_desc, rx_curr_desc;
+	u32 pscr;
+	struct ethtool_cmd ethtool_cmd;
 
 	/* Assignment of Tx CTRP of given queue */
 	tx_curr_desc = mp->tx_curr_desc_q;
@@ -1828,31 +1973,35 @@ static void eth_port_start(struct net_device *dev)
 	/* Assign port configuration and command. */
 	mv_write(MV643XX_ETH_PORT_CONFIG_REG(port_num), mp->port_config);
 
-	mv_write(MV643XX_ETH_PORT_CONFIG_EXTEND_REG(port_num),
-						mp->port_config_extend);
+	pscr = mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num));
+	pscr &= ~MV643XX_ETH_SERIAL_PORT_ENABLE;
+	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
 
+	pscr &= ~MV643XX_ETH_FORCE_LINK_PASS;
+	pscr |= MV643XX_ETH_DISABLE_AUTO_NEG_FOR_FLOW_CTRL |
+		MV643XX_ETH_DISABLE_AUTO_NEG_SPEED_GMII    |
+		MV643XX_ETH_DISABLE_AUTO_NEG_FOR_DUPLX     |
+		MV643XX_ETH_DO_NOT_FORCE_LINK_FAIL	   |
+		MV643XX_ETH_SERIAL_PORT_CONTROL_RESERVED;
 
-	/* Increase the Rx side buffer size if supporting GigE */
-	if (mp->port_serial_control & MV643XX_ETH_SET_GMII_SPEED_TO_1000)
-		mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num),
-			(mp->port_serial_control & 0xfff1ffff) | (0x5 << 17));
-	else
-		mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num),
-						mp->port_serial_control);
+	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
 
-	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num),
-		mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num)) |
-						MV643XX_ETH_SERIAL_PORT_ENABLE);
+	pscr |= MV643XX_ETH_SERIAL_PORT_ENABLE;
+	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
 
 	/* Assign port SDMA configuration */
-	mv_write(MV643XX_ETH_SDMA_CONFIG_REG(port_num),
-							mp->port_sdma_config);
+	mv_write(MV643XX_ETH_SDMA_CONFIG_REG(port_num), mp->port_sdma_config);
 
 	/* Enable port Rx. */
 	mv643xx_eth_port_enable_rx(port_num, mp->port_rx_queue_command);
 
 	/* Disable port bandwidth limits by clearing MTU register */
 	mv_write(MV643XX_ETH_MAXIMUM_TRANSMIT_UNIT(port_num), 0);
+
+	/* save phy settings across reset */
+	mv643xx_get_settings(dev, &ethtool_cmd);
+	ethernet_phy_reset(mp->port_num);
+	mv643xx_set_settings(dev, &ethtool_cmd);
 }
 
 /*
@@ -2324,6 +2473,12 @@ static void ethernet_phy_reset(unsigned int eth_port_num)
 	eth_port_read_smi_reg(eth_port_num, 0, &phy_reg_data);
 	phy_reg_data |= 0x8000;	/* Set bit 15 to reset the PHY */
 	eth_port_write_smi_reg(eth_port_num, 0, phy_reg_data);
+
+	/* wait for PHY to come out of reset */
+	do {
+		udelay(1);
+		eth_port_read_smi_reg(eth_port_num, 0, &phy_reg_data);
+	} while (phy_reg_data & 0x8000);
 }
 
 static void mv643xx_eth_port_enable_tx(unsigned int port_num,
@@ -2417,20 +2572,13 @@ static void eth_port_reset(unsigned int port_num)
 
 	/* Reset the Enable bit in the Configuration Register */
 	reg_data = mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num));
-	reg_data &= ~MV643XX_ETH_SERIAL_PORT_ENABLE;
+	reg_data &= ~(MV643XX_ETH_SERIAL_PORT_ENABLE		|
+			MV643XX_ETH_DO_NOT_FORCE_LINK_FAIL	|
+			MV643XX_ETH_FORCE_LINK_PASS);
 	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), reg_data);
 }
 
 
-static int eth_port_autoneg_supported(unsigned int eth_port_num)
-{
-	unsigned int phy_reg_data0;
-
-	eth_port_read_smi_reg(eth_port_num, 0, &phy_reg_data0);
-
-	return phy_reg_data0 & 0x1000;
-}
-
 /*
  * eth_port_read_smi_reg - Read PHY registers
  *
@@ -2989,111 +3137,6 @@ static const struct mv643xx_stats mv643xx_gstrings_stats[] = {
 #define MV643XX_STATS_LEN	\
 	sizeof(mv643xx_gstrings_stats) / sizeof(struct mv643xx_stats)
 
-static int
-mv643xx_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
-{
-	struct mv643xx_private *mp = netdev->priv;
-	int port_num = mp->port_num;
-	int autoneg = eth_port_autoneg_supported(port_num);
-	int mode_10_bit;
-	int auto_duplex;
-	int half_duplex = 0;
-	int full_duplex = 0;
-	int auto_speed;
-	int speed_10 = 0;
-	int speed_100 = 0;
-	int speed_1000 = 0;
-
-	u32 pcs = mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num));
-	u32 psr = mv_read(MV643XX_ETH_PORT_STATUS_REG(port_num));
-
-	mode_10_bit = psr & MV643XX_ETH_PORT_STATUS_MODE_10_BIT;
-
-	if (mode_10_bit) {
-		ecmd->supported = SUPPORTED_10baseT_Half;
-	} else {
-		ecmd->supported = (SUPPORTED_10baseT_Half		|
-				   SUPPORTED_10baseT_Full		|
-				   SUPPORTED_100baseT_Half		|
-				   SUPPORTED_100baseT_Full		|
-				   SUPPORTED_1000baseT_Full		|
-				   (autoneg ? SUPPORTED_Autoneg : 0)	|
-				   SUPPORTED_TP);
-
-		auto_duplex = !(pcs & MV643XX_ETH_DISABLE_AUTO_NEG_FOR_DUPLX);
-		auto_speed = !(pcs & MV643XX_ETH_DISABLE_AUTO_NEG_SPEED_GMII);
-
-		ecmd->advertising = ADVERTISED_TP;
-
-		if (autoneg) {
-			ecmd->advertising |= ADVERTISED_Autoneg;
-
-			if (auto_duplex) {
-				half_duplex = 1;
-				full_duplex = 1;
-			} else {
-				if (pcs & MV643XX_ETH_SET_FULL_DUPLEX_MODE)
-					full_duplex = 1;
-				else
-					half_duplex = 1;
-			}
-
-			if (auto_speed) {
-				speed_10 = 1;
-				speed_100 = 1;
-				speed_1000 = 1;
-			} else {
-				if (pcs & MV643XX_ETH_SET_GMII_SPEED_TO_1000)
-					speed_1000 = 1;
-				else if (pcs & MV643XX_ETH_SET_MII_SPEED_TO_100)
-					speed_100 = 1;
-				else
-					speed_10 = 1;
-			}
-
-			if (speed_10 & half_duplex)
-				ecmd->advertising |= ADVERTISED_10baseT_Half;
-			if (speed_10 & full_duplex)
-				ecmd->advertising |= ADVERTISED_10baseT_Full;
-			if (speed_100 & half_duplex)
-				ecmd->advertising |= ADVERTISED_100baseT_Half;
-			if (speed_100 & full_duplex)
-				ecmd->advertising |= ADVERTISED_100baseT_Full;
-			if (speed_1000)
-				ecmd->advertising |= ADVERTISED_1000baseT_Full;
-		}
-	}
-
-	ecmd->port = PORT_TP;
-	ecmd->phy_address = ethernet_phy_get(port_num);
-
-	ecmd->transceiver = XCVR_EXTERNAL;
-
-	if (netif_carrier_ok(netdev)) {
-		if (mode_10_bit)
-			ecmd->speed = SPEED_10;
-		else {
-			if (psr & MV643XX_ETH_PORT_STATUS_GMII_1000)
-				ecmd->speed = SPEED_1000;
-			else if (psr & MV643XX_ETH_PORT_STATUS_MII_100)
-				ecmd->speed = SPEED_100;
-			else
-				ecmd->speed = SPEED_10;
-		}
-
-		if (psr & MV643XX_ETH_PORT_STATUS_FULL_DUPLEX)
-			ecmd->duplex = DUPLEX_FULL;
-		else
-			ecmd->duplex = DUPLEX_HALF;
-	} else {
-		ecmd->speed = -1;
-		ecmd->duplex = -1;
-	}
-
-	ecmd->autoneg = autoneg ? AUTONEG_ENABLE : AUTONEG_DISABLE;
-	return 0;
-}
-
 static void mv643xx_get_drvinfo(struct net_device *netdev,
 				struct ethtool_drvinfo *drvinfo)
 {
@@ -3140,15 +3183,41 @@ static void mv643xx_get_strings(struct net_device *netdev, uint32_t stringset,
 	}
 }
 
+static u32 mv643xx_eth_get_link(struct net_device *dev)
+{
+	struct mv643xx_private *mp = netdev_priv(dev);
+
+	return mii_link_ok(&mp->mii);
+}
+
+static int mv643xx_eth_nway_restart(struct net_device *dev)
+{
+	struct mv643xx_private *mp = netdev_priv(dev);
+
+	return mii_nway_restart(&mp->mii);
+}
+
+static int mv643xx_eth_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct mv643xx_private *mp = netdev_priv(dev);
+
+	return generic_mii_ioctl(&mp->mii, if_mii(ifr), cmd, NULL);
+}
+
 static struct ethtool_ops mv643xx_ethtool_ops = {
 	.get_settings           = mv643xx_get_settings,
+	.set_settings           = mv643xx_set_settings,
 	.get_drvinfo            = mv643xx_get_drvinfo,
-	.get_link               = ethtool_op_get_link,
+	.get_link               = mv643xx_eth_get_link,
 	.get_sg			= ethtool_op_get_sg,
 	.set_sg			= ethtool_op_set_sg,
 	.get_strings            = mv643xx_get_strings,
 	.get_stats_count        = mv643xx_get_stats_count,
 	.get_ethtool_stats      = mv643xx_get_ethtool_stats,
+	.get_strings            = mv643xx_get_strings,
+	.get_stats_count        = mv643xx_get_stats_count,
+	.get_ethtool_stats      = mv643xx_get_ethtool_stats,
+	.nway_reset		= mv643xx_eth_nway_restart,
 };
 
 /************* End ethtool support *************************/
diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h
index 0b08cd692201..7ffbeac7d2bb 100644
--- a/include/linux/mv643xx.h
+++ b/include/linux/mv643xx.h
@@ -1214,6 +1214,7 @@ struct mv64xxx_i2c_pdata {
 #define MV643XX_ETH_FORCE_BP_MODE_NO_JAM		0
 #define MV643XX_ETH_FORCE_BP_MODE_JAM_TX		(1<<7)
 #define MV643XX_ETH_FORCE_BP_MODE_JAM_TX_ON_RX_ERR	(1<<8)
+#define MV643XX_ETH_SERIAL_PORT_CONTROL_RESERVED	(1<<9)
 #define MV643XX_ETH_FORCE_LINK_FAIL			0
 #define MV643XX_ETH_DO_NOT_FORCE_LINK_FAIL		(1<<10)
 #define MV643XX_ETH_RETRANSMIT_16_ATTEMPTS		0
@@ -1243,6 +1244,8 @@ struct mv64xxx_i2c_pdata {
 #define MV643XX_ETH_SET_MII_SPEED_TO_10			0
 #define MV643XX_ETH_SET_MII_SPEED_TO_100		(1<<24)
 
+#define MV643XX_ETH_MAX_RX_PACKET_MASK			(0x7<<17)
+
 #define	MV643XX_ETH_PORT_SERIAL_CONTROL_DEFAULT_VALUE		\
 		MV643XX_ETH_DO_NOT_FORCE_LINK_PASS	|	\
 		MV643XX_ETH_ENABLE_AUTO_NEG_FOR_DUPLX	|	\
-- 
cgit v1.2.3


From 01999873a455fe9104e91820c72849e608239928 Mon Sep 17 00:00:00 2001
From: Dale Farnsworth <dale@farnsworth.org>
Date: Fri, 27 Jan 2006 01:18:01 -0700
Subject: [PATCH] mv643xx_eth: Clean up platform_data configuration

We shouldn't expose the hardware register contents in platform_data.
The only things we allow the user to configure are autoneg, speed, and
duplex.  Add specific platform_data fields for these values and remove
the registers configs.

Signed-off-by: Dale Farnsworth <dale@farnsworth.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 arch/ppc/platforms/hdpu.c |  5 ++--
 drivers/net/mv643xx_eth.c | 71 ++++++++++++++---------------------------------
 drivers/net/mv643xx_eth.h |  4 ---
 include/linux/mv643xx.h   | 24 ++++++----------
 4 files changed, 31 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/platforms/hdpu.c b/arch/ppc/platforms/hdpu.c
index 50039a204c24..f945416960e9 100644
--- a/arch/ppc/platforms/hdpu.c
+++ b/arch/ppc/platforms/hdpu.c
@@ -319,11 +319,10 @@ static void __init hdpu_fixup_eth_pdata(struct platform_device *pd)
 	struct mv643xx_eth_platform_data *eth_pd;
 	eth_pd = pd->dev.platform_data;
 
-	eth_pd->port_serial_control =
-	    mv64x60_read(&bh, MV643XX_ETH_PORT_SERIAL_CONTROL_REG(pd->id) & ~1);
-
 	eth_pd->force_phy_addr = 1;
 	eth_pd->phy_addr = pd->id;
+	eth_pd->speed = SPEED_100;
+	eth_pd->duplex = DUPLEX_FULL;
 	eth_pd->tx_queue_size = 400;
 	eth_pd->rx_queue_size = 800;
 }
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index ff2b613a7436..df572018595e 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -266,13 +266,14 @@ static void mv643xx_eth_update_mac_address(struct net_device *dev)
 static void mv643xx_eth_set_rx_mode(struct net_device *dev)
 {
 	struct mv643xx_private *mp = netdev_priv(dev);
+	u32 config_reg;
 
+	config_reg = mv_read(MV643XX_ETH_PORT_CONFIG_REG(mp->port_num));
 	if (dev->flags & IFF_PROMISC)
-		mp->port_config |= (u32) MV643XX_ETH_UNICAST_PROMISCUOUS_MODE;
+		config_reg |= (u32) MV643XX_ETH_UNICAST_PROMISCUOUS_MODE;
 	else
-		mp->port_config &= ~(u32) MV643XX_ETH_UNICAST_PROMISCUOUS_MODE;
-
-	mv_write(MV643XX_ETH_PORT_CONFIG_REG(mp->port_num), mp->port_config);
+		config_reg &= ~(u32) MV643XX_ETH_UNICAST_PROMISCUOUS_MODE;
+	mv_write(MV643XX_ETH_PORT_CONFIG_REG(mp->port_num), config_reg);
 
 	eth_port_set_multicast_list(dev);
 }
@@ -1454,9 +1455,8 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	struct resource *res;
 	int err;
 	struct ethtool_cmd cmd;
-	u32 pscr;
-	int duplex;
-	int speed;
+	int duplex = DUPLEX_HALF;
+	int speed = 0;			/* default to auto-negotiation */
 
 	dev = alloc_etherdev(sizeof(struct mv643xx_private));
 	if (!dev)
@@ -1515,33 +1515,17 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 
 	/* set default config values */
 	eth_port_uc_addr_get(dev, dev->dev_addr);
-	mp->port_config = MV643XX_ETH_PORT_CONFIG_DEFAULT_VALUE;
-	mp->port_config_extend = MV643XX_ETH_PORT_CONFIG_EXTEND_DEFAULT_VALUE;
-	mp->port_sdma_config = MV643XX_ETH_PORT_SDMA_CONFIG_DEFAULT_VALUE;
-	mp->port_serial_control = MV643XX_ETH_PORT_SERIAL_CONTROL_DEFAULT_VALUE;
 	mp->rx_ring_size = MV643XX_ETH_PORT_DEFAULT_RECEIVE_QUEUE_SIZE;
 	mp->tx_ring_size = MV643XX_ETH_PORT_DEFAULT_TRANSMIT_QUEUE_SIZE;
 
 	pd = pdev->dev.platform_data;
 	if (pd) {
-		if (pd->mac_addr != NULL)
+		if (pd->mac_addr)
 			memcpy(dev->dev_addr, pd->mac_addr, 6);
 
 		if (pd->phy_addr || pd->force_phy_addr)
 			ethernet_phy_set(port_num, pd->phy_addr);
 
-		if (pd->port_config || pd->force_port_config)
-			mp->port_config = pd->port_config;
-
-		if (pd->port_config_extend || pd->force_port_config_extend)
-			mp->port_config_extend = pd->port_config_extend;
-
-		if (pd->port_sdma_config || pd->force_port_sdma_config)
-			mp->port_sdma_config = pd->port_sdma_config;
-
-		if (pd->port_serial_control || pd->force_port_serial_control)
-			mp->port_serial_control = pd->port_serial_control;
-
 		if (pd->rx_queue_size)
 			mp->rx_ring_size = pd->rx_queue_size;
 
@@ -1557,6 +1541,9 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 			mp->rx_sram_size = pd->rx_sram_size;
 			mp->rx_sram_addr = pd->rx_sram_addr;
 		}
+
+		duplex = pd->duplex;
+		speed = pd->speed;
 	}
 
 	/* Hook up MII support for ethtool */
@@ -1575,28 +1562,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 		goto out;
 	}
 
-	pscr = mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num));
-	pscr &= ~MV643XX_ETH_SERIAL_PORT_ENABLE;
-	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
-	pscr = mp->port_serial_control;
-	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
-
-	if (!(pscr & MV643XX_ETH_DISABLE_AUTO_NEG_FOR_DUPLX) &&
-	    !(pscr & MV643XX_ETH_DISABLE_AUTO_NEG_SPEED_GMII))
-		speed = 0;
-	else if (pscr & MV643XX_ETH_PORT_STATUS_GMII_1000)
-		speed = SPEED_1000;
-	else if (pscr & MV643XX_ETH_PORT_STATUS_MII_100)
-		speed = SPEED_100;
-	else
-		speed = SPEED_10;
-
-	if (pscr & MV643XX_ETH_PORT_STATUS_FULL_DUPLEX)
-		duplex = DUPLEX_FULL;
-	else
-		duplex = DUPLEX_HALF;
-
-	ethernet_phy_reset(mp->port_num);
+	ethernet_phy_reset(port_num);
 	mp->mii.supports_gmii = mii_check_gmii_support(&mp->mii);
 	mv643xx_init_ethtool_cmd(dev, mp->mii.phy_id, speed, duplex, &cmd);
 	mv643xx_eth_update_pscr(dev, &cmd);
@@ -1971,13 +1937,17 @@ static void eth_port_start(struct net_device *dev)
 	eth_port_uc_addr_set(port_num, dev->dev_addr);
 
 	/* Assign port configuration and command. */
-	mv_write(MV643XX_ETH_PORT_CONFIG_REG(port_num), mp->port_config);
+	mv_write(MV643XX_ETH_PORT_CONFIG_REG(port_num),
+			  MV643XX_ETH_PORT_CONFIG_DEFAULT_VALUE);
+
+	mv_write(MV643XX_ETH_PORT_CONFIG_EXTEND_REG(port_num),
+			  MV643XX_ETH_PORT_CONFIG_EXTEND_DEFAULT_VALUE);
 
 	pscr = mv_read(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num));
-	pscr &= ~MV643XX_ETH_SERIAL_PORT_ENABLE;
+
+	pscr &= ~(MV643XX_ETH_SERIAL_PORT_ENABLE | MV643XX_ETH_FORCE_LINK_PASS);
 	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
 
-	pscr &= ~MV643XX_ETH_FORCE_LINK_PASS;
 	pscr |= MV643XX_ETH_DISABLE_AUTO_NEG_FOR_FLOW_CTRL |
 		MV643XX_ETH_DISABLE_AUTO_NEG_SPEED_GMII    |
 		MV643XX_ETH_DISABLE_AUTO_NEG_FOR_DUPLX     |
@@ -1990,7 +1960,8 @@ static void eth_port_start(struct net_device *dev)
 	mv_write(MV643XX_ETH_PORT_SERIAL_CONTROL_REG(port_num), pscr);
 
 	/* Assign port SDMA configuration */
-	mv_write(MV643XX_ETH_SDMA_CONFIG_REG(port_num), mp->port_sdma_config);
+	mv_write(MV643XX_ETH_SDMA_CONFIG_REG(port_num),
+			  MV643XX_ETH_PORT_SDMA_CONFIG_DEFAULT_VALUE);
 
 	/* Enable port Rx. */
 	mv643xx_eth_port_enable_rx(port_num, mp->port_rx_queue_command);
diff --git a/drivers/net/mv643xx_eth.h b/drivers/net/mv643xx_eth.h
index f2e5da79dde8..a553054e8da7 100644
--- a/drivers/net/mv643xx_eth.h
+++ b/drivers/net/mv643xx_eth.h
@@ -321,10 +321,6 @@ struct mv643xx_mib_counters {
 
 struct mv643xx_private {
 	int port_num;			/* User Ethernet port number	*/
-	u32 port_config;		/* User port configuration value*/
-	u32 port_config_extend;		/* User port config extend value*/
-	u32 port_sdma_config;		/* User port SDMA config value	*/
-	u32 port_serial_control;	/* User port serial control value */
 	u32 port_tx_queue_command;	/* Port active Tx queues summary*/
 	u32 port_rx_queue_command;	/* Port active Rx queues summary*/
 
diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h
index 7ffbeac7d2bb..955d3069d727 100644
--- a/include/linux/mv643xx.h
+++ b/include/linux/mv643xx.h
@@ -1288,23 +1288,15 @@ struct mv64xxx_i2c_pdata {
 #define MV643XX_ETH_NAME	"mv643xx_eth"
 
 struct mv643xx_eth_platform_data {
-	/* 
-	 * Non-values for mac_addr, phy_addr, port_config, etc.
-	 * override the default value.  Setting the corresponding
-	 * force_* field, causes the default value to be overridden
-	 * even when zero.
-	 */
-	unsigned int	force_phy_addr:1;
-	unsigned int	force_port_config:1;
-	unsigned int	force_port_config_extend:1;
-	unsigned int	force_port_sdma_config:1;
-	unsigned int	force_port_serial_control:1;
-	int		phy_addr;
 	char		*mac_addr;	/* pointer to mac address */
-	u32		port_config;
-	u32		port_config_extend;
-	u32		port_sdma_config;
-	u32		port_serial_control;
+	u16		force_phy_addr;	/* force override if phy_addr == 0 */
+	u16		phy_addr;
+
+	/* If speed is 0, then speed and duplex are autonegotiated. */
+	int		speed;		/* 0, SPEED_10, SPEED_100, SPEED_1000 */
+	int		duplex;		/* DUPLEX_HALF or DUPLEX_FULL */
+
+	/* non-zero values of the following fields override defaults */
 	u32		tx_queue_size;
 	u32		rx_queue_size;
 	u32		tx_sram_addr;
-- 
cgit v1.2.3


From a62c0fc526c344d8163f7a9e45e68cc63826ffd3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 24 Jan 2006 17:05:22 +0900
Subject: [PATCH] libata: implement ata_drive_probe_reset()

Most low level drivers share supported reset/classify actions and
sequence.  This patch implements ata_drive_probe_reset() which helps
constructing ->probe_reset from three component operations -
softreset, hardreset and postreset.  This minimizes duplicate code and
yet allows flexibility if needed. The three component operations can
also be shared by EH later.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |  5 +++
 2 files changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 147e1461062d..cf91fdc08ef6 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2233,6 +2233,94 @@ err_out:
 	DPRINTK("EXIT\n");
 }
 
+static int do_probe_reset(struct ata_port *ap, ata_reset_fn_t reset,
+			  ata_postreset_fn_t postreset,
+			  unsigned int *classes)
+{
+	int i, rc;
+
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		classes[i] = ATA_DEV_UNKNOWN;
+
+	rc = reset(ap, 0, classes);
+	if (rc)
+		return rc;
+
+	/* If any class isn't ATA_DEV_UNKNOWN, consider classification
+	 * is complete and convert all ATA_DEV_UNKNOWN to
+	 * ATA_DEV_NONE.
+	 */
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		if (classes[i] != ATA_DEV_UNKNOWN)
+			break;
+
+	if (i < ATA_MAX_DEVICES)
+		for (i = 0; i < ATA_MAX_DEVICES; i++)
+			if (classes[i] == ATA_DEV_UNKNOWN)
+				classes[i] = ATA_DEV_NONE;
+
+	if (postreset)
+		postreset(ap, classes);
+
+	return classes[0] != ATA_DEV_UNKNOWN ? 0 : -ENODEV;
+}
+
+/**
+ *	ata_drive_probe_reset - Perform probe reset with given methods
+ *	@ap: port to reset
+ *	@softreset: softreset method (can be NULL)
+ *	@hardreset: hardreset method (can be NULL)
+ *	@postreset: postreset method (can be NULL)
+ *	@classes: resulting classes of attached devices
+ *
+ *	Reset the specified port and classify attached devices using
+ *	given methods.  This function prefers softreset but tries all
+ *	possible reset sequences to reset and classify devices.  This
+ *	function is intended to be used for constructing ->probe_reset
+ *	callback by low level drivers.
+ *
+ *	Reset methods should follow the following rules.
+ *
+ *	- Return 0 on sucess, -errno on failure.
+ *	- If classification is supported, fill classes[] with
+ *	  recognized class codes.
+ *	- If classification is not supported, leave classes[] alone.
+ *	- If verbose is non-zero, print error message on failure;
+ *	  otherwise, shut up.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, -EINVAL if no reset method is avaliable, -ENODEV
+ *	if classification fails, and any error code from reset
+ *	methods.
+ */
+int ata_drive_probe_reset(struct ata_port *ap,
+			  ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
+			  ata_postreset_fn_t postreset, unsigned int *classes)
+{
+	int rc = -EINVAL;
+
+	if (softreset) {
+		rc = do_probe_reset(ap, softreset, postreset, classes);
+		if (rc == 0)
+			return 0;
+	}
+
+	if (!hardreset)
+		return rc;
+
+	rc = do_probe_reset(ap, hardreset, postreset, classes);
+	if (rc == 0 || rc != -ENODEV)
+		return rc;
+
+	if (softreset)
+		rc = do_probe_reset(ap, softreset, postreset, classes);
+
+	return rc;
+}
+
 static void ata_pr_blacklisted(const struct ata_port *ap,
 			       const struct ata_device *dev)
 {
@@ -5180,6 +5268,7 @@ EXPORT_SYMBOL_GPL(ata_port_probe);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
+EXPORT_SYMBOL_GPL(ata_drive_probe_reset);
 EXPORT_SYMBOL_GPL(ata_port_disable);
 EXPORT_SYMBOL_GPL(ata_ratelimit);
 EXPORT_SYMBOL_GPL(ata_busy_sleep);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a84d1c3a5429..38e08ce2d1af 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -241,6 +241,8 @@ struct ata_queued_cmd;
 
 /* typedefs */
 typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
+typedef int (*ata_reset_fn_t)(struct ata_port *, int, unsigned int *);
+typedef void (*ata_postreset_fn_t)(struct ata_port *ap, unsigned int *);
 
 struct ata_ioports {
 	unsigned long		cmd_addr;
@@ -478,6 +480,9 @@ extern void ata_port_probe(struct ata_port *);
 extern void __sata_phy_reset(struct ata_port *ap);
 extern void sata_phy_reset(struct ata_port *ap);
 extern void ata_bus_reset(struct ata_port *ap);
+extern int ata_drive_probe_reset(struct ata_port *ap,
+			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
+			ata_postreset_fn_t postreset, unsigned int *classes);
 extern void ata_port_disable(struct ata_port *);
 extern void ata_std_ports(struct ata_ioports *ioaddr);
 #ifdef CONFIG_PCI
-- 
cgit v1.2.3


From c2bd58047b9b5c91a3b0a851de66a877f2eb7ae3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 24 Jan 2006 17:05:22 +0900
Subject: [PATCH] libata: implement standard reset component operations and
 ->probe_reset

Implement SRST, COMRESET and standard postreset component operations
for ata_drive_probe_reset(), and use these three functions to
implement ata_std_probe_reset.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 206 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |   6 ++
 2 files changed, 212 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index cf91fdc08ef6..b369dbd7cb23 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2233,6 +2233,208 @@ err_out:
 	DPRINTK("EXIT\n");
 }
 
+/**
+ *	ata_std_softreset - reset host port via ATA SRST
+ *	@ap: port to reset
+ *	@verbose: fail verbosely
+ *	@classes: resulting classes of attached devices
+ *
+ *	Reset host port using ATA SRST.  This function is to be used
+ *	as standard callback for ata_drive_*_reset() functions.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, -errno otherwise.
+ */
+int ata_std_softreset(struct ata_port *ap, int verbose, unsigned int *classes)
+{
+	unsigned int slave_possible = ap->flags & ATA_FLAG_SLAVE_POSS;
+	unsigned int devmask = 0, err_mask;
+	u8 err;
+
+	DPRINTK("ENTER\n");
+
+	/* determine if device 0/1 are present */
+	if (ata_devchk(ap, 0))
+		devmask |= (1 << 0);
+	if (slave_possible && ata_devchk(ap, 1))
+		devmask |= (1 << 1);
+
+	/* devchk reports device presence without actual device on
+	 * most SATA controllers.  Check SStatus and turn devmask off
+	 * if link is offline.  Note that we should continue resetting
+	 * even when it seems like there's no device.
+	 */
+	if (ap->ops->scr_read && !sata_dev_present(ap))
+		devmask = 0;
+
+	/* select device 0 again */
+	ap->ops->dev_select(ap, 0);
+
+	/* issue bus reset */
+	DPRINTK("about to softreset, devmask=%x\n", devmask);
+	err_mask = ata_bus_softreset(ap, devmask);
+	if (err_mask) {
+		if (verbose)
+			printk(KERN_ERR "ata%u: SRST failed (err_mask=0x%x)\n",
+			       ap->id, err_mask);
+		else
+			DPRINTK("EXIT, softreset failed (err_mask=0x%x)\n",
+				err_mask);
+		return -EIO;
+	}
+
+	/* determine by signature whether we have ATA or ATAPI devices */
+	classes[0] = ata_dev_try_classify(ap, 0, &err);
+	if (slave_possible && err != 0x81)
+		classes[1] = ata_dev_try_classify(ap, 1, &err);
+
+	DPRINTK("EXIT, classes[0]=%u [1]=%u\n", classes[0], classes[1]);
+	return 0;
+}
+
+/**
+ *	sata_std_hardreset - reset host port via SATA phy reset
+ *	@ap: port to reset
+ *	@verbose: fail verbosely
+ *	@class: resulting class of attached device
+ *
+ *	SATA phy-reset host port using DET bits of SControl register.
+ *	This function is to be used as standard callback for
+ *	ata_drive_*_reset().
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, -errno otherwise.
+ */
+int sata_std_hardreset(struct ata_port *ap, int verbose, unsigned int *class)
+{
+	u32 sstatus, serror;
+	unsigned long timeout = jiffies + (HZ * 5);
+
+	DPRINTK("ENTER\n");
+
+	/* Issue phy wake/reset */
+	scr_write_flush(ap, SCR_CONTROL, 0x301);
+
+	/*
+	 * Couldn't find anything in SATA I/II specs, but AHCI-1.1
+	 * 10.4.2 says at least 1 ms.
+	 */
+	msleep(1);
+
+	scr_write_flush(ap, SCR_CONTROL, 0x300);
+
+	/* Wait for phy to become ready, if necessary. */
+	do {
+		msleep(200);
+		sstatus = scr_read(ap, SCR_STATUS);
+		if ((sstatus & 0xf) != 1)
+			break;
+	} while (time_before(jiffies, timeout));
+
+	/* Clear SError */
+	serror = scr_read(ap, SCR_ERROR);
+	scr_write(ap, SCR_ERROR, serror);
+
+	/* TODO: phy layer with polling, timeouts, etc. */
+	if (!sata_dev_present(ap)) {
+		*class = ATA_DEV_NONE;
+		DPRINTK("EXIT, link offline\n");
+		return 0;
+	}
+
+	if (ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT)) {
+		if (verbose)
+			printk(KERN_ERR "ata%u: COMRESET failed "
+			       "(device not ready)\n", ap->id);
+		else
+			DPRINTK("EXIT, device not ready\n");
+		return -EIO;
+	}
+
+	*class = ata_dev_try_classify(ap, 0, NULL);
+
+	DPRINTK("EXIT, class=%u\n", *class);
+	return 0;
+}
+
+/**
+ *	ata_std_postreset - standard postreset callback
+ *	@ap: the target ata_port
+ *	@classes: classes of attached devices
+ *
+ *	This function is invoked after a successful reset.  Note that
+ *	the device might have been reset more than once using
+ *	different reset methods before postreset is invoked.
+ *	postreset is also reponsible for setting cable type.
+ *
+ *	This function is to be used as standard callback for
+ *	ata_drive_*_reset().
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ */
+void ata_std_postreset(struct ata_port *ap, unsigned int *classes)
+{
+	DPRINTK("ENTER\n");
+
+	/* set cable type */
+	if (ap->cbl == ATA_CBL_NONE && ap->flags & ATA_FLAG_SATA)
+		ap->cbl = ATA_CBL_SATA;
+
+	/* print link status */
+	if (ap->cbl == ATA_CBL_SATA)
+		sata_print_link_status(ap);
+
+	/* bail out if no device is present */
+	if (classes[0] == ATA_DEV_NONE && classes[1] == ATA_DEV_NONE) {
+		DPRINTK("EXIT, no device\n");
+		return;
+	}
+
+	/* is double-select really necessary? */
+	if (classes[0] != ATA_DEV_NONE)
+		ap->ops->dev_select(ap, 1);
+	if (classes[1] != ATA_DEV_NONE)
+		ap->ops->dev_select(ap, 0);
+
+	/* re-enable interrupts & set up device control */
+	if (ap->ioaddr.ctl_addr)	/* FIXME: hack. create a hook instead */
+		ata_irq_on(ap);
+
+	DPRINTK("EXIT\n");
+}
+
+/**
+ *	ata_std_probe_reset - standard probe reset method
+ *	@ap: prot to perform probe-reset
+ *	@classes: resulting classes of attached devices
+ *
+ *	The stock off-the-shelf ->probe_reset method.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, -errno otherwise.
+ */
+int ata_std_probe_reset(struct ata_port *ap, unsigned int *classes)
+{
+	ata_reset_fn_t hardreset;
+
+	hardreset = NULL;
+	if (ap->cbl == ATA_CBL_SATA && ap->ops->scr_read)
+		hardreset = sata_std_hardreset;
+
+	return ata_drive_probe_reset(ap, ata_std_softreset, hardreset,
+				     ata_std_postreset, classes);
+}
+
 static int do_probe_reset(struct ata_port *ap, ata_reset_fn_t reset,
 			  ata_postreset_fn_t postreset,
 			  unsigned int *classes)
@@ -5268,6 +5470,10 @@ EXPORT_SYMBOL_GPL(ata_port_probe);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
+EXPORT_SYMBOL_GPL(ata_std_softreset);
+EXPORT_SYMBOL_GPL(sata_std_hardreset);
+EXPORT_SYMBOL_GPL(ata_std_postreset);
+EXPORT_SYMBOL_GPL(ata_std_probe_reset);
 EXPORT_SYMBOL_GPL(ata_drive_probe_reset);
 EXPORT_SYMBOL_GPL(ata_port_disable);
 EXPORT_SYMBOL_GPL(ata_ratelimit);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 38e08ce2d1af..474cdfa35d1e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -483,6 +483,11 @@ extern void ata_bus_reset(struct ata_port *ap);
 extern int ata_drive_probe_reset(struct ata_port *ap,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			ata_postreset_fn_t postreset, unsigned int *classes);
+extern int ata_std_softreset(struct ata_port *ap, int verbose,
+			     unsigned int *classes);
+extern int sata_std_hardreset(struct ata_port *ap, int verbose,
+			      unsigned int *class);
+extern void ata_std_postreset(struct ata_port *ap, unsigned int *classes);
 extern void ata_port_disable(struct ata_port *);
 extern void ata_std_ports(struct ata_ioports *ioaddr);
 #ifdef CONFIG_PCI
@@ -523,6 +528,7 @@ extern void ata_std_dev_select (struct ata_port *ap, unsigned int device);
 extern u8 ata_check_status(struct ata_port *ap);
 extern u8 ata_altstatus(struct ata_port *ap);
 extern void ata_exec_command(struct ata_port *ap, const struct ata_taskfile *tf);
+extern int ata_std_probe_reset(struct ata_port *ap, unsigned int *classes);
 extern int ata_port_start (struct ata_port *ap);
 extern void ata_port_stop (struct ata_port *ap);
 extern void ata_host_stop (struct ata_host_set *host_set);
-- 
cgit v1.2.3


From d97a11e091a0bf40f1cfb0bbf443ddd7b455b133 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 7 Feb 2006 06:48:40 -0200
Subject: V4L/DVB (3300): Add standard for South Korean NTSC-M using A2 audio.

South Korea uses NTSC-M but with A2 audio instead of BTSC. Several audio
chips need this information in order to set the correct audio processing
registers.

Acked-by: Mauro Carvalho Chehab <mauro_chehab@yahoo.com.br>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/video/bttv-driver.c          |  2 +-
 drivers/media/video/cx25840/cx25840-core.c | 50 ++++++++++++------------------
 drivers/media/video/tda9887.c              |  7 ++++-
 drivers/media/video/tuner-core.c           |  5 +++
 include/linux/videodev2.h                  |  4 ++-
 5 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c
index aa4c4c521880..578b20085082 100644
--- a/drivers/media/video/bttv-driver.c
+++ b/drivers/media/video/bttv-driver.c
@@ -214,7 +214,7 @@ const struct bttv_tvnorm bttv_tvnorms[] = {
 		   we can capture, of the first and second field. */
 		.vbistart	= { 7,320 },
 	},{
-		.v4l2_id        = V4L2_STD_NTSC_M,
+		.v4l2_id        = V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR,
 		.name           = "NTSC",
 		.Fsc            = 28636363,
 		.swidth         = 768,
diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c
index 3acd587b160c..8d8aa8ec8304 100644
--- a/drivers/media/video/cx25840/cx25840-core.c
+++ b/drivers/media/video/cx25840/cx25840-core.c
@@ -220,33 +220,23 @@ static void input_change(struct i2c_client *client)
 		cx25840_write(client, 0x808, 0xff);
 		cx25840_write(client, 0x80b, 0x10);
 	} else if (std & V4L2_STD_NTSC) {
-		/* NTSC */
-		if (state->pvr150_workaround) {
-			/* Certain Hauppauge PVR150 models have a hardware bug
-			   that causes audio to drop out. For these models the
-			   audio standard must be set explicitly.
-			   To be precise: it affects cards with tuner models
-			   85, 99 and 112 (model numbers from tveeprom). */
-			if (std == V4L2_STD_NTSC_M_JP) {
-				/* Japan uses EIAJ audio standard */
-				cx25840_write(client, 0x808, 0x2f);
-			} else {
-				/* Others use the BTSC audio standard */
-				cx25840_write(client, 0x808, 0x1f);
-			}
-			/* South Korea uses the A2-M (aka Zweiton M) audio
-			   standard, and should set 0x808 to 0x3f, but I don't
-			   know how to detect this. */
-		} else if (std == V4L2_STD_NTSC_M_JP) {
+		/* Certain Hauppauge PVR150 models have a hardware bug
+		   that causes audio to drop out. For these models the
+		   audio standard must be set explicitly.
+		   To be precise: it affects cards with tuner models
+		   85, 99 and 112 (model numbers from tveeprom). */
+		int hw_fix = state->pvr150_workaround;
+
+		if (std == V4L2_STD_NTSC_M_JP) {
 			/* Japan uses EIAJ audio standard */
-			cx25840_write(client, 0x808, 0xf7);
+			cx25840_write(client, 0x808, hw_fix ? 0x2f : 0xf7);
+		} else if (std == V4L2_STD_NTSC_M_KR) {
+			/* South Korea uses A2 audio standard */
+			cx25840_write(client, 0x808, hw_fix ? 0x3f : 0xf8);
 		} else {
 			/* Others use the BTSC audio standard */
-			cx25840_write(client, 0x808, 0xf6);
+			cx25840_write(client, 0x808, hw_fix ? 0x1f : 0xf6);
 		}
-		/* South Korea uses the A2-M (aka Zweiton M) audio standard,
-		   and should set 0x808 to 0xf8, but I don't know how to
-		   detect this. */
 		cx25840_write(client, 0x80b, 0x00);
 	}
 
@@ -330,17 +320,17 @@ static int set_v4lstd(struct i2c_client *client, v4l2_std_id std)
 	u8 fmt=0; 	/* zero is autodetect */
 
 	/* First tests should be against specific std */
-	if (std & V4L2_STD_NTSC_M_JP) {
+	if (std == V4L2_STD_NTSC_M_JP) {
 		fmt=0x2;
-	} else if (std & V4L2_STD_NTSC_443) {
+	} else if (std == V4L2_STD_NTSC_443) {
 		fmt=0x3;
-	} else if (std & V4L2_STD_PAL_M) {
+	} else if (std == V4L2_STD_PAL_M) {
 		fmt=0x5;
-	} else if (std & V4L2_STD_PAL_N) {
+	} else if (std == V4L2_STD_PAL_N) {
 		fmt=0x6;
-	} else if (std & V4L2_STD_PAL_Nc) {
+	} else if (std == V4L2_STD_PAL_Nc) {
 		fmt=0x7;
-	} else if (std & V4L2_STD_PAL_60) {
+	} else if (std == V4L2_STD_PAL_60) {
 		fmt=0x8;
 	} else {
 		/* Then, test against generic ones */
@@ -369,7 +359,7 @@ v4l2_std_id cx25840_get_v4lstd(struct i2c_client * client)
 	}
 
 	switch (fmt) {
-	case 0x1: return V4L2_STD_NTSC_M;
+	case 0x1: return V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR;
 	case 0x2: return V4L2_STD_NTSC_M_JP;
 	case 0x3: return V4L2_STD_NTSC_443;
 	case 0x4: return V4L2_STD_PAL;
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index 7c71422f5d3f..0d54f6c1982b 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -231,7 +231,7 @@ static struct tvnorm tvnorms[] = {
 			   cAudioIF_6_5   |
 			   cVideoIF_38_90 ),
 	},{
-		.std   = V4L2_STD_NTSC_M,
+		.std   = V4L2_STD_NTSC_M | V4L2_STD_NTSC_M_KR,
 		.name  = "NTSC-M",
 		.b     = ( cNegativeFmTV  |
 			   cQSS           ),
@@ -619,6 +619,11 @@ static int tda9887_fixup_std(struct tda9887 *t)
 			tda9887_dbg("insmod fixup: NTSC => NTSC_M_JP\n");
 			t->std = V4L2_STD_NTSC_M_JP;
 			break;
+		case 'k':
+		case 'K':
+			tda9887_dbg("insmod fixup: NTSC => NTSC_M_KR\n");
+			t->std = V4L2_STD_NTSC_M_KR;
+			break;
 		case '-':
 			/* default parameter, do nothing */
 			break;
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index 788eadaae672..e34f801c9a10 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -366,6 +366,11 @@ static int tuner_fixup_std(struct tuner *t)
 			tuner_dbg("insmod fixup: NTSC => NTSC_M_JP\n");
 			t->std = V4L2_STD_NTSC_M_JP;
 			break;
+		case 'k':
+		case 'K':
+			tuner_dbg("insmod fixup: NTSC => NTSC_M_KR\n");
+			t->std = V4L2_STD_NTSC_M_KR;
+			break;
 		case '-':
 			/* default parameter, do nothing */
 			break;
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 6e33ce96cab0..965c8902fe60 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -638,6 +638,7 @@ typedef __u64 v4l2_std_id;
 #define V4L2_STD_NTSC_M         ((v4l2_std_id)0x00001000)
 #define V4L2_STD_NTSC_M_JP      ((v4l2_std_id)0x00002000)
 #define V4L2_STD_NTSC_443       ((v4l2_std_id)0x00004000)
+#define V4L2_STD_NTSC_M_KR      ((v4l2_std_id)0x00008000)
 
 #define V4L2_STD_SECAM_B        ((v4l2_std_id)0x00010000)
 #define V4L2_STD_SECAM_D        ((v4l2_std_id)0x00020000)
@@ -670,7 +671,8 @@ typedef __u64 v4l2_std_id;
 				 V4L2_STD_PAL_H		|\
 				 V4L2_STD_PAL_I)
 #define V4L2_STD_NTSC           (V4L2_STD_NTSC_M	|\
-				 V4L2_STD_NTSC_M_JP)
+				 V4L2_STD_NTSC_M_JP     |\
+				 V4L2_STD_NTSC_M_KR)
 #define V4L2_STD_SECAM_DK      	(V4L2_STD_SECAM_D	|\
 				 V4L2_STD_SECAM_K	|\
 				 V4L2_STD_SECAM_K1)
-- 
cgit v1.2.3


From 3593cab5d62c4c7abced1076710f9bc2d8847433 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Feb 2006 06:49:14 -0200
Subject: V4L/DVB (3318b): sem2mutex: drivers/media/, #2

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/common/saa7146_core.c               |   6 +-
 drivers/media/common/saa7146_fops.c               |  18 ++--
 drivers/media/common/saa7146_i2c.c                |   4 +-
 drivers/media/common/saa7146_vbi.c                |   2 +-
 drivers/media/common/saa7146_video.c              |  30 +++---
 drivers/media/dvb/b2c2/flexcop-common.h           |   4 +-
 drivers/media/dvb/b2c2/flexcop-i2c.c              |   6 +-
 drivers/media/dvb/bt8xx/bt878.c                   |   4 +-
 drivers/media/dvb/bt8xx/bt878.h                   |   4 +-
 drivers/media/dvb/bt8xx/dst.c                     |  14 +--
 drivers/media/dvb/bt8xx/dst_ca.c                  |   6 +-
 drivers/media/dvb/bt8xx/dst_common.h              |   3 +-
 drivers/media/dvb/bt8xx/dvb-bt8xx.c               |  12 +--
 drivers/media/dvb/bt8xx/dvb-bt8xx.h               |   3 +-
 drivers/media/dvb/cinergyT2/cinergyT2.c           |  47 ++++-----
 drivers/media/dvb/dvb-core/dmxdev.c               |  86 ++++++++---------
 drivers/media/dvb/dvb-core/dmxdev.h               |   6 +-
 drivers/media/dvb/dvb-core/dvb_demux.c            | 104 ++++++++++----------
 drivers/media/dvb/dvb-core/dvb_demux.h            |   4 +-
 drivers/media/dvb/dvb-core/dvb_frontend.c         |  13 ++-
 drivers/media/dvb/dvb-core/dvb_net.c              |  14 +--
 drivers/media/dvb/dvb-usb/cxusb.c                 |   4 +-
 drivers/media/dvb/dvb-usb/dibusb-common.c         |   4 +-
 drivers/media/dvb/dvb-usb/digitv.c                |   4 +-
 drivers/media/dvb/dvb-usb/dvb-usb-init.c          |   4 +-
 drivers/media/dvb/dvb-usb/dvb-usb-urb.c           |   4 +-
 drivers/media/dvb/dvb-usb/dvb-usb.h               |   9 +-
 drivers/media/dvb/dvb-usb/vp702x.c                |   4 +-
 drivers/media/dvb/dvb-usb/vp7045.c                |   4 +-
 drivers/media/dvb/frontends/bcm3510.c             |   9 +-
 drivers/media/dvb/ttpci/av7110.c                  |  23 +++--
 drivers/media/dvb/ttpci/av7110.h                  |   7 +-
 drivers/media/dvb/ttpci/av7110_hw.c               |  40 ++++----
 drivers/media/dvb/ttpci/budget.h                  |   4 +-
 drivers/media/dvb/ttusb-budget/dvb-ttusb-budget.c |  32 ++++---
 drivers/media/dvb/ttusb-dec/ttusb_dec.c           |  31 +++---
 drivers/media/radio/miropcm20-rds-core.c          |  11 ++-
 drivers/media/radio/radio-aimslab.c               |  20 ++--
 drivers/media/radio/radio-aztech.c                |  12 +--
 drivers/media/radio/radio-maestro.c               |  11 ++-
 drivers/media/radio/radio-maxiradio.c             |  11 ++-
 drivers/media/radio/radio-sf16fmi.c               |  22 ++---
 drivers/media/radio/radio-sf16fmr2.c              |  22 ++---
 drivers/media/radio/radio-typhoon.c               |  12 +--
 drivers/media/radio/radio-zoltrix.c               |  26 ++---
 drivers/media/video/arv.c                         |  16 ++--
 drivers/media/video/bttv-driver.c                 |  48 +++++-----
 drivers/media/video/bw-qcam.c                     |  16 ++--
 drivers/media/video/bw-qcam.h                     |   2 +-
 drivers/media/video/c-qcam.c                      |  19 ++--
 drivers/media/video/cpia.c                        | 102 ++++++++++----------
 drivers/media/video/cpia.h                        |   5 +-
 drivers/media/video/cx88/cx88-core.c              |   2 +-
 drivers/media/video/cx88/cx88-video.c             |  26 ++---
 drivers/media/video/cx88/cx88.h                   |   4 +-
 drivers/media/video/em28xx/em28xx-video.c         | 106 ++++++++++----------
 drivers/media/video/em28xx/em28xx.h               |   3 +-
 drivers/media/video/meye.c                        | 112 +++++++++++-----------
 drivers/media/video/meye.h                        |   4 +-
 drivers/media/video/mxb.c                         |   4 +-
 drivers/media/video/planb.c                       |   8 +-
 drivers/media/video/planb.h                       |   2 +-
 drivers/media/video/pms.c                         |  28 +++---
 drivers/media/video/saa5246a.c                    |  10 +-
 drivers/media/video/saa5249.c                     |  10 +-
 drivers/media/video/saa7134/saa7134-alsa.c        |   6 +-
 drivers/media/video/saa7134/saa7134-core.c        |   2 +-
 drivers/media/video/saa7134/saa7134-empress.c     |   8 +-
 drivers/media/video/saa7134/saa7134-oss.c         |  40 ++++----
 drivers/media/video/saa7134/saa7134-video.c       |  40 ++++----
 drivers/media/video/saa7134/saa7134.h             |   5 +-
 drivers/media/video/video-buf-dvb.c               |  10 +-
 drivers/media/video/video-buf.c                   |  42 ++++----
 drivers/media/video/videodev.c                    |   6 +-
 drivers/media/video/vino.c                        |  33 +++----
 include/linux/videodev2.h                         |   3 +-
 include/media/saa7146.h                           |  21 ++--
 include/media/video-buf-dvb.h                     |   2 +-
 include/media/video-buf.h                         |   2 +-
 79 files changed, 749 insertions(+), 718 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/common/saa7146_core.c b/drivers/media/common/saa7146_core.c
index 04c1938b9c91..ee16c042ef6e 100644
--- a/drivers/media/common/saa7146_core.c
+++ b/drivers/media/common/saa7146_core.c
@@ -21,7 +21,7 @@
 #include <media/saa7146.h>
 
 LIST_HEAD(saa7146_devices);
-DECLARE_MUTEX(saa7146_devices_lock);
+DEFINE_MUTEX(saa7146_devices_lock);
 
 static int saa7146_num;
 
@@ -402,11 +402,11 @@ static int saa7146_init_one(struct pci_dev *pci, const struct pci_device_id *ent
 
 	pci_set_drvdata(pci, dev);
 
-	init_MUTEX(&dev->lock);
+	mutex_init(&dev->lock);
 	spin_lock_init(&dev->int_slock);
 	spin_lock_init(&dev->slock);
 
-	init_MUTEX(&dev->i2c_lock);
+	mutex_init(&dev->i2c_lock);
 
 	dev->module = THIS_MODULE;
 	init_waitqueue_head(&dev->i2c_wq);
diff --git a/drivers/media/common/saa7146_fops.c b/drivers/media/common/saa7146_fops.c
index f8cf73ed49ad..dc7fb20f47b5 100644
--- a/drivers/media/common/saa7146_fops.c
+++ b/drivers/media/common/saa7146_fops.c
@@ -17,18 +17,18 @@ int saa7146_res_get(struct saa7146_fh *fh, unsigned int bit)
 	}
 
 	/* is it free? */
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	if (vv->resources & bit) {
 		DEB_D(("locked! vv->resources:0x%02x, we want:0x%02x\n",vv->resources,bit));
 		/* no, someone else uses it */
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	}
 	/* it's free, grab it */
 	fh->resources  |= bit;
 	vv->resources |= bit;
 	DEB_D(("res: get 0x%02x, cur:0x%02x\n",bit,vv->resources));
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 	return 1;
 }
 
@@ -40,11 +40,11 @@ void saa7146_res_free(struct saa7146_fh *fh, unsigned int bits)
 	if ((fh->resources & bits) != bits)
 		BUG();
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	fh->resources  &= ~bits;
 	vv->resources &= ~bits;
 	DEB_D(("res: put 0x%02x, cur:0x%02x\n",bits,vv->resources));
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 }
 
 
@@ -204,7 +204,7 @@ static int fops_open(struct inode *inode, struct file *file)
 
 	DEB_EE(("inode:%p, file:%p, minor:%d\n",inode,file,minor));
 
-	if (down_interruptible(&saa7146_devices_lock))
+	if (mutex_lock_interruptible(&saa7146_devices_lock))
 		return -ERESTARTSYS;
 
 	list_for_each(list,&saa7146_devices) {
@@ -276,7 +276,7 @@ out:
 		kfree(fh);
 		file->private_data = NULL;
 	}
-	up(&saa7146_devices_lock);
+	mutex_unlock(&saa7146_devices_lock);
 	return result;
 }
 
@@ -287,7 +287,7 @@ static int fops_release(struct inode *inode, struct file *file)
 
 	DEB_EE(("inode:%p, file:%p\n",inode,file));
 
-	if (down_interruptible(&saa7146_devices_lock))
+	if (mutex_lock_interruptible(&saa7146_devices_lock))
 		return -ERESTARTSYS;
 
 	if( fh->type == V4L2_BUF_TYPE_VBI_CAPTURE) {
@@ -303,7 +303,7 @@ static int fops_release(struct inode *inode, struct file *file)
 	file->private_data = NULL;
 	kfree(fh);
 
-	up(&saa7146_devices_lock);
+	mutex_unlock(&saa7146_devices_lock);
 
 	return 0;
 }
diff --git a/drivers/media/common/saa7146_i2c.c b/drivers/media/common/saa7146_i2c.c
index 8aabdd8fb3c5..d9953f7a8b6b 100644
--- a/drivers/media/common/saa7146_i2c.c
+++ b/drivers/media/common/saa7146_i2c.c
@@ -279,7 +279,7 @@ int saa7146_i2c_transfer(struct saa7146_dev *dev, const struct i2c_msg *msgs, in
 	int address_err = 0;
 	int short_delay = 0;
 
-	if (down_interruptible (&dev->i2c_lock))
+	if (mutex_lock_interruptible(&dev->i2c_lock))
 		return -ERESTARTSYS;
 
 	for(i=0;i<num;i++) {
@@ -366,7 +366,7 @@ out:
 		}
 	}
 
-	up(&dev->i2c_lock);
+	mutex_unlock(&dev->i2c_lock);
 	return err;
 }
 
diff --git a/drivers/media/common/saa7146_vbi.c b/drivers/media/common/saa7146_vbi.c
index 468d3c959075..500bd3f05e16 100644
--- a/drivers/media/common/saa7146_vbi.c
+++ b/drivers/media/common/saa7146_vbi.c
@@ -410,7 +410,7 @@ static int vbi_open(struct saa7146_dev *dev, struct file *file)
 			    V4L2_FIELD_SEQ_TB, // FIXME: does this really work?
 			    sizeof(struct saa7146_buf),
 			    file);
-	init_MUTEX(&fh->vbi_q.lock);
+	mutex_init(&fh->vbi_q.lock);
 
 	init_timer(&fh->vbi_read_timeout);
 	fh->vbi_read_timeout.function = vbi_read_timeout;
diff --git a/drivers/media/common/saa7146_video.c b/drivers/media/common/saa7146_video.c
index 7ebac7949df3..6b42713d97f4 100644
--- a/drivers/media/common/saa7146_video.c
+++ b/drivers/media/common/saa7146_video.c
@@ -378,20 +378,20 @@ static int s_fmt(struct saa7146_fh *fh, struct v4l2_format *f)
 		err = try_win(dev,&f->fmt.win);
 		if (0 != err)
 			return err;
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		fh->ov.win    = f->fmt.win;
 		fh->ov.nclips = f->fmt.win.clipcount;
 		if (fh->ov.nclips > 16)
 			fh->ov.nclips = 16;
 		if (copy_from_user(fh->ov.clips,f->fmt.win.clips,sizeof(struct v4l2_clip)*fh->ov.nclips)) {
-			up(&dev->lock);
+			mutex_unlock(&dev->lock);
 			return -EFAULT;
 		}
 
 		/* fh->ov.fh is used to indicate that we have valid overlay informations, too */
 		fh->ov.fh = fh;
 
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 
 		/* check if our current overlay is active */
 		if (IS_OVERLAY_ACTIVE(fh) != 0) {
@@ -516,7 +516,7 @@ static int set_control(struct saa7146_fh *fh, struct v4l2_control *c)
 		return -EINVAL;
 	}
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 
 	switch (ctrl->type) {
 	case V4L2_CTRL_TYPE_BOOLEAN:
@@ -560,7 +560,7 @@ static int set_control(struct saa7146_fh *fh, struct v4l2_control *c)
 		/* fixme: we can support changing VFLIP and HFLIP here... */
 		if (IS_CAPTURE_ACTIVE(fh) != 0) {
 			DEB_D(("V4L2_CID_HFLIP while active capture.\n"));
-			up(&dev->lock);
+			mutex_unlock(&dev->lock);
 			return -EINVAL;
 		}
 		vv->hflip = c->value;
@@ -568,7 +568,7 @@ static int set_control(struct saa7146_fh *fh, struct v4l2_control *c)
 	case V4L2_CID_VFLIP:
 		if (IS_CAPTURE_ACTIVE(fh) != 0) {
 			DEB_D(("V4L2_CID_VFLIP while active capture.\n"));
-			up(&dev->lock);
+			mutex_unlock(&dev->lock);
 			return -EINVAL;
 		}
 		vv->vflip = c->value;
@@ -577,7 +577,7 @@ static int set_control(struct saa7146_fh *fh, struct v4l2_control *c)
 		return -EINVAL;
 	}
 	}
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 
 	if (IS_OVERLAY_ACTIVE(fh) != 0) {
 		saa7146_stop_preview(fh);
@@ -939,7 +939,7 @@ int saa7146_video_do_ioctl(struct inode *inode, struct file *file, unsigned int
 			}
 		}
 
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 
 		/* ok, accept it */
 		vv->ov_fb = *fb;
@@ -948,7 +948,7 @@ int saa7146_video_do_ioctl(struct inode *inode, struct file *file, unsigned int
 			vv->ov_fb.fmt.bytesperline =
 				vv->ov_fb.fmt.width*fmt->depth/8;
 
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 
 		return 0;
 	}
@@ -1086,7 +1086,7 @@ int saa7146_video_do_ioctl(struct inode *inode, struct file *file, unsigned int
 			}
 		}
 
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 
 		for(i = 0; i < dev->ext_vv_data->num_stds; i++)
 			if (*id & dev->ext_vv_data->stds[i].id)
@@ -1098,7 +1098,7 @@ int saa7146_video_do_ioctl(struct inode *inode, struct file *file, unsigned int
 			found = 1;
 		}
 
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 
 		if (vv->ov_suspend != NULL) {
 			saa7146_start_preview(vv->ov_suspend);
@@ -1201,11 +1201,11 @@ int saa7146_video_do_ioctl(struct inode *inode, struct file *file, unsigned int
 		DEB_D(("VIDIOCGMBUF \n"));
 
 		q = &fh->video_q;
-		down(&q->lock);
+		mutex_lock(&q->lock);
 		err = videobuf_mmap_setup(q,gbuffers,gbufsize,
 					  V4L2_MEMORY_MMAP);
 		if (err < 0) {
-			up(&q->lock);
+			mutex_unlock(&q->lock);
 			return err;
 		}
 		memset(mbuf,0,sizeof(*mbuf));
@@ -1213,7 +1213,7 @@ int saa7146_video_do_ioctl(struct inode *inode, struct file *file, unsigned int
 		mbuf->size   = gbuffers * gbufsize;
 		for (i = 0; i < gbuffers; i++)
 			mbuf->offsets[i] = i * gbufsize;
-		up(&q->lock);
+		mutex_unlock(&q->lock);
 		return 0;
 	}
 	default:
@@ -1414,7 +1414,7 @@ static int video_open(struct saa7146_dev *dev, struct file *file)
 			    sizeof(struct saa7146_buf),
 			    file);
 
-	init_MUTEX(&fh->video_q.lock);
+	mutex_init(&fh->video_q.lock);
 
 	return 0;
 }
diff --git a/drivers/media/dvb/b2c2/flexcop-common.h b/drivers/media/dvb/b2c2/flexcop-common.h
index 7d7e1613c5a7..b3dd0603cd92 100644
--- a/drivers/media/dvb/b2c2/flexcop-common.h
+++ b/drivers/media/dvb/b2c2/flexcop-common.h
@@ -10,6 +10,7 @@
 
 #include <linux/config.h>
 #include <linux/pci.h>
+#include <linux/mutex.h>
 
 #include "flexcop-reg.h"
 
@@ -73,8 +74,7 @@ struct flexcop_device {
 	int (*fe_sleep) (struct dvb_frontend *);
 
 	struct i2c_adapter i2c_adap;
-	struct semaphore i2c_sem;
-
+	struct mutex i2c_mutex;
 	struct module *owner;
 
 	/* options and status */
diff --git a/drivers/media/dvb/b2c2/flexcop-i2c.c b/drivers/media/dvb/b2c2/flexcop-i2c.c
index 56495cb6cd02..e0bd2d8f0f0c 100644
--- a/drivers/media/dvb/b2c2/flexcop-i2c.c
+++ b/drivers/media/dvb/b2c2/flexcop-i2c.c
@@ -135,7 +135,7 @@ static int flexcop_master_xfer(struct i2c_adapter *i2c_adap, struct i2c_msg msgs
 	struct flexcop_device *fc = i2c_get_adapdata(i2c_adap);
 	int i, ret = 0;
 
-	if (down_interruptible(&fc->i2c_sem))
+	if (mutex_lock_interruptible(&fc->i2c_mutex))
 		return -ERESTARTSYS;
 
 	/* reading */
@@ -161,7 +161,7 @@ static int flexcop_master_xfer(struct i2c_adapter *i2c_adap, struct i2c_msg msgs
 	else
 		ret = num;
 
-	up(&fc->i2c_sem);
+	mutex_unlock(&fc->i2c_mutex);
 
 	return ret;
 }
@@ -180,7 +180,7 @@ int flexcop_i2c_init(struct flexcop_device *fc)
 {
 	int ret;
 
-	sema_init(&fc->i2c_sem,1);
+	mutex_init(&fc->i2c_mutex);
 
 	memset(&fc->i2c_adap, 0, sizeof(struct i2c_adapter));
 	strncpy(fc->i2c_adap.name, "B2C2 FlexCop device",I2C_NAME_SIZE);
diff --git a/drivers/media/dvb/bt8xx/bt878.c b/drivers/media/dvb/bt8xx/bt878.c
index 34c3189a1a33..d276ce6b3661 100644
--- a/drivers/media/dvb/bt8xx/bt878.c
+++ b/drivers/media/dvb/bt8xx/bt878.c
@@ -344,7 +344,7 @@ bt878_device_control(struct bt878 *bt, unsigned int cmd, union dst_gpio_packet *
 	int retval;
 
 	retval = 0;
-	if (down_interruptible (&bt->gpio_lock))
+	if (mutex_lock_interruptible(&bt->gpio_lock))
 		return -ERESTARTSYS;
 	/* special gpio signal */
 	switch (cmd) {
@@ -375,7 +375,7 @@ bt878_device_control(struct bt878 *bt, unsigned int cmd, union dst_gpio_packet *
 		retval = -EINVAL;
 		break;
 	}
-	up(&bt->gpio_lock);
+	mutex_unlock(&bt->gpio_lock);
 	return retval;
 }
 
diff --git a/drivers/media/dvb/bt8xx/bt878.h b/drivers/media/dvb/bt8xx/bt878.h
index 9faf93770d08..f685bc129609 100644
--- a/drivers/media/dvb/bt8xx/bt878.h
+++ b/drivers/media/dvb/bt8xx/bt878.h
@@ -25,6 +25,8 @@
 #include <linux/pci.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
+
 #include "bt848.h"
 #include "bttv.h"
 
@@ -108,7 +110,7 @@ struct cards {
 extern int bt878_num;
 
 struct bt878 {
-	struct semaphore  gpio_lock;
+	struct mutex gpio_lock;
 	unsigned int nr;
 	unsigned int bttv_nr;
 	struct i2c_adapter *adapter;
diff --git a/drivers/media/dvb/bt8xx/dst.c b/drivers/media/dvb/bt8xx/dst.c
index 3a2ff1cc24b7..d800df1212c5 100644
--- a/drivers/media/dvb/bt8xx/dst.c
+++ b/drivers/media/dvb/bt8xx/dst.c
@@ -910,7 +910,7 @@ static int dst_get_device_id(struct dst_state *state)
 
 static int dst_probe(struct dst_state *state)
 {
-	sema_init(&state->dst_mutex, 1);
+	mutex_init(&state->dst_mutex);
 	if ((rdc_8820_reset(state)) < 0) {
 		dprintk(verbose, DST_ERROR, 1, "RDC 8820 RESET Failed.");
 		return -1;
@@ -962,7 +962,7 @@ int dst_command(struct dst_state *state, u8 *data, u8 len)
 {
 	u8 reply;
 
-	down(&state->dst_mutex);
+	mutex_lock(&state->dst_mutex);
 	if ((dst_comm_init(state)) < 0) {
 		dprintk(verbose, DST_NOTICE, 1, "DST Communication Initialization Failed.");
 		goto error;
@@ -1013,11 +1013,11 @@ int dst_command(struct dst_state *state, u8 *data, u8 len)
 		dprintk(verbose, DST_INFO, 1, "checksum failure");
 		goto error;
 	}
-	up(&state->dst_mutex);
+	mutex_unlock(&state->dst_mutex);
 	return 0;
 
 error:
-	up(&state->dst_mutex);
+	mutex_unlock(&state->dst_mutex);
 	return -EIO;
 
 }
@@ -1128,7 +1128,7 @@ static int dst_write_tuna(struct dvb_frontend *fe)
 			dst_set_voltage(fe, SEC_VOLTAGE_13);
 	}
 	state->diseq_flags &= ~(HAS_LOCK | ATTEMPT_TUNE);
-	down(&state->dst_mutex);
+	mutex_lock(&state->dst_mutex);
 	if ((dst_comm_init(state)) < 0) {
 		dprintk(verbose, DST_DEBUG, 1, "DST Communication initialization failed.");
 		goto error;
@@ -1160,11 +1160,11 @@ static int dst_write_tuna(struct dvb_frontend *fe)
 	state->diseq_flags |= ATTEMPT_TUNE;
 	retval = dst_get_tuna(state);
 werr:
-	up(&state->dst_mutex);
+	mutex_unlock(&state->dst_mutex);
 	return retval;
 
 error:
-	up(&state->dst_mutex);
+	mutex_unlock(&state->dst_mutex);
 	return -EIO;
 }
 
diff --git a/drivers/media/dvb/bt8xx/dst_ca.c b/drivers/media/dvb/bt8xx/dst_ca.c
index c650b4bf7f5f..f6b49a801eba 100644
--- a/drivers/media/dvb/bt8xx/dst_ca.c
+++ b/drivers/media/dvb/bt8xx/dst_ca.c
@@ -81,7 +81,7 @@ static int dst_ci_command(struct dst_state* state, u8 * data, u8 *ca_string, u8
 {
 	u8 reply;
 
-	down(&state->dst_mutex);
+	mutex_lock(&state->dst_mutex);
 	dst_comm_init(state);
 	msleep(65);
 
@@ -110,11 +110,11 @@ static int dst_ci_command(struct dst_state* state, u8 * data, u8 *ca_string, u8
 			goto error;
 		}
 	}
-	up(&state->dst_mutex);
+	mutex_unlock(&state->dst_mutex);
 	return 0;
 
 error:
-	up(&state->dst_mutex);
+	mutex_unlock(&state->dst_mutex);
 	return -EIO;
 }
 
diff --git a/drivers/media/dvb/bt8xx/dst_common.h b/drivers/media/dvb/bt8xx/dst_common.h
index 81557f38fe38..51d4e043716c 100644
--- a/drivers/media/dvb/bt8xx/dst_common.h
+++ b/drivers/media/dvb/bt8xx/dst_common.h
@@ -25,6 +25,7 @@
 #include <linux/smp_lock.h>
 #include <linux/dvb/frontend.h>
 #include <linux/device.h>
+#include <linux/mutex.h>
 #include "bt878.h"
 
 #include "dst_ca.h"
@@ -121,7 +122,7 @@ struct dst_state {
 	u8 vendor[8];
 	u8 board_info[8];
 
-	struct semaphore dst_mutex;
+	struct mutex dst_mutex;
 };
 
 struct dst_types {
diff --git a/drivers/media/dvb/bt8xx/dvb-bt8xx.c b/drivers/media/dvb/bt8xx/dvb-bt8xx.c
index ea27b15007e9..1649846f9ceb 100644
--- a/drivers/media/dvb/bt8xx/dvb-bt8xx.c
+++ b/drivers/media/dvb/bt8xx/dvb-bt8xx.c
@@ -76,13 +76,13 @@ static int dvb_bt8xx_start_feed(struct dvb_demux_feed *dvbdmxfeed)
 	if (!dvbdmx->dmx.frontend)
 		return -EINVAL;
 
-	down(&card->lock);
+	mutex_lock(&card->lock);
 	card->nfeeds++;
 	rc = card->nfeeds;
 	if (card->nfeeds == 1)
 		bt878_start(card->bt, card->gpio_mode,
 			    card->op_sync_orin, card->irq_err_ignore);
-	up(&card->lock);
+	mutex_unlock(&card->lock);
 	return rc;
 }
 
@@ -96,11 +96,11 @@ static int dvb_bt8xx_stop_feed(struct dvb_demux_feed *dvbdmxfeed)
 	if (!dvbdmx->dmx.frontend)
 		return -EINVAL;
 
-	down(&card->lock);
+	mutex_lock(&card->lock);
 	card->nfeeds--;
 	if (card->nfeeds == 0)
 		bt878_stop(card->bt);
-	up(&card->lock);
+	mutex_unlock(&card->lock);
 
 	return 0;
 }
@@ -788,7 +788,7 @@ static int dvb_bt8xx_probe(struct bttv_sub_device *sub)
 	if (!(card = kzalloc(sizeof(struct dvb_bt8xx_card), GFP_KERNEL)))
 		return -ENOMEM;
 
-	init_MUTEX(&card->lock);
+	mutex_init(&card->lock);
 	card->bttv_nr = sub->core->nr;
 	strncpy(card->card_name, sub->core->name, sizeof(sub->core->name));
 	card->i2c_adapter = &sub->core->i2c_adap;
@@ -881,7 +881,7 @@ static int dvb_bt8xx_probe(struct bttv_sub_device *sub)
 		return -EFAULT;
 	}
 
-	init_MUTEX(&card->bt->gpio_lock);
+	mutex_init(&card->bt->gpio_lock);
 	card->bt->bttv_nr = sub->core->nr;
 
 	if ( (ret = dvb_bt8xx_load_card(card, sub->core->type)) ) {
diff --git a/drivers/media/dvb/bt8xx/dvb-bt8xx.h b/drivers/media/dvb/bt8xx/dvb-bt8xx.h
index cf035a80361c..00dd9fa54c82 100644
--- a/drivers/media/dvb/bt8xx/dvb-bt8xx.h
+++ b/drivers/media/dvb/bt8xx/dvb-bt8xx.h
@@ -26,6 +26,7 @@
 #define DVB_BT8XX_H
 
 #include <linux/i2c.h>
+#include <linux/mutex.h>
 #include "dvbdev.h"
 #include "dvb_net.h"
 #include "bttv.h"
@@ -38,7 +39,7 @@
 #include "lgdt330x.h"
 
 struct dvb_bt8xx_card {
-	struct semaphore lock;
+	struct mutex lock;
 	int nfeeds;
 	char card_name[32];
 	struct dvb_adapter dvb_adapter;
diff --git a/drivers/media/dvb/cinergyT2/cinergyT2.c b/drivers/media/dvb/cinergyT2/cinergyT2.c
index c4b4c5b6b7c8..29b7be5271d4 100644
--- a/drivers/media/dvb/cinergyT2/cinergyT2.c
+++ b/drivers/media/dvb/cinergyT2/cinergyT2.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <linux/input.h>
 #include <linux/dvb/frontend.h>
+#include <linux/mutex.h>
 
 #include "dmxdev.h"
 #include "dvb_demux.h"
@@ -116,7 +117,7 @@ static struct dvb_frontend_info cinergyt2_fe_info = {
 struct cinergyt2 {
 	struct dvb_demux demux;
 	struct usb_device *udev;
-	struct semaphore sem;
+	struct mutex sem;
 	struct dvb_adapter adapter;
 	struct dvb_device *fedev;
 	struct dmxdev dmxdev;
@@ -345,14 +346,14 @@ static int cinergyt2_start_feed(struct dvb_demux_feed *dvbdmxfeed)
 	struct dvb_demux *demux = dvbdmxfeed->demux;
 	struct cinergyt2 *cinergyt2 = demux->priv;
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if (cinergyt2->streaming == 0)
 		cinergyt2_start_stream_xfer(cinergyt2);
 
 	cinergyt2->streaming++;
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 	return 0;
 }
 
@@ -361,13 +362,13 @@ static int cinergyt2_stop_feed(struct dvb_demux_feed *dvbdmxfeed)
 	struct dvb_demux *demux = dvbdmxfeed->demux;
 	struct cinergyt2 *cinergyt2 = demux->priv;
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if (--cinergyt2->streaming == 0)
 		cinergyt2_stop_stream_xfer(cinergyt2);
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 	return 0;
 }
 
@@ -483,11 +484,11 @@ static int cinergyt2_open (struct inode *inode, struct file *file)
 	struct cinergyt2 *cinergyt2 = dvbdev->priv;
 	int err = -ERESTARTSYS;
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if ((err = dvb_generic_open(inode, file))) {
-		up(&cinergyt2->sem);
+		mutex_unlock(&cinergyt2->sem);
 		return err;
 	}
 
@@ -499,7 +500,7 @@ static int cinergyt2_open (struct inode *inode, struct file *file)
 
 	atomic_inc(&cinergyt2->inuse);
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 	return 0;
 }
 
@@ -517,7 +518,7 @@ static int cinergyt2_release (struct inode *inode, struct file *file)
 	struct dvb_device *dvbdev = file->private_data;
 	struct cinergyt2 *cinergyt2 = dvbdev->priv;
 
-	if (down_interruptible(&cinergyt2->sem))
+	if (mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if (!cinergyt2->disconnect_pending && (file->f_flags & O_ACCMODE) != O_RDONLY) {
@@ -526,7 +527,7 @@ static int cinergyt2_release (struct inode *inode, struct file *file)
 		cinergyt2_sleep(cinergyt2, 1);
 	}
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 
 	if (atomic_dec_and_test(&cinergyt2->inuse) && cinergyt2->disconnect_pending) {
 		warn("delayed unregister in release");
@@ -541,12 +542,12 @@ static unsigned int cinergyt2_poll (struct file *file, struct poll_table_struct
 	struct dvb_device *dvbdev = file->private_data;
 	struct cinergyt2 *cinergyt2 = dvbdev->priv;
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	poll_wait(file, &cinergyt2->poll_wq, wait);
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 
 	return (POLLIN | POLLRDNORM | POLLPRI);
 }
@@ -613,7 +614,7 @@ static int cinergyt2_ioctl (struct inode *inode, struct file *file,
 		if (copy_from_user(&p, (void  __user*) arg, sizeof(p)))
 			return -EFAULT;
 
-		if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+		if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 			return -ERESTARTSYS;
 
 		param->cmd = CINERGYT2_EP1_SET_TUNER_PARAMETERS;
@@ -629,7 +630,7 @@ static int cinergyt2_ioctl (struct inode *inode, struct file *file,
 					(char *) param, sizeof(*param),
 					NULL, 0);
 
-		up(&cinergyt2->sem);
+		mutex_unlock(&cinergyt2->sem);
 
 		return (err < 0) ? err : 0;
 	}
@@ -724,7 +725,7 @@ static void cinergyt2_query_rc (void *data)
 	struct cinergyt2_rc_event rc_events[12];
 	int n, len, i;
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return;
 
 	len = cinergyt2_command(cinergyt2, buf, sizeof(buf),
@@ -784,7 +785,7 @@ out:
 	schedule_delayed_work(&cinergyt2->rc_query_work,
 			      msecs_to_jiffies(RC_QUERY_INTERVAL));
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 }
 
 static int cinergyt2_register_rc(struct cinergyt2 *cinergyt2)
@@ -849,7 +850,7 @@ static void cinergyt2_query (void *data)
 	uint8_t lock_bits;
 	uint32_t unc;
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return;
 
 	unc = s->uncorrected_block_count;
@@ -868,7 +869,7 @@ static void cinergyt2_query (void *data)
 	schedule_delayed_work(&cinergyt2->query_work,
 			      msecs_to_jiffies(QUERY_INTERVAL));
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 }
 
 static int cinergyt2_probe (struct usb_interface *intf,
@@ -885,7 +886,7 @@ static int cinergyt2_probe (struct usb_interface *intf,
 	memset (cinergyt2, 0, sizeof (struct cinergyt2));
 	usb_set_intfdata (intf, (void *) cinergyt2);
 
-	init_MUTEX(&cinergyt2->sem);
+	mutex_init(&cinergyt2->sem);
 	init_waitqueue_head (&cinergyt2->poll_wq);
 	INIT_WORK(&cinergyt2->query_work, cinergyt2_query, cinergyt2);
 
@@ -967,7 +968,7 @@ static int cinergyt2_suspend (struct usb_interface *intf, pm_message_t state)
 {
 	struct cinergyt2 *cinergyt2 = usb_get_intfdata (intf);
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if (state.event > PM_EVENT_ON) {
@@ -981,7 +982,7 @@ static int cinergyt2_suspend (struct usb_interface *intf, pm_message_t state)
 		cinergyt2_sleep(cinergyt2, 1);
 	}
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 	return 0;
 }
 
@@ -990,7 +991,7 @@ static int cinergyt2_resume (struct usb_interface *intf)
 	struct cinergyt2 *cinergyt2 = usb_get_intfdata (intf);
 	struct dvbt_set_parameters_msg *param = &cinergyt2->param;
 
-	if (cinergyt2->disconnect_pending || down_interruptible(&cinergyt2->sem))
+	if (cinergyt2->disconnect_pending || mutex_lock_interruptible(&cinergyt2->sem))
 		return -ERESTARTSYS;
 
 	if (!cinergyt2->sleeping) {
@@ -1003,7 +1004,7 @@ static int cinergyt2_resume (struct usb_interface *intf)
 
 	cinergyt2_resume_rc(cinergyt2);
 
-	up(&cinergyt2->sem);
+	mutex_unlock(&cinergyt2->sem);
 	return 0;
 }
 
diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c
index 7b8373ad121b..ead5343d7706 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.c
+++ b/drivers/media/dvb/dvb-core/dmxdev.c
@@ -175,12 +175,12 @@ static int dvb_dvr_open(struct inode *inode, struct file *file)
 
 	dprintk ("function : %s\n", __FUNCTION__);
 
-	if (down_interruptible (&dmxdev->mutex))
+	if (mutex_lock_interruptible(&dmxdev->mutex))
 		return -ERESTARTSYS;
 
 	if ((file->f_flags&O_ACCMODE)==O_RDWR) {
 		if (!(dmxdev->capabilities&DMXDEV_CAP_DUPLEX)) {
-			up(&dmxdev->mutex);
+			mutex_unlock(&dmxdev->mutex);
 			return -EOPNOTSUPP;
 		}
 	}
@@ -190,7 +190,7 @@ static int dvb_dvr_open(struct inode *inode, struct file *file)
 	      dmxdev->dvr_buffer.size=DVR_BUFFER_SIZE;
 	      dmxdev->dvr_buffer.data=vmalloc(DVR_BUFFER_SIZE);
 	      if (!dmxdev->dvr_buffer.data) {
-		      up(&dmxdev->mutex);
+		      mutex_unlock(&dmxdev->mutex);
 		      return -ENOMEM;
 	      }
 	}
@@ -199,20 +199,20 @@ static int dvb_dvr_open(struct inode *inode, struct file *file)
 		dmxdev->dvr_orig_fe=dmxdev->demux->frontend;
 
 		if (!dmxdev->demux->write) {
-			up(&dmxdev->mutex);
+			mutex_unlock(&dmxdev->mutex);
 			return -EOPNOTSUPP;
 		}
 
 		front=get_fe(dmxdev->demux, DMX_MEMORY_FE);
 
 		if (!front) {
-			up(&dmxdev->mutex);
+			mutex_unlock(&dmxdev->mutex);
 			return -EINVAL;
 		}
 		dmxdev->demux->disconnect_frontend(dmxdev->demux);
 		dmxdev->demux->connect_frontend(dmxdev->demux, front);
 	}
-	up(&dmxdev->mutex);
+	mutex_unlock(&dmxdev->mutex);
 	return 0;
 }
 
@@ -221,7 +221,7 @@ static int dvb_dvr_release(struct inode *inode, struct file *file)
 	struct dvb_device *dvbdev = file->private_data;
 	struct dmxdev *dmxdev = dvbdev->priv;
 
-	if (down_interruptible (&dmxdev->mutex))
+	if (mutex_lock_interruptible(&dmxdev->mutex))
 		return -ERESTARTSYS;
 
 	if ((file->f_flags&O_ACCMODE)==O_WRONLY) {
@@ -239,7 +239,7 @@ static int dvb_dvr_release(struct inode *inode, struct file *file)
 			vfree(mem);
 		}
 	}
-	up(&dmxdev->mutex);
+	mutex_unlock(&dmxdev->mutex);
 	return 0;
 }
 
@@ -254,10 +254,10 @@ static ssize_t dvb_dvr_write(struct file *file, const char __user *buf,
 		return -EOPNOTSUPP;
 	if ((file->f_flags&O_ACCMODE)!=O_WRONLY)
 		return -EINVAL;
-	if (down_interruptible (&dmxdev->mutex))
+	if (mutex_lock_interruptible(&dmxdev->mutex))
 		return -ERESTARTSYS;
 	ret=dmxdev->demux->write(dmxdev->demux, buf, count);
-	up(&dmxdev->mutex);
+	mutex_unlock(&dmxdev->mutex);
 	return ret;
 }
 
@@ -268,11 +268,11 @@ static ssize_t dvb_dvr_read(struct file *file, char __user *buf, size_t count,
 	struct dmxdev *dmxdev = dvbdev->priv;
 	int ret;
 
-	//down(&dmxdev->mutex);
+	//mutex_lock(&dmxdev->mutex);
 	ret= dvb_dmxdev_buffer_read(&dmxdev->dvr_buffer,
 			      file->f_flags&O_NONBLOCK,
 			      buf, count, ppos);
-	//up(&dmxdev->mutex);
+	//mutex_unlock(&dmxdev->mutex);
 	return ret;
 }
 
@@ -688,7 +688,7 @@ static int dvb_demux_open(struct inode *inode, struct file *file)
 	if (!dmxdev->filter)
 		return -EINVAL;
 
-	if (down_interruptible(&dmxdev->mutex))
+	if (mutex_lock_interruptible(&dmxdev->mutex))
 		return -ERESTARTSYS;
 
 	for (i=0; i<dmxdev->filternum; i++)
@@ -696,12 +696,12 @@ static int dvb_demux_open(struct inode *inode, struct file *file)
 			break;
 
 	if (i==dmxdev->filternum) {
-		up(&dmxdev->mutex);
+		mutex_unlock(&dmxdev->mutex);
 		return -EMFILE;
 	}
 
 	dmxdevfilter=&dmxdev->filter[i];
-	sema_init(&dmxdevfilter->mutex, 1);
+	mutex_init(&dmxdevfilter->mutex);
 	dmxdevfilter->dvbdev=dmxdev->dvbdev;
 	file->private_data=dmxdevfilter;
 
@@ -711,18 +711,18 @@ static int dvb_demux_open(struct inode *inode, struct file *file)
 	dmxdevfilter->feed.ts=NULL;
 	init_timer(&dmxdevfilter->timer);
 
-	up(&dmxdev->mutex);
+	mutex_unlock(&dmxdev->mutex);
 	return 0;
 }
 
 
 static int dvb_dmxdev_filter_free(struct dmxdev *dmxdev, struct dmxdev_filter *dmxdevfilter)
 {
-	if (down_interruptible(&dmxdev->mutex))
+	if (mutex_lock_interruptible(&dmxdev->mutex))
 		return -ERESTARTSYS;
 
-	if (down_interruptible(&dmxdevfilter->mutex)) {
-		up(&dmxdev->mutex);
+	if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+		mutex_unlock(&dmxdev->mutex);
 		return -ERESTARTSYS;
 	}
 
@@ -740,8 +740,8 @@ static int dvb_dmxdev_filter_free(struct dmxdev *dmxdev, struct dmxdev_filter *d
 
 	dvb_dmxdev_filter_state_set(dmxdevfilter, DMXDEV_STATE_FREE);
 	wake_up(&dmxdevfilter->buffer.queue);
-	up(&dmxdevfilter->mutex);
-	up(&dmxdev->mutex);
+	mutex_unlock(&dmxdevfilter->mutex);
+	mutex_unlock(&dmxdev->mutex);
 	return 0;
 }
 
@@ -841,7 +841,7 @@ dvb_demux_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	struct dmxdev_filter *dmxdevfilter= file->private_data;
 	int ret=0;
 
-	if (down_interruptible(&dmxdevfilter->mutex))
+	if (mutex_lock_interruptible(&dmxdevfilter->mutex))
 		return -ERESTARTSYS;
 
 	if (dmxdevfilter->type==DMXDEV_TYPE_SEC)
@@ -851,7 +851,7 @@ dvb_demux_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 				     file->f_flags&O_NONBLOCK,
 				     buf, count, ppos);
 
-	up(&dmxdevfilter->mutex);
+	mutex_unlock(&dmxdevfilter->mutex);
 	return ret;
 }
 
@@ -864,58 +864,58 @@ static int dvb_demux_do_ioctl(struct inode *inode, struct file *file,
 	unsigned long arg=(unsigned long) parg;
 	int ret=0;
 
-	if (down_interruptible (&dmxdev->mutex))
+	if (mutex_lock_interruptible(&dmxdev->mutex))
 		return -ERESTARTSYS;
 
 	switch (cmd) {
 	case DMX_START:
-		if (down_interruptible(&dmxdevfilter->mutex)) {
-			up(&dmxdev->mutex);
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
 			return -ERESTARTSYS;
 		}
 		if (dmxdevfilter->state<DMXDEV_STATE_SET)
 			ret = -EINVAL;
 		else
 			ret = dvb_dmxdev_filter_start(dmxdevfilter);
-		up(&dmxdevfilter->mutex);
+		mutex_unlock(&dmxdevfilter->mutex);
 		break;
 
 	case DMX_STOP:
-		if (down_interruptible(&dmxdevfilter->mutex)) {
-			up(&dmxdev->mutex);
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
 			return -ERESTARTSYS;
 		}
 		ret=dvb_dmxdev_filter_stop(dmxdevfilter);
-		up(&dmxdevfilter->mutex);
+		mutex_unlock(&dmxdevfilter->mutex);
 		break;
 
 	case DMX_SET_FILTER:
-		if (down_interruptible(&dmxdevfilter->mutex)) {
-			up(&dmxdev->mutex);
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
 			return -ERESTARTSYS;
 		}
 		ret = dvb_dmxdev_filter_set(dmxdev, dmxdevfilter,
 				    (struct dmx_sct_filter_params *)parg);
-		up(&dmxdevfilter->mutex);
+		mutex_unlock(&dmxdevfilter->mutex);
 		break;
 
 	case DMX_SET_PES_FILTER:
-		if (down_interruptible(&dmxdevfilter->mutex)) {
-			up(&dmxdev->mutex);
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
 			return -ERESTARTSYS;
 		}
 		ret=dvb_dmxdev_pes_filter_set(dmxdev, dmxdevfilter,
 					       (struct dmx_pes_filter_params *)parg);
-		up(&dmxdevfilter->mutex);
+		mutex_unlock(&dmxdevfilter->mutex);
 		break;
 
 	case DMX_SET_BUFFER_SIZE:
-		if (down_interruptible(&dmxdevfilter->mutex)) {
-			up(&dmxdev->mutex);
+		if (mutex_lock_interruptible(&dmxdevfilter->mutex)) {
+			mutex_unlock(&dmxdev->mutex);
 			return -ERESTARTSYS;
 		}
 		ret=dvb_dmxdev_set_buffer_size(dmxdevfilter, arg);
-		up(&dmxdevfilter->mutex);
+		mutex_unlock(&dmxdevfilter->mutex);
 		break;
 
 	case DMX_GET_EVENT:
@@ -959,7 +959,7 @@ static int dvb_demux_do_ioctl(struct inode *inode, struct file *file,
 	default:
 		ret=-EINVAL;
 	}
-	up(&dmxdev->mutex);
+	mutex_unlock(&dmxdev->mutex);
 	return ret;
 }
 
@@ -1030,7 +1030,7 @@ static int dvb_dvr_do_ioctl(struct inode *inode, struct file *file,
 
 	int ret=0;
 
-	if (down_interruptible (&dmxdev->mutex))
+	if (mutex_lock_interruptible(&dmxdev->mutex))
 		return -ERESTARTSYS;
 
 	switch (cmd) {
@@ -1042,7 +1042,7 @@ static int dvb_dvr_do_ioctl(struct inode *inode, struct file *file,
 	default:
 		ret=-EINVAL;
 	}
-	up(&dmxdev->mutex);
+	mutex_unlock(&dmxdev->mutex);
 	return ret;
 }
 
@@ -1113,7 +1113,7 @@ dvb_dmxdev_init(struct dmxdev *dmxdev, struct dvb_adapter *dvb_adapter)
 		return -ENOMEM;
 	}
 
-	sema_init(&dmxdev->mutex, 1);
+	mutex_init(&dmxdev->mutex);
 	spin_lock_init(&dmxdev->lock);
 	for (i=0; i<dmxdev->filternum; i++) {
 		dmxdev->filter[i].dev=dmxdev;
diff --git a/drivers/media/dvb/dvb-core/dmxdev.h b/drivers/media/dvb/dvb-core/dmxdev.h
index fd72920c2199..ec2a7a4da5e4 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.h
+++ b/drivers/media/dvb/dvb-core/dmxdev.h
@@ -30,7 +30,7 @@
 #include <linux/wait.h>
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include <linux/dvb/dmx.h>
 
@@ -83,7 +83,7 @@ struct dmxdev_filter {
 	struct dmxdev *dev;
 	struct dmxdev_buffer buffer;
 
-	struct semaphore mutex;
+	struct mutex mutex;
 
 	/* only for sections */
 	struct timer_list timer;
@@ -117,7 +117,7 @@ struct dmxdev {
 	struct dmxdev_buffer dvr_buffer;
 #define DVR_BUFFER_SIZE (10*188*1024)
 
-	struct semaphore mutex;
+	struct mutex mutex;
 	spinlock_t lock;
 };
 
diff --git a/drivers/media/dvb/dvb-core/dvb_demux.c b/drivers/media/dvb/dvb-core/dvb_demux.c
index b4c899b15959..83ec5e06c482 100644
--- a/drivers/media/dvb/dvb-core/dvb_demux.c
+++ b/drivers/media/dvb/dvb-core/dvb_demux.c
@@ -589,18 +589,18 @@ static int dmx_ts_feed_set(struct dmx_ts_feed *ts_feed, u16 pid, int ts_type,
 	if (pid > DMX_MAX_PID)
 		return -EINVAL;
 
-	if (down_interruptible(&demux->mutex))
+	if (mutex_lock_interruptible(&demux->mutex))
 		return -ERESTARTSYS;
 
 	if (ts_type & TS_DECODER) {
 		if (pes_type >= DMX_TS_PES_OTHER) {
-			up(&demux->mutex);
+			mutex_unlock(&demux->mutex);
 			return -EINVAL;
 		}
 
 		if (demux->pesfilter[pes_type] &&
 		    demux->pesfilter[pes_type] != feed) {
-			up(&demux->mutex);
+			mutex_unlock(&demux->mutex);
 			return -EINVAL;
 		}
 
@@ -622,14 +622,14 @@ static int dmx_ts_feed_set(struct dmx_ts_feed *ts_feed, u16 pid, int ts_type,
 #else
 		feed->buffer = vmalloc(feed->buffer_size);
 		if (!feed->buffer) {
-			up(&demux->mutex);
+			mutex_unlock(&demux->mutex);
 			return -ENOMEM;
 		}
 #endif
 	}
 
 	feed->state = DMX_STATE_READY;
-	up(&demux->mutex);
+	mutex_unlock(&demux->mutex);
 
 	return 0;
 }
@@ -640,21 +640,21 @@ static int dmx_ts_feed_start_filtering(struct dmx_ts_feed *ts_feed)
 	struct dvb_demux *demux = feed->demux;
 	int ret;
 
-	if (down_interruptible(&demux->mutex))
+	if (mutex_lock_interruptible(&demux->mutex))
 		return -ERESTARTSYS;
 
 	if (feed->state != DMX_STATE_READY || feed->type != DMX_TYPE_TS) {
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return -EINVAL;
 	}
 
 	if (!demux->start_feed) {
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return -ENODEV;
 	}
 
 	if ((ret = demux->start_feed(feed)) < 0) {
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return ret;
 	}
 
@@ -662,7 +662,7 @@ static int dmx_ts_feed_start_filtering(struct dmx_ts_feed *ts_feed)
 	ts_feed->is_filtering = 1;
 	feed->state = DMX_STATE_GO;
 	spin_unlock_irq(&demux->lock);
-	up(&demux->mutex);
+	mutex_unlock(&demux->mutex);
 
 	return 0;
 }
@@ -673,16 +673,16 @@ static int dmx_ts_feed_stop_filtering(struct dmx_ts_feed *ts_feed)
 	struct dvb_demux *demux = feed->demux;
 	int ret;
 
-	if (down_interruptible(&demux->mutex))
+	if (mutex_lock_interruptible(&demux->mutex))
 		return -ERESTARTSYS;
 
 	if (feed->state < DMX_STATE_GO) {
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return -EINVAL;
 	}
 
 	if (!demux->stop_feed) {
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return -ENODEV;
 	}
 
@@ -692,7 +692,7 @@ static int dmx_ts_feed_stop_filtering(struct dmx_ts_feed *ts_feed)
 	ts_feed->is_filtering = 0;
 	feed->state = DMX_STATE_ALLOCATED;
 	spin_unlock_irq(&demux->lock);
-	up(&demux->mutex);
+	mutex_unlock(&demux->mutex);
 
 	return ret;
 }
@@ -704,11 +704,11 @@ static int dvbdmx_allocate_ts_feed(struct dmx_demux *dmx,
 	struct dvb_demux *demux = (struct dvb_demux *)dmx;
 	struct dvb_demux_feed *feed;
 
-	if (down_interruptible(&demux->mutex))
+	if (mutex_lock_interruptible(&demux->mutex))
 		return -ERESTARTSYS;
 
 	if (!(feed = dvb_dmx_feed_alloc(demux))) {
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return -EBUSY;
 	}
 
@@ -729,7 +729,7 @@ static int dvbdmx_allocate_ts_feed(struct dmx_demux *dmx,
 
 	if (!(feed->filter = dvb_dmx_filter_alloc(demux))) {
 		feed->state = DMX_STATE_FREE;
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return -EBUSY;
 	}
 
@@ -737,7 +737,7 @@ static int dvbdmx_allocate_ts_feed(struct dmx_demux *dmx,
 	feed->filter->feed = feed;
 	feed->filter->state = DMX_STATE_READY;
 
-	up(&demux->mutex);
+	mutex_unlock(&demux->mutex);
 
 	return 0;
 }
@@ -748,11 +748,11 @@ static int dvbdmx_release_ts_feed(struct dmx_demux *dmx,
 	struct dvb_demux *demux = (struct dvb_demux *)dmx;
 	struct dvb_demux_feed *feed = (struct dvb_demux_feed *)ts_feed;
 
-	if (down_interruptible(&demux->mutex))
+	if (mutex_lock_interruptible(&demux->mutex))
 		return -ERESTARTSYS;
 
 	if (feed->state == DMX_STATE_FREE) {
-		up(&demux->mutex);
+		mutex_unlock(&demux->mutex);
 		return -EINVAL;
 	}
 #ifndef NOBUFS
@@ -770,7 +770,7 @@ static int dvbdmx_release_ts_feed(struct dmx_demux *dmx,
 	if (feed->ts_type & TS_DECODER && feed->pes_type < DMX_TS_PES_OTHER)
 		demux->pesfilter[feed->pes_type] = NULL;
 
-	up(&demux->mutex);
+	mutex_unlock(&demux->mutex);
 	return 0;
 }
 
@@ -785,12 +785,12 @@ static int dmx_section_feed_allocate_filter(struct dmx_section_feed *feed,
 	struct dvb_demux *dvbdemux = dvbdmxfeed->demux;
 	struct dvb_demux_filter *dvbdmxfilter;
 
-	if (down_interruptible(&dvbdemux->mutex))
+	if (mutex_lock_interruptible(&dvbdemux->mutex))
 		return -ERESTARTSYS;
 
 	dvbdmxfilter = dvb_dmx_filter_alloc(dvbdemux);
 	if (!dvbdmxfilter) {
-		up(&dvbdemux->mutex);
+		mutex_unlock(&dvbdemux->mutex);
 		return -EBUSY;
 	}
 
@@ -805,7 +805,7 @@ static int dmx_section_feed_allocate_filter(struct dmx_section_feed *feed,
 	dvbdmxfeed->filter = dvbdmxfilter;
 	spin_unlock_irq(&dvbdemux->lock);
 
-	up(&dvbdemux->mutex);
+	mutex_unlock(&dvbdemux->mutex);
 	return 0;
 }
 
@@ -819,7 +819,7 @@ static int dmx_section_feed_set(struct dmx_section_feed *feed,
 	if (pid > 0x1fff)
 		return -EINVAL;
 
-	if (down_interruptible(&dvbdmx->mutex))
+	if (mutex_lock_interruptible(&dvbdmx->mutex))
 		return -ERESTARTSYS;
 
 	dvb_demux_feed_add(dvbdmxfeed);
@@ -833,13 +833,13 @@ static int dmx_section_feed_set(struct dmx_section_feed *feed,
 #else
 	dvbdmxfeed->buffer = vmalloc(dvbdmxfeed->buffer_size);
 	if (!dvbdmxfeed->buffer) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -ENOMEM;
 	}
 #endif
 
 	dvbdmxfeed->state = DMX_STATE_READY;
-	up(&dvbdmx->mutex);
+	mutex_unlock(&dvbdmx->mutex);
 	return 0;
 }
 
@@ -871,16 +871,16 @@ static int dmx_section_feed_start_filtering(struct dmx_section_feed *feed)
 	struct dvb_demux *dvbdmx = dvbdmxfeed->demux;
 	int ret;
 
-	if (down_interruptible(&dvbdmx->mutex))
+	if (mutex_lock_interruptible(&dvbdmx->mutex))
 		return -ERESTARTSYS;
 
 	if (feed->is_filtering) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -EBUSY;
 	}
 
 	if (!dvbdmxfeed->filter) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -EINVAL;
 	}
 
@@ -890,14 +890,14 @@ static int dmx_section_feed_start_filtering(struct dmx_section_feed *feed)
 	dvbdmxfeed->feed.sec.seclen = 0;
 
 	if (!dvbdmx->start_feed) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -ENODEV;
 	}
 
 	prepare_secfilters(dvbdmxfeed);
 
 	if ((ret = dvbdmx->start_feed(dvbdmxfeed)) < 0) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return ret;
 	}
 
@@ -906,7 +906,7 @@ static int dmx_section_feed_start_filtering(struct dmx_section_feed *feed)
 	dvbdmxfeed->state = DMX_STATE_GO;
 	spin_unlock_irq(&dvbdmx->lock);
 
-	up(&dvbdmx->mutex);
+	mutex_unlock(&dvbdmx->mutex);
 	return 0;
 }
 
@@ -916,11 +916,11 @@ static int dmx_section_feed_stop_filtering(struct dmx_section_feed *feed)
 	struct dvb_demux *dvbdmx = dvbdmxfeed->demux;
 	int ret;
 
-	if (down_interruptible(&dvbdmx->mutex))
+	if (mutex_lock_interruptible(&dvbdmx->mutex))
 		return -ERESTARTSYS;
 
 	if (!dvbdmx->stop_feed) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -ENODEV;
 	}
 
@@ -931,7 +931,7 @@ static int dmx_section_feed_stop_filtering(struct dmx_section_feed *feed)
 	feed->is_filtering = 0;
 	spin_unlock_irq(&dvbdmx->lock);
 
-	up(&dvbdmx->mutex);
+	mutex_unlock(&dvbdmx->mutex);
 	return ret;
 }
 
@@ -942,11 +942,11 @@ static int dmx_section_feed_release_filter(struct dmx_section_feed *feed,
 	struct dvb_demux_feed *dvbdmxfeed = (struct dvb_demux_feed *)feed;
 	struct dvb_demux *dvbdmx = dvbdmxfeed->demux;
 
-	if (down_interruptible(&dvbdmx->mutex))
+	if (mutex_lock_interruptible(&dvbdmx->mutex))
 		return -ERESTARTSYS;
 
 	if (dvbdmxfilter->feed != dvbdmxfeed) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -EINVAL;
 	}
 
@@ -966,7 +966,7 @@ static int dmx_section_feed_release_filter(struct dmx_section_feed *feed,
 
 	dvbdmxfilter->state = DMX_STATE_FREE;
 	spin_unlock_irq(&dvbdmx->lock);
-	up(&dvbdmx->mutex);
+	mutex_unlock(&dvbdmx->mutex);
 	return 0;
 }
 
@@ -977,11 +977,11 @@ static int dvbdmx_allocate_section_feed(struct dmx_demux *demux,
 	struct dvb_demux *dvbdmx = (struct dvb_demux *)demux;
 	struct dvb_demux_feed *dvbdmxfeed;
 
-	if (down_interruptible(&dvbdmx->mutex))
+	if (mutex_lock_interruptible(&dvbdmx->mutex))
 		return -ERESTARTSYS;
 
 	if (!(dvbdmxfeed = dvb_dmx_feed_alloc(dvbdmx))) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -EBUSY;
 	}
 
@@ -1006,7 +1006,7 @@ static int dvbdmx_allocate_section_feed(struct dmx_demux *demux,
 	(*feed)->stop_filtering = dmx_section_feed_stop_filtering;
 	(*feed)->release_filter = dmx_section_feed_release_filter;
 
-	up(&dvbdmx->mutex);
+	mutex_unlock(&dvbdmx->mutex);
 	return 0;
 }
 
@@ -1016,11 +1016,11 @@ static int dvbdmx_release_section_feed(struct dmx_demux *demux,
 	struct dvb_demux_feed *dvbdmxfeed = (struct dvb_demux_feed *)feed;
 	struct dvb_demux *dvbdmx = (struct dvb_demux *)demux;
 
-	if (down_interruptible(&dvbdmx->mutex))
+	if (mutex_lock_interruptible(&dvbdmx->mutex))
 		return -ERESTARTSYS;
 
 	if (dvbdmxfeed->state == DMX_STATE_FREE) {
-		up(&dvbdmx->mutex);
+		mutex_unlock(&dvbdmx->mutex);
 		return -EINVAL;
 	}
 #ifndef NOBUFS
@@ -1033,7 +1033,7 @@ static int dvbdmx_release_section_feed(struct dmx_demux *demux,
 
 	dvbdmxfeed->pid = 0xffff;
 
-	up(&dvbdmx->mutex);
+	mutex_unlock(&dvbdmx->mutex);
 	return 0;
 }
 
@@ -1071,10 +1071,10 @@ static int dvbdmx_write(struct dmx_demux *demux, const char *buf, size_t count)
 	if ((!demux->frontend) || (demux->frontend->source != DMX_MEMORY_FE))
 		return -EINVAL;
 
-	if (down_interruptible(&dvbdemux->mutex))
+	if (mutex_lock_interruptible(&dvbdemux->mutex))
 		return -ERESTARTSYS;
 	dvb_dmx_swfilter(dvbdemux, buf, count);
-	up(&dvbdemux->mutex);
+	mutex_unlock(&dvbdemux->mutex);
 
 	if (signal_pending(current))
 		return -EINTR;
@@ -1126,11 +1126,11 @@ static int dvbdmx_connect_frontend(struct dmx_demux *demux,
 	if (demux->frontend)
 		return -EINVAL;
 
-	if (down_interruptible(&dvbdemux->mutex))
+	if (mutex_lock_interruptible(&dvbdemux->mutex))
 		return -ERESTARTSYS;
 
 	demux->frontend = frontend;
-	up(&dvbdemux->mutex);
+	mutex_unlock(&dvbdemux->mutex);
 	return 0;
 }
 
@@ -1138,11 +1138,11 @@ static int dvbdmx_disconnect_frontend(struct dmx_demux *demux)
 {
 	struct dvb_demux *dvbdemux = (struct dvb_demux *)demux;
 
-	if (down_interruptible(&dvbdemux->mutex))
+	if (mutex_lock_interruptible(&dvbdemux->mutex))
 		return -ERESTARTSYS;
 
 	demux->frontend = NULL;
-	up(&dvbdemux->mutex);
+	mutex_unlock(&dvbdemux->mutex);
 	return 0;
 }
 
@@ -1215,7 +1215,7 @@ int dvb_dmx_init(struct dvb_demux *dvbdemux)
 	dmx->disconnect_frontend = dvbdmx_disconnect_frontend;
 	dmx->get_pes_pids = dvbdmx_get_pes_pids;
 
-	sema_init(&dvbdemux->mutex, 1);
+	mutex_init(&dvbdemux->mutex);
 	spin_lock_init(&dvbdemux->lock);
 
 	return 0;
diff --git a/drivers/media/dvb/dvb-core/dvb_demux.h b/drivers/media/dvb/dvb-core/dvb_demux.h
index 0cc888339d52..2c5f915329ca 100644
--- a/drivers/media/dvb/dvb-core/dvb_demux.h
+++ b/drivers/media/dvb/dvb-core/dvb_demux.h
@@ -26,7 +26,7 @@
 #include <linux/time.h>
 #include <linux/timer.h>
 #include <linux/spinlock.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include "demux.h"
 
@@ -125,7 +125,7 @@ struct dvb_demux {
 	u8 tsbuf[204];
 	int tsbufp;
 
-	struct semaphore mutex;
+	struct mutex mutex;
 	spinlock_t lock;
 };
 
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index 771f32d889e6..22e96cf8497a 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -37,7 +37,6 @@
 #include <linux/suspend.h>
 #include <linux/jiffies.h>
 #include <asm/processor.h>
-#include <asm/semaphore.h>
 
 #include "dvb_frontend.h"
 #include "dvbdev.h"
@@ -88,7 +87,7 @@ MODULE_PARM_DESC(dvb_powerdown_on_sleep, "0: do not power down, 1: turn LNB vola
  * FESTATE_LOSTLOCK. When the lock has been lost, and we're searching it again.
  */
 
-static DECLARE_MUTEX(frontend_mutex);
+static DEFINE_MUTEX(frontend_mutex);
 
 struct dvb_frontend_private {
 
@@ -1021,12 +1020,12 @@ int dvb_register_frontend(struct dvb_adapter* dvb,
 
 	dprintk ("%s\n", __FUNCTION__);
 
-	if (down_interruptible (&frontend_mutex))
+	if (mutex_lock_interruptible(&frontend_mutex))
 		return -ERESTARTSYS;
 
 	fe->frontend_priv = kzalloc(sizeof(struct dvb_frontend_private), GFP_KERNEL);
 	if (fe->frontend_priv == NULL) {
-		up(&frontend_mutex);
+		mutex_unlock(&frontend_mutex);
 		return -ENOMEM;
 	}
 	fepriv = fe->frontend_priv;
@@ -1045,7 +1044,7 @@ int dvb_register_frontend(struct dvb_adapter* dvb,
 	dvb_register_device (fe->dvb, &fepriv->dvbdev, &dvbdev_template,
 			     fe, DVB_DEVICE_FRONTEND);
 
-	up (&frontend_mutex);
+	mutex_unlock(&frontend_mutex);
 	return 0;
 }
 EXPORT_SYMBOL(dvb_register_frontend);
@@ -1055,7 +1054,7 @@ int dvb_unregister_frontend(struct dvb_frontend* fe)
 	struct dvb_frontend_private *fepriv = fe->frontend_priv;
 	dprintk ("%s\n", __FUNCTION__);
 
-	down (&frontend_mutex);
+	mutex_lock(&frontend_mutex);
 	dvb_unregister_device (fepriv->dvbdev);
 	dvb_frontend_stop (fe);
 	if (fe->ops->release)
@@ -1064,7 +1063,7 @@ int dvb_unregister_frontend(struct dvb_frontend* fe)
 		printk("dvb_frontend: Demodulator (%s) does not have a release callback!\n", fe->ops->info.name);
 	/* fe is invalid now */
 	kfree(fepriv);
-	up (&frontend_mutex);
+	mutex_unlock(&frontend_mutex);
 	return 0;
 }
 EXPORT_SYMBOL(dvb_unregister_frontend);
diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c
index 6711eb6a058c..2f0f35811bf7 100644
--- a/drivers/media/dvb/dvb-core/dvb_net.c
+++ b/drivers/media/dvb/dvb-core/dvb_net.c
@@ -62,6 +62,7 @@
 #include <linux/uio.h>
 #include <asm/uaccess.h>
 #include <linux/crc32.h>
+#include <linux/mutex.h>
 
 #include "dvb_demux.h"
 #include "dvb_net.h"
@@ -151,8 +152,7 @@ struct dvb_net_priv {
 	unsigned char ule_bridged;		/* Whether the ULE_BRIDGED extension header was found. */
 	int ule_sndu_remain;			/* Nr. of bytes still required for current ULE SNDU. */
 	unsigned long ts_count;			/* Current ts cell counter. */
-
-	struct semaphore mutex;
+	struct mutex mutex;
 };
 
 
@@ -889,7 +889,7 @@ static int dvb_net_feed_start(struct net_device *dev)
 	unsigned char *mac = (unsigned char *) dev->dev_addr;
 
 	dprintk("%s: rx_mode %i\n", __FUNCTION__, priv->rx_mode);
-	down(&priv->mutex);
+	mutex_lock(&priv->mutex);
 	if (priv->tsfeed || priv->secfeed || priv->secfilter || priv->multi_secfilter[0])
 		printk("%s: BUG %d\n", __FUNCTION__, __LINE__);
 
@@ -974,7 +974,7 @@ static int dvb_net_feed_start(struct net_device *dev)
 		ret = -EINVAL;
 
 error:
-	up(&priv->mutex);
+	mutex_unlock(&priv->mutex);
 	return ret;
 }
 
@@ -984,7 +984,7 @@ static int dvb_net_feed_stop(struct net_device *dev)
 	int i, ret = 0;
 
 	dprintk("%s\n", __FUNCTION__);
-	down(&priv->mutex);
+	mutex_lock(&priv->mutex);
 	if (priv->feedtype == DVB_NET_FEEDTYPE_MPE) {
 		if (priv->secfeed) {
 			if (priv->secfeed->is_filtering) {
@@ -1026,7 +1026,7 @@ static int dvb_net_feed_stop(struct net_device *dev)
 			printk("%s: no ts feed to stop\n", dev->name);
 	} else
 		ret = -EINVAL;
-	up(&priv->mutex);
+	mutex_unlock(&priv->mutex);
 	return ret;
 }
 
@@ -1208,7 +1208,7 @@ static int dvb_net_add_if(struct dvb_net *dvbnet, u16 pid, u8 feedtype)
 
 	INIT_WORK(&priv->set_multicast_list_wq, wq_set_multicast_list, net);
 	INIT_WORK(&priv->restart_net_feed_wq, wq_restart_net_feed, net);
-	init_MUTEX(&priv->mutex);
+	mutex_init(&priv->mutex);
 
 	net->base_addr = pid;
 
diff --git a/drivers/media/dvb/dvb-usb/cxusb.c b/drivers/media/dvb/dvb-usb/cxusb.c
index f327fac1688e..e62a293c7e5a 100644
--- a/drivers/media/dvb/dvb-usb/cxusb.c
+++ b/drivers/media/dvb/dvb-usb/cxusb.c
@@ -77,7 +77,7 @@ static int cxusb_i2c_xfer(struct i2c_adapter *adap,struct i2c_msg msg[],int num)
 	struct dvb_usb_device *d = i2c_get_adapdata(adap);
 	int i;
 
-	if (down_interruptible(&d->i2c_sem) < 0)
+	if (mutex_lock_interruptible(&d->i2c_mutex) < 0)
 		return -EAGAIN;
 
 	if (num > 2)
@@ -126,7 +126,7 @@ static int cxusb_i2c_xfer(struct i2c_adapter *adap,struct i2c_msg msg[],int num)
 		}
 	}
 
-	up(&d->i2c_sem);
+	mutex_unlock(&d->i2c_mutex);
 	return i;
 }
 
diff --git a/drivers/media/dvb/dvb-usb/dibusb-common.c b/drivers/media/dvb/dvb-usb/dibusb-common.c
index 269d899da488..2d52b76671d3 100644
--- a/drivers/media/dvb/dvb-usb/dibusb-common.c
+++ b/drivers/media/dvb/dvb-usb/dibusb-common.c
@@ -128,7 +128,7 @@ static int dibusb_i2c_xfer(struct i2c_adapter *adap,struct i2c_msg msg[],int num
 	struct dvb_usb_device *d = i2c_get_adapdata(adap);
 	int i;
 
-	if (down_interruptible(&d->i2c_sem) < 0)
+	if (mutex_lock_interruptible(&d->i2c_mutex) < 0)
 		return -EAGAIN;
 
 	if (num > 2)
@@ -146,7 +146,7 @@ static int dibusb_i2c_xfer(struct i2c_adapter *adap,struct i2c_msg msg[],int num
 				break;
 	}
 
-	up(&d->i2c_sem);
+	mutex_unlock(&d->i2c_mutex);
 	return i;
 }
 
diff --git a/drivers/media/dvb/dvb-usb/digitv.c b/drivers/media/dvb/dvb-usb/digitv.c
index caa1346e3063..91136c00ce9d 100644
--- a/drivers/media/dvb/dvb-usb/digitv.c
+++ b/drivers/media/dvb/dvb-usb/digitv.c
@@ -48,7 +48,7 @@ static int digitv_i2c_xfer(struct i2c_adapter *adap,struct i2c_msg msg[],int num
 	struct dvb_usb_device *d = i2c_get_adapdata(adap);
 	int i;
 
-	if (down_interruptible(&d->i2c_sem) < 0)
+	if (mutex_lock_interruptible(&d->i2c_mutex) < 0)
 		return -EAGAIN;
 
 	if (num > 2)
@@ -67,7 +67,7 @@ static int digitv_i2c_xfer(struct i2c_adapter *adap,struct i2c_msg msg[],int num
 				break;
 	}
 
-	up(&d->i2c_sem);
+	mutex_unlock(&d->i2c_mutex);
 	return i;
 }
 
diff --git a/drivers/media/dvb/dvb-usb/dvb-usb-init.c b/drivers/media/dvb/dvb-usb/dvb-usb-init.c
index 716f8bf528cd..4258a995dce1 100644
--- a/drivers/media/dvb/dvb-usb/dvb-usb-init.c
+++ b/drivers/media/dvb/dvb-usb/dvb-usb-init.c
@@ -42,8 +42,8 @@ static int dvb_usb_init(struct dvb_usb_device *d)
 {
 	int ret = 0;
 
-	sema_init(&d->usb_sem, 1);
-	sema_init(&d->i2c_sem, 1);
+	mutex_init(&d->usb_mutex);
+	mutex_init(&d->i2c_mutex);
 
 	d->state = DVB_USB_STATE_INIT;
 
diff --git a/drivers/media/dvb/dvb-usb/dvb-usb-urb.c b/drivers/media/dvb/dvb-usb/dvb-usb-urb.c
index ee821974dc60..9002f35aa952 100644
--- a/drivers/media/dvb/dvb-usb/dvb-usb-urb.c
+++ b/drivers/media/dvb/dvb-usb/dvb-usb-urb.c
@@ -21,7 +21,7 @@ int dvb_usb_generic_rw(struct dvb_usb_device *d, u8 *wbuf, u16 wlen, u8 *rbuf,
 	if (wbuf == NULL || wlen == 0)
 		return -EINVAL;
 
-	if ((ret = down_interruptible(&d->usb_sem)))
+	if ((ret = mutex_lock_interruptible(&d->usb_mutex)))
 		return ret;
 
 	deb_xfer(">>> ");
@@ -53,7 +53,7 @@ int dvb_usb_generic_rw(struct dvb_usb_device *d, u8 *wbuf, u16 wlen, u8 *rbuf,
 		}
 	}
 
-	up(&d->usb_sem);
+	mutex_unlock(&d->usb_mutex);
 	return ret;
 }
 EXPORT_SYMBOL(dvb_usb_generic_rw);
diff --git a/drivers/media/dvb/dvb-usb/dvb-usb.h b/drivers/media/dvb/dvb-usb/dvb-usb.h
index 5e5d21ad93c9..d2be37cc43b7 100644
--- a/drivers/media/dvb/dvb-usb/dvb-usb.h
+++ b/drivers/media/dvb/dvb-usb/dvb-usb.h
@@ -12,6 +12,7 @@
 #include <linux/input.h>
 #include <linux/usb.h>
 #include <linux/firmware.h>
+#include <linux/mutex.h>
 
 #include "dvb_frontend.h"
 #include "dvb_demux.h"
@@ -227,8 +228,8 @@ struct dvb_usb_properties {
  * @feedcount: number of reqested feeds (used for streaming-activation)
  * @pid_filtering: is hardware pid_filtering used or not.
  *
- * @usb_sem: semaphore of USB control messages (reading needs two messages)
- * @i2c_sem: semaphore for i2c-transfers
+ * @usb_mutex: semaphore of USB control messages (reading needs two messages)
+ * @i2c_mutex: semaphore for i2c-transfers
  *
  * @i2c_adap: device's i2c_adapter if it uses I2CoverUSB
  * @pll_addr: I2C address of the tuner for programming
@@ -283,10 +284,10 @@ struct dvb_usb_device {
 	int pid_filtering;
 
 	/* locking */
-	struct semaphore usb_sem;
+	struct mutex usb_mutex;
 
 	/* i2c */
-	struct semaphore i2c_sem;
+	struct mutex i2c_mutex;
 	struct i2c_adapter i2c_adap;
 
 	/* tuner programming information */
diff --git a/drivers/media/dvb/dvb-usb/vp702x.c b/drivers/media/dvb/dvb-usb/vp702x.c
index 4a95eca81c5c..b2f098a2d5f7 100644
--- a/drivers/media/dvb/dvb-usb/vp702x.c
+++ b/drivers/media/dvb/dvb-usb/vp702x.c
@@ -75,7 +75,7 @@ int vp702x_usb_inout_op(struct dvb_usb_device *d, u8 *o, int olen, u8 *i, int il
 {
 	int ret;
 
-	if ((ret = down_interruptible(&d->usb_sem)))
+	if ((ret = mutex_lock_interruptible(&d->usb_mutex)))
 		return ret;
 
 	if ((ret = vp702x_usb_out_op(d,REQUEST_OUT,0,0,o,olen)) < 0)
@@ -84,7 +84,7 @@ int vp702x_usb_inout_op(struct dvb_usb_device *d, u8 *o, int olen, u8 *i, int il
 	ret = vp702x_usb_in_op(d,REQUEST_IN,0,0,i,ilen);
 
 unlock:
-	up(&d->usb_sem);
+	mutex_unlock(&d->usb_mutex);
 
 	return ret;
 }
diff --git a/drivers/media/dvb/dvb-usb/vp7045.c b/drivers/media/dvb/dvb-usb/vp7045.c
index 028204956bb0..7fa656b108cc 100644
--- a/drivers/media/dvb/dvb-usb/vp7045.c
+++ b/drivers/media/dvb/dvb-usb/vp7045.c
@@ -38,7 +38,7 @@ int vp7045_usb_op(struct dvb_usb_device *d, u8 cmd, u8 *out, int outlen, u8 *in,
 	deb_xfer("out buffer: ");
 	debug_dump(outbuf,outlen+1,deb_xfer);
 
-	if ((ret = down_interruptible(&d->usb_sem)))
+	if ((ret = mutex_lock_interruptible(&d->usb_mutex)))
 		return ret;
 
 	if (usb_control_msg(d->udev,
@@ -68,7 +68,7 @@ int vp7045_usb_op(struct dvb_usb_device *d, u8 cmd, u8 *out, int outlen, u8 *in,
 		memcpy(in,&inbuf[1],inlen);
 
 unlock:
-	up(&d->usb_sem);
+	mutex_unlock(&d->usb_mutex);
 
 	return ret;
 }
diff --git a/drivers/media/dvb/frontends/bcm3510.c b/drivers/media/dvb/frontends/bcm3510.c
index caaee893ca76..1708a1d4893e 100644
--- a/drivers/media/dvb/frontends/bcm3510.c
+++ b/drivers/media/dvb/frontends/bcm3510.c
@@ -39,6 +39,7 @@
 #include <linux/jiffies.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 
 #include "dvb_frontend.h"
 #include "bcm3510.h"
@@ -52,7 +53,7 @@ struct bcm3510_state {
 	struct dvb_frontend frontend;
 
 	/* demodulator private data */
-	struct semaphore hab_sem;
+	struct mutex hab_mutex;
 	u8 firmware_loaded:1;
 
 	unsigned long next_status_check;
@@ -213,7 +214,7 @@ static int bcm3510_do_hab_cmd(struct bcm3510_state *st, u8 cmd, u8 msgid, u8 *ob
 	dbufout(ob,olen+2,deb_hab);
 	deb_hab("\n");
 
-	if (down_interruptible(&st->hab_sem) < 0)
+	if (mutex_lock_interruptible(&st->hab_mutex) < 0)
 		return -EAGAIN;
 
 	if ((ret = bcm3510_hab_send_request(st, ob, olen+2)) < 0 ||
@@ -226,7 +227,7 @@ static int bcm3510_do_hab_cmd(struct bcm3510_state *st, u8 cmd, u8 msgid, u8 *ob
 
 	memcpy(ibuf,&ib[2],ilen);
 error:
-	up(&st->hab_sem);
+	mutex_unlock(&st->hab_mutex);
 	return ret;
 }
 
@@ -796,7 +797,7 @@ struct dvb_frontend* bcm3510_attach(const struct bcm3510_config *config,
 	state->frontend.ops = &state->ops;
 	state->frontend.demodulator_priv = state;
 
-	sema_init(&state->hab_sem, 1);
+	mutex_init(&state->hab_mutex);
 
 	if ((ret = bcm3510_readB(state,0xe0,&v)) < 0)
 		goto error;
diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c
index d36369e9e88f..1aa61bf29ad6 100644
--- a/drivers/media/dvb/ttpci/av7110.c
+++ b/drivers/media/dvb/ttpci/av7110.c
@@ -54,7 +54,6 @@
 #include <linux/i2c.h>
 
 #include <asm/system.h>
-#include <asm/semaphore.h>
 
 #include <linux/dvb/frontend.h>
 
@@ -242,10 +241,10 @@ static int arm_thread(void *data)
 		if (!av7110->arm_ready)
 			continue;
 
-		if (down_interruptible(&av7110->dcomlock))
+		if (mutex_lock_interruptible(&av7110->dcomlock))
 			break;
 		newloops = rdebi(av7110, DEBINOSWAP, STATUS_LOOPS, 0, 2);
-		up(&av7110->dcomlock);
+		mutex_unlock(&av7110->dcomlock);
 
 		if (newloops == av7110->arm_loops || av7110->arm_errors > 3) {
 			printk(KERN_ERR "dvb-ttpci: ARM crashed @ card %d\n",
@@ -253,10 +252,10 @@ static int arm_thread(void *data)
 
 			recover_arm(av7110);
 
-			if (down_interruptible(&av7110->dcomlock))
+			if (mutex_lock_interruptible(&av7110->dcomlock))
 				break;
 			newloops = rdebi(av7110, DEBINOSWAP, STATUS_LOOPS, 0, 2) - 1;
-			up(&av7110->dcomlock);
+			mutex_unlock(&av7110->dcomlock);
 		}
 		av7110->arm_loops = newloops;
 		av7110->arm_errors = 0;
@@ -741,7 +740,7 @@ int ChangePIDs(struct av7110 *av7110, u16 vpid, u16 apid, u16 ttpid,
 	int ret = 0;
 	dprintk(4, "%p\n", av7110);
 
-	if (down_interruptible(&av7110->pid_mutex))
+	if (mutex_lock_interruptible(&av7110->pid_mutex))
 		return -ERESTARTSYS;
 
 	if (!(vpid & 0x8000))
@@ -760,7 +759,7 @@ int ChangePIDs(struct av7110 *av7110, u16 vpid, u16 apid, u16 ttpid,
 		ret = SetPIDs(av7110, vpid, apid, ttpid, subpid, pcrpid);
 	}
 
-	up(&av7110->pid_mutex);
+	mutex_unlock(&av7110->pid_mutex);
 	return ret;
 }
 
@@ -2096,7 +2095,7 @@ static int av7110_fe_lock_fix(struct av7110* av7110, fe_status_t status)
 	if (av7110->playing)
 		return 0;
 
-	if (down_interruptible(&av7110->pid_mutex))
+	if (mutex_lock_interruptible(&av7110->pid_mutex))
 		return -ERESTARTSYS;
 
 	if (synced) {
@@ -2118,7 +2117,7 @@ static int av7110_fe_lock_fix(struct av7110* av7110, fe_status_t status)
 	if (!ret)
 		av7110->fe_synced = synced;
 
-	up(&av7110->pid_mutex);
+	mutex_unlock(&av7110->pid_mutex);
 	return ret;
 }
 
@@ -2713,16 +2712,16 @@ static int av7110_attach(struct saa7146_dev* dev, struct saa7146_pci_extension_d
 	tasklet_init (&av7110->debi_tasklet, debiirq, (unsigned long) av7110);
 	tasklet_init (&av7110->gpio_tasklet, gpioirq, (unsigned long) av7110);
 
-	sema_init(&av7110->pid_mutex, 1);
+	mutex_init(&av7110->pid_mutex);
 
 	/* locks for data transfers from/to AV7110 */
 	spin_lock_init(&av7110->debilock);
-	sema_init(&av7110->dcomlock, 1);
+	mutex_init(&av7110->dcomlock);
 	av7110->debitype = -1;
 
 	/* default OSD window */
 	av7110->osdwin = 1;
-	sema_init(&av7110->osd_sema, 1);
+	mutex_init(&av7110->osd_mutex);
 
 	/* ARM "watchdog" */
 	init_waitqueue_head(&av7110->arm_wait);
diff --git a/drivers/media/dvb/ttpci/av7110.h b/drivers/media/dvb/ttpci/av7110.h
index fafd25fab835..3e2e12124bae 100644
--- a/drivers/media/dvb/ttpci/av7110.h
+++ b/drivers/media/dvb/ttpci/av7110.h
@@ -16,6 +16,7 @@
 #include <linux/dvb/ca.h>
 #include <linux/dvb/osd.h>
 #include <linux/dvb/net.h>
+#include <linux/mutex.h>
 
 #include "dvbdev.h"
 #include "demux.h"
@@ -127,7 +128,7 @@ struct av7110 {
 	/* DEBI and polled command interface */
 
 	spinlock_t		debilock;
-	struct semaphore	dcomlock;
+	struct mutex		dcomlock;
 	volatile int		debitype;
 	volatile int		debilen;
 
@@ -146,7 +147,7 @@ struct av7110 {
 
 	int			osdwin;      /* currently active window */
 	u16			osdbpp[8];
-	struct semaphore	osd_sema;
+	struct mutex		osd_mutex;
 
 	/* CA */
 
@@ -172,7 +173,7 @@ struct av7110 {
 	struct tasklet_struct   vpe_tasklet;
 
 	int			fe_synced;
-	struct semaphore	pid_mutex;
+	struct mutex		pid_mutex;
 
 	int			video_blank;
 	struct video_status	videostate;
diff --git a/drivers/media/dvb/ttpci/av7110_hw.c b/drivers/media/dvb/ttpci/av7110_hw.c
index b2e63e9fc053..3c5366d15a8a 100644
--- a/drivers/media/dvb/ttpci/av7110_hw.c
+++ b/drivers/media/dvb/ttpci/av7110_hw.c
@@ -324,10 +324,10 @@ int av7110_wait_msgstate(struct av7110 *av7110, u16 flags)
 	start = jiffies;
 	for (;;) {
 		err = time_after(jiffies, start + ARM_WAIT_FREE);
-		if (down_interruptible(&av7110->dcomlock))
+		if (mutex_lock_interruptible(&av7110->dcomlock))
 			return -ERESTARTSYS;
 		stat = rdebi(av7110, DEBINOSWAP, MSGSTATE, 0, 2);
-		up(&av7110->dcomlock);
+		mutex_unlock(&av7110->dcomlock);
 		if ((stat & flags) == 0)
 			break;
 		if (err) {
@@ -484,11 +484,11 @@ static int av7110_send_fw_cmd(struct av7110 *av7110, u16* buf, int length)
 		dprintk(1, "arm not ready.\n");
 		return -1;
 	}
-	if (down_interruptible(&av7110->dcomlock))
+	if (mutex_lock_interruptible(&av7110->dcomlock))
 		return -ERESTARTSYS;
 
 	ret = __av7110_send_fw_cmd(av7110, buf, length);
-	up(&av7110->dcomlock);
+	mutex_unlock(&av7110->dcomlock);
 	if (ret && ret!=-ERESTARTSYS)
 		printk(KERN_ERR "dvb-ttpci: %s(): av7110_send_fw_cmd error %d\n",
 		       __FUNCTION__, ret);
@@ -560,11 +560,11 @@ int av7110_fw_request(struct av7110 *av7110, u16 *request_buf,
 		return -1;
 	}
 
-	if (down_interruptible(&av7110->dcomlock))
+	if (mutex_lock_interruptible(&av7110->dcomlock))
 		return -ERESTARTSYS;
 
 	if ((err = __av7110_send_fw_cmd(av7110, request_buf, request_buf_len)) < 0) {
-		up(&av7110->dcomlock);
+		mutex_unlock(&av7110->dcomlock);
 		printk(KERN_ERR "dvb-ttpci: av7110_fw_request error %d\n", err);
 		return err;
 	}
@@ -576,7 +576,7 @@ int av7110_fw_request(struct av7110 *av7110, u16 *request_buf,
 			break;
 		if (err) {
 			printk(KERN_ERR "%s: timeout waiting for COMMAND to complete\n", __FUNCTION__);
-			up(&av7110->dcomlock);
+			mutex_unlock(&av7110->dcomlock);
 			return -ETIMEDOUT;
 		}
 #ifdef _NOHANDSHAKE
@@ -592,7 +592,7 @@ int av7110_fw_request(struct av7110 *av7110, u16 *request_buf,
 			break;
 		if (err) {
 			printk(KERN_ERR "%s: timeout waiting for HANDSHAKE_REG\n", __FUNCTION__);
-			up(&av7110->dcomlock);
+			mutex_unlock(&av7110->dcomlock);
 			return -ETIMEDOUT;
 		}
 		msleep(1);
@@ -603,12 +603,12 @@ int av7110_fw_request(struct av7110 *av7110, u16 *request_buf,
 	stat = rdebi(av7110, DEBINOSWAP, MSGSTATE, 0, 2);
 	if (stat & GPMQOver) {
 		printk(KERN_ERR "%s: GPMQOver\n", __FUNCTION__);
-		up(&av7110->dcomlock);
+		mutex_unlock(&av7110->dcomlock);
 		return -1;
 	}
 	else if (stat & OSDQOver) {
 		printk(KERN_ERR "%s: OSDQOver\n", __FUNCTION__);
-		up(&av7110->dcomlock);
+		mutex_unlock(&av7110->dcomlock);
 		return -1;
 	}
 #endif
@@ -616,7 +616,7 @@ int av7110_fw_request(struct av7110 *av7110, u16 *request_buf,
 	for (i = 0; i < reply_buf_len; i++)
 		reply_buf[i] = rdebi(av7110, DEBINOSWAP, COM_BUFF + 2 * i, 0, 2);
 
-	up(&av7110->dcomlock);
+	mutex_unlock(&av7110->dcomlock);
 	return 0;
 }
 
@@ -732,7 +732,7 @@ static int FlushText(struct av7110 *av7110)
 	unsigned long start;
 	int err;
 
-	if (down_interruptible(&av7110->dcomlock))
+	if (mutex_lock_interruptible(&av7110->dcomlock))
 		return -ERESTARTSYS;
 	start = jiffies;
 	while (1) {
@@ -742,12 +742,12 @@ static int FlushText(struct av7110 *av7110)
 		if (err) {
 			printk(KERN_ERR "dvb-ttpci: %s(): timeout waiting for BUFF1_BASE == 0\n",
 			       __FUNCTION__);
-			up(&av7110->dcomlock);
+			mutex_unlock(&av7110->dcomlock);
 			return -ETIMEDOUT;
 		}
 		msleep(1);
 	}
-	up(&av7110->dcomlock);
+	mutex_unlock(&av7110->dcomlock);
 	return 0;
 }
 
@@ -758,7 +758,7 @@ static int WriteText(struct av7110 *av7110, u8 win, u16 x, u16 y, u8* buf)
 	int length = strlen(buf) + 1;
 	u16 cbuf[5] = { (COMTYPE_OSD << 8) + DText, 3, win, x, y };
 
-	if (down_interruptible(&av7110->dcomlock))
+	if (mutex_lock_interruptible(&av7110->dcomlock))
 		return -ERESTARTSYS;
 
 	start = jiffies;
@@ -769,7 +769,7 @@ static int WriteText(struct av7110 *av7110, u8 win, u16 x, u16 y, u8* buf)
 		if (ret) {
 			printk(KERN_ERR "dvb-ttpci: %s: timeout waiting for BUFF1_BASE == 0\n",
 			       __FUNCTION__);
-			up(&av7110->dcomlock);
+			mutex_unlock(&av7110->dcomlock);
 			return -ETIMEDOUT;
 		}
 		msleep(1);
@@ -783,7 +783,7 @@ static int WriteText(struct av7110 *av7110, u8 win, u16 x, u16 y, u8* buf)
 		if (ret) {
 			printk(KERN_ERR "dvb-ttpci: %s: timeout waiting for HANDSHAKE_REG\n",
 			       __FUNCTION__);
-			up(&av7110->dcomlock);
+			mutex_unlock(&av7110->dcomlock);
 			return -ETIMEDOUT;
 		}
 		msleep(1);
@@ -795,7 +795,7 @@ static int WriteText(struct av7110 *av7110, u8 win, u16 x, u16 y, u8* buf)
 	if (length & 1)
 		wdebi(av7110, DEBINOSWAP, BUFF1_BASE + i * 2, 0, 2);
 	ret = __av7110_send_fw_cmd(av7110, cbuf, 5);
-	up(&av7110->dcomlock);
+	mutex_unlock(&av7110->dcomlock);
 	if (ret && ret!=-ERESTARTSYS)
 		printk(KERN_ERR "dvb-ttpci: WriteText error %d\n", ret);
 	return ret;
@@ -1059,7 +1059,7 @@ int av7110_osd_cmd(struct av7110 *av7110, osd_cmd_t *dc)
 {
 	int ret;
 
-	if (down_interruptible(&av7110->osd_sema))
+	if (mutex_lock_interruptible(&av7110->osd_mutex))
 		return -ERESTARTSYS;
 
 	switch (dc->cmd) {
@@ -1195,7 +1195,7 @@ int av7110_osd_cmd(struct av7110 *av7110, osd_cmd_t *dc)
 		break;
 	}
 
-	up(&av7110->osd_sema);
+	mutex_unlock(&av7110->osd_mutex);
 	if (ret==-ERESTARTSYS)
 		dprintk(1, "av7110_osd_cmd(%d) returns with -ERESTARTSYS\n",dc->cmd);
 	else if (ret)
diff --git a/drivers/media/dvb/ttpci/budget.h b/drivers/media/dvb/ttpci/budget.h
index c7bb63c4d98d..4ac0f4d08025 100644
--- a/drivers/media/dvb/ttpci/budget.h
+++ b/drivers/media/dvb/ttpci/budget.h
@@ -10,6 +10,8 @@
 #include "dvb_net.h"
 
 #include <linux/module.h>
+#include <linux/mutex.h>
+
 #include <media/saa7146.h>
 
 extern int budget_debug;
@@ -51,7 +53,7 @@ struct budget {
 	struct dmx_frontend mem_frontend;
 
 	int fe_synced;
-	struct semaphore pid_mutex;
+	struct mutex pid_mutex;
 
 	int ci_present;
 	int video_port;
diff --git a/drivers/media/dvb/ttusb-budget/dvb-ttusb-budget.c b/drivers/media/dvb/ttusb-budget/dvb-ttusb-budget.c
index 5a13c4744f61..ecb15d4ecf81 100644
--- a/drivers/media/dvb/ttusb-budget/dvb-ttusb-budget.c
+++ b/drivers/media/dvb/ttusb-budget/dvb-ttusb-budget.c
@@ -19,7 +19,7 @@
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/jiffies.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include "dvb_frontend.h"
 #include "dmxdev.h"
@@ -35,7 +35,6 @@
 #include <linux/dvb/dmx.h>
 #include <linux/pci.h>
 
-
 /*
   TTUSB_HWSECTIONS:
     the DSP supports filtering in hardware, however, since the "muxstream"
@@ -83,8 +82,8 @@ struct ttusb {
 	struct dvb_net dvbnet;
 
 	/* and one for USB access. */
-	struct semaphore semi2c;
-	struct semaphore semusb;
+	struct mutex semi2c;
+	struct mutex semusb;
 
 	struct dvb_adapter adapter;
 	struct usb_device *dev;
@@ -150,7 +149,7 @@ static int ttusb_cmd(struct ttusb *ttusb,
 	printk("\n");
 #endif
 
-	if (down_interruptible(&ttusb->semusb) < 0)
+	if (mutex_lock_interruptible(&ttusb->semusb) < 0)
 		return -EAGAIN;
 
 	err = usb_bulk_msg(ttusb->dev, ttusb->bulk_out_pipe,
@@ -158,13 +157,13 @@ static int ttusb_cmd(struct ttusb *ttusb,
 	if (err != 0) {
 		dprintk("%s: usb_bulk_msg(send) failed, err == %i!\n",
 			__FUNCTION__, err);
-		up(&ttusb->semusb);
+		mutex_unlock(&ttusb->semusb);
 		return err;
 	}
 	if (actual_len != len) {
 		dprintk("%s: only wrote %d of %d bytes\n", __FUNCTION__,
 			actual_len, len);
-		up(&ttusb->semusb);
+		mutex_unlock(&ttusb->semusb);
 		return -1;
 	}
 
@@ -174,7 +173,7 @@ static int ttusb_cmd(struct ttusb *ttusb,
 	if (err != 0) {
 		printk("%s: failed, receive error %d\n", __FUNCTION__,
 		       err);
-		up(&ttusb->semusb);
+		mutex_unlock(&ttusb->semusb);
 		return err;
 	}
 #if DEBUG >= 3
@@ -185,14 +184,14 @@ static int ttusb_cmd(struct ttusb *ttusb,
 	printk("\n");
 #endif
 	if (!needresult)
-		up(&ttusb->semusb);
+		mutex_unlock(&ttusb->semusb);
 	return 0;
 }
 
 static int ttusb_result(struct ttusb *ttusb, u8 * data, int len)
 {
 	memcpy(data, ttusb->last_result, len);
-	up(&ttusb->semusb);
+	mutex_unlock(&ttusb->semusb);
 	return 0;
 }
 
@@ -250,7 +249,7 @@ static int master_xfer(struct i2c_adapter* adapter, struct i2c_msg *msg, int num
 	int i = 0;
 	int inc;
 
-	if (down_interruptible(&ttusb->semi2c) < 0)
+	if (mutex_lock_interruptible(&ttusb->semi2c) < 0)
 		return -EAGAIN;
 
 	while (i < num) {
@@ -284,7 +283,7 @@ static int master_xfer(struct i2c_adapter* adapter, struct i2c_msg *msg, int num
 		i += inc;
 	}
 
-	up(&ttusb->semi2c);
+	mutex_unlock(&ttusb->semi2c);
 	return i;
 }
 
@@ -1495,8 +1494,11 @@ static int ttusb_probe(struct usb_interface *intf, const struct usb_device_id *i
 	ttusb->dev = udev;
 	ttusb->c = 0;
 	ttusb->mux_state = 0;
-	sema_init(&ttusb->semi2c, 0);
-	sema_init(&ttusb->semusb, 1);
+	mutex_init(&ttusb->semi2c);
+
+	mutex_lock(&ttusb->semi2c);
+
+	mutex_init(&ttusb->semusb);
 
 	ttusb_setup_interfaces(ttusb);
 
@@ -1504,7 +1506,7 @@ static int ttusb_probe(struct usb_interface *intf, const struct usb_device_id *i
 	if (ttusb_init_controller(ttusb))
 		printk("ttusb_init_controller: error\n");
 
-	up(&ttusb->semi2c);
+	mutex_unlock(&ttusb->semi2c);
 
 	dvb_register_adapter(&ttusb->adapter, "Technotrend/Hauppauge Nova-USB", THIS_MODULE);
 	ttusb->adapter.priv = ttusb;
diff --git a/drivers/media/dvb/ttusb-dec/ttusb_dec.c b/drivers/media/dvb/ttusb-dec/ttusb_dec.c
index df831171e03c..44dea3211848 100644
--- a/drivers/media/dvb/ttusb-dec/ttusb_dec.c
+++ b/drivers/media/dvb/ttusb-dec/ttusb_dec.c
@@ -20,7 +20,8 @@
  *
  */
 
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
+
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -115,7 +116,7 @@ struct ttusb_dec {
 	unsigned int			out_pipe;
 	unsigned int			irq_pipe;
 	enum ttusb_dec_interface	interface;
-	struct semaphore		usb_sem;
+	struct mutex			usb_mutex;
 
 	void			*irq_buffer;
 	struct urb		*irq_urb;
@@ -124,7 +125,7 @@ struct ttusb_dec {
 	dma_addr_t		iso_dma_handle;
 	struct urb		*iso_urb[ISO_BUF_COUNT];
 	int			iso_stream_count;
-	struct semaphore	iso_sem;
+	struct mutex		iso_mutex;
 
 	u8				packet[MAX_PVA_LENGTH + 4];
 	enum ttusb_dec_packet_type	packet_type;
@@ -273,9 +274,9 @@ static int ttusb_dec_send_command(struct ttusb_dec *dec, const u8 command,
 	if (!b)
 		return -ENOMEM;
 
-	if ((result = down_interruptible(&dec->usb_sem))) {
+	if ((result = mutex_lock_interruptible(&dec->usb_mutex))) {
 		kfree(b);
-		printk("%s: Failed to down usb semaphore.\n", __FUNCTION__);
+		printk("%s: Failed to lock usb mutex.\n", __FUNCTION__);
 		return result;
 	}
 
@@ -300,7 +301,7 @@ static int ttusb_dec_send_command(struct ttusb_dec *dec, const u8 command,
 	if (result) {
 		printk("%s: command bulk message failed: error %d\n",
 		       __FUNCTION__, result);
-		up(&dec->usb_sem);
+		mutex_unlock(&dec->usb_mutex);
 		kfree(b);
 		return result;
 	}
@@ -311,7 +312,7 @@ static int ttusb_dec_send_command(struct ttusb_dec *dec, const u8 command,
 	if (result) {
 		printk("%s: result bulk message failed: error %d\n",
 		       __FUNCTION__, result);
-		up(&dec->usb_sem);
+		mutex_unlock(&dec->usb_mutex);
 		kfree(b);
 		return result;
 	} else {
@@ -327,7 +328,7 @@ static int ttusb_dec_send_command(struct ttusb_dec *dec, const u8 command,
 		if (cmd_result && b[3] > 0)
 			memcpy(cmd_result, &b[4], b[3]);
 
-		up(&dec->usb_sem);
+		mutex_unlock(&dec->usb_mutex);
 
 		kfree(b);
 		return 0;
@@ -835,7 +836,7 @@ static void ttusb_dec_stop_iso_xfer(struct ttusb_dec *dec)
 
 	dprintk("%s\n", __FUNCTION__);
 
-	if (down_interruptible(&dec->iso_sem))
+	if (mutex_lock_interruptible(&dec->iso_mutex))
 		return;
 
 	dec->iso_stream_count--;
@@ -845,7 +846,7 @@ static void ttusb_dec_stop_iso_xfer(struct ttusb_dec *dec)
 			usb_kill_urb(dec->iso_urb[i]);
 	}
 
-	up(&dec->iso_sem);
+	mutex_unlock(&dec->iso_mutex);
 }
 
 /* Setting the interface of the DEC tends to take down the USB communications
@@ -890,7 +891,7 @@ static int ttusb_dec_start_iso_xfer(struct ttusb_dec *dec)
 
 	dprintk("%s\n", __FUNCTION__);
 
-	if (down_interruptible(&dec->iso_sem))
+	if (mutex_lock_interruptible(&dec->iso_mutex))
 		return -EAGAIN;
 
 	if (!dec->iso_stream_count) {
@@ -911,7 +912,7 @@ static int ttusb_dec_start_iso_xfer(struct ttusb_dec *dec)
 					i--;
 				}
 
-				up(&dec->iso_sem);
+				mutex_unlock(&dec->iso_mutex);
 				return result;
 			}
 		}
@@ -919,7 +920,7 @@ static int ttusb_dec_start_iso_xfer(struct ttusb_dec *dec)
 
 	dec->iso_stream_count++;
 
-	up(&dec->iso_sem);
+	mutex_unlock(&dec->iso_mutex);
 
 	return 0;
 }
@@ -1229,8 +1230,8 @@ static int ttusb_dec_init_usb(struct ttusb_dec *dec)
 {
 	dprintk("%s\n", __FUNCTION__);
 
-	sema_init(&dec->usb_sem, 1);
-	sema_init(&dec->iso_sem, 1);
+	mutex_init(&dec->usb_mutex);
+	mutex_init(&dec->iso_mutex);
 
 	dec->command_pipe = usb_sndbulkpipe(dec->udev, COMMAND_PIPE);
 	dec->result_pipe = usb_rcvbulkpipe(dec->udev, RESULT_PIPE);
diff --git a/drivers/media/radio/miropcm20-rds-core.c b/drivers/media/radio/miropcm20-rds-core.c
index a917a90cb5dc..b602c73e2309 100644
--- a/drivers/media/radio/miropcm20-rds-core.c
+++ b/drivers/media/radio/miropcm20-rds-core.c
@@ -18,14 +18,15 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
+
 #include <asm/io.h>
 #include "../../../sound/oss/aci.h"
 #include "miropcm20-rds-core.h"
 
 #define DEBUG 0
 
-static struct semaphore aci_rds_sem;
+static struct mutex aci_rds_mutex;
 
 #define RDS_DATASHIFT          2   /* Bit 2 */
 #define RDS_DATAMASK        (1 << RDS_DATASHIFT)
@@ -181,7 +182,7 @@ int aci_rds_cmd(unsigned char cmd, unsigned char databuffer[], int datasize)
 {
 	int ret;
 
-	if (down_interruptible(&aci_rds_sem))
+	if (mutex_lock_interruptible(&aci_rds_mutex))
 		return -EINTR;
 
 	rds_write(cmd);
@@ -192,7 +193,7 @@ int aci_rds_cmd(unsigned char cmd, unsigned char databuffer[], int datasize)
 	else
 		ret = 0;
 
-	up(&aci_rds_sem);
+	mutex_unlock(&aci_rds_mutex);
 	
 	return ret;
 }
@@ -200,7 +201,7 @@ EXPORT_SYMBOL(aci_rds_cmd);
 
 int __init attach_aci_rds(void)
 {
-	init_MUTEX(&aci_rds_sem);
+	mutex_init(&aci_rds_mutex);
 	return 0;
 }
 
diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c
index 914deab4e044..557fb5c4af38 100644
--- a/drivers/media/radio/radio-aimslab.c
+++ b/drivers/media/radio/radio-aimslab.c
@@ -43,7 +43,7 @@
 
 static int io = CONFIG_RADIO_RTRACK_PORT; 
 static int radio_nr = -1;
-static struct semaphore lock;
+static struct mutex lock;
 
 struct rt_device
 {
@@ -83,23 +83,23 @@ static void rt_incvol(void)
 static void rt_mute(struct rt_device *dev)
 {
 	dev->muted = 1;
-	down(&lock);
+	mutex_lock(&lock);
 	outb(0xd0, io);			/* volume steady, off		*/
-	up(&lock);
+	mutex_unlock(&lock);
 }
 
 static int rt_setvol(struct rt_device *dev, int vol)
 {
 	int i;
 
-	down(&lock);
+	mutex_lock(&lock);
 	
 	if(vol == dev->curvol) {	/* requested volume = current */
 		if (dev->muted) {	/* user is unmuting the card  */
 			dev->muted = 0;
 			outb (0xd8, io);	/* enable card */
 		}	
-		up(&lock);
+		mutex_unlock(&lock);
 		return 0;
 	}
 
@@ -108,7 +108,7 @@ static int rt_setvol(struct rt_device *dev, int vol)
 		sleep_delay(2000000);	/* make sure it's totally down	*/
 		outb(0xd0, io);		/* volume steady, off		*/
 		dev->curvol = 0;	/* track the volume state!	*/
-		up(&lock);
+		mutex_unlock(&lock);
 		return 0;
 	}
 
@@ -121,7 +121,7 @@ static int rt_setvol(struct rt_device *dev, int vol)
 			rt_decvol();
 
 	dev->curvol = vol;
-	up(&lock);
+	mutex_unlock(&lock);
 	return 0;
 }
 
@@ -168,7 +168,7 @@ static int rt_setfreq(struct rt_device *dev, unsigned long freq)
 	freq += 171200;			/* Add 10.7 MHz IF 		*/
 	freq /= 800;			/* Convert to 50 kHz units	*/
 	
-	down(&lock);			/* Stop other ops interfering */
+	mutex_lock(&lock);			/* Stop other ops interfering */
 	 
 	send_0_byte (io, dev);		/*  0: LSB of frequency		*/
 
@@ -196,7 +196,7 @@ static int rt_setfreq(struct rt_device *dev, unsigned long freq)
 	else
 		outb (0xd8, io);	/* volume steady + sigstr + on */
 		
-	up(&lock);
+	mutex_unlock(&lock);
 
 	return 0;
 }
@@ -337,7 +337,7 @@ static int __init rtrack_init(void)
 
 	/* Set up the I/O locking */
 	
-	init_MUTEX(&lock);
+	mutex_init(&lock);
 	
  	/* mute card - prevents noisy bootups */
 
diff --git a/drivers/media/radio/radio-aztech.c b/drivers/media/radio/radio-aztech.c
index 523be820f9c6..83bdae23417d 100644
--- a/drivers/media/radio/radio-aztech.c
+++ b/drivers/media/radio/radio-aztech.c
@@ -42,7 +42,7 @@
 static int io = CONFIG_RADIO_AZTECH_PORT; 
 static int radio_nr = -1;
 static int radio_wait_time = 1000;
-static struct semaphore lock;
+static struct mutex lock;
 
 struct az_device
 {
@@ -87,9 +87,9 @@ static void send_1_byte (struct az_device *dev)
 
 static int az_setvol(struct az_device *dev, int vol)
 {
-	down(&lock);
+	mutex_lock(&lock);
 	outb (volconvert(vol), io);
-	up(&lock);
+	mutex_unlock(&lock);
 	return 0;
 }
 
@@ -122,7 +122,7 @@ static int az_setfreq(struct az_device *dev, unsigned long frequency)
 	frequency += 171200;		/* Add 10.7 MHz IF		*/
 	frequency /= 800;		/* Convert to 50 kHz units	*/
 					
-	down(&lock);
+	mutex_lock(&lock);
 	
 	send_0_byte (dev);		/*  0: LSB of frequency       */
 
@@ -152,7 +152,7 @@ static int az_setfreq(struct az_device *dev, unsigned long frequency)
 	udelay (radio_wait_time);
 	outb_p(128+64+volconvert(dev->curvol), io);
 	
-	up(&lock);
+	mutex_unlock(&lock);
 
 	return 0;
 }
@@ -283,7 +283,7 @@ static int __init aztech_init(void)
 		return -EBUSY;
 	}
 
-	init_MUTEX(&lock);
+	mutex_init(&lock);
 	aztech_radio.priv=&aztech_unit;
 	
 	if(video_register_device(&aztech_radio, VFL_TYPE_RADIO, radio_nr)==-1)
diff --git a/drivers/media/radio/radio-maestro.c b/drivers/media/radio/radio-maestro.c
index 36c9f5bf8cdd..39c1d9118636 100644
--- a/drivers/media/radio/radio-maestro.c
+++ b/drivers/media/radio/radio-maestro.c
@@ -23,10 +23,11 @@
 #include <linux/sched.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/videodev.h>
 
+
 #define DRIVER_VERSION	"0.05"
 
 #define GPIO_DATA	0x60   /* port offset from ESS_IO_BASE */
@@ -104,7 +105,7 @@ struct radio_device {
 		muted,	/* VIDEO_AUDIO_MUTE */
 		stereo,	/* VIDEO_TUNER_STEREO_ON */	
 		tuned;	/* signal strength (0 or 0xffff) */
-	struct	semaphore lock;
+	struct mutex lock;
 };
 
 static u32 radio_bits_get(struct radio_device *dev)
@@ -258,9 +259,9 @@ static int radio_ioctl(struct inode *inode, struct file *file,
 	struct radio_device *card = video_get_drvdata(dev);
 	int ret;
 
-	down(&card->lock);
+	mutex_lock(&card->lock);
 	ret = video_usercopy(inode, file, cmd, arg, radio_function);
-	up(&card->lock);
+	mutex_unlock(&card->lock);
 
 	return ret;
 }
@@ -311,7 +312,7 @@ static int __devinit maestro_probe(struct pci_dev *pdev,
 	}
 
 	radio_unit->io = pci_resource_start(pdev, 0) + GPIO_DATA;
-	init_MUTEX(&radio_unit->lock);
+	mutex_init(&radio_unit->lock);
 
 	maestro_radio_inst = video_device_alloc();
 	if (maestro_radio_inst == NULL) {
diff --git a/drivers/media/radio/radio-maxiradio.c b/drivers/media/radio/radio-maxiradio.c
index c975ddd86cd5..f0bf47bcb64c 100644
--- a/drivers/media/radio/radio-maxiradio.c
+++ b/drivers/media/radio/radio-maxiradio.c
@@ -37,7 +37,8 @@
 #include <linux/sched.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
+
 #include <linux/pci.h>
 #include <linux/videodev.h>
 
@@ -101,7 +102,7 @@ static struct radio_device
 		
 	unsigned long freq;
 	
-	struct  semaphore lock;
+	struct mutex lock;
 } radio_unit = {0, 0, 0, 0, };
 
 
@@ -267,9 +268,9 @@ static int radio_ioctl(struct inode *inode, struct file *file,
 	struct radio_device *card=dev->priv;
 	int ret;
 	
-	down(&card->lock);
+	mutex_lock(&card->lock);
 	ret = video_usercopy(inode, file, cmd, arg, radio_function);
-	up(&card->lock);
+	mutex_unlock(&card->lock);
 	return ret;
 }
 
@@ -290,7 +291,7 @@ static int __devinit maxiradio_init_one(struct pci_dev *pdev, const struct pci_d
 	        goto err_out_free_region;
 
 	radio_unit.io = pci_resource_start(pdev, 0);
-	init_MUTEX(&radio_unit.lock);
+	mutex_init(&radio_unit.lock);
 	maxiradio_radio.priv = &radio_unit;
 
 	if(video_register_device(&maxiradio_radio, VFL_TYPE_RADIO, radio_nr)==-1) {
diff --git a/drivers/media/radio/radio-sf16fmi.c b/drivers/media/radio/radio-sf16fmi.c
index 0229f792a059..53073b424107 100644
--- a/drivers/media/radio/radio-sf16fmi.c
+++ b/drivers/media/radio/radio-sf16fmi.c
@@ -24,7 +24,7 @@
 #include <linux/isapnp.h>
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 struct fmi_device
 {
@@ -37,7 +37,7 @@ struct fmi_device
 static int io = -1; 
 static int radio_nr = -1;
 static struct pnp_dev *dev = NULL;
-static struct semaphore lock;
+static struct mutex lock;
 
 /* freq is in 1/16 kHz to internal number, hw precision is 50 kHz */
 /* It is only useful to give freq in intervall of 800 (=0.05Mhz),
@@ -68,16 +68,16 @@ static void outbits(int bits, unsigned int data, int port)
 
 static inline void fmi_mute(int port)
 {
-	down(&lock);
+	mutex_lock(&lock);
 	outb(0x00, port);
-	up(&lock);
+	mutex_unlock(&lock);
 }
 
 static inline void fmi_unmute(int port)
 {
-	down(&lock);
+	mutex_lock(&lock);
 	outb(0x08, port);
-	up(&lock);
+	mutex_unlock(&lock);
 }
 
 static inline int fmi_setfreq(struct fmi_device *dev)
@@ -85,12 +85,12 @@ static inline int fmi_setfreq(struct fmi_device *dev)
 	int myport = dev->port;
 	unsigned long freq = dev->curfreq;
 
-	down(&lock);
+	mutex_lock(&lock);
 
 	outbits(16, RSF16_ENCODE(freq), myport);
 	outbits(8, 0xC0, myport);
 	msleep(143);		/* was schedule_timeout(HZ/7) */
-	up(&lock);
+	mutex_unlock(&lock);
 	if (dev->curvol) fmi_unmute(myport);
 	return 0;
 }
@@ -102,7 +102,7 @@ static inline int fmi_getsigstr(struct fmi_device *dev)
 	int myport = dev->port;
 
 	
-	down(&lock);
+	mutex_lock(&lock);
 	val = dev->curvol ? 0x08 : 0x00;	/* unmute/mute */
 	outb(val, myport);
 	outb(val | 0x10, myport);
@@ -110,7 +110,7 @@ static inline int fmi_getsigstr(struct fmi_device *dev)
 	res = (int)inb(myport+1);
 	outb(val, myport);
 	
-	up(&lock);
+	mutex_unlock(&lock);
 	return (res & 2) ? 0 : 0xFFFF;
 }
 
@@ -296,7 +296,7 @@ static int __init fmi_init(void)
 	fmi_unit.flags = VIDEO_TUNER_LOW;
 	fmi_radio.priv = &fmi_unit;
 	
-	init_MUTEX(&lock);
+	mutex_init(&lock);
 	
 	if (video_register_device(&fmi_radio, VFL_TYPE_RADIO, radio_nr) == -1) {
 		release_region(io, 2);
diff --git a/drivers/media/radio/radio-sf16fmr2.c b/drivers/media/radio/radio-sf16fmr2.c
index 099ffb3b9c71..bcebd8cb19ad 100644
--- a/drivers/media/radio/radio-sf16fmr2.c
+++ b/drivers/media/radio/radio-sf16fmr2.c
@@ -19,9 +19,9 @@
 #include <asm/io.h>		/* outb, outb_p			*/
 #include <asm/uaccess.h>	/* copy to/from user		*/
 #include <linux/videodev.h>	/* kernel radio structs		*/
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
-static struct semaphore lock;
+static struct mutex lock;
 
 #undef DEBUG
 //#define DEBUG 1
@@ -238,9 +238,9 @@ static int fmr2_do_ioctl(struct inode *inode, struct file *file,
 			if (fmr2->mute)
 				v->flags |= VIDEO_AUDIO_MUTE;
 			v->mode=VIDEO_MODE_AUTO;
-			down(&lock);
+			mutex_lock(&lock);
 			v->signal = fmr2_getsigstr(fmr2);
-			up(&lock);
+			mutex_unlock(&lock);
 			return 0;
 		}
 		case VIDIOCSTUNER:
@@ -274,9 +274,9 @@ static int fmr2_do_ioctl(struct inode *inode, struct file *file,
 			/* set card freq (if not muted) */
 			if (fmr2->curvol && !fmr2->mute)
 			{
-				down(&lock);
+				mutex_lock(&lock);
 				fmr2_setfreq(fmr2);
-				up(&lock);
+				mutex_unlock(&lock);
 			}
 			return 0;
 		}
@@ -318,14 +318,14 @@ static int fmr2_do_ioctl(struct inode *inode, struct file *file,
 			else
 				printk(KERN_DEBUG "mute\n");
 #endif
-			down(&lock);
+			mutex_lock(&lock);
 			if (fmr2->curvol && !fmr2->mute)
 			{
 				fmr2_setvolume(fmr2);
 				fmr2_setfreq(fmr2);
 			}
 			else fmr2_mute(fmr2->port);
-			up(&lock);
+			mutex_unlock(&lock);
 			return 0;
 		}
 		case VIDIOCGUNIT:
@@ -380,7 +380,7 @@ static int __init fmr2_init(void)
 	fmr2_unit.card_type = 0;
 	fmr2_radio.priv = &fmr2_unit;
 
-	init_MUTEX(&lock);
+	mutex_init(&lock);
 
 	if (request_region(io, 2, "sf16fmr2"))
 	{
@@ -397,10 +397,10 @@ static int __init fmr2_init(void)
 	printk(KERN_INFO "SF16FMR2 radio card driver at 0x%x.\n", io);
 	debug_print((KERN_DEBUG "Mute %d Low %d\n",VIDEO_AUDIO_MUTE,VIDEO_TUNER_LOW));
 	/* mute card - prevents noisy bootups */
-	down(&lock);
+	mutex_lock(&lock);
 	fmr2_mute(io);
 	fmr2_product_info(&fmr2_unit);
-	up(&lock);
+	mutex_unlock(&lock);
 	debug_print((KERN_DEBUG "card_type %d\n", fmr2_unit.card_type));
 	return 0;
 }
diff --git a/drivers/media/radio/radio-typhoon.c b/drivers/media/radio/radio-typhoon.c
index 8ac9a8ef9094..e50955836d6b 100644
--- a/drivers/media/radio/radio-typhoon.c
+++ b/drivers/media/radio/radio-typhoon.c
@@ -59,7 +59,7 @@ struct typhoon_device {
 	int muted;
 	unsigned long curfreq;
 	unsigned long mutefreq;
-	struct semaphore lock;
+	struct mutex lock;
 };
 
 static void typhoon_setvol_generic(struct typhoon_device *dev, int vol);
@@ -77,12 +77,12 @@ static int typhoon_get_info(char *buf, char **start, off_t offset, int len);
 
 static void typhoon_setvol_generic(struct typhoon_device *dev, int vol)
 {
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	vol >>= 14;				/* Map 16 bit to 2 bit */
 	vol &= 3;
 	outb_p(vol / 2, dev->iobase);		/* Set the volume, high bit. */
 	outb_p(vol % 2, dev->iobase + 2);	/* Set the volume, low bit. */
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 }
 
 static int typhoon_setfreq_generic(struct typhoon_device *dev,
@@ -102,7 +102,7 @@ static int typhoon_setfreq_generic(struct typhoon_device *dev,
 	 *
 	 */
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	x = frequency / 160;
 	outval = (x * x + 2500) / 5000;
 	outval = (outval * x + 5000) / 10000;
@@ -112,7 +112,7 @@ static int typhoon_setfreq_generic(struct typhoon_device *dev,
 	outb_p((outval >> 8) & 0x01, dev->iobase + 4);
 	outb_p(outval >> 9, dev->iobase + 6);
 	outb_p(outval & 0xff, dev->iobase + 8);
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 
 	return 0;
 }
@@ -337,7 +337,7 @@ static int __init typhoon_init(void)
 #endif /* MODULE */
 
 	printk(KERN_INFO BANNER);
-	init_MUTEX(&typhoon_unit.lock);
+	mutex_init(&typhoon_unit.lock);
 	io = typhoon_unit.iobase;
 	if (!request_region(io, 8, "typhoon")) {
 		printk(KERN_ERR "radio-typhoon: port 0x%x already in use\n",
diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c
index d590e80c922e..7bf1a4264891 100644
--- a/drivers/media/radio/radio-zoltrix.c
+++ b/drivers/media/radio/radio-zoltrix.c
@@ -48,7 +48,7 @@ struct zol_device {
 	unsigned long curfreq;
 	int muted;
 	unsigned int stereo;
-	struct semaphore lock;
+	struct mutex lock;
 };
 
 static int zol_setvol(struct zol_device *dev, int vol)
@@ -57,30 +57,30 @@ static int zol_setvol(struct zol_device *dev, int vol)
 	if (dev->muted)
 		return 0;
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	if (vol == 0) {
 		outb(0, io);
 		outb(0, io);
 		inb(io + 3);    /* Zoltrix needs to be read to confirm */
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	}
 
 	outb(dev->curvol-1, io);
 	msleep(10);
 	inb(io + 2);
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 	return 0;
 }
 
 static void zol_mute(struct zol_device *dev)
 {
 	dev->muted = 1;
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	outb(0, io);
 	outb(0, io);
 	inb(io + 3);            /* Zoltrix needs to be read to confirm */
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 }
 
 static void zol_unmute(struct zol_device *dev)
@@ -104,7 +104,7 @@ static int zol_setfreq(struct zol_device *dev, unsigned long freq)
 	bitmask = 0xc480402c10080000ull;
 	i = 45;
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	
 	outb(0, io);
 	outb(0, io);
@@ -149,7 +149,7 @@ static int zol_setfreq(struct zol_device *dev, unsigned long freq)
 		udelay(1000);
 	}
 	
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 	
 	if(!dev->muted)
 	{
@@ -164,7 +164,7 @@ static int zol_getsigstr(struct zol_device *dev)
 {
 	int a, b;
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	outb(0x00, io);         /* This stuff I found to do nothing */
 	outb(dev->curvol, io);
 	msleep(20);
@@ -173,7 +173,7 @@ static int zol_getsigstr(struct zol_device *dev)
 	msleep(10);
 	b = inb(io);
 
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 	
 	if (a != b)
 		return (0);
@@ -188,7 +188,7 @@ static int zol_is_stereo (struct zol_device *dev)
 {
 	int x1, x2;
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	
 	outb(0x00, io);
 	outb(dev->curvol, io);
@@ -198,7 +198,7 @@ static int zol_is_stereo (struct zol_device *dev)
 	msleep(10);
 	x2 = inb(io);
 
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 	
 	if ((x1 == x2) && (x1 == 0xcf))
 		return 1;
@@ -350,7 +350,7 @@ static int __init zoltrix_init(void)
 	}
 	printk(KERN_INFO "Zoltrix Radio Plus card driver.\n");
 
-	init_MUTEX(&zoltrix_unit.lock);
+	mutex_init(&zoltrix_unit.lock);
 	
 	/* mute card - prevents noisy bootups */
 
diff --git a/drivers/media/video/arv.c b/drivers/media/video/arv.c
index 994b75fe165a..c586f64b6b7f 100644
--- a/drivers/media/video/arv.c
+++ b/drivers/media/video/arv.c
@@ -31,8 +31,8 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
+#include <linux/mutex.h>
 
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 #include <asm/m32r.h>
 #include <asm/io.h>
@@ -117,7 +117,7 @@ struct ar_device {
 	int width, height;
 	int frame_bytes, line_bytes;
 	wait_queue_head_t wait;
-	struct semaphore lock;
+	struct mutex lock;
 };
 
 static int video_nr = -1;	/* video device number (first free) */
@@ -288,7 +288,7 @@ static ssize_t ar_read(struct file *file, char *buf, size_t count, loff_t *ppos)
 	if (ar->mode == AR_MODE_NORMAL)
 		arvcr1 |= ARVCR1_NORMAL;
 
-	down(&ar->lock);
+	mutex_lock(&ar->lock);
 
 #if USE_INT
 	local_irq_save(flags);
@@ -392,7 +392,7 @@ static ssize_t ar_read(struct file *file, char *buf, size_t count, loff_t *ppos)
 	}
 	DEBUG(1, "ret = %d\n", ret);
 out_up:
-	up(&ar->lock);
+	mutex_unlock(&ar->lock);
 	return ret;
 }
 
@@ -456,7 +456,7 @@ static int ar_do_ioctl(struct inode *inode, struct file *file,
 		    (w->width != AR_WIDTH_QVGA || w->height != AR_HEIGHT_QVGA))
 				return -EINVAL;
 
-		down(&ar->lock);
+		mutex_lock(&ar->lock);
 		ar->width = w->width;
 		ar->height = w->height;
 		if (ar->width == AR_WIDTH_VGA) {
@@ -473,7 +473,7 @@ static int ar_do_ioctl(struct inode *inode, struct file *file,
 			ar->line_bytes = AR_LINE_BYTES_QVGA;
 			ar->mode = AR_MODE_INTERLACE;
 		}
-		up(&ar->lock);
+		mutex_unlock(&ar->lock);
 		return 0;
 	}
 	case VIDIOCGFBUF:
@@ -734,7 +734,7 @@ static int ar_initialize(struct video_device *dev)
 void ar_release(struct video_device *vfd)
 {
 	struct ar_device *ar = vfd->priv;
-	down(&ar->lock);
+	mutex_lock(&ar->lock);
 	video_device_release(vfd);
 }
 
@@ -824,7 +824,7 @@ static int __init ar_init(void)
 		ar->line_bytes	= AR_LINE_BYTES_QVGA;
 		ar->mode	= AR_MODE_INTERLACE;
 	}
-	init_MUTEX(&ar->lock);
+	mutex_init(&ar->lock);
 	init_waitqueue_head(&ar->wait);
 
 #if USE_INT
diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c
index 578b20085082..c0415d6e7fee 100644
--- a/drivers/media/video/bttv-driver.c
+++ b/drivers/media/video/bttv-driver.c
@@ -1965,7 +1965,7 @@ static int setup_window(struct bttv_fh *fh, struct bttv *btv,
 		BUG();
 	}
 
-	down(&fh->cap.lock);
+	mutex_lock(&fh->cap.lock);
 		kfree(fh->ov.clips);
 	fh->ov.clips    = clips;
 	fh->ov.nclips   = n;
@@ -1986,7 +1986,7 @@ static int setup_window(struct bttv_fh *fh, struct bttv *btv,
 		bttv_overlay_risc(btv, &fh->ov, fh->ovfmt, new);
 		retval = bttv_switch_overlay(btv,fh,new);
 	}
-	up(&fh->cap.lock);
+	mutex_unlock(&fh->cap.lock);
 	return retval;
 }
 
@@ -2166,7 +2166,7 @@ static int bttv_s_fmt(struct bttv_fh *fh, struct bttv *btv,
 		fmt = format_by_fourcc(f->fmt.pix.pixelformat);
 
 		/* update our state informations */
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		fh->fmt              = fmt;
 		fh->cap.field        = f->fmt.pix.field;
 		fh->cap.last         = V4L2_FIELD_NONE;
@@ -2175,7 +2175,7 @@ static int bttv_s_fmt(struct bttv_fh *fh, struct bttv *btv,
 		btv->init.fmt        = fmt;
 		btv->init.width      = f->fmt.pix.width;
 		btv->init.height     = f->fmt.pix.height;
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 
 		return 0;
 	}
@@ -2282,7 +2282,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		fmt = format_by_palette(pic->palette);
 		if (NULL == fmt)
 			return -EINVAL;
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		if (fmt->depth != pic->depth) {
 			retval = -EINVAL;
 			goto fh_unlock_and_return;
@@ -2313,7 +2313,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		bt848_contrast(btv,pic->contrast);
 		bt848_hue(btv,pic->hue);
 		bt848_sat(btv,pic->colour);
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		return 0;
 	}
 
@@ -2379,7 +2379,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 			return -EPERM;
 		end = (unsigned long)fbuf->base +
 			fbuf->height * fbuf->bytesperline;
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		retval = -EINVAL;
 
 		switch (fbuf->depth) {
@@ -2417,7 +2417,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 			btv->fbuf.fmt.bytesperline = fbuf->bytesperline;
 		else
 			btv->fbuf.fmt.bytesperline = btv->fbuf.fmt.width*fbuf->depth/8;
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		return 0;
 	}
 
@@ -2440,7 +2440,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		if (!check_alloc_btres(btv,fh,RESOURCE_OVERLAY))
 			return -EBUSY;
 
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		if (*on) {
 			fh->ov.tvnorm = btv->tvnorm;
 			new = videobuf_alloc(sizeof(*new));
@@ -2451,7 +2451,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 
 		/* switch over */
 		retval = bttv_switch_overlay(btv,fh,new);
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		return retval;
 	}
 
@@ -2460,7 +2460,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		struct video_mbuf *mbuf = arg;
 		unsigned int i;
 
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		retval = videobuf_mmap_setup(&fh->cap,gbuffers,gbufsize,
 					     V4L2_MEMORY_MMAP);
 		if (retval < 0)
@@ -2470,7 +2470,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		mbuf->size   = gbuffers * gbufsize;
 		for (i = 0; i < gbuffers; i++)
 			mbuf->offsets[i] = i * gbufsize;
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		return 0;
 	}
 	case VIDIOCMCAPTURE:
@@ -2482,7 +2482,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		if (vm->frame >= VIDEO_MAX_FRAME)
 			return -EINVAL;
 
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		retval = -EINVAL;
 		buf = (struct bttv_buffer *)fh->cap.bufs[vm->frame];
 		if (NULL == buf)
@@ -2504,7 +2504,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		spin_lock_irqsave(&btv->s_lock,flags);
 		buffer_queue(&fh->cap,&buf->vb);
 		spin_unlock_irqrestore(&btv->s_lock,flags);
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		return 0;
 	}
 	case VIDIOCSYNC:
@@ -2515,7 +2515,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		if (*frame >= VIDEO_MAX_FRAME)
 			return -EINVAL;
 
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		retval = -EINVAL;
 		buf = (struct bttv_buffer *)fh->cap.bufs[*frame];
 		if (NULL == buf)
@@ -2535,7 +2535,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 			retval = -EINVAL;
 			break;
 		}
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		return retval;
 	}
 
@@ -2719,7 +2719,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 		if (0 == (fmt->flags & FORMAT_FLAGS_PACKED))
 			return -EINVAL;
 
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		retval = -EINVAL;
 		if (fb->flags & V4L2_FBUF_FLAG_OVERLAY) {
 			if (fb->fmt.width > bttv_tvnorms[btv->tvnorm].swidth)
@@ -2759,7 +2759,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 				retval = bttv_switch_overlay(btv,fh,new);
 			}
 		}
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		return retval;
 	}
 
@@ -2890,7 +2890,7 @@ static int bttv_do_ioctl(struct inode *inode, struct file *file,
 	return 0;
 
  fh_unlock_and_return:
-	up(&fh->cap.lock);
+	mutex_unlock(&fh->cap.lock);
 	return retval;
 }
 
@@ -2957,16 +2957,16 @@ static unsigned int bttv_poll(struct file *file, poll_table *wait)
 		buf = list_entry(fh->cap.stream.next,struct bttv_buffer,vb.stream);
 	} else {
 		/* read() capture */
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		if (NULL == fh->cap.read_buf) {
 			/* need to capture a new frame */
 			if (locked_btres(fh->btv,RESOURCE_VIDEO)) {
-				up(&fh->cap.lock);
+				mutex_unlock(&fh->cap.lock);
 				return POLLERR;
 			}
 			fh->cap.read_buf = videobuf_alloc(fh->cap.msize);
 			if (NULL == fh->cap.read_buf) {
-				up(&fh->cap.lock);
+				mutex_unlock(&fh->cap.lock);
 				return POLLERR;
 			}
 			fh->cap.read_buf->memory = V4L2_MEMORY_USERPTR;
@@ -2974,13 +2974,13 @@ static unsigned int bttv_poll(struct file *file, poll_table *wait)
 			if (0 != fh->cap.ops->buf_prepare(&fh->cap,fh->cap.read_buf,field)) {
 				kfree (fh->cap.read_buf);
 				fh->cap.read_buf = NULL;
-				up(&fh->cap.lock);
+				mutex_unlock(&fh->cap.lock);
 				return POLLERR;
 			}
 			fh->cap.ops->buf_queue(&fh->cap,fh->cap.read_buf);
 			fh->cap.read_off = 0;
 		}
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		buf = (struct bttv_buffer*)fh->cap.read_buf;
 	}
 
diff --git a/drivers/media/video/bw-qcam.c b/drivers/media/video/bw-qcam.c
index 6bad93ef969f..d97b7d8ac33d 100644
--- a/drivers/media/video/bw-qcam.c
+++ b/drivers/media/video/bw-qcam.c
@@ -73,7 +73,7 @@ OTHER DEALINGS IN THE SOFTWARE.
 #include <linux/parport.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 
 #include "bw-qcam.h"
@@ -168,7 +168,7 @@ static struct qcam_device *qcam_init(struct parport *port)
 	
 	memcpy(&q->vdev, &qcam_template, sizeof(qcam_template));
 	
-	init_MUTEX(&q->lock);
+	mutex_init(&q->lock);
 
 	q->port_mode = (QC_ANY | QC_NOTSET);
 	q->width = 320;
@@ -772,9 +772,9 @@ static int qcam_do_ioctl(struct inode *inode, struct file *file,
 			qcam->whitebal = p->whiteness>>8;
 			qcam->bpp = p->depth;
 
-			down(&qcam->lock);			
+			mutex_lock(&qcam->lock);
 			qc_setscanmode(qcam);
-			up(&qcam->lock);
+			mutex_unlock(&qcam->lock);
 			qcam->status |= QC_PARAM_CHANGE;
 
 			return 0;
@@ -805,9 +805,9 @@ static int qcam_do_ioctl(struct inode *inode, struct file *file,
 				qcam->height = 240;
 				qcam->transfer_scale = 1;
 			}
-			down(&qcam->lock);
+			mutex_lock(&qcam->lock);
 			qc_setscanmode(qcam);
-			up(&qcam->lock);
+			mutex_unlock(&qcam->lock);
 			
 			/* We must update the camera before we grab. We could
 			   just have changed the grab size */
@@ -854,7 +854,7 @@ static ssize_t qcam_read(struct file *file, char __user *buf,
 	int len;
 	parport_claim_or_block(qcam->pdev);
 	
-	down(&qcam->lock);
+	mutex_lock(&qcam->lock);
 	
 	qc_reset(qcam);
 
@@ -864,7 +864,7 @@ static ssize_t qcam_read(struct file *file, char __user *buf,
 
 	len=qc_capture(qcam, buf,count);
 	
-	up(&qcam->lock);
+	mutex_unlock(&qcam->lock);
 	
 	parport_release(qcam->pdev);
 	return len;
diff --git a/drivers/media/video/bw-qcam.h b/drivers/media/video/bw-qcam.h
index 723e8ad9e56a..6701dafbc0da 100644
--- a/drivers/media/video/bw-qcam.h
+++ b/drivers/media/video/bw-qcam.h
@@ -55,7 +55,7 @@ struct qcam_device {
 	struct video_device vdev;
 	struct pardevice *pdev;
 	struct parport *pport;
-	struct semaphore lock;
+	struct mutex lock;
 	int width, height;
 	int bpp;
 	int mode;
diff --git a/drivers/media/video/c-qcam.c b/drivers/media/video/c-qcam.c
index 9976db4f6da8..8211fd8d7cbf 100644
--- a/drivers/media/video/c-qcam.c
+++ b/drivers/media/video/c-qcam.c
@@ -34,7 +34,8 @@
 #include <linux/parport.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
+
 #include <asm/uaccess.h>
 
 struct qcam_device {
@@ -47,7 +48,7 @@ struct qcam_device {
 	int contrast, brightness, whitebal;
 	int top, left;
 	unsigned int bidirectional;
-	struct semaphore lock;
+	struct mutex lock;
 };
 
 /* cameras maximum */
@@ -581,11 +582,11 @@ static int qcam_do_ioctl(struct inode *inode, struct file *file,
 			qcam->contrast = p->contrast>>8;
 			qcam->whitebal = p->whiteness>>8;
 
-			down(&qcam->lock);			
+			mutex_lock(&qcam->lock);
 			parport_claim_or_block(qcam->pdev);
 			qc_setup(qcam); 
 			parport_release(qcam->pdev);
-			up(&qcam->lock);
+			mutex_unlock(&qcam->lock);
 			return 0;
 		}
 		case VIDIOCSWIN:
@@ -628,11 +629,11 @@ static int qcam_do_ioctl(struct inode *inode, struct file *file,
 #endif
 			/* Ok we figured out what to use from our 
 			   wide choice */
-			down(&qcam->lock);
+			mutex_lock(&qcam->lock);
 			parport_claim_or_block(qcam->pdev);
 			qc_setup(qcam);
 			parport_release(qcam->pdev);
-			up(&qcam->lock);
+			mutex_unlock(&qcam->lock);
 			return 0;
 		}
 		case VIDIOCGWIN:
@@ -672,12 +673,12 @@ static ssize_t qcam_read(struct file *file, char __user *buf,
 	struct qcam_device *qcam=(struct qcam_device *)v;
 	int len;
 
-	down(&qcam->lock);
+	mutex_lock(&qcam->lock);
 	parport_claim_or_block(qcam->pdev);
 	/* Probably should have a semaphore against multiple users */
 	len = qc_capture(qcam, buf,count); 
 	parport_release(qcam->pdev);
-	up(&qcam->lock);
+	mutex_unlock(&qcam->lock);
 	return len;
 }
 
@@ -727,7 +728,7 @@ static struct qcam_device *qcam_init(struct parport *port)
 	
 	memcpy(&q->vdev, &qcam_template, sizeof(qcam_template));
 
-	init_MUTEX(&q->lock);
+	mutex_init(&q->lock);
 	q->width = q->ccd_width = 320;
 	q->height = q->ccd_height = 240;
 	q->mode = QC_MILLIONS | QC_DECIMATION_1;
diff --git a/drivers/media/video/cpia.c b/drivers/media/video/cpia.c
index 9f59541155d9..c2405029a720 100644
--- a/drivers/media/video/cpia.c
+++ b/drivers/media/video/cpia.c
@@ -39,7 +39,7 @@
 #include <linux/pagemap.h>
 #include <linux/delay.h>
 #include <asm/io.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
@@ -622,7 +622,7 @@ static int cpia_write_proc(struct file *file, const char __user *buf,
 	
 	buffer = page;
 	
-	if (down_interruptible(&cam->param_lock))
+	if (mutex_lock_interruptible(&cam->param_lock))
 		return -ERESTARTSYS;
 	
 	/*
@@ -1350,7 +1350,7 @@ static int cpia_write_proc(struct file *file, const char __user *buf,
 	} else
 		DBG("error: %d\n", retval);
 	
-	up(&cam->param_lock);
+	mutex_unlock(&cam->param_lock);
 	
 out:
 	free_page((unsigned long)page);
@@ -1664,7 +1664,7 @@ static int do_command(struct cam_data *cam, u16 command, u8 a, u8 b, u8 c, u8 d)
 	case CPIA_COMMAND_GetColourParams:
 	case CPIA_COMMAND_GetColourBalance:
 	case CPIA_COMMAND_GetExposure:
-		down(&cam->param_lock);
+		mutex_lock(&cam->param_lock);
 		datasize=8;
 		break;
 	case CPIA_COMMAND_ReadMCPorts: 
@@ -1691,7 +1691,7 @@ static int do_command(struct cam_data *cam, u16 command, u8 a, u8 b, u8 c, u8 d)
 		if (command == CPIA_COMMAND_GetColourParams ||
 		    command == CPIA_COMMAND_GetColourBalance ||
 		    command == CPIA_COMMAND_GetExposure)
-			up(&cam->param_lock);
+			mutex_unlock(&cam->param_lock);
 	} else {
 		switch(command) {
 		case CPIA_COMMAND_GetCPIAVersion:
@@ -1726,13 +1726,13 @@ static int do_command(struct cam_data *cam, u16 command, u8 a, u8 b, u8 c, u8 d)
 			cam->params.colourParams.brightness = data[0];
 			cam->params.colourParams.contrast = data[1];
 			cam->params.colourParams.saturation = data[2];
-			up(&cam->param_lock);
+			mutex_unlock(&cam->param_lock);
 			break;
 		case CPIA_COMMAND_GetColourBalance:
 			cam->params.colourBalance.redGain = data[0];
 			cam->params.colourBalance.greenGain = data[1];
 			cam->params.colourBalance.blueGain = data[2];
-			up(&cam->param_lock);
+			mutex_unlock(&cam->param_lock);
 			break;
 		case CPIA_COMMAND_GetExposure:
 			cam->params.exposure.gain = data[0];
@@ -1743,7 +1743,7 @@ static int do_command(struct cam_data *cam, u16 command, u8 a, u8 b, u8 c, u8 d)
 			cam->params.exposure.green1Comp = data[5];
 			cam->params.exposure.green2Comp = data[6];
 			cam->params.exposure.blueComp = data[7];
-			up(&cam->param_lock);
+			mutex_unlock(&cam->param_lock);
 			break;
 
 		case CPIA_COMMAND_ReadMCPorts: 
@@ -2059,7 +2059,7 @@ static int parse_picture(struct cam_data *cam, int size)
 	int rows, cols, linesize, subsample_422;
 
 	/* make sure params don't change while we are decoding */
-	down(&cam->param_lock);
+	mutex_lock(&cam->param_lock);
 
 	obuf = cam->decompressed_frame.data;
 	end_obuf = obuf+CPIA_MAX_FRAME_SIZE;
@@ -2069,26 +2069,26 @@ static int parse_picture(struct cam_data *cam, int size)
 
 	if ((ibuf[0] != MAGIC_0) || (ibuf[1] != MAGIC_1)) {
 		LOG("header not found\n");
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return -1;
 	}
 
 	if ((ibuf[16] != VIDEOSIZE_QCIF) && (ibuf[16] != VIDEOSIZE_CIF)) {
 		LOG("wrong video size\n");
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return -1;
 	}
 	
 	if (ibuf[17] != SUBSAMPLE_420 && ibuf[17] != SUBSAMPLE_422) {
 		LOG("illegal subtype %d\n",ibuf[17]);
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return -1;
 	}
 	subsample_422 = ibuf[17] == SUBSAMPLE_422;
 	
 	if (ibuf[18] != YUVORDER_YUYV && ibuf[18] != YUVORDER_UYVY) {
 		LOG("illegal yuvorder %d\n",ibuf[18]);
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return -1;
 	}
 	in_uyvy = ibuf[18] == YUVORDER_UYVY;
@@ -2098,7 +2098,7 @@ static int parse_picture(struct cam_data *cam, int size)
 	    (ibuf[26] != cam->params.roi.rowStart) ||
 	    (ibuf[27] != cam->params.roi.rowEnd)) {
 		LOG("ROI mismatch\n");
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return -1;
 	}
 	cols = 8*(ibuf[25] - ibuf[24]);
@@ -2107,14 +2107,14 @@ static int parse_picture(struct cam_data *cam, int size)
 	
 	if ((ibuf[28] != NOT_COMPRESSED) && (ibuf[28] != COMPRESSED)) {
 		LOG("illegal compression %d\n",ibuf[28]);
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return -1;
 	}
 	compressed = (ibuf[28] == COMPRESSED);
 	
 	if (ibuf[29] != NO_DECIMATION && ibuf[29] != DECIMATION_ENAB) {
 		LOG("illegal decimation %d\n",ibuf[29]);
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return -1;
 	}
 	decimation = (ibuf[29] == DECIMATION_ENAB);	
@@ -2130,7 +2130,7 @@ static int parse_picture(struct cam_data *cam, int size)
 	cam->params.status.vpStatus = ibuf[38];
 	cam->params.status.errorCode = ibuf[39];
 	cam->fps = ibuf[41];
-	up(&cam->param_lock);
+	mutex_unlock(&cam->param_lock);
 	
 	linesize = skipcount(cols, out_fmt);
 	ibuf += FRAME_HEADER_SIZE;
@@ -2271,9 +2271,9 @@ static int find_over_exposure(int brightness)
 /* update various camera modes and settings */
 static void dispatch_commands(struct cam_data *cam)
 {
-	down(&cam->param_lock);
+	mutex_lock(&cam->param_lock);
 	if (cam->cmd_queue==COMMAND_NONE) {
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return;
 	}
 	DEB_BYTE(cam->cmd_queue);
@@ -2415,7 +2415,7 @@ static void dispatch_commands(struct cam_data *cam)
 	  }
 
 	cam->cmd_queue = COMMAND_NONE;
-	up(&cam->param_lock);
+	mutex_unlock(&cam->param_lock);
 	return;
 }
 
@@ -2562,7 +2562,7 @@ static void monitor_exposure(struct cam_data *cam)
 	gain = data[2];
 	coarseL = data[3];
 
-	down(&cam->param_lock);
+	mutex_lock(&cam->param_lock);
 	light_exp = cam->params.colourParams.brightness +
 	            TC - 50 + EXP_ACC_LIGHT;
 	if(light_exp > 255)
@@ -2762,7 +2762,7 @@ static void monitor_exposure(struct cam_data *cam)
 			LOG("Automatically increasing sensor_fps\n");
 		}
 	}
-	up(&cam->param_lock);
+	mutex_unlock(&cam->param_lock);
 }
 
 /*-----------------------------------------------------------------*/
@@ -2778,10 +2778,10 @@ static void restart_flicker(struct cam_data *cam)
 	int cam_exposure, old_exp;
 	if(!FIRMWARE_VERSION(1,2))
 		return;
-	down(&cam->param_lock);
+	mutex_lock(&cam->param_lock);
 	if(cam->params.flickerControl.flickerMode == 0 ||
 	   cam->raw_image[39] == 0) {
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		return;
 	}
 	cam_exposure = cam->raw_image[39]*2;
@@ -2810,7 +2810,7 @@ static void restart_flicker(struct cam_data *cam)
 			cam->exposure_status = EXPOSURE_NORMAL;
 
 	}
-	up(&cam->param_lock);
+	mutex_unlock(&cam->param_lock);
 }
 #undef FIRMWARE_VERSION
 
@@ -3186,7 +3186,7 @@ static int cpia_open(struct inode *inode, struct file *file)
 	if (!try_module_get(cam->ops->owner))
 		return -ENODEV;
 
-	down(&cam->busy_lock);
+	mutex_lock(&cam->busy_lock);
 	err = -ENOMEM;
 	if (!cam->raw_image) {
 		cam->raw_image = rvmalloc(CPIA_MAX_IMAGE_SIZE);
@@ -3227,7 +3227,7 @@ static int cpia_open(struct inode *inode, struct file *file)
 	
 	++cam->open_count;
 	file->private_data = dev;
-	up(&cam->busy_lock);
+	mutex_unlock(&cam->busy_lock);
 	return 0;
 
  oops:
@@ -3239,7 +3239,7 @@ static int cpia_open(struct inode *inode, struct file *file)
 		rvfree(cam->raw_image, CPIA_MAX_IMAGE_SIZE);
 		cam->raw_image = NULL;
 	}
-	up(&cam->busy_lock);
+	mutex_unlock(&cam->busy_lock);
 	put_cam(cam->ops);
 	return err;
 }
@@ -3303,24 +3303,24 @@ static ssize_t cpia_read(struct file *file, char __user *buf,
 	int err;
 
 	/* make this _really_ smp and multithread-safe */
-	if (down_interruptible(&cam->busy_lock))
+	if (mutex_lock_interruptible(&cam->busy_lock))
 		return -EINTR;
 
 	if (!buf) {
 		DBG("buf NULL\n");
-		up(&cam->busy_lock);
+		mutex_unlock(&cam->busy_lock);
 		return -EINVAL;
 	}
 
 	if (!count) {
 		DBG("count 0\n");
-		up(&cam->busy_lock);
+		mutex_unlock(&cam->busy_lock);
 		return 0;
 	}
 
 	if (!cam->ops) {
 		DBG("ops NULL\n");
-		up(&cam->busy_lock);
+		mutex_unlock(&cam->busy_lock);
 		return -ENODEV;
 	}
 
@@ -3329,7 +3329,7 @@ static ssize_t cpia_read(struct file *file, char __user *buf,
 	cam->mmap_kludge=0;
 	if((err = fetch_frame(cam)) != 0) {
 		DBG("ERROR from fetch_frame: %d\n", err);
-		up(&cam->busy_lock);
+		mutex_unlock(&cam->busy_lock);
 		return err;
 	}
 	cam->decompressed_frame.state = FRAME_UNUSED;
@@ -3338,17 +3338,17 @@ static ssize_t cpia_read(struct file *file, char __user *buf,
 	if (cam->decompressed_frame.count > count) {
 		DBG("count wrong: %d, %lu\n", cam->decompressed_frame.count,
 		    (unsigned long) count);
-		up(&cam->busy_lock);
+		mutex_unlock(&cam->busy_lock);
 		return -EFAULT;
 	}
 	if (copy_to_user(buf, cam->decompressed_frame.data,
 	                cam->decompressed_frame.count)) {
 		DBG("copy_to_user failed\n");
-		up(&cam->busy_lock);
+		mutex_unlock(&cam->busy_lock);
 		return -EFAULT;
 	}
 
-	up(&cam->busy_lock);
+	mutex_unlock(&cam->busy_lock);
 	return cam->decompressed_frame.count;
 }
 
@@ -3363,7 +3363,7 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 		return -ENODEV;
 	
 	/* make this _really_ smp-safe */
-	if (down_interruptible(&cam->busy_lock))
+	if (mutex_lock_interruptible(&cam->busy_lock))
 		return -EINTR;
 
 	//DBG("cpia_ioctl: %u\n", ioctlnr);
@@ -3439,7 +3439,7 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 			break;
 		}
 
-		down(&cam->param_lock);
+		mutex_lock(&cam->param_lock);
 		/* brightness, colour, contrast need no check 0-65535 */
 		cam->vp = *vp;
 		/* update cam->params.colourParams */
@@ -3466,7 +3466,7 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 
 		/* queue command to update camera */
 		cam->cmd_queue |= COMMAND_SETCOLOURPARAMS;
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 		DBG("VIDIOCSPICT: %d / %d // %d / %d / %d / %d\n",
 		    vp->depth, vp->palette, vp->brightness, vp->hue, vp->colour,
 		    vp->contrast);
@@ -3501,13 +3501,13 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 		/* we set the video window to something smaller or equal to what
 		* is requested by the user???
 		*/
-		down(&cam->param_lock);
+		mutex_lock(&cam->param_lock);
 		if (vw->width != cam->vw.width || vw->height != cam->vw.height) {
 			int video_size = match_videosize(vw->width, vw->height);
 
 			if (video_size < 0) {
 				retval = -EINVAL;
-				up(&cam->param_lock);
+				mutex_unlock(&cam->param_lock);
 				break;
 			}
 			cam->video_size = video_size;
@@ -3520,7 +3520,7 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 			cam->cmd_queue |= COMMAND_SETFORMAT;
 		}
 
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 
 		/* setformat ignored by camera during streaming,
 		 * so stop/dispatch/start */
@@ -3682,7 +3682,7 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 
 		DBG("%d,%d/%dx%d\n", vc->x,vc->y,vc->width, vc->height);
 		
-		down(&cam->param_lock);
+		mutex_lock(&cam->param_lock);
 		
 		cam->vc.x      = vc->x;
 		cam->vc.y      = vc->y;
@@ -3692,7 +3692,7 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 		set_vw_size(cam);
 		cam->cmd_queue |= COMMAND_SETFORMAT;
 
-		up(&cam->param_lock);
+		mutex_unlock(&cam->param_lock);
 
 		/* setformat ignored by camera during streaming,
 		 * so stop/dispatch/start */
@@ -3736,7 +3736,7 @@ static int cpia_do_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
-	up(&cam->busy_lock);
+	mutex_unlock(&cam->busy_lock);
 	return retval;
 } 
 
@@ -3769,12 +3769,12 @@ static int cpia_mmap(struct file *file, struct vm_area_struct *vma)
 		return -ENODEV;
 	
 	/* make this _really_ smp-safe */
-	if (down_interruptible(&cam->busy_lock))
+	if (mutex_lock_interruptible(&cam->busy_lock))
 		return -EINTR;
 
 	if (!cam->frame_buf) {	/* we do lazy allocation */
 		if ((retval = allocate_frame_buf(cam))) {
-			up(&cam->busy_lock);
+			mutex_unlock(&cam->busy_lock);
 			return retval;
 		}
 	}
@@ -3783,7 +3783,7 @@ static int cpia_mmap(struct file *file, struct vm_area_struct *vma)
 	while (size > 0) {
 		page = vmalloc_to_pfn((void *)pos);
 		if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED)) {
-			up(&cam->busy_lock);
+			mutex_unlock(&cam->busy_lock);
 			return -EAGAIN;
 		}
 		start += PAGE_SIZE;
@@ -3795,7 +3795,7 @@ static int cpia_mmap(struct file *file, struct vm_area_struct *vma)
 	}
 
 	DBG("cpia_mmap: %ld\n", size);
-	up(&cam->busy_lock);
+	mutex_unlock(&cam->busy_lock);
 
 	return 0;
 }
@@ -3936,8 +3936,8 @@ static void init_camera_struct(struct cam_data *cam,
 	memset(cam, 0, sizeof(struct cam_data));
 
 	cam->ops = ops;
-	init_MUTEX(&cam->param_lock);
-	init_MUTEX(&cam->busy_lock);
+	mutex_init(&cam->param_lock);
+	mutex_init(&cam->busy_lock);
 
 	reset_camera_struct(cam);
 
diff --git a/drivers/media/video/cpia.h b/drivers/media/video/cpia.h
index f629b693ee65..de6678200a57 100644
--- a/drivers/media/video/cpia.h
+++ b/drivers/media/video/cpia.h
@@ -47,6 +47,7 @@
 #include <linux/videodev.h>
 #include <linux/list.h>
 #include <linux/smp_lock.h>
+#include <linux/mutex.h>
 
 struct cpia_camera_ops
 {
@@ -246,7 +247,7 @@ enum v4l_camstates {
 struct cam_data {
 	struct list_head cam_data_list;
 
-        struct semaphore busy_lock;     /* guard against SMP multithreading */
+        struct mutex busy_lock;     /* guard against SMP multithreading */
 	struct cpia_camera_ops *ops;	/* lowlevel driver operations */
 	void *lowlevel_data;		/* private data for lowlevel driver */
 	u8 *raw_image;			/* buffer for raw image data */
@@ -261,7 +262,7 @@ struct cam_data {
 	u8 mainsFreq;			/* for flicker control */
 
 				/* proc interface */
-	struct semaphore param_lock;	/* params lock for this camera */
+	struct mutex param_lock;	/* params lock for this camera */
 	struct cam_params params;	/* camera settings */
 	struct proc_dir_entry *proc_entry;	/* /proc/cpia/videoX */
 	
diff --git a/drivers/media/video/cx88/cx88-core.c b/drivers/media/video/cx88/cx88-core.c
index 3720f24a25cf..eda7cd8b2d4a 100644
--- a/drivers/media/video/cx88/cx88-core.c
+++ b/drivers/media/video/cx88/cx88-core.c
@@ -1061,7 +1061,7 @@ struct cx88_core* cx88_core_get(struct pci_dev *pci)
 	core->pci_bus  = pci->bus->number;
 	core->pci_slot = PCI_SLOT(pci->devfn);
 	core->pci_irqmask = 0x00fc00;
-	init_MUTEX(&core->lock);
+	mutex_init(&core->lock);
 
 	core->nr = cx88_devcount++;
 	sprintf(core->name,"cx88[%d]",core->nr);
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index 073494ceab0f..d15ef158ac8f 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -336,17 +336,17 @@ static int res_get(struct cx8800_dev *dev, struct cx8800_fh *fh, unsigned int bi
 		return 1;
 
 	/* is it free? */
-	down(&core->lock);
+	mutex_lock(&core->lock);
 	if (dev->resources & bit) {
 		/* no, someone else uses it */
-		up(&core->lock);
+		mutex_unlock(&core->lock);
 		return 0;
 	}
 	/* it's free, grab it */
 	fh->resources  |= bit;
 	dev->resources |= bit;
 	dprintk(1,"res: get %d\n",bit);
-	up(&core->lock);
+	mutex_unlock(&core->lock);
 	return 1;
 }
 
@@ -369,11 +369,11 @@ void res_free(struct cx8800_dev *dev, struct cx8800_fh *fh, unsigned int bits)
 	if ((fh->resources & bits) != bits)
 		BUG();
 
-	down(&core->lock);
+	mutex_lock(&core->lock);
 	fh->resources  &= ~bits;
 	dev->resources &= ~bits;
 	dprintk(1,"res: put %d\n",bits);
-	up(&core->lock);
+	mutex_unlock(&core->lock);
 }
 
 /* ------------------------------------------------------------------ */
@@ -1291,9 +1291,9 @@ int cx88_do_ioctl(struct inode *inode, struct file *file, int radio,
 		if (i == ARRAY_SIZE(tvnorms))
 			return -EINVAL;
 
-		down(&core->lock);
+		mutex_lock(&core->lock);
 		cx88_set_tvnorm(core,&tvnorms[i]);
-		up(&core->lock);
+		mutex_unlock(&core->lock);
 		return 0;
 	}
 
@@ -1343,10 +1343,10 @@ int cx88_do_ioctl(struct inode *inode, struct file *file, int radio,
 
 		if (*i >= 4)
 			return -EINVAL;
-		down(&core->lock);
+		mutex_lock(&core->lock);
 		cx88_newstation(core);
 		video_mux(core,*i);
-		up(&core->lock);
+		mutex_unlock(&core->lock);
 		return 0;
 	}
 
@@ -1438,7 +1438,7 @@ int cx88_do_ioctl(struct inode *inode, struct file *file, int radio,
 			return -EINVAL;
 		if (1 == radio && f->type != V4L2_TUNER_RADIO)
 			return -EINVAL;
-		down(&core->lock);
+		mutex_lock(&core->lock);
 		core->freq = f->frequency;
 		cx88_newstation(core);
 		cx88_call_i2c_clients(core,VIDIOC_S_FREQUENCY,f);
@@ -1447,7 +1447,7 @@ int cx88_do_ioctl(struct inode *inode, struct file *file, int radio,
 		msleep (10);
 		cx88_set_tvaudio(core);
 
-		up(&core->lock);
+		mutex_unlock(&core->lock);
 		return 0;
 	}
 
@@ -1921,11 +1921,11 @@ static int __devinit cx8800_initdev(struct pci_dev *pci_dev,
 	pci_set_drvdata(pci_dev,dev);
 
 	/* initial device configuration */
-	down(&core->lock);
+	mutex_lock(&core->lock);
 	cx88_set_tvnorm(core,tvnorms);
 	init_controls(core);
 	video_mux(core,0);
-	up(&core->lock);
+	mutex_unlock(&core->lock);
 
 	/* start tvaudio thread */
 	if (core->tuner_type != TUNER_ABSENT)
diff --git a/drivers/media/video/cx88/cx88.h b/drivers/media/video/cx88/cx88.h
index 31a688a3fb77..a4cf2473eacf 100644
--- a/drivers/media/video/cx88/cx88.h
+++ b/drivers/media/video/cx88/cx88.h
@@ -35,6 +35,7 @@
 #include "cx88-reg.h"
 
 #include <linux/version.h>
+#include <linux/mutex.h>
 #define CX88_VERSION_CODE KERNEL_VERSION(0,0,5)
 
 #ifndef TRUE
@@ -309,8 +310,7 @@ struct cx88_core {
 	/* IR remote control state */
 	struct cx88_IR             *ir;
 
-	struct semaphore           lock;
-
+	struct mutex               lock;
 	/* various v4l controls */
 	u32                        freq;
 
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index 1c1557d9bace..671fc52b6a88 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -362,12 +362,12 @@ static int em28xx_v4l2_open(struct inode *inode, struct file *filp)
 		return -EBUSY;
 	}
 
-	init_MUTEX(&dev->fileop_lock);	/* to 1 == available */
+	mutex_init(&dev->fileop_lock);	/* to 1 == available */
 	spin_lock_init(&dev->queue_lock);
 	init_waitqueue_head(&dev->wait_frame);
 	init_waitqueue_head(&dev->wait_stream);
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 
 	if (dev->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) {
 		em28xx_set_alternate(dev);
@@ -404,8 +404,8 @@ static int em28xx_v4l2_open(struct inode *inode, struct file *filp)
 
 	dev->state |= DEV_INITIALIZED;
 
-      err:
-	up(&dev->lock);
+err:
+	mutex_unlock(&dev->lock);
 	up_read(&em28xx_disconnect);
 	return errCode;
 }
@@ -447,7 +447,7 @@ static int em28xx_v4l2_close(struct inode *inode, struct file *filp)
 
 	em28xx_videodbg("users=%d\n", dev->users);
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 
 	em28xx_uninit_isoc(dev);
 
@@ -456,7 +456,7 @@ static int em28xx_v4l2_close(struct inode *inode, struct file *filp)
 	/* the device is already disconnect, free the remaining resources */
 	if (dev->state & DEV_DISCONNECTED) {
 		em28xx_release_resources(dev);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		kfree(dev);
 		return 0;
 	}
@@ -472,7 +472,7 @@ static int em28xx_v4l2_close(struct inode *inode, struct file *filp)
 
 	dev->users--;
 	wake_up_interruptible_nr(&dev->open, 1);
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 	return 0;
 }
 
@@ -496,7 +496,7 @@ em28xx_v4l2_read(struct file *filp, char __user * buf, size_t count,
 		em28xx_videodbg("V4L2_BUF_TYPE_VBI_CAPTURE is set\n");
 		em28xx_videodbg("not supported yet! ...\n");
 		if (copy_to_user(buf, "", 1)) {
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 			return -EFAULT;
 		}
 		return (1);
@@ -505,38 +505,38 @@ em28xx_v4l2_read(struct file *filp, char __user * buf, size_t count,
 		em28xx_videodbg("V4L2_BUF_TYPE_SLICED_VBI_CAPTURE is set\n");
 		em28xx_videodbg("not supported yet! ...\n");
 		if (copy_to_user(buf, "", 1)) {
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 			return -EFAULT;
 		}
 		return (1);
 	}
 
-	if (down_interruptible(&dev->fileop_lock))
+	if (mutex_lock_interruptible(&dev->fileop_lock))
 		return -ERESTARTSYS;
 
 	if (dev->state & DEV_DISCONNECTED) {
 		em28xx_videodbg("device not present\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -ENODEV;
 	}
 
 	if (dev->state & DEV_MISCONFIGURED) {
 		em28xx_videodbg("device misconfigured; close and open it again\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -EIO;
 	}
 
 	if (dev->io == IO_MMAP) {
 		em28xx_videodbg ("IO method is set to mmap; close and open"
 				" the device again to choose the read method\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -EINVAL;
 	}
 
 	if (dev->io == IO_NONE) {
 		if (!em28xx_request_buffers(dev, EM28XX_NUM_READ_FRAMES)) {
 			em28xx_errdev("read failed, not enough memory\n");
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 			return -ENOMEM;
 		}
 		dev->io = IO_READ;
@@ -545,13 +545,13 @@ em28xx_v4l2_read(struct file *filp, char __user * buf, size_t count,
 	}
 
 	if (!count) {
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return 0;
 	}
 
 	if (list_empty(&dev->outqueue)) {
 		if (filp->f_flags & O_NONBLOCK) {
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 			return -EAGAIN;
 		}
 		ret = wait_event_interruptible
@@ -559,11 +559,11 @@ em28xx_v4l2_read(struct file *filp, char __user * buf, size_t count,
 		     (!list_empty(&dev->outqueue)) ||
 		     (dev->state & DEV_DISCONNECTED));
 		if (ret) {
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 			return ret;
 		}
 		if (dev->state & DEV_DISCONNECTED) {
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 			return -ENODEV;
 		}
 	}
@@ -582,12 +582,12 @@ em28xx_v4l2_read(struct file *filp, char __user * buf, size_t count,
 		count = f->buf.length;
 
 	if (copy_to_user(buf, f->bufmem, count)) {
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -EFAULT;
 	}
 	*f_pos += count;
 
-	up(&dev->fileop_lock);
+	mutex_unlock(&dev->fileop_lock);
 
 	return count;
 }
@@ -601,7 +601,7 @@ static unsigned int em28xx_v4l2_poll(struct file *filp, poll_table * wait)
 	unsigned int mask = 0;
 	struct em28xx *dev = filp->private_data;
 
-	if (down_interruptible(&dev->fileop_lock))
+	if (mutex_lock_interruptible(&dev->fileop_lock))
 		return POLLERR;
 
 	if (dev->state & DEV_DISCONNECTED) {
@@ -627,13 +627,13 @@ static unsigned int em28xx_v4l2_poll(struct file *filp, poll_table * wait)
 			if (!list_empty(&dev->outqueue))
 				mask |= POLLIN | POLLRDNORM;
 
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 
 			return mask;
 		}
 	}
 
-	up(&dev->fileop_lock);
+	mutex_unlock(&dev->fileop_lock);
 	return POLLERR;
 }
 
@@ -673,25 +673,25 @@ static int em28xx_v4l2_mmap(struct file *filp, struct vm_area_struct *vma)
 
 	struct em28xx *dev = filp->private_data;
 
-	if (down_interruptible(&dev->fileop_lock))
+	if (mutex_lock_interruptible(&dev->fileop_lock))
 		return -ERESTARTSYS;
 
 	if (dev->state & DEV_DISCONNECTED) {
 		em28xx_videodbg("mmap: device not present\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -ENODEV;
 	}
 
 	if (dev->state & DEV_MISCONFIGURED) {
 		em28xx_videodbg ("mmap: Device is misconfigured; close and "
 						"open it again\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -EIO;
 	}
 
 	if (dev->io != IO_MMAP || !(vma->vm_flags & VM_WRITE) ||
 	    size != PAGE_ALIGN(dev->frame[0].buf.length)) {
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -EINVAL;
 	}
 
@@ -701,7 +701,7 @@ static int em28xx_v4l2_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 	if (i == dev->num_frames) {
 		em28xx_videodbg("mmap: user supplied mapping address is out of range\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -EINVAL;
 	}
 
@@ -713,7 +713,7 @@ static int em28xx_v4l2_mmap(struct file *filp, struct vm_area_struct *vma)
 	while (size > 0) {	/* size is page-aligned */
 		if (vm_insert_page(vma, start, vmalloc_to_page(pos))) {
 			em28xx_videodbg("mmap: vm_insert_page failed\n");
-			up(&dev->fileop_lock);
+			mutex_unlock(&dev->fileop_lock);
 			return -EAGAIN;
 		}
 		start += PAGE_SIZE;
@@ -725,7 +725,7 @@ static int em28xx_v4l2_mmap(struct file *filp, struct vm_area_struct *vma)
 	vma->vm_private_data = &dev->frame[i];
 
 	em28xx_vm_open(vma);
-	up(&dev->fileop_lock);
+	mutex_unlock(&dev->fileop_lock);
 	return 0;
 }
 
@@ -1125,7 +1125,7 @@ static int em28xx_do_ioctl(struct inode *inode, struct file *filp,
 		if (i == TVNORMS)
 			return -EINVAL;
 
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		dev->tvnorm = &tvnorms[i];
 
 		em28xx_set_norm(dev, dev->width, dev->height);
@@ -1135,7 +1135,7 @@ static int em28xx_do_ioctl(struct inode *inode, struct file *filp,
 		em28xx_i2c_call_clients(dev, VIDIOC_S_STD,
 					&dev->tvnorm->id);
 
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 
 		return 0;
 	}
@@ -1189,9 +1189,9 @@ static int em28xx_do_ioctl(struct inode *inode, struct file *filp,
 		if (0 == INPUT(*index)->type)
 			return -EINVAL;
 
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		video_mux(dev, *index);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 
 		return 0;
 	}
@@ -1338,10 +1338,10 @@ static int em28xx_do_ioctl(struct inode *inode, struct file *filp,
 /*		t->signal = 0xffff;*/
 /*		em28xx_i2c_call_clients(dev,VIDIOC_G_TUNER,t);*/
 		/* No way to get signal strength? */
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		em28xx_i2c_call_clients(dev, DECODER_GET_STATUS,
 					&status);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		t->signal =
 			(status & DECODER_STATUS_GOOD) != 0 ? 0xffff : 0;
 
@@ -1363,10 +1363,10 @@ static int em28xx_do_ioctl(struct inode *inode, struct file *filp,
 		t->rangehigh = 0xffffffffUL;	/* FIXME: set correct range */
 /*		t->signal = 0xffff; */
 		/* No way to get signal strength? */
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		em28xx_i2c_call_clients(dev, DECODER_GET_STATUS,
 					&status);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		t->signal =
 			(status & DECODER_STATUS_GOOD) != 0 ? 0xffff : 0;
 
@@ -1394,10 +1394,10 @@ static int em28xx_do_ioctl(struct inode *inode, struct file *filp,
 		if (V4L2_TUNER_ANALOG_TV != f->type)
 			return -EINVAL;
 
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		dev->ctl_freq = f->frequency;
 		em28xx_i2c_call_clients(dev, VIDIOC_S_FREQUENCY, f);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	}
 	case VIDIOC_CROPCAP:
@@ -1660,25 +1660,25 @@ static int em28xx_v4l2_ioctl(struct inode *inode, struct file *filp,
 	int ret = 0;
 	struct em28xx *dev = filp->private_data;
 
-	if (down_interruptible(&dev->fileop_lock))
+	if (mutex_lock_interruptible(&dev->fileop_lock))
 		return -ERESTARTSYS;
 
 	if (dev->state & DEV_DISCONNECTED) {
 		em28xx_errdev("v4l2 ioctl: device not present\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -ENODEV;
 	}
 
 	if (dev->state & DEV_MISCONFIGURED) {
 		em28xx_errdev
 		    ("v4l2 ioctl: device is misconfigured; close and open it again\n");
-		up(&dev->fileop_lock);
+		mutex_unlock(&dev->fileop_lock);
 		return -EIO;
 	}
 
 	ret = video_usercopy(inode, filp, cmd, arg, em28xx_video_do_ioctl);
 
-	up(&dev->fileop_lock);
+	mutex_unlock(&dev->fileop_lock);
 
 	return ret;
 }
@@ -1712,7 +1712,7 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 
 	dev->udev = udev;
 	dev->model = model;
-	init_MUTEX(&dev->lock);
+	mutex_init(&dev->lock);
 	init_waitqueue_head(&dev->open);
 
 	dev->em28xx_write_regs = em28xx_write_regs;
@@ -1788,7 +1788,7 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 		return -ENOMEM;
 	}
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	/* register i2c bus */
 	em28xx_i2c_register(dev);
 
@@ -1798,7 +1798,7 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 	/* configure the device */
 	em28xx_config_i2c(dev);
 
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 
 	errCode = em28xx_config(dev);
 
@@ -1849,12 +1849,12 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 	list_add_tail(&dev->devlist,&em28xx_devlist);
 
 	/* register v4l2 device */
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	if ((retval = video_register_device(dev->vdev, VFL_TYPE_GRABBER,
 					 video_nr[dev->devno]))) {
 		em28xx_errdev("unable to register video device (error=%i).\n",
 			      retval);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		list_del(&dev->devlist);
 		video_device_release(dev->vdev);
 		kfree(dev);
@@ -1865,7 +1865,7 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 	if (video_register_device(dev->vbi_dev, VFL_TYPE_VBI,
 					vbi_nr[dev->devno]) < 0) {
 		printk("unable to register vbi device\n");
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		list_del(&dev->devlist);
 		video_device_release(dev->vbi_dev);
 		video_device_release(dev->vdev);
@@ -1886,7 +1886,7 @@ static int em28xx_init_dev(struct em28xx **devhandle, struct usb_device *udev,
 	}
 	video_mux(dev, 0);
 
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 
 	em28xx_info("V4L2 device registered as /dev/video%d and /dev/vbi%d\n",
 				dev->vdev->minor-MINOR_VFL_TYPE_GRABBER_MIN,
@@ -2035,7 +2035,7 @@ static void em28xx_usb_disconnect(struct usb_interface *interface)
 
 	down_write(&em28xx_disconnect);
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 
 	em28xx_info("disconnecting %s\n", dev->vdev->name);
 
@@ -2057,7 +2057,7 @@ static void em28xx_usb_disconnect(struct usb_interface *interface)
 		em28xx_release_resources(dev);
 	}
 
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 
 	if (!dev->users) {
 		kfree(dev->alt_max_pkt_size);
diff --git a/drivers/media/video/em28xx/em28xx.h b/drivers/media/video/em28xx/em28xx.h
index ddf06c520e99..4e2fe62b7350 100644
--- a/drivers/media/video/em28xx/em28xx.h
+++ b/drivers/media/video/em28xx/em28xx.h
@@ -27,6 +27,7 @@
 
 #include <linux/videodev.h>
 #include <linux/i2c.h>
+#include <linux/mutex.h>
 #include <media/ir-kbd-i2c.h>
 
 /* Boards supported by driver */
@@ -260,7 +261,7 @@ struct em28xx {
 	enum em28xx_stream_state stream;
 	enum em28xx_io_method io;
 	/* locks */
-	struct semaphore lock, fileop_lock;
+	struct mutex lock, fileop_lock;
 	spinlock_t queue_lock;
 	struct list_head inqueue, outqueue;
 	wait_queue_head_t open, wait_frame, wait_stream;
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 2869464aee0d..850bee97090c 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -925,7 +925,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		if (p->palette != VIDEO_PALETTE_YUV422)
 			return -EINVAL;
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		sonypi_camera_command(SONYPI_COMMAND_SETCAMERABRIGHTNESS,
 				      p->brightness >> 10);
 		sonypi_camera_command(SONYPI_COMMAND_SETCAMERAHUE,
@@ -935,7 +935,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		sonypi_camera_command(SONYPI_COMMAND_SETCAMERACONTRAST,
 				      p->contrast >> 10);
 		meye.picture = *p;
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -946,21 +946,21 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		if (*i < 0 || *i >= gbuffers)
 			return -EINVAL;
 
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 
 		switch (meye.grab_buffer[*i].state) {
 
 		case MEYE_BUF_UNUSED:
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINVAL;
 		case MEYE_BUF_USING:
 			if (file->f_flags & O_NONBLOCK) {
-				up(&meye.lock);
+				mutex_unlock(&meye.lock);
 				return -EAGAIN;
 			}
 			if (wait_event_interruptible(meye.proc_list,
 						     (meye.grab_buffer[*i].state != MEYE_BUF_USING))) {
-				up(&meye.lock);
+				mutex_unlock(&meye.lock);
 				return -EINTR;
 			}
 			/* fall through */
@@ -968,7 +968,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			meye.grab_buffer[*i].state = MEYE_BUF_UNUSED;
 			kfifo_get(meye.doneq, (unsigned char *)&unused, sizeof(int));
 		}
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -987,7 +987,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		if (meye.grab_buffer[vm->frame].state != MEYE_BUF_UNUSED)
 			return -EBUSY;
 
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		if (vm->width == 640 && vm->height == 480) {
 			if (meye.params.subsample) {
 				meye.params.subsample = 0;
@@ -999,7 +999,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 				restart = 1;
 			}
 		} else {
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINVAL;
 		}
 
@@ -1007,7 +1007,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			mchip_continuous_start();
 		meye.grab_buffer[vm->frame].state = MEYE_BUF_USING;
 		kfifo_put(meye.grabq, (unsigned char *)&vm->frame, sizeof(int));
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1039,7 +1039,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		if (jp->framerate > 31)
 			return -EINVAL;
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		if (meye.params.subsample != jp->subsample ||
 		    meye.params.quality != jp->quality)
 			mchip_hic_stop();	/* need restart */
@@ -1050,7 +1050,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 				      meye.params.agc);
 		sonypi_camera_command(SONYPI_COMMAND_SETCAMERAPICTURE,
 				      meye.params.picture);
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1068,12 +1068,12 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		}
 		if (meye.grab_buffer[*nb].state != MEYE_BUF_UNUSED)
 			return -EBUSY;
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		if (meye.mchip_mode != MCHIP_HIC_MODE_CONT_COMP)
 			mchip_cont_compression_start();
 		meye.grab_buffer[*nb].state = MEYE_BUF_USING;
 		kfifo_put(meye.grabq, (unsigned char *)nb, sizeof(int));
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1084,20 +1084,20 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		if (*i < 0 || *i >= gbuffers)
 			return -EINVAL;
 
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		switch (meye.grab_buffer[*i].state) {
 
 		case MEYE_BUF_UNUSED:
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINVAL;
 		case MEYE_BUF_USING:
 			if (file->f_flags & O_NONBLOCK) {
-				up(&meye.lock);
+				mutex_unlock(&meye.lock);
 				return -EAGAIN;
 			}
 			if (wait_event_interruptible(meye.proc_list,
 						     (meye.grab_buffer[*i].state != MEYE_BUF_USING))) {
-				up(&meye.lock);
+				mutex_unlock(&meye.lock);
 				return -EINTR;
 			}
 			/* fall through */
@@ -1106,7 +1106,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			kfifo_get(meye.doneq, (unsigned char *)&unused, sizeof(int));
 		}
 		*i = meye.grab_buffer[*i].size;
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1116,14 +1116,14 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		if (meye.grab_buffer[0].state != MEYE_BUF_UNUSED)
 			return -EBUSY;
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		meye.grab_buffer[0].state = MEYE_BUF_USING;
 		mchip_take_picture();
 		mchip_get_picture(
 			meye.grab_fbuffer,
 			mchip_hsize() * mchip_vsize() * 2);
 		meye.grab_buffer[0].state = MEYE_BUF_DONE;
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1134,7 +1134,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		if (meye.grab_buffer[0].state != MEYE_BUF_UNUSED)
 			return -EBUSY;
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		meye.grab_buffer[0].state = MEYE_BUF_USING;
 		*len = -1;
 		while (*len == -1) {
@@ -1142,7 +1142,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			*len = mchip_compress_frame(meye.grab_fbuffer, gbufsize);
 		}
 		meye.grab_buffer[0].state = MEYE_BUF_DONE;
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1285,7 +1285,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 	case VIDIOC_S_CTRL: {
 		struct v4l2_control *c = arg;
 
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		switch (c->id) {
 		case V4L2_CID_BRIGHTNESS:
 			sonypi_camera_command(
@@ -1329,17 +1329,17 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			meye.params.framerate = c->value;
 			break;
 		default:
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINVAL;
 		}
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
 	case VIDIOC_G_CTRL: {
 		struct v4l2_control *c = arg;
 
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		switch (c->id) {
 		case V4L2_CID_BRIGHTNESS:
 			c->value = meye.picture.brightness >> 10;
@@ -1369,10 +1369,10 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			c->value = meye.params.framerate;
 			break;
 		default:
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINVAL;
 		}
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1469,7 +1469,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		    f->fmt.pix.field != V4L2_FIELD_NONE)
 			return -EINVAL;
 		f->fmt.pix.field = V4L2_FIELD_NONE;
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		if (f->fmt.pix.width <= 320) {
 			f->fmt.pix.width = 320;
 			f->fmt.pix.height = 240;
@@ -1487,7 +1487,7 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			meye.mchip_mode = MCHIP_HIC_MODE_CONT_COMP;
 			break;
 		}
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		f->fmt.pix.bytesperline = f->fmt.pix.width * 2;
 		f->fmt.pix.sizeimage = f->fmt.pix.height *
 				       f->fmt.pix.bytesperline;
@@ -1509,11 +1509,11 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			/* already allocated, no modifications */
 			break;
 		}
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		if (meye.grab_fbuffer) {
 			for (i = 0; i < gbuffers; i++)
 				if (meye.vma_use_count[i]) {
-					up(&meye.lock);
+					mutex_unlock(&meye.lock);
 					return -EINVAL;
 				}
 			rvfree(meye.grab_fbuffer, gbuffers * gbufsize);
@@ -1525,12 +1525,12 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		if (!meye.grab_fbuffer) {
 			printk(KERN_ERR "meye: v4l framebuffer allocation"
 					" failed\n");
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -ENOMEM;
 		}
 		for (i = 0; i < gbuffers; i++)
 			meye.vma_use_count[i] = 0;
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1569,12 +1569,12 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		if (meye.grab_buffer[buf->index].state != MEYE_BUF_UNUSED)
 			return -EINVAL;
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		buf->flags |= V4L2_BUF_FLAG_QUEUED;
 		buf->flags &= ~V4L2_BUF_FLAG_DONE;
 		meye.grab_buffer[buf->index].state = MEYE_BUF_USING;
 		kfifo_put(meye.grabq, (unsigned char *)&buf->index, sizeof(int));
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1587,23 +1587,23 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		if (buf->memory != V4L2_MEMORY_MMAP)
 			return -EINVAL;
 
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		if (kfifo_len(meye.doneq) == 0 && file->f_flags & O_NONBLOCK) {
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EAGAIN;
 		}
 		if (wait_event_interruptible(meye.proc_list,
 					     kfifo_len(meye.doneq) != 0) < 0) {
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINTR;
 		}
 		if (!kfifo_get(meye.doneq, (unsigned char *)&reqnr,
 			       sizeof(int))) {
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EBUSY;
 		}
 		if (meye.grab_buffer[reqnr].state != MEYE_BUF_DONE) {
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINVAL;
 		}
 		buf->index = reqnr;
@@ -1616,12 +1616,12 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 		buf->m.offset = reqnr * gbufsize;
 		buf->length = gbufsize;
 		meye.grab_buffer[reqnr].state = MEYE_BUF_UNUSED;
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
 	case VIDIOC_STREAMON: {
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		switch (meye.mchip_mode) {
 		case MCHIP_HIC_MODE_CONT_OUT:
 			mchip_continuous_start();
@@ -1630,23 +1630,23 @@ static int meye_do_ioctl(struct inode *inode, struct file *file,
 			mchip_cont_compression_start();
 			break;
 		default:
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EINVAL;
 		}
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
 	case VIDIOC_STREAMOFF: {
 		int i;
 
-		down(&meye.lock);
+		mutex_lock(&meye.lock);
 		mchip_hic_stop();
 		kfifo_reset(meye.grabq);
 		kfifo_reset(meye.doneq);
 		for (i = 0; i < MEYE_MAX_BUFNBRS; i++)
 			meye.grab_buffer[i].state = MEYE_BUF_UNUSED;
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		break;
 	}
 
@@ -1672,11 +1672,11 @@ static unsigned int meye_poll(struct file *file, poll_table *wait)
 {
 	unsigned int res = 0;
 
-	down(&meye.lock);
+	mutex_lock(&meye.lock);
 	poll_wait(file, &meye.proc_list, wait);
 	if (kfifo_len(meye.doneq))
 		res = POLLIN | POLLRDNORM;
-	up(&meye.lock);
+	mutex_unlock(&meye.lock);
 	return res;
 }
 
@@ -1704,9 +1704,9 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
 	unsigned long page, pos;
 
-	down(&meye.lock);
+	mutex_lock(&meye.lock);
 	if (size > gbuffers * gbufsize) {
-		up(&meye.lock);
+		mutex_unlock(&meye.lock);
 		return -EINVAL;
 	}
 	if (!meye.grab_fbuffer) {
@@ -1716,7 +1716,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma)
 		meye.grab_fbuffer = rvmalloc(gbuffers*gbufsize);
 		if (!meye.grab_fbuffer) {
 			printk(KERN_ERR "meye: v4l framebuffer allocation failed\n");
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -ENOMEM;
 		}
 		for (i = 0; i < gbuffers; i++)
@@ -1727,7 +1727,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma)
 	while (size > 0) {
 		page = vmalloc_to_pfn((void *)pos);
 		if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED)) {
-			up(&meye.lock);
+			mutex_unlock(&meye.lock);
 			return -EAGAIN;
 		}
 		start += PAGE_SIZE;
@@ -1744,7 +1744,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_private_data = (void *) (offset / gbufsize);
 	meye_vm_open(vma);
 
-	up(&meye.lock);
+	mutex_unlock(&meye.lock);
 	return 0;
 }
 
@@ -1913,7 +1913,7 @@ static int __devinit meye_probe(struct pci_dev *pcidev,
 		goto outvideoreg;
 	}
 
-	init_MUTEX(&meye.lock);
+	mutex_init(&meye.lock);
 	init_waitqueue_head(&meye.proc_list);
 	meye.picture.depth = 16;
 	meye.picture.palette = VIDEO_PALETTE_YUV422;
diff --git a/drivers/media/video/meye.h b/drivers/media/video/meye.h
index e8cd897b0d20..0d09a0e3803c 100644
--- a/drivers/media/video/meye.h
+++ b/drivers/media/video/meye.h
@@ -260,6 +260,8 @@
 
 /* private API definitions */
 #include <linux/meye.h>
+#include <linux/mutex.h>
+
 
 /* Enable jpg software correction */
 #define MEYE_JPEG_CORRECTION	1
@@ -301,7 +303,7 @@ struct meye {
 					/* list of buffers */
 	struct meye_grab_buffer grab_buffer[MEYE_MAX_BUFNBRS];
 	int vma_use_count[MEYE_MAX_BUFNBRS]; /* mmap count */
-	struct semaphore lock;		/* semaphore for open/mmap... */
+	struct mutex lock;		/* mutex for open/mmap... */
 	struct kfifo *grabq;		/* queue for buffers to be grabbed */
 	spinlock_t grabq_lock;		/* lock protecting the queue */
 	struct kfifo *doneq;		/* queue for grabbed buffers */
diff --git a/drivers/media/video/mxb.c b/drivers/media/video/mxb.c
index 8416ceff524b..69a7eddf3a24 100644
--- a/drivers/media/video/mxb.c
+++ b/drivers/media/video/mxb.c
@@ -625,9 +625,9 @@ static int mxb_ioctl(struct saa7146_fh *fh, unsigned int cmd, void *arg)
 		}
 		
 		/* fixme: locke das setzen des inputs mit hilfe des mutexes
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		video_mux(dev,*i);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		*/
 				
 		/* fixme: check if streaming capture
diff --git a/drivers/media/video/planb.c b/drivers/media/video/planb.c
index f3fc361bec97..15fd85acabda 100644
--- a/drivers/media/video/planb.c
+++ b/drivers/media/video/planb.c
@@ -48,7 +48,7 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/irq.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include "planb.h"
 #include "saa7196.h"
@@ -329,12 +329,12 @@ static volatile struct dbdma_cmd *cmd_geo_setup(
 
 static inline void planb_lock(struct planb *pb)
 {
-	down(&pb->lock);
+	mutex_lock(&pb->lock);
 }
 
 static inline void planb_unlock(struct planb *pb)
 {
-	up(&pb->lock);
+	mutex_unlock(&pb->lock);
 }
 
 /***************/
@@ -2067,7 +2067,7 @@ static int init_planb(struct planb *pb)
 #endif
 	pb->tab_size = PLANB_MAXLINES + 40;
 	pb->suspend = 0;
-	init_MUTEX(&pb->lock);
+	mutex_init(&pb->lock);
 	pb->ch1_cmd = 0;
 	pb->ch2_cmd = 0;
 	pb->mask = 0;
diff --git a/drivers/media/video/planb.h b/drivers/media/video/planb.h
index 8a0faad16118..79b6b561426e 100644
--- a/drivers/media/video/planb.h
+++ b/drivers/media/video/planb.h
@@ -174,7 +174,7 @@ struct planb {
 	int	user;
 	unsigned int tab_size;
 	int     maxlines;
-	struct semaphore lock;
+	struct mutex lock;
 	unsigned int	irq;			/* interrupt number */
 	volatile unsigned int intr_mask;
 
diff --git a/drivers/media/video/pms.c b/drivers/media/video/pms.c
index 9e6448639480..05ca55939e77 100644
--- a/drivers/media/video/pms.c
+++ b/drivers/media/video/pms.c
@@ -30,6 +30,8 @@
 #include <asm/io.h>
 #include <linux/sched.h>
 #include <linux/videodev.h>
+#include <linux/mutex.h>
+
 #include <asm/uaccess.h>
 
 
@@ -44,7 +46,7 @@ struct pms_device
 	struct video_picture picture;
 	int height;
 	int width;
-	struct semaphore lock;
+	struct mutex lock;
 };
 
 struct i2c_info
@@ -724,10 +726,10 @@ static int pms_do_ioctl(struct inode *inode, struct file *file,
 			struct video_channel *v = arg;
 			if(v->channel<0 || v->channel>3)
 				return -EINVAL;
-			down(&pd->lock);
+			mutex_lock(&pd->lock);
 			pms_videosource(v->channel&1);
 			pms_vcrinput(v->channel>>1);
-			up(&pd->lock);
+			mutex_unlock(&pd->lock);
 			return 0;
 		}
 		case VIDIOCGTUNER:
@@ -761,7 +763,7 @@ static int pms_do_ioctl(struct inode *inode, struct file *file,
 			struct video_tuner *v = arg;
 			if(v->tuner)
 				return -EINVAL;
-			down(&pd->lock);
+			mutex_lock(&pd->lock);
 			switch(v->mode)
 			{
 				case VIDEO_MODE_AUTO:
@@ -785,10 +787,10 @@ static int pms_do_ioctl(struct inode *inode, struct file *file,
 					pms_format(2);
 					break;
 				default:
-					up(&pd->lock);
+					mutex_unlock(&pd->lock);
 					return -EINVAL;
 			}
-			up(&pd->lock);
+			mutex_unlock(&pd->lock);
 			return 0;
 		}
 		case VIDIOCGPICT:
@@ -809,12 +811,12 @@ static int pms_do_ioctl(struct inode *inode, struct file *file,
 			 *	Now load the card.
 			 */
 
-			down(&pd->lock);
+			mutex_lock(&pd->lock);
 			pms_brightness(p->brightness>>8);
 			pms_hue(p->hue>>8);
 			pms_colour(p->colour>>8);
 			pms_contrast(p->contrast>>8);	
-			up(&pd->lock);
+			mutex_unlock(&pd->lock);
 			return 0;
 		}
 		case VIDIOCSWIN:
@@ -830,9 +832,9 @@ static int pms_do_ioctl(struct inode *inode, struct file *file,
 				return -EINVAL;
 			pd->width=vw->width;
 			pd->height=vw->height;
-			down(&pd->lock);
+			mutex_lock(&pd->lock);
 			pms_resolution(pd->width, pd->height);
-			up(&pd->lock);			/* Ok we figured out what to use from our wide choice */
+			mutex_unlock(&pd->lock);			/* Ok we figured out what to use from our wide choice */
 			return 0;
 		}
 		case VIDIOCGWIN:
@@ -872,9 +874,9 @@ static ssize_t pms_read(struct file *file, char __user *buf,
 	struct pms_device *pd=(struct pms_device *)v;
 	int len;
 	
-	down(&pd->lock);
+	mutex_lock(&pd->lock);
 	len=pms_capture(pd, buf, (pd->picture.depth==16)?0:1,count);
-	up(&pd->lock);
+	mutex_unlock(&pd->lock);
 	return len;
 }
 
@@ -1029,7 +1031,7 @@ static int __init init_pms_cards(void)
 		return -ENODEV;
 	}
 	memcpy(&pms_device, &pms_template, sizeof(pms_template));
-	init_MUTEX(&pms_device.lock);
+	mutex_init(&pms_device.lock);
 	pms_device.height=240;
 	pms_device.width=320;
 	pms_swsense(75);
diff --git a/drivers/media/video/saa5246a.c b/drivers/media/video/saa5246a.c
index 2ce010201308..dd830e0e5e96 100644
--- a/drivers/media/video/saa5246a.c
+++ b/drivers/media/video/saa5246a.c
@@ -46,6 +46,8 @@
 #include <linux/i2c.h>
 #include <linux/videotext.h>
 #include <linux/videodev.h>
+#include <linux/mutex.h>
+
 #include "saa5246a.h"
 
 MODULE_AUTHOR("Michael Geng <linux@MichaelGeng.de>");
@@ -57,7 +59,7 @@ struct saa5246a_device
 	u8     pgbuf[NUM_DAUS][VTX_VIRTUALSIZE];
 	int    is_searching[NUM_DAUS];
 	struct i2c_client *client;
-	struct semaphore lock;
+	struct mutex lock;
 };
 
 static struct video_device saa_template;	/* Declared near bottom */
@@ -90,7 +92,7 @@ static int saa5246a_attach(struct i2c_adapter *adap, int addr, int kind)
 		return -ENOMEM;
 	}
 	strlcpy(client->name, IF_NAME, I2C_NAME_SIZE);
-	init_MUTEX(&t->lock);
+	mutex_init(&t->lock);
 
 	/*
 	 *	Now create a video4linux device
@@ -719,9 +721,9 @@ static int saa5246a_ioctl(struct inode *inode, struct file *file,
 	int err;
 
 	cmd = vtx_fix_command(cmd);
-	down(&t->lock);
+	mutex_lock(&t->lock);
 	err = video_usercopy(inode, file, cmd, arg, do_saa5246a_ioctl);
-	up(&t->lock);
+	mutex_unlock(&t->lock);
 	return err;
 }
 
diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c
index 5694eb58c3a1..a9f3cf0b1e3c 100644
--- a/drivers/media/video/saa5249.c
+++ b/drivers/media/video/saa5249.c
@@ -56,6 +56,8 @@
 #include <linux/i2c.h>
 #include <linux/videotext.h>
 #include <linux/videodev.h>
+#include <linux/mutex.h>
+
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -105,7 +107,7 @@ struct saa5249_device
 	int disp_mode;
 	int virtual_mode;
 	struct i2c_client *client;
-	struct semaphore lock;
+	struct mutex lock;
 };
 
 
@@ -158,7 +160,7 @@ static int saa5249_attach(struct i2c_adapter *adap, int addr, int kind)
 		return -ENOMEM;
 	}
 	strlcpy(client->name, IF_NAME, I2C_NAME_SIZE);
-	init_MUTEX(&t->lock);
+	mutex_init(&t->lock);
 	
 	/*
 	 *	Now create a video4linux device
@@ -619,9 +621,9 @@ static int saa5249_ioctl(struct inode *inode, struct file *file,
 	int err;
 	
 	cmd = vtx_fix_command(cmd);
-	down(&t->lock);
+	mutex_lock(&t->lock);
 	err = video_usercopy(inode,file,cmd,arg,do_saa5249_ioctl);
-	up(&t->lock);
+	mutex_unlock(&t->lock);
 	return err;
 }
 
diff --git a/drivers/media/video/saa7134/saa7134-alsa.c b/drivers/media/video/saa7134/saa7134-alsa.c
index a7a6ab9298a9..d3c7345a5d22 100644
--- a/drivers/media/video/saa7134/saa7134-alsa.c
+++ b/drivers/media/video/saa7134/saa7134-alsa.c
@@ -609,12 +609,12 @@ static int snd_card_saa7134_capture_open(snd_pcm_substream_t * substream)
 	struct saa7134_dev *dev = saa7134->dev;
 	int err;
 
-	down(&dev->dmasound.lock);
+	mutex_lock(&dev->dmasound.lock);
 
 	dev->dmasound.read_count  = 0;
 	dev->dmasound.read_offset = 0;
 
-	up(&dev->dmasound.lock);
+	mutex_unlock(&dev->dmasound.lock);
 
 	pcm = kzalloc(sizeof(*pcm), GFP_KERNEL);
 	if (pcm == NULL)
@@ -932,7 +932,7 @@ static int alsa_card_saa7134_create(struct saa7134_dev *dev, int devnum)
 
 	chip->irq = dev->pci->irq;
 
-	init_MUTEX(&dev->dmasound.lock);
+	mutex_init(&dev->dmasound.lock);
 
 	if ((err = snd_card_saa7134_new_mixer(chip)) < 0)
 		goto __nodev;
diff --git a/drivers/media/video/saa7134/saa7134-core.c b/drivers/media/video/saa7134/saa7134-core.c
index 028904bd94a2..e7c30665739e 100644
--- a/drivers/media/video/saa7134/saa7134-core.c
+++ b/drivers/media/video/saa7134/saa7134-core.c
@@ -613,7 +613,7 @@ static int saa7134_hwinit1(struct saa7134_dev *dev)
 
 	saa_writel(SAA7134_IRQ1, 0);
 	saa_writel(SAA7134_IRQ2, 0);
-	init_MUTEX(&dev->lock);
+	mutex_init(&dev->lock);
 	spin_lock_init(&dev->slock);
 
 	saa7134_track_gpio(dev,"pre-init");
diff --git a/drivers/media/video/saa7134/saa7134-empress.c b/drivers/media/video/saa7134/saa7134-empress.c
index bd4c389d4c37..1d972edb3be6 100644
--- a/drivers/media/video/saa7134/saa7134-empress.c
+++ b/drivers/media/video/saa7134/saa7134-empress.c
@@ -89,7 +89,7 @@ static int ts_open(struct inode *inode, struct file *file)
 
 	dprintk("open minor=%d\n",minor);
 	err = -EBUSY;
-	if (down_trylock(&dev->empress_tsq.lock))
+	if (!mutex_trylock(&dev->empress_tsq.lock))
 		goto done;
 	if (dev->empress_users)
 		goto done_up;
@@ -99,7 +99,7 @@ static int ts_open(struct inode *inode, struct file *file)
 	err = 0;
 
 done_up:
-	up(&dev->empress_tsq.lock);
+	mutex_unlock(&dev->empress_tsq.lock);
 done:
 	return err;
 }
@@ -110,7 +110,7 @@ static int ts_release(struct inode *inode, struct file *file)
 
 	if (dev->empress_tsq.streaming)
 		videobuf_streamoff(&dev->empress_tsq);
-	down(&dev->empress_tsq.lock);
+	mutex_lock(&dev->empress_tsq.lock);
 	if (dev->empress_tsq.reading)
 		videobuf_read_stop(&dev->empress_tsq);
 	videobuf_mmap_free(&dev->empress_tsq);
@@ -119,7 +119,7 @@ static int ts_release(struct inode *inode, struct file *file)
 	/* stop the encoder */
 	ts_reset_encoder(dev);
 
-	up(&dev->empress_tsq.lock);
+	mutex_unlock(&dev->empress_tsq.lock);
 	return 0;
 }
 
diff --git a/drivers/media/video/saa7134/saa7134-oss.c b/drivers/media/video/saa7134/saa7134-oss.c
index 7448e386a804..80e34a5fdcc5 100644
--- a/drivers/media/video/saa7134/saa7134-oss.c
+++ b/drivers/media/video/saa7134/saa7134-oss.c
@@ -254,7 +254,7 @@ static int dsp_open(struct inode *inode, struct file *file)
 	if (NULL == dev)
 		return -ENODEV;
 
-	down(&dev->dmasound.lock);
+	mutex_lock(&dev->dmasound.lock);
 	err = -EBUSY;
 	if (dev->dmasound.users_dsp)
 		goto fail1;
@@ -270,13 +270,13 @@ static int dsp_open(struct inode *inode, struct file *file)
 	if (0 != err)
 		goto fail2;
 
-	up(&dev->dmasound.lock);
+	mutex_unlock(&dev->dmasound.lock);
 	return 0;
 
  fail2:
 	dev->dmasound.users_dsp--;
  fail1:
-	up(&dev->dmasound.lock);
+	mutex_unlock(&dev->dmasound.lock);
 	return err;
 }
 
@@ -284,13 +284,13 @@ static int dsp_release(struct inode *inode, struct file *file)
 {
 	struct saa7134_dev *dev = file->private_data;
 
-	down(&dev->dmasound.lock);
+	mutex_lock(&dev->dmasound.lock);
 	if (dev->dmasound.recording_on)
 		dsp_rec_stop(dev);
 	dsp_buffer_free(dev);
 	dev->dmasound.users_dsp--;
 	file->private_data = NULL;
-	up(&dev->dmasound.lock);
+	mutex_unlock(&dev->dmasound.lock);
 	return 0;
 }
 
@@ -304,7 +304,7 @@ static ssize_t dsp_read(struct file *file, char __user *buffer,
 	int err,ret = 0;
 
 	add_wait_queue(&dev->dmasound.wq, &wait);
-	down(&dev->dmasound.lock);
+	mutex_lock(&dev->dmasound.lock);
 	while (count > 0) {
 		/* wait for data if needed */
 		if (0 == dev->dmasound.read_count) {
@@ -328,12 +328,12 @@ static ssize_t dsp_read(struct file *file, char __user *buffer,
 					ret = -EAGAIN;
 				break;
 			}
-			up(&dev->dmasound.lock);
+			mutex_unlock(&dev->dmasound.lock);
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (0 == dev->dmasound.read_count)
 				schedule();
 			set_current_state(TASK_RUNNING);
-			down(&dev->dmasound.lock);
+			mutex_lock(&dev->dmasound.lock);
 			if (signal_pending(current)) {
 				if (0 == ret)
 					ret = -EINTR;
@@ -362,7 +362,7 @@ static ssize_t dsp_read(struct file *file, char __user *buffer,
 		if (dev->dmasound.read_offset == dev->dmasound.bufsize)
 			dev->dmasound.read_offset = 0;
 	}
-	up(&dev->dmasound.lock);
+	mutex_unlock(&dev->dmasound.lock);
 	remove_wait_queue(&dev->dmasound.wq, &wait);
 	return ret;
 }
@@ -435,13 +435,13 @@ static int dsp_ioctl(struct inode *inode, struct file *file,
 	case SNDCTL_DSP_STEREO:
 		if (get_user(val, p))
 			return -EFAULT;
-		down(&dev->dmasound.lock);
+		mutex_lock(&dev->dmasound.lock);
 		dev->dmasound.channels = val ? 2 : 1;
 		if (dev->dmasound.recording_on) {
 			dsp_rec_stop(dev);
 			dsp_rec_start(dev);
 		}
-		up(&dev->dmasound.lock);
+		mutex_unlock(&dev->dmasound.lock);
 		return put_user(dev->dmasound.channels-1, p);
 
 	case SNDCTL_DSP_CHANNELS:
@@ -449,13 +449,13 @@ static int dsp_ioctl(struct inode *inode, struct file *file,
 			return -EFAULT;
 		if (val != 1 && val != 2)
 			return -EINVAL;
-		down(&dev->dmasound.lock);
+		mutex_lock(&dev->dmasound.lock);
 		dev->dmasound.channels = val;
 		if (dev->dmasound.recording_on) {
 			dsp_rec_stop(dev);
 			dsp_rec_start(dev);
 		}
-		up(&dev->dmasound.lock);
+		mutex_unlock(&dev->dmasound.lock);
 		/* fall through */
 	case SOUND_PCM_READ_CHANNELS:
 		return put_user(dev->dmasound.channels, p);
@@ -478,13 +478,13 @@ static int dsp_ioctl(struct inode *inode, struct file *file,
 		case AFMT_U16_BE:
 		case AFMT_S16_LE:
 		case AFMT_S16_BE:
-			down(&dev->dmasound.lock);
+			mutex_lock(&dev->dmasound.lock);
 			dev->dmasound.afmt = val;
 			if (dev->dmasound.recording_on) {
 				dsp_rec_stop(dev);
 				dsp_rec_start(dev);
 			}
-			up(&dev->dmasound.lock);
+			mutex_unlock(&dev->dmasound.lock);
 			return put_user(dev->dmasound.afmt, p);
 		default:
 			return -EINVAL;
@@ -509,10 +509,10 @@ static int dsp_ioctl(struct inode *inode, struct file *file,
 		return 0;
 
 	case SNDCTL_DSP_RESET:
-		down(&dev->dmasound.lock);
+		mutex_lock(&dev->dmasound.lock);
 		if (dev->dmasound.recording_on)
 			dsp_rec_stop(dev);
-		up(&dev->dmasound.lock);
+		mutex_unlock(&dev->dmasound.lock);
 		return 0;
 	case SNDCTL_DSP_GETBLKSIZE:
 		return put_user(dev->dmasound.blksize, p);
@@ -556,10 +556,10 @@ static unsigned int dsp_poll(struct file *file, struct poll_table_struct *wait)
 	poll_wait(file, &dev->dmasound.wq, wait);
 
 	if (0 == dev->dmasound.read_count) {
-		down(&dev->dmasound.lock);
+		mutex_lock(&dev->dmasound.lock);
 		if (!dev->dmasound.recording_on)
 			dsp_rec_start(dev);
-		up(&dev->dmasound.lock);
+		mutex_unlock(&dev->dmasound.lock);
 	} else
 		mask |= (POLLIN | POLLRDNORM);
 	return mask;
@@ -852,7 +852,7 @@ int saa7134_oss_init1(struct saa7134_dev *dev)
 		return -1;
 
 	/* general */
-	init_MUTEX(&dev->dmasound.lock);
+	mutex_init(&dev->dmasound.lock);
 	init_waitqueue_head(&dev->dmasound.wq);
 
 	switch (dev->pci->device) {
diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index e97426bc85df..72f389a51a13 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -460,17 +460,17 @@ static int res_get(struct saa7134_dev *dev, struct saa7134_fh *fh, unsigned int
 		return 1;
 
 	/* is it free? */
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	if (dev->resources & bit) {
 		/* no, someone else uses it */
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	}
 	/* it's free, grab it */
 	fh->resources  |= bit;
 	dev->resources |= bit;
 	dprintk("res: get %d\n",bit);
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 	return 1;
 }
 
@@ -492,11 +492,11 @@ void res_free(struct saa7134_dev *dev, struct saa7134_fh *fh, unsigned int bits)
 	if ((fh->resources & bits) != bits)
 		BUG();
 
-	down(&dev->lock);
+	mutex_lock(&dev->lock);
 	fh->resources  &= ~bits;
 	dev->resources &= ~bits;
 	dprintk("res: put %d\n",bits);
-	up(&dev->lock);
+	mutex_unlock(&dev->lock);
 }
 
 /* ------------------------------------------------------------------ */
@@ -1340,21 +1340,21 @@ video_poll(struct file *file, struct poll_table_struct *wait)
 		if (!list_empty(&fh->cap.stream))
 			buf = list_entry(fh->cap.stream.next, struct videobuf_buffer, stream);
 	} else {
-		down(&fh->cap.lock);
+		mutex_lock(&fh->cap.lock);
 		if (UNSET == fh->cap.read_off) {
 			/* need to capture a new frame */
 			if (res_locked(fh->dev,RESOURCE_VIDEO)) {
-				up(&fh->cap.lock);
+				mutex_unlock(&fh->cap.lock);
 				return POLLERR;
 			}
 			if (0 != fh->cap.ops->buf_prepare(&fh->cap,fh->cap.read_buf,fh->cap.field)) {
-				up(&fh->cap.lock);
+				mutex_unlock(&fh->cap.lock);
 				return POLLERR;
 			}
 			fh->cap.ops->buf_queue(&fh->cap,fh->cap.read_buf);
 			fh->cap.read_off = 0;
 		}
-		up(&fh->cap.lock);
+		mutex_unlock(&fh->cap.lock);
 		buf = fh->cap.read_buf;
 	}
 
@@ -1561,14 +1561,14 @@ static int saa7134_s_fmt(struct saa7134_dev *dev, struct saa7134_fh *fh,
 		if (0 != err)
 			return err;
 
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		fh->win    = f->fmt.win;
 		fh->nclips = f->fmt.win.clipcount;
 		if (fh->nclips > 8)
 			fh->nclips = 8;
 		if (copy_from_user(fh->clips,f->fmt.win.clips,
 				   sizeof(struct v4l2_clip)*fh->nclips)) {
-			up(&dev->lock);
+			mutex_unlock(&dev->lock);
 			return -EFAULT;
 		}
 
@@ -1578,7 +1578,7 @@ static int saa7134_s_fmt(struct saa7134_dev *dev, struct saa7134_fh *fh,
 			start_preview(dev,fh);
 			spin_unlock_irqrestore(&dev->slock,flags);
 		}
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	case V4L2_BUF_TYPE_VBI_CAPTURE:
 		saa7134_vbi_fmt(dev,f);
@@ -1612,9 +1612,9 @@ int saa7134_common_ioctl(struct saa7134_dev *dev,
 		return get_control(dev,arg);
 	case VIDIOC_S_CTRL:
 	{
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		err = set_control(dev,NULL,arg);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return err;
 	}
 	/* --- input switching --------------------------------------- */
@@ -1664,9 +1664,9 @@ int saa7134_common_ioctl(struct saa7134_dev *dev,
 			return -EINVAL;
 		if (NULL == card_in(dev,*i).name)
 			return -EINVAL;
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		video_mux(dev,*i);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	}
 
@@ -1766,7 +1766,7 @@ static int video_do_ioctl(struct inode *inode, struct file *file,
 		if (i == TVNORMS)
 			return -EINVAL;
 
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		if (res_check(fh, RESOURCE_OVERLAY)) {
 			spin_lock_irqsave(&dev->slock,flags);
 			stop_preview(dev,fh);
@@ -1776,7 +1776,7 @@ static int video_do_ioctl(struct inode *inode, struct file *file,
 		} else
 			set_tvnorm(dev,&tvnorms[i]);
 		saa7134_tvaudio_do_scan(dev);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	}
 
@@ -1909,13 +1909,13 @@ static int video_do_ioctl(struct inode *inode, struct file *file,
 			return -EINVAL;
 		if (1 == fh->radio && V4L2_TUNER_RADIO != f->type)
 			return -EINVAL;
-		down(&dev->lock);
+		mutex_lock(&dev->lock);
 		dev->ctl_freq = f->frequency;
 
 		saa7134_i2c_call_clients(dev,VIDIOC_S_FREQUENCY,f);
 
 		saa7134_tvaudio_do_scan(dev);
-		up(&dev->lock);
+		mutex_unlock(&dev->lock);
 		return 0;
 	}
 
diff --git a/drivers/media/video/saa7134/saa7134.h b/drivers/media/video/saa7134/saa7134.h
index ff39a63c7372..691c10be459d 100644
--- a/drivers/media/video/saa7134/saa7134.h
+++ b/drivers/media/video/saa7134/saa7134.h
@@ -29,6 +29,7 @@
 #include <linux/input.h>
 #include <linux/notifier.h>
 #include <linux/delay.h>
+#include <linux/mutex.h>
 
 #include <asm/io.h>
 
@@ -364,7 +365,7 @@ struct saa7134_fh {
 
 /* dmasound dsp status */
 struct saa7134_dmasound {
-	struct semaphore           lock;
+	struct mutex               lock;
 	int                        minor_mixer;
 	int                        minor_dsp;
 	unsigned int               users_dsp;
@@ -428,7 +429,7 @@ struct saa7134_mpeg_ops {
 /* global device status */
 struct saa7134_dev {
 	struct list_head           devlist;
-	struct semaphore           lock;
+	struct mutex               lock;
 	spinlock_t                 slock;
 #ifdef VIDIOC_G_PRIORITY
 	struct v4l2_prio_state     prio;
diff --git a/drivers/media/video/video-buf-dvb.c b/drivers/media/video/video-buf-dvb.c
index 0a4004a4393c..caf3e7e2f219 100644
--- a/drivers/media/video/video-buf-dvb.c
+++ b/drivers/media/video/video-buf-dvb.c
@@ -96,7 +96,7 @@ static int videobuf_dvb_start_feed(struct dvb_demux_feed *feed)
 	if (!demux->dmx.frontend)
 		return -EINVAL;
 
-	down(&dvb->lock);
+	mutex_lock(&dvb->lock);
 	dvb->nfeeds++;
 	rc = dvb->nfeeds;
 
@@ -110,7 +110,7 @@ static int videobuf_dvb_start_feed(struct dvb_demux_feed *feed)
 	}
 
 out:
-	up(&dvb->lock);
+	mutex_unlock(&dvb->lock);
 	return rc;
 }
 
@@ -120,14 +120,14 @@ static int videobuf_dvb_stop_feed(struct dvb_demux_feed *feed)
 	struct videobuf_dvb *dvb = demux->priv;
 	int err = 0;
 
-	down(&dvb->lock);
+	mutex_lock(&dvb->lock);
 	dvb->nfeeds--;
 	if (0 == dvb->nfeeds  &&  NULL != dvb->thread) {
 		// FIXME: cx8802_cancel_buffers(dev);
 		err = kthread_stop(dvb->thread);
 		dvb->thread = NULL;
 	}
-	up(&dvb->lock);
+	mutex_unlock(&dvb->lock);
 	return err;
 }
 
@@ -139,7 +139,7 @@ int videobuf_dvb_register(struct videobuf_dvb *dvb,
 {
 	int result;
 
-	init_MUTEX(&dvb->lock);
+	mutex_init(&dvb->lock);
 
 	/* register adapter */
 	result = dvb_register_adapter(&dvb->adapter, dvb->name, module);
diff --git a/drivers/media/video/video-buf.c b/drivers/media/video/video-buf.c
index 9ef477523d27..cb1c228e29f8 100644
--- a/drivers/media/video/video-buf.c
+++ b/drivers/media/video/video-buf.c
@@ -385,7 +385,7 @@ void videobuf_queue_init(struct videobuf_queue* q,
 	q->ops     = ops;
 	q->priv_data = priv;
 
-	init_MUTEX(&q->lock);
+	mutex_init(&q->lock);
 	INIT_LIST_HEAD(&q->stream);
 }
 
@@ -549,7 +549,7 @@ videobuf_reqbufs(struct videobuf_queue *q,
 	if (!list_empty(&q->stream))
 		return -EBUSY;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	count = req->count;
 	if (count > VIDEO_MAX_FRAME)
 		count = VIDEO_MAX_FRAME;
@@ -566,7 +566,7 @@ videobuf_reqbufs(struct videobuf_queue *q,
 	req->count = count;
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
@@ -592,7 +592,7 @@ videobuf_qbuf(struct videobuf_queue *q,
 	unsigned long flags;
 	int retval;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	retval = -EBUSY;
 	if (q->reading)
 		goto done;
@@ -652,7 +652,7 @@ videobuf_qbuf(struct videobuf_queue *q,
 	retval = 0;
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
@@ -663,7 +663,7 @@ videobuf_dqbuf(struct videobuf_queue *q,
 	struct videobuf_buffer *buf;
 	int retval;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	retval = -EBUSY;
 	if (q->reading)
 		goto done;
@@ -693,7 +693,7 @@ videobuf_dqbuf(struct videobuf_queue *q,
 	videobuf_status(b,buf,q->type);
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
@@ -704,7 +704,7 @@ int videobuf_streamon(struct videobuf_queue *q)
 	unsigned long flags;
 	int retval;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	retval = -EBUSY;
 	if (q->reading)
 		goto done;
@@ -721,7 +721,7 @@ int videobuf_streamon(struct videobuf_queue *q)
 	spin_unlock_irqrestore(q->irqlock,flags);
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
@@ -729,7 +729,7 @@ int videobuf_streamoff(struct videobuf_queue *q)
 {
 	int retval = -EINVAL;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	if (!q->streaming)
 		goto done;
 	videobuf_queue_cancel(q);
@@ -737,7 +737,7 @@ int videobuf_streamoff(struct videobuf_queue *q)
 	retval = 0;
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
@@ -792,7 +792,7 @@ ssize_t videobuf_read_one(struct videobuf_queue *q,
 	unsigned size, nbufs, bytes;
 	int retval;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 
 	nbufs = 1; size = 0;
 	q->ops->buf_setup(q,&nbufs,&size);
@@ -860,7 +860,7 @@ ssize_t videobuf_read_one(struct videobuf_queue *q,
 	}
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
@@ -922,7 +922,7 @@ ssize_t videobuf_read_stream(struct videobuf_queue *q,
 	unsigned long flags;
 
 	dprintk(2,"%s\n",__FUNCTION__);
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	retval = -EBUSY;
 	if (q->streaming)
 		goto done;
@@ -996,7 +996,7 @@ ssize_t videobuf_read_stream(struct videobuf_queue *q,
 	}
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
@@ -1007,7 +1007,7 @@ unsigned int videobuf_poll_stream(struct file *file,
 	struct videobuf_buffer *buf = NULL;
 	unsigned int rc = 0;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	if (q->streaming) {
 		if (!list_empty(&q->stream))
 			buf = list_entry(q->stream.next,
@@ -1035,7 +1035,7 @@ unsigned int videobuf_poll_stream(struct file *file,
 		    buf->state == STATE_ERROR)
 			rc = POLLIN|POLLRDNORM;
 	}
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return rc;
 }
 
@@ -1064,7 +1064,7 @@ videobuf_vm_close(struct vm_area_struct *vma)
 	map->count--;
 	if (0 == map->count) {
 		dprintk(1,"munmap %p q=%p\n",map,q);
-		down(&q->lock);
+		mutex_lock(&q->lock);
 		for (i = 0; i < VIDEO_MAX_FRAME; i++) {
 			if (NULL == q->bufs[i])
 				continue;
@@ -1076,7 +1076,7 @@ videobuf_vm_close(struct vm_area_struct *vma)
 			q->bufs[i]->baddr = 0;
 			q->ops->buf_release(q,q->bufs[i]);
 		}
-		up(&q->lock);
+		mutex_unlock(&q->lock);
 		kfree(map);
 	}
 	return;
@@ -1170,7 +1170,7 @@ int videobuf_mmap_mapper(struct videobuf_queue *q,
 	unsigned int first,last,size,i;
 	int retval;
 
-	down(&q->lock);
+	mutex_lock(&q->lock);
 	retval = -EINVAL;
 	if (!(vma->vm_flags & VM_WRITE)) {
 		dprintk(1,"mmap app bug: PROT_WRITE please\n");
@@ -1238,7 +1238,7 @@ int videobuf_mmap_mapper(struct videobuf_queue *q,
 	retval = 0;
 
  done:
-	up(&q->lock);
+	mutex_unlock(&q->lock);
 	return retval;
 }
 
diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 908fbec776ac..75e3d41382f2 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -224,13 +224,13 @@ int video_exclusive_open(struct inode *inode, struct file *file)
 	struct  video_device *vfl = video_devdata(file);
 	int retval = 0;
 
-	down(&vfl->lock);
+	mutex_lock(&vfl->lock);
 	if (vfl->users) {
 		retval = -EBUSY;
 	} else {
 		vfl->users++;
 	}
-	up(&vfl->lock);
+	mutex_unlock(&vfl->lock);
 	return retval;
 }
 
@@ -328,7 +328,7 @@ int video_register_device(struct video_device *vfd, int type, int nr)
 	sprintf(vfd->devfs_name, "v4l/%s%d", name_base, i - base);
 	devfs_mk_cdev(MKDEV(VIDEO_MAJOR, vfd->minor),
 			S_IFCHR | S_IRUSR | S_IWUSR, vfd->devfs_name);
-	init_MUTEX(&vfd->lock);
+	mutex_init(&vfd->lock);
 
 	/* sysfs class */
 	memset(&vfd->class_dev, 0x00, sizeof(vfd->class_dev));
diff --git a/drivers/media/video/vino.c b/drivers/media/video/vino.c
index c8fd8238904d..0229819d0aac 100644
--- a/drivers/media/video/vino.c
+++ b/drivers/media/video/vino.c
@@ -42,6 +42,7 @@
 #include <linux/videodev.h>
 #include <linux/videodev2.h>
 #include <linux/video_decoder.h>
+#include <linux/mutex.h>
 
 #include <asm/paccess.h>
 #include <asm/io.h>
@@ -245,7 +246,7 @@ struct vino_framebuffer_queue {
 	struct vino_framebuffer *buffer[VINO_FRAMEBUFFER_COUNT_MAX];
 
 	spinlock_t queue_lock;
-	struct semaphore queue_sem;
+	struct mutex queue_mutex;
 	wait_queue_head_t frame_wait_queue;
 };
 
@@ -283,7 +284,7 @@ struct vino_channel_settings {
 	/* the driver is currently processing the queue */
 	int capturing;
 
-	struct semaphore sem;
+	struct mutex mutex;
 	spinlock_t capture_lock;
 
 	unsigned int users;
@@ -1131,11 +1132,11 @@ static void vino_queue_free(struct vino_framebuffer_queue *q)
 	if (q->type != VINO_MEMORY_MMAP)
 		return;
 
-	down(&q->queue_sem);
+	mutex_lock(&q->queue_mutex);
 
 	vino_queue_free_with_count(q, q->length);
 
-	up(&q->queue_sem);
+	mutex_unlock(&q->queue_mutex);
 }
 
 static int vino_queue_init(struct vino_framebuffer_queue *q,
@@ -1159,7 +1160,7 @@ static int vino_queue_init(struct vino_framebuffer_queue *q,
 	if (*length < 1)
 		return -EINVAL;
 
-	down(&q->queue_sem);
+	mutex_lock(&q->queue_mutex);
 
 	if (*length > VINO_FRAMEBUFFER_COUNT_MAX)
 		*length = VINO_FRAMEBUFFER_COUNT_MAX;
@@ -1211,7 +1212,7 @@ static int vino_queue_init(struct vino_framebuffer_queue *q,
 		q->magic = VINO_QUEUE_MAGIC;
 	}
 
-	up(&q->queue_sem);
+	mutex_unlock(&q->queue_mutex);
 
 	return ret;
 }
@@ -4045,7 +4046,7 @@ static int vino_open(struct inode *inode, struct file *file)
 	dprintk("open(): channel = %c\n",
 	       (vcs->channel == VINO_CHANNEL_A) ? 'A' : 'B');
 
-	down(&vcs->sem);
+	mutex_lock(&vcs->mutex);
 
 	if (vcs->users) {
 		dprintk("open(): driver busy\n");
@@ -4062,7 +4063,7 @@ static int vino_open(struct inode *inode, struct file *file)
 	vcs->users++;
 
  out:
-	up(&vcs->sem);
+	mutex_unlock(&vcs->mutex);
 
 	dprintk("open(): %s!\n", ret ? "failed" : "complete");
 
@@ -4075,7 +4076,7 @@ static int vino_close(struct inode *inode, struct file *file)
 	struct vino_channel_settings *vcs = video_get_drvdata(dev);
 	dprintk("close():\n");
 
-	down(&vcs->sem);
+	mutex_lock(&vcs->mutex);
 
 	vcs->users--;
 
@@ -4087,7 +4088,7 @@ static int vino_close(struct inode *inode, struct file *file)
 		vino_queue_free(&vcs->fb_queue);
 	}
 
-	up(&vcs->sem);
+	mutex_unlock(&vcs->mutex);
 
 	return 0;
 }
@@ -4130,7 +4131,7 @@ static int vino_mmap(struct file *file, struct vm_area_struct *vma)
 
 	// TODO: reject mmap if already mapped
 
-	if (down_interruptible(&vcs->sem))
+	if (mutex_lock_interruptible(&vcs->mutex))
 		return -EINTR;
 
 	if (vcs->reading) {
@@ -4214,7 +4215,7 @@ found:
 	vma->vm_ops = &vino_vm_ops;
 
 out:
-	up(&vcs->sem);
+	mutex_unlock(&vcs->mutex);
 
 	return ret;
 }
@@ -4374,12 +4375,12 @@ static int vino_ioctl(struct inode *inode, struct file *file,
 	struct vino_channel_settings *vcs = video_get_drvdata(dev);
 	int ret;
 
-	if (down_interruptible(&vcs->sem))
+	if (mutex_lock_interruptible(&vcs->mutex))
 		return -EINTR;
 
 	ret = video_usercopy(inode, file, cmd, arg, vino_do_ioctl);
 
-	up(&vcs->sem);
+	mutex_unlock(&vcs->mutex);
 
 	return ret;
 }
@@ -4564,10 +4565,10 @@ static int vino_init_channel_settings(struct vino_channel_settings *vcs,
 
 	vcs->capturing = 0;
 
-	init_MUTEX(&vcs->sem);
+	mutex_init(&vcs->mutex);
 	spin_lock_init(&vcs->capture_lock);
 
-	init_MUTEX(&vcs->fb_queue.queue_sem);
+	mutex_init(&vcs->fb_queue.queue_mutex);
 	spin_lock_init(&vcs->fb_queue.queue_lock);
 	init_waitqueue_head(&vcs->fb_queue.frame_wait_queue);
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 965c8902fe60..1dd8efeff35a 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -17,6 +17,7 @@
 #include <linux/time.h> /* need struct timeval */
 #include <linux/poll.h>
 #include <linux/device.h>
+#include <linux/mutex.h>
 #endif
 #include <linux/compiler.h> /* need __user */
 
@@ -90,7 +91,7 @@ struct video_device
 
 	/* for videodev.c intenal usage -- please don't touch */
 	int users;                     /* video_exclusive_{open|close} ... */
-	struct semaphore lock;         /* ... helper function uses these   */
+	struct mutex lock;             /* ... helper function uses these   */
 	char devfs_name[64];           /* devfs */
 	struct class_device class_dev; /* sysfs */
 };
diff --git a/include/media/saa7146.h b/include/media/saa7146.h
index 2bc634fcb7bb..fee579f10b32 100644
--- a/include/media/saa7146.h
+++ b/include/media/saa7146.h
@@ -11,6 +11,8 @@
 #include <linux/i2c.h>		/* for i2c subsystem */
 #include <asm/io.h>		/* for accessing devices */
 #include <linux/stringify.h>
+#include <linux/mutex.h>
+
 #include <linux/vmalloc.h>	/* for vmalloc() */
 #include <linux/mm.h>		/* for vmalloc_to_page() */
 
@@ -112,7 +114,7 @@ struct saa7146_dev
 
 	/* different device locks */
 	spinlock_t			slock;
-	struct semaphore		lock;
+	struct mutex			lock;
 
 	unsigned char			__iomem *mem;		/* pointer to mapped IO memory */
 	int				revision;	/* chip revision; needed for bug-workarounds*/
@@ -133,15 +135,16 @@ struct saa7146_dev
 	void (*vv_callback)(struct saa7146_dev *dev, unsigned long status);
 
 	/* i2c-stuff */
-	struct semaphore	i2c_lock;
-	u32			i2c_bitrate;
-	struct saa7146_dma	d_i2c;	/* pointer to i2c memory */
-	wait_queue_head_t	i2c_wq;
-	int			i2c_op;
+	struct mutex			i2c_lock;
+
+	u32				i2c_bitrate;
+	struct saa7146_dma		d_i2c;	/* pointer to i2c memory */
+	wait_queue_head_t		i2c_wq;
+	int				i2c_op;
 
 	/* memories */
-	struct saa7146_dma	d_rps0;
-	struct saa7146_dma	d_rps1;
+	struct saa7146_dma		d_rps0;
+	struct saa7146_dma		d_rps1;
 };
 
 /* from saa7146_i2c.c */
@@ -150,7 +153,7 @@ int saa7146_i2c_transfer(struct saa7146_dev *saa, const struct i2c_msg *msgs, in
 
 /* from saa7146_core.c */
 extern struct list_head saa7146_devices;
-extern struct semaphore saa7146_devices_lock;
+extern struct mutex saa7146_devices_lock;
 int saa7146_register_extension(struct saa7146_extension*);
 int saa7146_unregister_extension(struct saa7146_extension*);
 struct saa7146_format* format_by_fourcc(struct saa7146_dev *dev, int fourcc);
diff --git a/include/media/video-buf-dvb.h b/include/media/video-buf-dvb.h
index ad0a07a3a895..b78d90fe629f 100644
--- a/include/media/video-buf-dvb.h
+++ b/include/media/video-buf-dvb.h
@@ -11,7 +11,7 @@ struct videobuf_dvb {
 	struct videobuf_queue      dvbq;
 
 	/* video-buf-dvb state info */
-	struct semaphore           lock;
+	struct mutex               lock;
 	struct task_struct         *thread;
 	int                        nfeeds;
 
diff --git a/include/media/video-buf.h b/include/media/video-buf.h
index 8ecfd78e0027..d90dec5484ee 100644
--- a/include/media/video-buf.h
+++ b/include/media/video-buf.h
@@ -177,7 +177,7 @@ struct videobuf_queue_ops {
 };
 
 struct videobuf_queue {
-	struct semaphore           lock;
+	struct mutex               lock;
 	spinlock_t                 *irqlock;
 	struct pci_dev             *pci;
 
-- 
cgit v1.2.3


From dde44589bf9fac0168c6ce6d097c99c33b18074f Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Feb 2006 00:56:10 +0900
Subject: [PATCH] libata: implement ATA_FLAG_IN_EH port flag

ATA_FLAG_IN_EH flag is set on entry to EH and cleared on completion.
This patch just sets and clears the flag.  Following patches will
build normal qc execution / EH synchronization aroung this flag.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-scsi.c | 9 +++++++++
 include/linux/libata.h     | 2 ++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 2d328b3af067..3a4f40b251fb 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -735,6 +735,11 @@ int ata_scsi_error(struct Scsi_Host *host)
 
 	DPRINTK("ENTER\n");
 
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	assert(!(ap->flags & ATA_FLAG_IN_EH));
+	ap->flags |= ATA_FLAG_IN_EH;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
 	ap = (struct ata_port *) &host->hostdata[0];
 	ap->ops->eng_timeout(ap);
 
@@ -742,6 +747,10 @@ int ata_scsi_error(struct Scsi_Host *host)
 
 	scsi_eh_flush_done_q(&ap->eh_done_q);
 
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->flags &= ~ATA_FLAG_IN_EH;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
 	DPRINTK("EXIT\n");
 	return 0;
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 474cdfa35d1e..55176df403a5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -162,6 +162,8 @@ enum {
 	ATA_FLAG_PIO_LBA48	= (1 << 13), /* Host DMA engine is LBA28 only */
 	ATA_FLAG_IRQ_MASK	= (1 << 14), /* Mask IRQ in PIO xfers */
 
+	ATA_FLAG_IN_EH		= (1 << 15), /* EH in progress */
+
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
 	ATA_QCFLAG_SINGLE	= (1 << 4), /* no s/g, just a single buffer */
-- 
cgit v1.2.3


From c18d06f89fd09ee0059c4899e615c471d59af66a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Feb 2006 00:56:10 +0900
Subject: [PATCH] libata: EH / pio tasks synchronization

This patch makes sure that pio tasks are flushed before proceeding
with EH.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 56 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/libata.h     |  3 ++-
 2 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 249e67fab81f..9a785cf0c5b1 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1074,19 +1074,66 @@ static unsigned int ata_pio_modes(const struct ata_device *adev)
 static inline void
 ata_queue_packet_task(struct ata_port *ap)
 {
-	queue_work(ata_wq, &ap->packet_task);
+	if (!(ap->flags & ATA_FLAG_FLUSH_PIO_TASK))
+		queue_work(ata_wq, &ap->packet_task);
 }
 
 static inline void
 ata_queue_pio_task(struct ata_port *ap)
 {
-	queue_work(ata_wq, &ap->pio_task);
+	if (!(ap->flags & ATA_FLAG_FLUSH_PIO_TASK))
+		queue_work(ata_wq, &ap->pio_task);
 }
 
 static inline void
 ata_queue_delayed_pio_task(struct ata_port *ap, unsigned long delay)
 {
-	queue_delayed_work(ata_wq, &ap->pio_task, delay);
+	if (!(ap->flags & ATA_FLAG_FLUSH_PIO_TASK))
+		queue_delayed_work(ata_wq, &ap->pio_task, delay);
+}
+
+/**
+ *	ata_flush_pio_tasks - Flush pio_task and packet_task
+ *	@ap: the target ata_port
+ *
+ *	After this function completes, pio_task and packet_task are
+ *	guranteed not to be running or scheduled.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ */
+
+static void ata_flush_pio_tasks(struct ata_port *ap)
+{
+	int tmp = 0;
+	unsigned long flags;
+
+	DPRINTK("ENTER\n");
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->flags |= ATA_FLAG_FLUSH_PIO_TASK;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	DPRINTK("flush #1\n");
+	flush_workqueue(ata_wq);
+
+	/*
+	 * At this point, if a task is running, it's guaranteed to see
+	 * the FLUSH flag; thus, it will never queue pio tasks again.
+	 * Cancel and flush.
+	 */
+	tmp |= cancel_delayed_work(&ap->pio_task);
+	tmp |= cancel_delayed_work(&ap->packet_task);
+	if (!tmp) {
+		DPRINTK("flush #2\n");
+		flush_workqueue(ata_wq);
+	}
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->flags &= ~ATA_FLAG_FLUSH_PIO_TASK;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	DPRINTK("EXIT\n");
 }
 
 void ata_qc_complete_internal(struct ata_queued_cmd *qc)
@@ -3767,6 +3814,9 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 
 	DPRINTK("ENTER\n");
 
+	ata_flush_pio_tasks(ap);
+	ap->hsm_task_state = HSM_ST_IDLE;
+
 	spin_lock_irqsave(&host_set->lock, flags);
 
 	switch (qc->tf.protocol) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 55176df403a5..f4cd1eb734a0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -162,7 +162,8 @@ enum {
 	ATA_FLAG_PIO_LBA48	= (1 << 13), /* Host DMA engine is LBA28 only */
 	ATA_FLAG_IRQ_MASK	= (1 << 14), /* Mask IRQ in PIO xfers */
 
-	ATA_FLAG_IN_EH		= (1 << 15), /* EH in progress */
+	ATA_FLAG_FLUSH_PIO_TASK	= (1 << 15), /* Flush PIO task */
+	ATA_FLAG_IN_EH		= (1 << 16), /* EH in progress */
 
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
-- 
cgit v1.2.3


From 7944ea9522ce0ea32d57894b3dc2540b0bdca66e Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Feb 2006 18:20:00 +0900
Subject: [PATCH] libata: add probeinit component operation to
 ata_drive_probe_reset()

This patch adds probeinit component operation to
ata_drive_probe_reset().  If present, this new operation is called
before performing any reset.  The operations's roll is to prepare @ap
for following probe-reset operations.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 9 +++++++--
 include/linux/libata.h     | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 99f4bf6022ee..c36c5a9a4617 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2485,7 +2485,8 @@ int ata_std_probe_reset(struct ata_port *ap, unsigned int *classes)
 	if (ap->flags & ATA_FLAG_SATA && ap->ops->scr_read)
 		hardreset = sata_std_hardreset;
 
-	return ata_drive_probe_reset(ap, ata_std_softreset, hardreset,
+	return ata_drive_probe_reset(ap, NULL,
+				     ata_std_softreset, hardreset,
 				     ata_std_postreset, classes);
 }
 
@@ -2524,6 +2525,7 @@ static int do_probe_reset(struct ata_port *ap, ata_reset_fn_t reset,
 /**
  *	ata_drive_probe_reset - Perform probe reset with given methods
  *	@ap: port to reset
+ *	@probeinit: probeinit method (can be NULL)
  *	@softreset: softreset method (can be NULL)
  *	@hardreset: hardreset method (can be NULL)
  *	@postreset: postreset method (can be NULL)
@@ -2552,12 +2554,15 @@ static int do_probe_reset(struct ata_port *ap, ata_reset_fn_t reset,
  *	if classification fails, and any error code from reset
  *	methods.
  */
-int ata_drive_probe_reset(struct ata_port *ap,
+int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 			  ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			  ata_postreset_fn_t postreset, unsigned int *classes)
 {
 	int rc = -EINVAL;
 
+	if (probeinit)
+		probeinit(ap);
+
 	if (softreset) {
 		rc = do_probe_reset(ap, softreset, postreset, classes);
 		if (rc == 0)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f4cd1eb734a0..e8f29cefc351 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -244,6 +244,7 @@ struct ata_queued_cmd;
 
 /* typedefs */
 typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc);
+typedef void (*ata_probeinit_fn_t)(struct ata_port *);
 typedef int (*ata_reset_fn_t)(struct ata_port *, int, unsigned int *);
 typedef void (*ata_postreset_fn_t)(struct ata_port *ap, unsigned int *);
 
@@ -484,6 +485,7 @@ extern void __sata_phy_reset(struct ata_port *ap);
 extern void sata_phy_reset(struct ata_port *ap);
 extern void ata_bus_reset(struct ata_port *ap);
 extern int ata_drive_probe_reset(struct ata_port *ap,
+			ata_probeinit_fn_t probeinit,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			ata_postreset_fn_t postreset, unsigned int *classes);
 extern int ata_std_softreset(struct ata_port *ap, int verbose,
-- 
cgit v1.2.3


From 8a19ac89edbe9b702c10fd2039b8cb2db4644a5f Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Feb 2006 18:20:00 +0900
Subject: [PATCH] libata: implement ata_std_probeinit()

This patch implements the off-the-shelf probeinit component operation.
Currently, all it does is waking up the PHY if it's a SATA port.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 16 +++++++++++++++-
 include/linux/libata.h     |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index c36c5a9a4617..14cdbb336dd5 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2295,6 +2295,19 @@ static int sata_phy_resume(struct ata_port *ap)
 	return -1;
 }
 
+/**
+ *	ata_std_probeinit - initialize probing
+ *	@ap: port to be probed
+ *
+ *	@ap is about to be probed.  Initialize it.  This function is
+ *	to be used as standard callback for ata_drive_probe_reset().
+ */
+extern void ata_std_probeinit(struct ata_port *ap)
+{
+	if (ap->flags & ATA_FLAG_SATA && ap->ops->scr_read)
+		sata_phy_resume(ap);
+}
+
 /**
  *	ata_std_softreset - reset host port via ATA SRST
  *	@ap: port to reset
@@ -2485,7 +2498,7 @@ int ata_std_probe_reset(struct ata_port *ap, unsigned int *classes)
 	if (ap->flags & ATA_FLAG_SATA && ap->ops->scr_read)
 		hardreset = sata_std_hardreset;
 
-	return ata_drive_probe_reset(ap, NULL,
+	return ata_drive_probe_reset(ap, ata_std_probeinit,
 				     ata_std_softreset, hardreset,
 				     ata_std_postreset, classes);
 }
@@ -5535,6 +5548,7 @@ EXPORT_SYMBOL_GPL(ata_port_probe);
 EXPORT_SYMBOL_GPL(sata_phy_reset);
 EXPORT_SYMBOL_GPL(__sata_phy_reset);
 EXPORT_SYMBOL_GPL(ata_bus_reset);
+EXPORT_SYMBOL_GPL(ata_std_probeinit);
 EXPORT_SYMBOL_GPL(ata_std_softreset);
 EXPORT_SYMBOL_GPL(sata_std_hardreset);
 EXPORT_SYMBOL_GPL(ata_std_postreset);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e8f29cefc351..68b3fe6f9a4d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -488,6 +488,7 @@ extern int ata_drive_probe_reset(struct ata_port *ap,
 			ata_probeinit_fn_t probeinit,
 			ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
 			ata_postreset_fn_t postreset, unsigned int *classes);
+extern void ata_std_probeinit(struct ata_port *ap);
 extern int ata_std_softreset(struct ata_port *ap, int verbose,
 			     unsigned int *classes);
 extern int sata_std_hardreset(struct ata_port *ap, int verbose,
-- 
cgit v1.2.3


From 341963b909a01d2f38d86f5db8dd1f8c80bd6dbf Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 10 Feb 2006 15:10:48 +0900
Subject: [PATCH] libata: add ATA_QCFLAG_EH_SCHEDULED

Add ATA_QCFLAG_EH_SCHEDULED.  If this flag is set, the qc is owned by
EH and normal completion path is not allowed to finish it.  This patch
doesn't actually use this flag.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 33 ++++++++++++++++++++-------------
 drivers/scsi/libata-scsi.c |  2 +-
 drivers/scsi/libata.h      |  1 +
 include/linux/libata.h     |  1 +
 4 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 22db73932253..977a53dd1677 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3620,19 +3620,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	}
 }
 
-/**
- *	ata_qc_complete - Complete an active ATA command
- *	@qc: Command to complete
- *	@err_mask: ATA Status register contents
- *
- *	Indicate to the mid and upper layers that an ATA
- *	command has completed, with either an ok or not-ok status.
- *
- *	LOCKING:
- *	spin_lock_irqsave(host_set lock)
- */
-
-void ata_qc_complete(struct ata_queued_cmd *qc)
+inline void __ata_qc_complete(struct ata_queued_cmd *qc)
 {
 	assert(qc != NULL);	/* ata_qc_from_tag _might_ return NULL */
 	assert(qc->flags & ATA_QCFLAG_ACTIVE);
@@ -3650,6 +3638,25 @@ void ata_qc_complete(struct ata_queued_cmd *qc)
 	qc->complete_fn(qc);
 }
 
+/**
+ *	ata_qc_complete - Complete an active ATA command
+ *	@qc: Command to complete
+ *	@err_mask: ATA Status register contents
+ *
+ *	Indicate to the mid and upper layers that an ATA
+ *	command has completed, with either an ok or not-ok status.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ */
+void ata_qc_complete(struct ata_queued_cmd *qc)
+{
+	if (unlikely(qc->flags & ATA_QCFLAG_EH_SCHEDULED))
+		return;
+
+	__ata_qc_complete(qc);
+}
+
 static inline int ata_should_dma_map(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index b007bb409382..1df468eb2bf3 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -770,7 +770,7 @@ static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
 	qc->scsidone = ata_eh_scsidone;
-	ata_qc_complete(qc);
+	__ata_qc_complete(qc);
 	assert(!ata_tag_valid(qc->tag));
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 9d76923a2253..1cd071a32e93 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -46,6 +46,7 @@ extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern unsigned int ata_qc_issue(struct ata_queued_cmd *qc);
+extern void __ata_qc_complete(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
 extern void ata_dev_select(struct ata_port *ap, unsigned int device,
                            unsigned int wait, unsigned int can_sleep);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 68b3fe6f9a4d..5c70a57f93ee 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -169,6 +169,7 @@ enum {
 	ATA_QCFLAG_SG		= (1 << 3), /* have s/g table? */
 	ATA_QCFLAG_SINGLE	= (1 << 4), /* no s/g, just a single buffer */
 	ATA_QCFLAG_DMAMAP	= ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
+	ATA_QCFLAG_EH_SCHEDULED = (1 << 5), /* EH scheduled */
 
 	/* various lengths of time */
 	ATA_TMOUT_EDD		= 5 * HZ,	/* heuristic */
-- 
cgit v1.2.3


From f29841e08fa20a7f2c8bc1b70306975299c66ee7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 10 Feb 2006 15:10:48 +0900
Subject: [PATCH] libata: implement ata_scsi_timed_out()

Implement ata_scsi_timed_out(), to be used as
scsi_host_template->eh_timed_out callback for all libata drivers.
Without this function, the following race exists.

If a qc completes after SCSI timer expires but before libata EH kicks
in, the qc gets completed but the scsicmd still gets passed to libata
EH resulting in ->eng_timeout invocation with NULL qc, which none is
handling properly.

This patch makes sure that scmd and qc share the same lifetime.
Original idea from Jeff Garzik <jgarzik@pobox.com>.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c |  1 +
 drivers/scsi/libata-scsi.c | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |  1 +
 3 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 977a53dd1677..d53e0bce2bec 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4913,6 +4913,7 @@ EXPORT_SYMBOL_GPL(ata_ratelimit);
 EXPORT_SYMBOL_GPL(ata_busy_sleep);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
 EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
+EXPORT_SYMBOL_GPL(ata_scsi_timed_out);
 EXPORT_SYMBOL_GPL(ata_scsi_error);
 EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
 EXPORT_SYMBOL_GPL(ata_scsi_release);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 1df468eb2bf3..d67cc2fb5694 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -716,6 +716,47 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 	return 0;	/* scsi layer doesn't check return value, sigh */
 }
 
+/**
+ *	ata_scsi_timed_out - SCSI layer time out callback
+ *	@cmd: timed out SCSI command
+ *
+ *	Handles SCSI layer timeout.  We race with normal completion of
+ *	the qc for @cmd.  If the qc is already gone, we lose and let
+ *	the scsi command finish (EH_HANDLED).  Otherwise, the qc has
+ *	timed out and EH should be invoked.  Prevent ata_qc_complete()
+ *	from finishing it by setting EH_SCHEDULED and return
+ *	EH_NOT_HANDLED.
+ *
+ *	LOCKING:
+ *	Called from timer context
+ *
+ *	RETURNS:
+ *	EH_HANDLED or EH_NOT_HANDLED
+ */
+enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
+{
+	struct Scsi_Host *host = cmd->device->host;
+	struct ata_port *ap = (struct ata_port *) &host->hostdata[0];
+	unsigned long flags;
+	struct ata_queued_cmd *qc;
+	enum scsi_eh_timer_return ret = EH_HANDLED;
+
+	DPRINTK("ENTER\n");
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	qc = ata_qc_from_tag(ap, ap->active_tag);
+	if (qc) {
+		assert(qc->scsicmd == cmd);
+		qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
+		qc->err_mask |= AC_ERR_TIMEOUT;
+		ret = EH_NOT_HANDLED;
+	}
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	DPRINTK("EXIT, ret=%d\n", ret);
+	return ret;
+}
+
 /**
  *	ata_scsi_error - SCSI layer error handler callback
  *	@host: SCSI host on which error occurred
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5c70a57f93ee..c1e198655bb1 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -509,6 +509,7 @@ extern void ata_host_set_remove(struct ata_host_set *host_set);
 extern int ata_scsi_detect(struct scsi_host_template *sht);
 extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
 extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *));
+extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern int ata_scsi_error(struct Scsi_Host *host);
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 76014427e60f7ecfdc689dfbcb48e9760e1da4fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sat, 11 Feb 2006 15:13:49 +0900
Subject: [PATCH] libata: inline ata_qc_complete()

This patch inlines ata_qc_complete() and uninlines __ata_qc_complete()
as suggested by Jeff Garzik.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 23 ++---------------------
 drivers/scsi/libata.h      |  1 -
 include/linux/libata.h     | 20 +++++++++++++++++++-
 3 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 38e72c1dd689..fffbaa9dae76 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3621,7 +3621,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	}
 }
 
-inline void __ata_qc_complete(struct ata_queued_cmd *qc)
+void __ata_qc_complete(struct ata_queued_cmd *qc)
 {
 	assert(qc != NULL);	/* ata_qc_from_tag _might_ return NULL */
 	assert(qc->flags & ATA_QCFLAG_ACTIVE);
@@ -3639,25 +3639,6 @@ inline void __ata_qc_complete(struct ata_queued_cmd *qc)
 	qc->complete_fn(qc);
 }
 
-/**
- *	ata_qc_complete - Complete an active ATA command
- *	@qc: Command to complete
- *	@err_mask: ATA Status register contents
- *
- *	Indicate to the mid and upper layers that an ATA
- *	command has completed, with either an ok or not-ok status.
- *
- *	LOCKING:
- *	spin_lock_irqsave(host_set lock)
- */
-void ata_qc_complete(struct ata_queued_cmd *qc)
-{
-	if (unlikely(qc->flags & ATA_QCFLAG_EH_SCHEDULED))
-		return;
-
-	__ata_qc_complete(qc);
-}
-
 static inline int ata_should_dma_map(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
@@ -4877,7 +4858,7 @@ EXPORT_SYMBOL_GPL(ata_device_add);
 EXPORT_SYMBOL_GPL(ata_host_set_remove);
 EXPORT_SYMBOL_GPL(ata_sg_init);
 EXPORT_SYMBOL_GPL(ata_sg_init_one);
-EXPORT_SYMBOL_GPL(ata_qc_complete);
+EXPORT_SYMBOL_GPL(__ata_qc_complete);
 EXPORT_SYMBOL_GPL(ata_qc_issue_prot);
 EXPORT_SYMBOL_GPL(ata_eng_timeout);
 EXPORT_SYMBOL_GPL(ata_tf_load);
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 1cd071a32e93..9d76923a2253 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -46,7 +46,6 @@ extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern unsigned int ata_qc_issue(struct ata_queued_cmd *qc);
-extern void __ata_qc_complete(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
 extern void ata_dev_select(struct ata_port *ap, unsigned int device,
                            unsigned int wait, unsigned int can_sleep);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c1e198655bb1..695d9ae6ec03 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -556,7 +556,7 @@ extern void ata_bmdma_start (struct ata_queued_cmd *qc);
 extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
 extern u8   ata_bmdma_status(struct ata_port *ap);
 extern void ata_bmdma_irq_clear(struct ata_port *ap);
-extern void ata_qc_complete(struct ata_queued_cmd *qc);
+extern void __ata_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eng_timeout(struct ata_port *ap);
 extern void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
 			      struct scsi_cmnd *cmd,
@@ -756,6 +756,24 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	ata_tf_init(qc->ap, &qc->tf, qc->dev->devno);
 }
 
+/**
+ *	ata_qc_complete - Complete an active ATA command
+ *	@qc: Command to complete
+ *	@err_mask: ATA Status register contents
+ *
+ *	Indicate to the mid and upper layers that an ATA
+ *	command has completed, with either an ok or not-ok status.
+ *
+ *	LOCKING:
+ *	spin_lock_irqsave(host_set lock)
+ */
+static inline void ata_qc_complete(struct ata_queued_cmd *qc)
+{
+	if (unlikely(qc->flags & ATA_QCFLAG_EH_SCHEDULED))
+		return;
+
+	__ata_qc_complete(qc);
+}
 
 /**
  *	ata_irq_on - Enable interrupts on a port.
-- 
cgit v1.2.3


From bef4a456b8dc8b3638f4d49a25a89e1467da9483 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sat, 11 Feb 2006 19:11:13 +0900
Subject: [PATCH] libata: kill assert() macro

libata assert() now has no user left.  Kill it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 include/linux/libata.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 695d9ae6ec03..83a1f2ead861 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -62,16 +62,6 @@
 
 #define BPRINTK(fmt, args...) if (ap->flags & ATA_FLAG_DEBUGMSG) printk(KERN_ERR "%s: " fmt, __FUNCTION__, ## args)
 
-#ifdef ATA_NDEBUG
-#define assert(expr)
-#else
-#define assert(expr) \
-        if(unlikely(!(expr))) {                                   \
-        printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
-        #expr,__FILE__,__FUNCTION__,__LINE__);          \
-        }
-#endif
-
 /* NEW: debug levels */
 #define HAVE_LIBATA_MSG 1
 
-- 
cgit v1.2.3


From 0e949ff304a7ca07db38c17fbbf3ead1085d7bbf Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 12 Feb 2006 22:47:04 +0900
Subject: [PATCH] libata: implement ata_dev_id_c_string()

ata_dev_id_c_string() reads ATA string from the specified offset of
the given IDENTIFY PAGE and puts it in the specified buffer in trimmed
and NULL-terminated form.  The caller must supply a buffer which is
one byte larger than the maximum size of the target ID string.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 29 +++++++++++++++++++++++++++++
 include/linux/libata.h     |  2 ++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9ddb568267e1..7e410299d67a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -519,6 +519,34 @@ void ata_dev_id_string(const u16 *id, unsigned char *s,
 	}
 }
 
+/**
+ *	ata_dev_id_c_string - Convert IDENTIFY DEVICE page into C string
+ *	@id: IDENTIFY DEVICE results we will examine
+ *	@s: string into which data is output
+ *	@ofs: offset into identify device page
+ *	@len: length of string to return. must be an odd number.
+ *
+ *	This function is identical to ata_dev_id_string except that it
+ *	trims trailing spaces and terminates the resulting string with
+ *	null.  @len must be actual maximum length (even number) + 1.
+ *
+ *	LOCKING:
+ *	caller.
+ */
+void ata_dev_id_c_string(const u16 *id, unsigned char *s,
+			 unsigned int ofs, unsigned int len)
+{
+	unsigned char *p;
+
+	WARN_ON(!(len & 1));
+
+	ata_dev_id_string(id, s, ofs, len - 1);
+
+	p = s + strnlen(s, len - 1);
+	while (p > s && p[-1] == ' ')
+		p--;
+	*p = '\0';
+}
 
 /**
  *	ata_noop_dev_select - Select device 0/1 on ATA bus
@@ -4905,6 +4933,7 @@ EXPORT_SYMBOL_GPL(ata_scsi_release);
 EXPORT_SYMBOL_GPL(ata_host_intr);
 EXPORT_SYMBOL_GPL(ata_dev_classify);
 EXPORT_SYMBOL_GPL(ata_dev_id_string);
+EXPORT_SYMBOL_GPL(ata_dev_id_c_string);
 EXPORT_SYMBOL_GPL(ata_dev_config);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
 EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 83a1f2ead861..0853032673b7 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -540,6 +540,8 @@ extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 extern unsigned int ata_dev_classify(const struct ata_taskfile *tf);
 extern void ata_dev_id_string(const u16 *id, unsigned char *s,
 			      unsigned int ofs, unsigned int len);
+extern void ata_dev_id_c_string(const u16 *id, unsigned char *s,
+				unsigned int ofs, unsigned int len);
 extern void ata_dev_config(struct ata_port *ap, unsigned int i);
 extern void ata_bmdma_setup (struct ata_queued_cmd *qc);
 extern void ata_bmdma_start (struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 3d2ca91095f8ab6dc0cb925374eec62fa5336764 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 12 Feb 2006 22:47:04 +0900
Subject: [PATCH] libata: separate out ata_id_major_version()

Separate out ATA major version calculation from ata_dev_identify()
into ata_id_major_version().  It's preparation for splitting
ata_dev_identify().

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c |  6 +-----
 include/linux/ata.h        | 10 ++++++++++
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 4df5024c3888..d87854e199f6 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -927,7 +927,6 @@ static void ata_dev_identify(struct ata_port *ap, unsigned int device)
 {
 	struct ata_device *dev = &ap->device[device];
 	unsigned int major_version;
-	u16 tmp;
 	unsigned long xfer_modes;
 	unsigned int using_edd;
 	struct ata_taskfile tf;
@@ -1030,10 +1029,7 @@ retry:
 			goto err_out_nosup;
 
 		/* get major version */
-		tmp = dev->id[ATA_ID_MAJOR_VER];
-		for (major_version = 14; major_version >= 1; major_version--)
-			if (tmp & (1 << major_version))
-				break;
+		major_version = ata_id_major_version(dev->id);
 
 		/*
 		 * The exact sequence expected by certain pre-ATA4 drives is:
diff --git a/include/linux/ata.h b/include/linux/ata.h
index a8155ca4947f..b02a16c435e7 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -267,6 +267,16 @@ struct ata_taskfile {
 	  ((u64) (id)[(n) + 1] << 16) |	\
 	  ((u64) (id)[(n) + 0]) )
 
+static inline unsigned int ata_id_major_version(const u16 *id)
+{
+	unsigned int mver;
+
+	for (mver = 14; mver >= 1; mver--)
+		if (id[ATA_ID_MAJOR_VER] & (1 << mver))
+			break;
+	return mver;
+}
+
 static inline int ata_id_current_chs_valid(const u16 *id)
 {
 	/* For ATA-1 devices, if the INITIALIZE DEVICE PARAMETERS command 
-- 
cgit v1.2.3


From 6e7846e9c548443c86cfbad9e4defb4bdcfc538b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 12 Feb 2006 23:32:58 +0900
Subject: [PATCH] libata: move cdb_len for host to device

cdb_len is per-device property.  Sharing cdb_len on ap results in
inaccurate configuration on revalidation and hotplugging.  This patch
makes cdb_len per-device.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/ahci.c        |  3 ++-
 drivers/scsi/libata-core.c | 19 ++++++++++++-------
 drivers/scsi/libata-scsi.c |  4 ++--
 drivers/scsi/sata_sil24.c  |  4 ++--
 include/linux/libata.h     |  2 +-
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 24a54a5a91b8..23caa0c4019b 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -617,7 +617,8 @@ static void ahci_qc_prep(struct ata_queued_cmd *qc)
 	ata_tf_to_fis(&qc->tf, pp->cmd_tbl, 0);
 	if (is_atapi) {
 		memset(pp->cmd_tbl + AHCI_CMD_TBL_CDB, 0, 32);
-		memcpy(pp->cmd_tbl + AHCI_CMD_TBL_CDB, qc->cdb, ap->cdb_len);
+		memcpy(pp->cmd_tbl + AHCI_CMD_TBL_CDB, qc->cdb,
+		       qc->dev->cdb_len);
 	}
 
 	n_elem = 0;
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 5e8a32052a1e..1a373b4d0e22 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -931,7 +931,7 @@ static void ata_dev_identify(struct ata_port *ap, unsigned int device)
 	unsigned int using_edd;
 	struct ata_taskfile tf;
 	unsigned int err_mask;
-	int rc;
+	int i, rc;
 
 	if (!ata_dev_present(dev)) {
 		DPRINTK("ENTER/EXIT (host %u, dev %u) -- nodev\n",
@@ -1087,7 +1087,7 @@ retry:
 
 		}
 
-		ap->host->max_cmd_len = 16;
+		dev->cdb_len = 16;
 	}
 
 	/* ATAPI-specific feature tests */
@@ -1100,8 +1100,7 @@ retry:
 			printk(KERN_WARNING "ata%u: unsupported CDB len\n", ap->id);
 			goto err_out_nosup;
 		}
-		ap->cdb_len = (unsigned int) rc;
-		ap->host->max_cmd_len = (unsigned char) ap->cdb_len;
+		dev->cdb_len = (unsigned int) rc;
 
 		/* print device info to dmesg */
 		printk(KERN_INFO "ata%u: dev %u ATAPI, max %s\n",
@@ -1109,6 +1108,12 @@ retry:
 		       ata_mode_string(xfer_modes));
 	}
 
+	ap->host->max_cmd_len = 0;
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		ap->host->max_cmd_len = max_t(unsigned int,
+					      ap->host->max_cmd_len,
+					      ap->device[i].cdb_len);
+
 	DPRINTK("EXIT, drv_stat = 0x%x\n", ata_chk_status(ap));
 	return;
 
@@ -4203,7 +4208,7 @@ static void atapi_packet_task(void *_data)
 
 	/* send SCSI cdb */
 	DPRINTK("send cdb\n");
-	WARN_ON(ap->cdb_len < 12);
+	WARN_ON(qc->dev->cdb_len < 12);
 
 	if (qc->tf.protocol == ATA_PROT_ATAPI_DMA ||
 	    qc->tf.protocol == ATA_PROT_ATAPI_NODATA) {
@@ -4217,12 +4222,12 @@ static void atapi_packet_task(void *_data)
 		 */
 		spin_lock_irqsave(&ap->host_set->lock, flags);
 		ap->flags &= ~ATA_FLAG_NOINTR;
-		ata_data_xfer(ap, qc->cdb, ap->cdb_len, 1);
+		ata_data_xfer(ap, qc->cdb, qc->dev->cdb_len, 1);
 		if (qc->tf.protocol == ATA_PROT_ATAPI_DMA)
 			ap->ops->bmdma_start(qc);	/* initiate bmdma */
 		spin_unlock_irqrestore(&ap->host_set->lock, flags);
 	} else {
-		ata_data_xfer(ap, qc->cdb, ap->cdb_len, 1);
+		ata_data_xfer(ap, qc->cdb, qc->dev->cdb_len, 1);
 
 		/* PIO commands are handled by polling */
 		ap->hsm_task_state = HSM_ST;
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index ebd7cf42550b..3628fedc9865 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2146,7 +2146,7 @@ static void atapi_request_sense(struct ata_queued_cmd *qc)
 	ata_sg_init_one(qc, cmd->sense_buffer, sizeof(cmd->sense_buffer));
 	qc->dma_dir = DMA_FROM_DEVICE;
 
-	memset(&qc->cdb, 0, ap->cdb_len);
+	memset(&qc->cdb, 0, qc->dev->cdb_len);
 	qc->cdb[0] = REQUEST_SENSE;
 	qc->cdb[4] = SCSI_SENSE_BUFFERSIZE;
 
@@ -2248,7 +2248,7 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc, const u8 *scsicmd)
 		if (ata_check_atapi_dma(qc))
 			using_pio = 1;
 
-	memcpy(&qc->cdb, scsicmd, qc->ap->cdb_len);
+	memcpy(&qc->cdb, scsicmd, dev->cdb_len);
 
 	qc->complete_fn = atapi_qc_complete;
 
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 228a7fabffff..24020c68903a 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -371,7 +371,7 @@ static void sil24_dev_config(struct ata_port *ap, struct ata_device *dev)
 {
 	void __iomem *port = (void __iomem *)ap->ioaddr.cmd_addr;
 
-	if (ap->cdb_len == 16)
+	if (dev->cdb_len == 16)
 		writel(PORT_CS_CDB16, port + PORT_CTRL_STAT);
 	else
 		writel(PORT_CS_CDB16, port + PORT_CTRL_CLR);
@@ -543,7 +543,7 @@ static void sil24_qc_prep(struct ata_queued_cmd *qc)
 		prb = &cb->atapi.prb;
 		sge = cb->atapi.sge;
 		memset(cb->atapi.cdb, 0, 32);
-		memcpy(cb->atapi.cdb, qc->cdb, ap->cdb_len);
+		memcpy(cb->atapi.cdb, qc->cdb, qc->dev->cdb_len);
 
 		if (qc->tf.protocol != ATA_PROT_ATAPI_NODATA) {
 			if (qc->tf.flags & ATA_TFLAG_WRITE)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0853032673b7..853c98859a9f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -348,6 +348,7 @@ struct ata_device {
 
 	unsigned int		multi_count;	/* sectors count for
 						   READ/WRITE MULTIPLE */
+	unsigned int		cdb_len;
 
 	/* for CHS addressing */
 	u16			cylinders;	/* Number of cylinders */
@@ -377,7 +378,6 @@ struct ata_port {
 	unsigned int		mwdma_mask;
 	unsigned int		udma_mask;
 	unsigned int		cbl;	/* cable type; ATA_CBL_xxx */
-	unsigned int		cdb_len;
 
 	struct ata_device	device[ATA_MAX_DEVICES];
 
-- 
cgit v1.2.3


From b00eec1d58ee71131375bfeb86e64bceec3f5618 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 12 Feb 2006 23:32:59 +0900
Subject: [PATCH] libata: add per-device max_sectors

If a low level driver wants to control max_sectors, it had to adjust
ap->host->max_sectors and set ATA_DFLAG_LOCK_SECTORS to tell
ata_scsi_slave_config not to override the limit.  This is not only
cumbersome but also incorrect for hosts which support more than one
devices per port.

This patch adds per-device ->max_sectors.  If the field is unset
(zero), libata core layer will adjust ->max_sectors according to
default rules.  If the field is set, libata honors the setting.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c |  4 +---
 drivers/scsi/libata-scsi.c | 18 +++++++++---------
 drivers/scsi/sata_sil.c    |  4 +---
 include/linux/libata.h     |  4 ++--
 4 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 1a373b4d0e22..61cba39a6834 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1147,9 +1147,7 @@ void ata_dev_config(struct ata_port *ap, unsigned int i)
 		printk(KERN_INFO "ata%u(%u): applying bridge limits\n",
 		       ap->id, i);
 		ap->udma_mask &= ATA_UDMA5;
-		ap->host->max_sectors = ATA_MAX_SECTORS;
-		ap->host->hostt->max_sectors = ATA_MAX_SECTORS;
-		ap->device[i].flags |= ATA_DFLAG_LOCK_SECTORS;
+		ap->device[i].max_sectors = ATA_MAX_SECTORS;
 	}
 
 	if (ap->ops->dev_config)
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 3628fedc9865..86da46502b3e 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -684,23 +684,23 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
 	if (sdev->id < ATA_MAX_DEVICES) {
 		struct ata_port *ap;
 		struct ata_device *dev;
+		unsigned int max_sectors;
 
 		ap = (struct ata_port *) &sdev->host->hostdata[0];
 		dev = &ap->device[sdev->id];
 
-		/* TODO: 1024 is an arbitrary number, not the
+		/* TODO: 2048 is an arbitrary number, not the
 		 * hardware maximum.  This should be increased to
 		 * 65534 when Jens Axboe's patch for dynamically
 		 * determining max_sectors is merged.
 		 */
-		if ((dev->flags & ATA_DFLAG_LBA48) &&
-		    ((dev->flags & ATA_DFLAG_LOCK_SECTORS) == 0)) {
-			/*
-			 * do not overwrite sdev->host->max_sectors, since
-			 * other drives on this host may not support LBA48
-			 */
-			blk_queue_max_sectors(sdev->request_queue, 2048);
-		}
+		max_sectors = ATA_MAX_SECTORS;
+		if (dev->flags & ATA_DFLAG_LBA48)
+			max_sectors = 2048;
+		if (dev->max_sectors)
+			max_sectors = dev->max_sectors;
+
+		blk_queue_max_sectors(sdev->request_queue, max_sectors);
 
 		/*
 		 * SATA DMA transfers must be multiples of 4 byte, so
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index 61c4ac7ff9db..6c482c8be254 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -354,9 +354,7 @@ static void sil_dev_config(struct ata_port *ap, struct ata_device *dev)
 	     (quirks & SIL_QUIRK_MOD15WRITE))) {
 		printk(KERN_INFO "ata%u(%u): applying Seagate errata fix (mod15write workaround)\n",
 		       ap->id, dev->devno);
-		ap->host->max_sectors = 15;
-		ap->host->hostt->max_sectors = 15;
-		dev->flags |= ATA_DFLAG_LOCK_SECTORS;
+		dev->max_sectors = 15;
 		return;
 	}
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 853c98859a9f..afe46457124e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -122,8 +122,7 @@ enum {
 	/* struct ata_device stuff */
 	ATA_DFLAG_LBA48		= (1 << 0), /* device supports LBA48 */
 	ATA_DFLAG_PIO		= (1 << 1), /* device currently in PIO mode */
-	ATA_DFLAG_LOCK_SECTORS	= (1 << 2), /* don't adjust max_sectors */
-	ATA_DFLAG_LBA		= (1 << 3), /* device supports LBA */
+	ATA_DFLAG_LBA		= (1 << 2), /* device supports LBA */
 
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
@@ -348,6 +347,7 @@ struct ata_device {
 
 	unsigned int		multi_count;	/* sectors count for
 						   READ/WRITE MULTIPLE */
+	unsigned int		max_sectors;	/* per-device max sectors */
 	unsigned int		cdb_len;
 
 	/* for CHS addressing */
-- 
cgit v1.2.3


From 6a62a04d4705df4f9f9bee39e889b9e920eeca47 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 13 Feb 2006 10:02:46 +0900
Subject: [PATCH] libata: rename ata_dev_id_[c_]string()

This patch renames ata_dev_id_[c_]string() to ata_id_[c_]string().
All other functions which read data from ATA ID data start with ata_id
and those two function names were getting too long.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/scsi/libata-core.c | 23 +++++++++++------------
 drivers/scsi/libata-scsi.c | 12 ++++++------
 drivers/scsi/sata_sil.c    |  3 +--
 include/linux/libata.h     |  8 ++++----
 4 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index adc5b440c9bc..d694b20e81bf 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -486,7 +486,7 @@ ata_dev_try_classify(struct ata_port *ap, unsigned int device, u8 *r_err)
 }
 
 /**
- *	ata_dev_id_string - Convert IDENTIFY DEVICE page into string
+ *	ata_id_string - Convert IDENTIFY DEVICE page into string
  *	@id: IDENTIFY DEVICE results we will examine
  *	@s: string into which data is output
  *	@ofs: offset into identify device page
@@ -500,8 +500,8 @@ ata_dev_try_classify(struct ata_port *ap, unsigned int device, u8 *r_err)
  *	caller.
  */
 
-void ata_dev_id_string(const u16 *id, unsigned char *s,
-		       unsigned int ofs, unsigned int len)
+void ata_id_string(const u16 *id, unsigned char *s,
+		   unsigned int ofs, unsigned int len)
 {
 	unsigned int c;
 
@@ -520,27 +520,27 @@ void ata_dev_id_string(const u16 *id, unsigned char *s,
 }
 
 /**
- *	ata_dev_id_c_string - Convert IDENTIFY DEVICE page into C string
+ *	ata_id_c_string - Convert IDENTIFY DEVICE page into C string
  *	@id: IDENTIFY DEVICE results we will examine
  *	@s: string into which data is output
  *	@ofs: offset into identify device page
  *	@len: length of string to return. must be an odd number.
  *
- *	This function is identical to ata_dev_id_string except that it
+ *	This function is identical to ata_id_string except that it
  *	trims trailing spaces and terminates the resulting string with
  *	null.  @len must be actual maximum length (even number) + 1.
  *
  *	LOCKING:
  *	caller.
  */
-void ata_dev_id_c_string(const u16 *id, unsigned char *s,
-			 unsigned int ofs, unsigned int len)
+void ata_id_c_string(const u16 *id, unsigned char *s,
+		     unsigned int ofs, unsigned int len)
 {
 	unsigned char *p;
 
 	WARN_ON(!(len & 1));
 
-	ata_dev_id_string(id, s, ofs, len - 1);
+	ata_id_string(id, s, ofs, len - 1);
 
 	p = s + strnlen(s, len - 1);
 	while (p > s && p[-1] == ' ')
@@ -2315,8 +2315,7 @@ static int ata_dma_blacklisted(const struct ata_device *dev)
 	unsigned char model_num[41];
 	int i;
 
-	ata_dev_id_c_string(dev->id, model_num, ATA_ID_PROD_OFS,
-			    sizeof(model_num));
+	ata_id_c_string(dev->id, model_num, ATA_ID_PROD_OFS, sizeof(model_num));
 
 	for (i = 0; i < ARRAY_SIZE(ata_dma_blacklist); i++)
 		if (!strcmp(ata_dma_blacklist[i], model_num))
@@ -4934,8 +4933,8 @@ EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
 EXPORT_SYMBOL_GPL(ata_scsi_release);
 EXPORT_SYMBOL_GPL(ata_host_intr);
 EXPORT_SYMBOL_GPL(ata_dev_classify);
-EXPORT_SYMBOL_GPL(ata_dev_id_string);
-EXPORT_SYMBOL_GPL(ata_dev_id_c_string);
+EXPORT_SYMBOL_GPL(ata_id_string);
+EXPORT_SYMBOL_GPL(ata_id_c_string);
 EXPORT_SYMBOL_GPL(ata_dev_config);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
 EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 86da46502b3e..538784e65cdc 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1567,8 +1567,8 @@ unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf,
 
 	if (buflen > 35) {
 		memcpy(&rbuf[8], "ATA     ", 8);
-		ata_dev_id_string(args->id, &rbuf[16], ATA_ID_PROD_OFS, 16);
-		ata_dev_id_string(args->id, &rbuf[32], ATA_ID_FW_REV_OFS, 4);
+		ata_id_string(args->id, &rbuf[16], ATA_ID_PROD_OFS, 16);
+		ata_id_string(args->id, &rbuf[32], ATA_ID_FW_REV_OFS, 4);
 		if (rbuf[32] == 0 || rbuf[32] == ' ')
 			memcpy(&rbuf[32], "n/a ", 4);
 	}
@@ -1642,8 +1642,8 @@ unsigned int ata_scsiop_inq_80(struct ata_scsi_args *args, u8 *rbuf,
 	memcpy(rbuf, hdr, sizeof(hdr));
 
 	if (buflen > (ATA_SERNO_LEN + 4 - 1))
-		ata_dev_id_string(args->id, (unsigned char *) &rbuf[4],
-				  ATA_ID_SERNO_OFS, ATA_SERNO_LEN);
+		ata_id_string(args->id, (unsigned char *) &rbuf[4],
+			      ATA_ID_SERNO_OFS, ATA_SERNO_LEN);
 
 	return 0;
 }
@@ -1806,8 +1806,8 @@ static int ata_dev_supports_fua(u16 *id)
 	if (!ata_id_has_fua(id))
 		return 0;
 
-	ata_dev_id_c_string(id, model, ATA_ID_PROD_OFS, sizeof(model));
-	ata_dev_id_c_string(id, fw, ATA_ID_FW_REV_OFS, sizeof(fw));
+	ata_id_c_string(id, model, ATA_ID_PROD_OFS, sizeof(model));
+	ata_id_c_string(id, fw, ATA_ID_FW_REV_OFS, sizeof(fw));
 
 	if (strcmp(model, "Maxtor"))
 		return 1;
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index 15346888faf2..e14ed4ebbeed 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -338,8 +338,7 @@ static void sil_dev_config(struct ata_port *ap, struct ata_device *dev)
 	unsigned int n, quirks = 0;
 	unsigned char model_num[41];
 
-	ata_dev_id_c_string(dev->id, model_num, ATA_ID_PROD_OFS,
-			    sizeof(model_num));
+	ata_id_c_string(dev->id, model_num, ATA_ID_PROD_OFS, sizeof(model_num));
 
 	for (n = 0; sil_blacklist[n].product; n++)
 		if (!strcmp(sil_blacklist[n].product, model_num)) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index afe46457124e..0d6bf50ad029 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -538,10 +538,10 @@ extern void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf,
 extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
 		 unsigned int n_elem);
 extern unsigned int ata_dev_classify(const struct ata_taskfile *tf);
-extern void ata_dev_id_string(const u16 *id, unsigned char *s,
-			      unsigned int ofs, unsigned int len);
-extern void ata_dev_id_c_string(const u16 *id, unsigned char *s,
-				unsigned int ofs, unsigned int len);
+extern void ata_id_string(const u16 *id, unsigned char *s,
+			  unsigned int ofs, unsigned int len);
+extern void ata_id_c_string(const u16 *id, unsigned char *s,
+			    unsigned int ofs, unsigned int len);
 extern void ata_dev_config(struct ata_port *ap, unsigned int i);
 extern void ata_bmdma_setup (struct ata_queued_cmd *qc);
 extern void ata_bmdma_start (struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 749eef857948a3de789b7d0e3b96d92199d723cf Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@infradead.org>
Date: Mon, 27 Feb 2006 00:07:40 -0300
Subject: V4L/DVB (3334): Added ET61X251 fourcc type

Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 1dd8efeff35a..3f1504353472 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -326,6 +326,7 @@ struct v4l2_pix_format
 #define V4L2_PIX_FMT_SN9C10X  v4l2_fourcc('S','9','1','0') /* SN9C10x compression */
 #define V4L2_PIX_FMT_PWC1     v4l2_fourcc('P','W','C','1') /* pwc older webcam */
 #define V4L2_PIX_FMT_PWC2     v4l2_fourcc('P','W','C','2') /* pwc newer webcam */
+#define V4L2_PIX_FMT_ET61X251 v4l2_fourcc('E','6','2','5') /* ET61X251 compression */
 
 /*
  *	F O R M A T   E N U M E R A T I O N
-- 
cgit v1.2.3


From f05cce863fa399dd79c5aa3896d608b8b86d8030 Mon Sep 17 00:00:00 2001
From: Andreas Oberritter <obi@linuxtv.org>
Date: Mon, 27 Feb 2006 00:09:00 -0300
Subject: V4L/DVB (3375): Add AUDIO_GET_PTS and VIDEO_GET_PTS ioctls

Add two new ioctls to read the 33 bit presentation time stamp from audio
and video devices as defined in ITU T-REC-H.222.0 and ISO/IEC 13818-1.
Acked-by: Johannes Stezenbach <js@linuxtv.org>

Signed-off-by: Andreas Oberritter <obi@linuxtv.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/dvb/audio.h | 13 +++++++++++++
 include/linux/dvb/video.h | 13 +++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dvb/audio.h b/include/linux/dvb/audio.h
index 2b8797084685..0874a67c6b92 100644
--- a/include/linux/dvb/audio.h
+++ b/include/linux/dvb/audio.h
@@ -121,4 +121,17 @@ typedef uint16_t audio_attributes_t;
 #define AUDIO_SET_ATTRIBUTES       _IOW('o', 17, audio_attributes_t)
 #define AUDIO_SET_KARAOKE          _IOW('o', 18, audio_karaoke_t)
 
+/**
+ * AUDIO_GET_PTS
+ *
+ * Read the 33 bit presentation time stamp as defined
+ * in ITU T-REC-H.222.0 / ISO/IEC 13818-1.
+ *
+ * The PTS should belong to the currently played
+ * frame if possible, but may also be a value close to it
+ * like the PTS of the last decoded frame or the last PTS
+ * extracted by the PES parser.
+ */
+#define AUDIO_GET_PTS              _IOR('o', 19, __u64)
+
 #endif /* _DVBAUDIO_H_ */
diff --git a/include/linux/dvb/video.h b/include/linux/dvb/video.h
index b1999bfeaa56..1f7fa0351daf 100644
--- a/include/linux/dvb/video.h
+++ b/include/linux/dvb/video.h
@@ -200,4 +200,17 @@ typedef uint16_t video_attributes_t;
 #define VIDEO_GET_SIZE             _IOR('o', 55, video_size_t)
 #define VIDEO_GET_FRAME_RATE       _IOR('o', 56, unsigned int)
 
+/**
+ * VIDEO_GET_PTS
+ *
+ * Read the 33 bit presentation time stamp as defined
+ * in ITU T-REC-H.222.0 / ISO/IEC 13818-1.
+ *
+ * The PTS should belong to the currently played
+ * frame if possible, but may also be a value close to it
+ * like the PTS of the last decoded frame or the last PTS
+ * extracted by the PES parser.
+ */
+#define VIDEO_GET_PTS              _IOR('o', 57, __u64)
+
 #endif /*_DVBVIDEO_H_*/
-- 
cgit v1.2.3


From 1fa44ecad2b86475e038aed81b0bf333fa484f8b Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Thu, 23 Feb 2006 12:43:43 -0600
Subject: [SCSI] add execute_in_process_context() API

We have several points in the SCSI stack (primarily for our device
functions) where we need to guarantee process context, but (given the
place where the last reference was released) we cannot guarantee this.

This API gets around the issue by executing the function directly if
the caller has process context, but scheduling a workqueue to execute
in process context if the caller doesn't have it.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 include/linux/workqueue.h |  6 ++++++
 kernel/workqueue.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 86b111300231..957c21c16d62 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -20,6 +20,10 @@ struct work_struct {
 	struct timer_list timer;
 };
 
+struct execute_work {
+	struct work_struct work;
+};
+
 #define __WORK_INITIALIZER(n, f, d) {				\
         .entry	= { &(n).entry, &(n).entry },			\
 	.func = (f),						\
@@ -74,6 +78,8 @@ extern void init_workqueues(void);
 void cancel_rearming_delayed_work(struct work_struct *work);
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
 				       struct work_struct *);
+int execute_in_process_context(void (*fn)(void *), void *,
+			       struct execute_work *);
 
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b052e2c4c710..e9e464a90376 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/hardirq.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn:		the function to execute
+ * @data:	data to pass to the function
+ * @ew:		guaranteed storage for the execute work structure (must
+ *		be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Returns:	0 - function was executed
+ *		1 - function was scheduled for execution
+ */
+int execute_in_process_context(void (*fn)(void *data), void *data,
+			       struct execute_work *ew)
+{
+	if (!in_interrupt()) {
+		fn(data);
+		return 0;
+	}
+
+	INIT_WORK(&ew->work, fn, data);
+	schedule_work(&ew->work);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
+
 int keventd_up(void)
 {
 	return keventd_wq != NULL;
-- 
cgit v1.2.3


From 597afd21401c85bdf9441830abf431c2be6fd45f Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 1 Mar 2006 01:25:38 +0900
Subject: [PATCH] libata: seperate out ata_class_present()

Seperate out ata_class_present() from ata_dev_present().  This is
useful because new reset mechanism deals with classes[] directly.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/libata.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 66b6847225df..22e86cb2d166 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -645,10 +645,14 @@ static inline unsigned int ata_tag_valid(unsigned int tag)
 	return (tag < ATA_MAX_QUEUE) ? 1 : 0;
 }
 
+static inline unsigned int ata_class_present(unsigned int class)
+{
+	return class == ATA_DEV_ATA || class == ATA_DEV_ATAPI;
+}
+
 static inline unsigned int ata_dev_present(const struct ata_device *dev)
 {
-	return ((dev->class == ATA_DEV_ATA) ||
-		(dev->class == ATA_DEV_ATAPI));
+	return ata_class_present(dev->class);
 }
 
 static inline u8 ata_chk_status(struct ata_port *ap)
-- 
cgit v1.2.3


From d9572b1d5e60b63e27e17f1f7771c5a26dd5d81e Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 1 Mar 2006 16:09:35 +0900
Subject: [PATCH] libata: convert dev->id to pointer

Convert dev->id from array to pointer.  This is to accomodate
revalidation.  During revalidation, both old and new IDENTIFY pages
should be accessible and single ->id array doesn't cut it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 20 +++++++++++++++++---
 include/linux/libata.h     |  2 +-
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index e34b421eb2a3..fecda706d85f 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -911,7 +911,7 @@ unsigned int ata_pio_need_iordy(const struct ata_device *adev)
  *	@dev: target device
  *	@p_class: pointer to class of the target device (may be changed)
  *	@post_reset: is this read ID post-reset?
- *	@id: buffer to fill IDENTIFY page into
+ *	@p_id: read IDENTIFY page (newly allocated)
  *
  *	Read ID data from the specified device.  ATA_CMD_ID_ATA is
  *	performed on ATA devices and ATA_CMD_ID_ATAPI on ATAPI
@@ -926,12 +926,13 @@ unsigned int ata_pio_need_iordy(const struct ata_device *adev)
  *	0 on success, -errno otherwise.
  */
 static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
-			   unsigned int *p_class, int post_reset, u16 *id)
+			   unsigned int *p_class, int post_reset, u16 **p_id)
 {
 	unsigned int class = *p_class;
 	unsigned int using_edd;
 	struct ata_taskfile tf;
 	unsigned int err_mask = 0;
+	u16 *id;
 	const char *reason;
 	int rc;
 
@@ -945,6 +946,13 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 
 	ata_dev_select(ap, dev->devno, 1, 1); /* select device 0/1 */
 
+	id = kmalloc(sizeof(id[0]) * ATA_ID_WORDS, GFP_KERNEL);
+	if (id == NULL) {
+		rc = -ENOMEM;
+		reason = "out of memory";
+		goto err_out;
+	}
+
  retry:
 	ata_tf_init(ap, &tf, dev->devno);
 
@@ -1035,11 +1043,13 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 	}
 
 	*p_class = class;
+	*p_id = id;
 	return 0;
 
  err_out:
 	printk(KERN_WARNING "ata%u: dev %u failed to IDENTIFY (%s)\n",
 	       ap->id, dev->devno, reason);
+	kfree(id);
 	return rc;
 }
 
@@ -1079,7 +1089,8 @@ static void ata_dev_identify(struct ata_port *ap, unsigned int device)
 
 	DPRINTK("ENTER, host %u, dev %u\n", ap->id, device);
 
-	rc = ata_dev_read_id(ap, dev, &dev->class, 1, dev->id);
+	WARN_ON(dev->id != NULL);
+	rc = ata_dev_read_id(ap, dev, &dev->class, 1, &dev->id);
 	if (rc)
 		goto err_out;
 
@@ -4732,11 +4743,14 @@ void ata_host_set_remove(struct ata_host_set *host_set)
 int ata_scsi_release(struct Scsi_Host *host)
 {
 	struct ata_port *ap = (struct ata_port *) &host->hostdata[0];
+	int i;
 
 	DPRINTK("ENTER\n");
 
 	ap->ops->port_disable(ap);
 	ata_host_remove(ap, 0);
+	for (i = 0; i < ATA_MAX_DEVICES; i++)
+		kfree(ap->device[i].id);
 
 	DPRINTK("EXIT\n");
 	return 1;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 22e86cb2d166..9f273dd1958c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -339,7 +339,7 @@ struct ata_device {
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
 	unsigned int		class;		/* ATA_DEV_xxx */
 	unsigned int		devno;		/* 0 or 1 */
-	u16			id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */
+	u16			*id;		/* IDENTIFY xxx DEVICE data */
 	u8			pio_mode;
 	u8			dma_mode;
 	u8			xfer_mode;
-- 
cgit v1.2.3


From 4b2f3ededc035525038a7a9247074243dac6b351 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 1 Mar 2006 16:09:36 +0900
Subject: [PATCH] libata: fold ata_dev_config() into ata_dev_configure()

ata_dev_config() needs to be done everytime a device is configured.
Fold it into ata_dev_configure().

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 48 ++++++++++++++++------------------------------
 include/linux/libata.h     |  1 -
 2 files changed, 17 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index ba26c66cc11b..10803f72c57d 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1053,6 +1053,12 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 	return rc;
 }
 
+static inline u8 ata_dev_knobble(const struct ata_port *ap,
+				 struct ata_device *dev)
+{
+	return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
+}
+
 /**
  *	ata_dev_configure - Configure the specified ATA/ATAPI device
  *	@ap: Port on which target device resides
@@ -1167,6 +1173,17 @@ static int ata_dev_configure(struct ata_port *ap, struct ata_device *dev)
 					      ap->host->max_cmd_len,
 					      ap->device[i].cdb_len);
 
+	/* limit bridge transfers to udma5, 200 sectors */
+	if (ata_dev_knobble(ap, dev)) {
+		printk(KERN_INFO "ata%u(%u): applying bridge limits\n",
+		       ap->id, dev->devno);
+		ap->udma_mask &= ATA_UDMA5;
+		dev->max_sectors = ATA_MAX_SECTORS;
+	}
+
+	if (ap->ops->dev_config)
+		ap->ops->dev_config(ap, dev);
+
 	DPRINTK("EXIT, drv_stat = 0x%x\n", ata_chk_status(ap));
 	return 0;
 
@@ -1177,35 +1194,6 @@ err_out_nosup:
 	return rc;
 }
 
-
-static inline u8 ata_dev_knobble(const struct ata_port *ap,
-				 struct ata_device *dev)
-{
-	return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
-}
-
-/**
- * ata_dev_config - Run device specific handlers & check for SATA->PATA bridges
- * @ap: Bus
- * @i:  Device
- *
- * LOCKING:
- */
-
-void ata_dev_config(struct ata_port *ap, unsigned int i)
-{
-	/* limit bridge transfers to udma5, 200 sectors */
-	if (ata_dev_knobble(ap, &ap->device[i])) {
-		printk(KERN_INFO "ata%u(%u): applying bridge limits\n",
-		       ap->id, i);
-		ap->udma_mask &= ATA_UDMA5;
-		ap->device[i].max_sectors = ATA_MAX_SECTORS;
-	}
-
-	if (ap->ops->dev_config)
-		ap->ops->dev_config(ap, &ap->device[i]);
-}
-
 /**
  *	ata_bus_probe - Reset and probe ATA bus
  *	@ap: Bus to probe
@@ -1266,7 +1254,6 @@ static int ata_bus_probe(struct ata_port *ap)
 			continue;
 		}
 
-		ata_dev_config(ap, i);
 		found = 1;
 	}
 
@@ -4967,7 +4954,6 @@ EXPORT_SYMBOL_GPL(ata_host_intr);
 EXPORT_SYMBOL_GPL(ata_dev_classify);
 EXPORT_SYMBOL_GPL(ata_id_string);
 EXPORT_SYMBOL_GPL(ata_id_c_string);
-EXPORT_SYMBOL_GPL(ata_dev_config);
 EXPORT_SYMBOL_GPL(ata_scsi_simulate);
 EXPORT_SYMBOL_GPL(ata_eh_qc_complete);
 EXPORT_SYMBOL_GPL(ata_eh_qc_retry);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9f273dd1958c..86a504f0ef06 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -542,7 +542,6 @@ extern void ata_id_string(const u16 *id, unsigned char *s,
 			  unsigned int ofs, unsigned int len);
 extern void ata_id_c_string(const u16 *id, unsigned char *s,
 			    unsigned int ofs, unsigned int len);
-extern void ata_dev_config(struct ata_port *ap, unsigned int i);
 extern void ata_bmdma_setup (struct ata_queued_cmd *qc);
 extern void ata_bmdma_start (struct ata_queued_cmd *qc);
 extern void ata_bmdma_stop(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 8f903c708fcc2b579ebf16542bf6109bad593a1d Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <fubar@us.ibm.com>
Date: Tue, 21 Feb 2006 16:36:44 -0800
Subject: [PATCH] bonding: suppress duplicate packets

	Originally submitted by Kenzo Iwami; his original description is:

The current bonding driver receives duplicate packets when broadcast/
multicast packets are sent by other devices or packets are flooded by the
switch. In this patch, new flags are added in priv_flags of net_device
structure to let the bonding driver discard duplicate packets in
dev.c:skb_bond().

	Modified by Jay Vosburgh to change a define name, update some
comments, rearrange the new skb_bond() for clarity, clear all bonding
priv_flags on slave release, and update the driver version.

Signed-off-by: Kenzo Iwami <k-iwami@cj.jp.nec.com>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/net/bonding/bond_main.c  | 43 ++++++++++++++++++++--------------------
 drivers/net/bonding/bond_sysfs.c |  6 ++++++
 drivers/net/bonding/bonding.h    | 33 +++++++++++++++++++++++++-----
 include/linux/if.h               |  3 +++
 include/linux/if_ether.h         |  1 +
 net/core/dev.c                   | 26 +++++++++++++++++++++++-
 6 files changed, 85 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index bcf9f17daf0d..623c87a83615 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1040,6 +1040,10 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
 	if ((bond->params.mode == BOND_MODE_TLB) ||
 	    (bond->params.mode == BOND_MODE_ALB)) {
 		bond_alb_handle_active_change(bond, new_active);
+		if (old_active)
+			bond_set_slave_inactive_flags(old_active);
+		if (new_active)
+			bond_set_slave_active_flags(new_active);
 	} else {
 		bond->curr_active_slave = new_active;
 	}
@@ -1443,15 +1447,16 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 
 	switch (bond->params.mode) {
 	case BOND_MODE_ACTIVEBACKUP:
-		/* if we're in active-backup mode, we need one and only one active
-		 * interface. The backup interfaces will have their NOARP flag set
-		 * because we need them to be completely deaf and not to respond to
-		 * any ARP request on the network to avoid fooling a switch. Thus,
-		 * since we guarantee that curr_active_slave always point to the last
-		 * usable interface, we just have to verify this interface's flag.
+		/* if we're in active-backup mode, we need one and
+		 * only one active interface. The backup interfaces
+		 * will have their SLAVE_INACTIVE flag set because we
+		 * need them to be drop all packets. Thus, since we
+		 * guarantee that curr_active_slave always point to
+		 * the last usable interface, we just have to verify
+		 * this interface's flag.
 		 */
 		if (((!bond->curr_active_slave) ||
-		     (bond->curr_active_slave->dev->flags & IFF_NOARP)) &&
+		     (bond->curr_active_slave->dev->priv_flags & IFF_SLAVE_INACTIVE)) &&
 		    (new_slave->link != BOND_LINK_DOWN)) {
 			dprintk("This is the first active slave\n");
 			/* first slave or no active slave yet, and this link
@@ -1492,6 +1497,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 			 * is OK, so make this interface the active one
 			 */
 			bond_change_active_slave(bond, new_slave);
+		} else {
+			bond_set_slave_inactive_flags(new_slave);
 		}
 		break;
 	default:
@@ -1724,13 +1731,8 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
 	addr.sa_family = slave_dev->type;
 	dev_set_mac_address(slave_dev, &addr);
 
-	/* restore the original state of the
-	 * IFF_NOARP flag that might have been
-	 * set by bond_set_slave_inactive_flags()
-	 */
-	if ((slave->original_flags & IFF_NOARP) == 0) {
-		slave_dev->flags &= ~IFF_NOARP;
-	}
+	slave_dev->priv_flags &= ~(IFF_MASTER_8023AD | IFF_MASTER_ALB |
+				   IFF_SLAVE_INACTIVE);
 
 	kfree(slave);
 
@@ -1816,12 +1818,8 @@ static int bond_release_all(struct net_device *bond_dev)
 		addr.sa_family = slave_dev->type;
 		dev_set_mac_address(slave_dev, &addr);
 
-		/* restore the original state of the IFF_NOARP flag that might have
-		 * been set by bond_set_slave_inactive_flags()
-		 */
-		if ((slave->original_flags & IFF_NOARP) == 0) {
-			slave_dev->flags &= ~IFF_NOARP;
-		}
+		slave_dev->priv_flags &= ~(IFF_MASTER_8023AD | IFF_MASTER_ALB |
+					   IFF_SLAVE_INACTIVE);
 
 		kfree(slave);
 
@@ -4061,14 +4059,17 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
 		bond_dev->hard_start_xmit = bond_xmit_broadcast;
 		break;
 	case BOND_MODE_8023AD:
+		bond_set_master_3ad_flags(bond);
 		bond_dev->hard_start_xmit = bond_3ad_xmit_xor;
 		if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34)
 			bond->xmit_hash_policy = bond_xmit_hash_policy_l34;
 		else
 			bond->xmit_hash_policy = bond_xmit_hash_policy_l2;
 		break;
-	case BOND_MODE_TLB:
 	case BOND_MODE_ALB:
+		bond_set_master_alb_flags(bond);
+		/* FALLTHRU */
+	case BOND_MODE_TLB:
 		bond_dev->hard_start_xmit = bond_alb_xmit;
 		bond_dev->set_mac_address = bond_alb_set_mac_address;
 		break;
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 041bcc583557..5a9bd95884be 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -424,6 +424,12 @@ static ssize_t bonding_store_mode(struct class_device *cd, const char *buf, size
 		ret = -EINVAL;
 		goto out;
 	} else {
+		if (bond->params.mode == BOND_MODE_8023AD)
+			bond_unset_master_3ad_flags(bond);
+
+		if (bond->params.mode == BOND_MODE_ALB)
+			bond_unset_master_alb_flags(bond);
+
 		bond->params.mode = new_value;
 		bond_set_mode_ops(bond, bond->params.mode);
 		printk(KERN_INFO DRV_NAME ": %s: setting mode to %s (%d).\n",
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 3dd78d048c3e..ce9dc9b4e2dc 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -22,8 +22,8 @@
 #include "bond_3ad.h"
 #include "bond_alb.h"
 
-#define DRV_VERSION	"3.0.1"
-#define DRV_RELDATE	"January 9, 2006"
+#define DRV_VERSION	"3.0.2"
+#define DRV_RELDATE	"February 21, 2006"
 #define DRV_NAME	"bonding"
 #define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
 
@@ -230,14 +230,37 @@ static inline struct bonding *bond_get_bond_by_slave(struct slave *slave)
 
 static inline void bond_set_slave_inactive_flags(struct slave *slave)
 {
-	slave->state = BOND_STATE_BACKUP;
-	slave->dev->flags |= IFF_NOARP;
+	struct bonding *bond = slave->dev->master->priv;
+	if (bond->params.mode != BOND_MODE_TLB &&
+	    bond->params.mode != BOND_MODE_ALB)
+		slave->state = BOND_STATE_BACKUP;
+	slave->dev->priv_flags |= IFF_SLAVE_INACTIVE;
 }
 
 static inline void bond_set_slave_active_flags(struct slave *slave)
 {
 	slave->state = BOND_STATE_ACTIVE;
-	slave->dev->flags &= ~IFF_NOARP;
+	slave->dev->priv_flags &= ~IFF_SLAVE_INACTIVE;
+}
+
+static inline void bond_set_master_3ad_flags(struct bonding *bond)
+{
+	bond->dev->priv_flags |= IFF_MASTER_8023AD;
+}
+
+static inline void bond_unset_master_3ad_flags(struct bonding *bond)
+{
+	bond->dev->priv_flags &= ~IFF_MASTER_8023AD;
+}
+
+static inline void bond_set_master_alb_flags(struct bonding *bond)
+{
+	bond->dev->priv_flags |= IFF_MASTER_ALB;
+}
+
+static inline void bond_unset_master_alb_flags(struct bonding *bond)
+{
+	bond->dev->priv_flags &= ~IFF_MASTER_ALB;
 }
 
 struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr);
diff --git a/include/linux/if.h b/include/linux/if.h
index ce627d9092ef..12c6f6d157c3 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -52,6 +52,9 @@
 /* Private (from user) interface flags (netdevice->priv_flags). */
 #define IFF_802_1Q_VLAN 0x1             /* 802.1Q VLAN device.          */
 #define IFF_EBRIDGE	0x2		/* Ethernet bridging device.	*/
+#define IFF_SLAVE_INACTIVE	0x4	/* bonding slave not the curr. active */
+#define IFF_MASTER_8023AD	0x8	/* bonding master, 802.3ad. 	*/
+#define IFF_MASTER_ALB	0x10		/* bonding master, balance-alb.	*/
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index 7a92c1ce1457..ab08f35cbc35 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -61,6 +61,7 @@
 #define ETH_P_8021Q	0x8100          /* 802.1Q VLAN Extended Header  */
 #define ETH_P_IPX	0x8137		/* IPX over DIX			*/
 #define ETH_P_IPV6	0x86DD		/* IPv6 over bluebook		*/
+#define ETH_P_SLOW	0x8809		/* Slow Protocol. See 802.3ad 43B */
 #define ETH_P_WCCP	0x883E		/* Web-cache coordination protocol
 					 * defined in draft-wilson-wrec-wccp-v2-00.txt */
 #define ETH_P_PPP_DISC	0x8863		/* PPPoE discovery messages     */
diff --git a/net/core/dev.c b/net/core/dev.c
index 225e38ff57c4..ef56c035d44e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1446,8 +1446,29 @@ static inline struct net_device *skb_bond(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
 
-	if (dev->master)
+	if (dev->master) {
+		/*
+		 * On bonding slaves other than the currently active
+		 * slave, suppress duplicates except for 802.3ad
+		 * ETH_P_SLOW and alb non-mcast/bcast.
+		 */
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+			if (dev->master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					goto keep;
+			}
+
+			if (dev->master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __constant_htons(ETH_P_SLOW))
+				goto keep;
+		
+			kfree_skb(skb);
+			return NULL;
+		}
+keep:
 		skb->dev = dev->master;
+	}
 
 	return dev;
 }
@@ -1591,6 +1612,9 @@ int netif_receive_skb(struct sk_buff *skb)
 
 	orig_dev = skb_bond(skb);
 
+	if (!orig_dev)
+		return NET_RX_DROP;
+
 	__get_cpu_var(netdev_rx_stat).total++;
 
 	skb->h.raw = skb->nh.raw = skb->data;
-- 
cgit v1.2.3


From 623a3128aa2b86caa8e06e762e9e444177e4fa47 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 5 Mar 2006 17:55:58 +0900
Subject: [PATCH] libata: implement ata_dev_revalidate()

ata_dev_revalidate() re-reads IDENTIFY PAGE of the given device and
makes sure it's the same device as the configured one.  Once it's
verified that it's the same device, @dev is configured according to
newly read IDENTIFY PAGE.  Note that revalidation currently doesn't
invoke transfer mode reconfiguration.

Criteria for 'same device'

* same class (of course)
* same model string
* same serial string
* if ATA, same n_sectors (to catch geometry parameter changes)

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 115 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h     |   2 +
 2 files changed, 117 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index d65aa86ddbb2..5d0adfa4610a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2345,6 +2345,120 @@ int ata_drive_probe_reset(struct ata_port *ap, ata_probeinit_fn_t probeinit,
 	return rc;
 }
 
+/**
+ *	ata_dev_same_device - Determine whether new ID matches configured device
+ *	@ap: port on which the device to compare against resides
+ *	@dev: device to compare against
+ *	@new_class: class of the new device
+ *	@new_id: IDENTIFY page of the new device
+ *
+ *	Compare @new_class and @new_id against @dev and determine
+ *	whether @dev is the device indicated by @new_class and
+ *	@new_id.
+ *
+ *	LOCKING:
+ *	None.
+ *
+ *	RETURNS:
+ *	1 if @dev matches @new_class and @new_id, 0 otherwise.
+ */
+static int ata_dev_same_device(struct ata_port *ap, struct ata_device *dev,
+			       unsigned int new_class, const u16 *new_id)
+{
+	const u16 *old_id = dev->id;
+	unsigned char model[2][41], serial[2][21];
+	u64 new_n_sectors;
+
+	if (dev->class != new_class) {
+		printk(KERN_INFO
+		       "ata%u: dev %u class mismatch %d != %d\n",
+		       ap->id, dev->devno, dev->class, new_class);
+		return 0;
+	}
+
+	ata_id_c_string(old_id, model[0], ATA_ID_PROD_OFS, sizeof(model[0]));
+	ata_id_c_string(new_id, model[1], ATA_ID_PROD_OFS, sizeof(model[1]));
+	ata_id_c_string(old_id, serial[0], ATA_ID_SERNO_OFS, sizeof(serial[0]));
+	ata_id_c_string(new_id, serial[1], ATA_ID_SERNO_OFS, sizeof(serial[1]));
+	new_n_sectors = ata_id_n_sectors(new_id);
+
+	if (strcmp(model[0], model[1])) {
+		printk(KERN_INFO
+		       "ata%u: dev %u model number mismatch '%s' != '%s'\n",
+		       ap->id, dev->devno, model[0], model[1]);
+		return 0;
+	}
+
+	if (strcmp(serial[0], serial[1])) {
+		printk(KERN_INFO
+		       "ata%u: dev %u serial number mismatch '%s' != '%s'\n",
+		       ap->id, dev->devno, serial[0], serial[1]);
+		return 0;
+	}
+
+	if (dev->class == ATA_DEV_ATA && dev->n_sectors != new_n_sectors) {
+		printk(KERN_INFO
+		       "ata%u: dev %u n_sectors mismatch %llu != %llu\n",
+		       ap->id, dev->devno, (unsigned long long)dev->n_sectors,
+		       (unsigned long long)new_n_sectors);
+		return 0;
+	}
+
+	return 1;
+}
+
+/**
+ *	ata_dev_revalidate - Revalidate ATA device
+ *	@ap: port on which the device to revalidate resides
+ *	@dev: device to revalidate
+ *	@post_reset: is this revalidation after reset?
+ *
+ *	Re-read IDENTIFY page and make sure @dev is still attached to
+ *	the port.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ *
+ *	RETURNS:
+ *	0 on success, negative errno otherwise
+ */
+int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
+		       int post_reset)
+{
+	unsigned int class;
+	u16 *id;
+	int rc;
+
+	if (!ata_dev_present(dev))
+		return -ENODEV;
+
+	class = dev->class;
+	id = NULL;
+
+	/* allocate & read ID data */
+	rc = ata_dev_read_id(ap, dev, &class, post_reset, &id);
+	if (rc)
+		goto fail;
+
+	/* is the device still there? */
+	if (!ata_dev_same_device(ap, dev, class, id)) {
+		rc = -ENODEV;
+		goto fail;
+	}
+
+	kfree(dev->id);
+	dev->id = id;
+
+	/* configure device according to the new ID */
+	return ata_dev_configure(ap, dev, 0);
+
+ fail:
+	printk(KERN_ERR "ata%u: dev %u revalidation failed (errno=%d)\n",
+	       ap->id, dev->devno, rc);
+	kfree(id);
+	return rc;
+}
+
 static void ata_pr_blacklisted(const struct ata_port *ap,
 			       const struct ata_device *dev)
 {
@@ -4964,6 +5078,7 @@ EXPORT_SYMBOL_GPL(sata_std_hardreset);
 EXPORT_SYMBOL_GPL(ata_std_postreset);
 EXPORT_SYMBOL_GPL(ata_std_probe_reset);
 EXPORT_SYMBOL_GPL(ata_drive_probe_reset);
+EXPORT_SYMBOL_GPL(ata_dev_revalidate);
 EXPORT_SYMBOL_GPL(ata_port_disable);
 EXPORT_SYMBOL_GPL(ata_ratelimit);
 EXPORT_SYMBOL_GPL(ata_busy_sleep);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 86a504f0ef06..66dce58f1941 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -485,6 +485,8 @@ extern int ata_std_softreset(struct ata_port *ap, int verbose,
 extern int sata_std_hardreset(struct ata_port *ap, int verbose,
 			      unsigned int *class);
 extern void ata_std_postreset(struct ata_port *ap, unsigned int *classes);
+extern int ata_dev_revalidate(struct ata_port *ap, struct ata_device *dev,
+			      int post_reset);
 extern void ata_port_disable(struct ata_port *);
 extern void ata_std_ports(struct ata_ioports *ioaddr);
 #ifdef CONFIG_PCI
-- 
cgit v1.2.3


From 86e45b6bd6900c4a0b3666fb18b46e215f775c4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 5 Mar 2006 15:29:09 +0900
Subject: [PATCH] libata: implement port_task

Implement port_task.  LLDD's can schedule a function to be executed
with context after specified delay.  libata core takes care of
synchronization against EH.  This is generalized form of pio_task and
packet_task which are tied to PIO hsm implementation.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/scsi/libata-scsi.c |  2 ++
 drivers/scsi/libata.h      |  1 +
 include/linux/libata.h     |  4 +++
 4 files changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 5dbcf0cf4a10..d59d462130c1 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -721,6 +721,81 @@ static unsigned int ata_pio_modes(const struct ata_device *adev)
 	   timing API will get this right anyway */
 }
 
+/**
+ *	ata_port_queue_task - Queue port_task
+ *	@ap: The ata_port to queue port_task for
+ *
+ *	Schedule @fn(@data) for execution after @delay jiffies using
+ *	port_task.  There is one port_task per port and it's the
+ *	user(low level driver)'s responsibility to make sure that only
+ *	one task is active at any given time.
+ *
+ *	libata core layer takes care of synchronization between
+ *	port_task and EH.  ata_port_queue_task() may be ignored for EH
+ *	synchronization.
+ *
+ *	LOCKING:
+ *	Inherited from caller.
+ */
+void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *), void *data,
+			 unsigned long delay)
+{
+	int rc;
+
+	if (ap->flags & ATA_FLAG_FLUSH_PIO_TASK)
+		return;
+
+	PREPARE_WORK(&ap->port_task, fn, data);
+
+	if (!delay)
+		rc = queue_work(ata_wq, &ap->port_task);
+	else
+		rc = queue_delayed_work(ata_wq, &ap->port_task, delay);
+
+	/* rc == 0 means that another user is using port task */
+	WARN_ON(rc == 0);
+}
+
+/**
+ *	ata_port_flush_task - Flush port_task
+ *	@ap: The ata_port to flush port_task for
+ *
+ *	After this function completes, port_task is guranteed not to
+ *	be running or scheduled.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep)
+ */
+void ata_port_flush_task(struct ata_port *ap)
+{
+	unsigned long flags;
+
+	DPRINTK("ENTER\n");
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->flags |= ATA_FLAG_FLUSH_PIO_TASK;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	DPRINTK("flush #1\n");
+	flush_workqueue(ata_wq);
+
+	/*
+	 * At this point, if a task is running, it's guaranteed to see
+	 * the FLUSH flag; thus, it will never queue pio tasks again.
+	 * Cancel and flush.
+	 */
+	if (!cancel_delayed_work(&ap->port_task)) {
+		DPRINTK("flush #2\n");
+		flush_workqueue(ata_wq);
+	}
+
+	spin_lock_irqsave(&ap->host_set->lock, flags);
+	ap->flags &= ~ATA_FLAG_FLUSH_PIO_TASK;
+	spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+	DPRINTK("EXIT\n");
+}
+
 static inline void
 ata_queue_packet_task(struct ata_port *ap)
 {
@@ -4617,6 +4692,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->active_tag = ATA_TAG_POISON;
 	ap->last_ctl = 0xFF;
 
+	INIT_WORK(&ap->port_task, NULL, NULL);
 	INIT_WORK(&ap->packet_task, atapi_packet_task, ap);
 	INIT_WORK(&ap->pio_task, ata_pio_task, ap);
 	INIT_LIST_HEAD(&ap->eh_done_q);
@@ -5088,6 +5164,7 @@ EXPORT_SYMBOL_GPL(ata_dev_revalidate);
 EXPORT_SYMBOL_GPL(ata_port_disable);
 EXPORT_SYMBOL_GPL(ata_ratelimit);
 EXPORT_SYMBOL_GPL(ata_busy_sleep);
+EXPORT_SYMBOL_GPL(ata_port_queue_task);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
 EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
 EXPORT_SYMBOL_GPL(ata_scsi_timed_out);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index d0bd94abb413..ccedb4536977 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -785,6 +785,8 @@ int ata_scsi_error(struct Scsi_Host *host)
 	WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
+	ata_port_flush_task(ap);
+
 	ap->ops->eng_timeout(ap);
 
 	WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index d822eba05f3c..f4c48c91b63d 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -45,6 +45,7 @@ extern int libata_fua;
 extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
 				      struct ata_device *dev);
 extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
+extern void ata_port_flush_task(struct ata_port *ap);
 extern void ata_qc_free(struct ata_queued_cmd *qc);
 extern unsigned int ata_qc_issue(struct ata_queued_cmd *qc);
 extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 66dce58f1941..3ad2570f663b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -388,6 +388,8 @@ struct ata_port {
 	struct ata_host_stats	stats;
 	struct ata_host_set	*host_set;
 
+	struct work_struct	port_task;
+
 	struct work_struct	packet_task;
 
 	struct work_struct	pio_task;
@@ -515,6 +517,8 @@ extern int ata_ratelimit(void);
 extern unsigned int ata_busy_sleep(struct ata_port *ap,
 				   unsigned long timeout_pat,
 				   unsigned long timeout);
+extern void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *),
+				void *data, unsigned long delay);
 
 /*
  * Default driver ops implementations
-- 
cgit v1.2.3


From 507ceda00302c071029277652d9faa5a0a55419a Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 5 Mar 2006 15:29:09 +0900
Subject: [PATCH] libata: kill unused pio_task and packet_task

Kill unused pio_task and packet_task.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 68 ----------------------------------------------
 include/linux/libata.h     |  3 --
 2 files changed, 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 16a108d2f9a9..2974438d3bb7 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -796,71 +796,6 @@ void ata_port_flush_task(struct ata_port *ap)
 	DPRINTK("EXIT\n");
 }
 
-static inline void
-ata_queue_packet_task(struct ata_port *ap)
-{
-	if (!(ap->flags & ATA_FLAG_FLUSH_PIO_TASK))
-		queue_work(ata_wq, &ap->packet_task);
-}
-
-static inline void
-ata_queue_pio_task(struct ata_port *ap)
-{
-	if (!(ap->flags & ATA_FLAG_FLUSH_PIO_TASK))
-		queue_work(ata_wq, &ap->pio_task);
-}
-
-static inline void
-ata_queue_delayed_pio_task(struct ata_port *ap, unsigned long delay)
-{
-	if (!(ap->flags & ATA_FLAG_FLUSH_PIO_TASK))
-		queue_delayed_work(ata_wq, &ap->pio_task, delay);
-}
-
-/**
- *	ata_flush_pio_tasks - Flush pio_task and packet_task
- *	@ap: the target ata_port
- *
- *	After this function completes, pio_task and packet_task are
- *	guranteed not to be running or scheduled.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep)
- */
-
-static void ata_flush_pio_tasks(struct ata_port *ap)
-{
-	int tmp = 0;
-	unsigned long flags;
-
-	DPRINTK("ENTER\n");
-
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-	ap->flags |= ATA_FLAG_FLUSH_PIO_TASK;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
-	DPRINTK("flush #1\n");
-	flush_workqueue(ata_wq);
-
-	/*
-	 * At this point, if a task is running, it's guaranteed to see
-	 * the FLUSH flag; thus, it will never queue pio tasks again.
-	 * Cancel and flush.
-	 */
-	tmp |= cancel_delayed_work(&ap->pio_task);
-	tmp |= cancel_delayed_work(&ap->packet_task);
-	if (!tmp) {
-		DPRINTK("flush #2\n");
-		flush_workqueue(ata_wq);
-	}
-
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-	ap->flags &= ~ATA_FLAG_FLUSH_PIO_TASK;
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
-	DPRINTK("EXIT\n");
-}
-
 void ata_qc_complete_internal(struct ata_queued_cmd *qc)
 {
 	struct completion *waiting = qc->private_data;
@@ -3814,7 +3749,6 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
 
 	DPRINTK("ENTER\n");
 
-	ata_flush_pio_tasks(ap);
 	ap->hsm_task_state = HSM_ST_IDLE;
 
 	spin_lock_irqsave(&host_set->lock, flags);
@@ -4693,8 +4627,6 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->last_ctl = 0xFF;
 
 	INIT_WORK(&ap->port_task, NULL, NULL);
-	INIT_WORK(&ap->packet_task, atapi_packet_task, ap);
-	INIT_WORK(&ap->pio_task, ata_pio_task, ap);
 	INIT_LIST_HEAD(&ap->eh_done_q);
 
 	for (i = 0; i < ATA_MAX_DEVICES; i++)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 3ad2570f663b..9ad020ac8591 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -390,9 +390,6 @@ struct ata_port {
 
 	struct work_struct	port_task;
 
-	struct work_struct	packet_task;
-
-	struct work_struct	pio_task;
 	unsigned int		hsm_task_state;
 	unsigned long		pio_task_timeout;
 
-- 
cgit v1.2.3


From 2e755f68ee23b03484fde18d978f910cc5479cb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sun, 5 Mar 2006 15:29:09 +0900
Subject: [PATCH] libata: rename ATA_FLAG_FLUSH_PIO_TASK to
 ATA_FLAG_FLUSH_PORT_TASK

Rename ATA_FLAG_FLUSH_PIO_TASK to ATA_FLAG_FLUSH_PORT_TASK.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 6 +++---
 include/linux/libata.h     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 2974438d3bb7..6d8aa86f2f6b 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -742,7 +742,7 @@ void ata_port_queue_task(struct ata_port *ap, void (*fn)(void *), void *data,
 {
 	int rc;
 
-	if (ap->flags & ATA_FLAG_FLUSH_PIO_TASK)
+	if (ap->flags & ATA_FLAG_FLUSH_PORT_TASK)
 		return;
 
 	PREPARE_WORK(&ap->port_task, fn, data);
@@ -773,7 +773,7 @@ void ata_port_flush_task(struct ata_port *ap)
 	DPRINTK("ENTER\n");
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
-	ap->flags |= ATA_FLAG_FLUSH_PIO_TASK;
+	ap->flags |= ATA_FLAG_FLUSH_PORT_TASK;
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
 	DPRINTK("flush #1\n");
@@ -790,7 +790,7 @@ void ata_port_flush_task(struct ata_port *ap)
 	}
 
 	spin_lock_irqsave(&ap->host_set->lock, flags);
-	ap->flags &= ~ATA_FLAG_FLUSH_PIO_TASK;
+	ap->flags &= ~ATA_FLAG_FLUSH_PORT_TASK;
 	spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
 	DPRINTK("EXIT\n");
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9ad020ac8591..15674923cc84 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -151,7 +151,7 @@ enum {
 	ATA_FLAG_PIO_LBA48	= (1 << 13), /* Host DMA engine is LBA28 only */
 	ATA_FLAG_IRQ_MASK	= (1 << 14), /* Mask IRQ in PIO xfers */
 
-	ATA_FLAG_FLUSH_PIO_TASK	= (1 << 15), /* Flush PIO task */
+	ATA_FLAG_FLUSH_PORT_TASK = (1 << 15), /* Flush port task */
 	ATA_FLAG_IN_EH		= (1 << 16), /* EH in progress */
 
 	ATA_QCFLAG_ACTIVE	= (1 << 1), /* cmd not yet ack'd to scsi lyer */
-- 
cgit v1.2.3


From 1da7b0d01b20bf21f3263d8d2f17fa49a214d773 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 6 Mar 2006 04:31:56 +0900
Subject: [PATCH] libata: improve xfer mask constants and update
 ata_mode_string()

Add ATA_BITS_*, ATA_MASK_* macros and reorder xfer_mask fields such
that higher transfer mode is placed at higher order bit.  As thie
reordering breaks ata_mode_string(), this patch also rewrites
ata_mode_string().

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 44 +++++++++++++++++---------------------------
 include/linux/libata.h     | 16 ++++++++++++----
 2 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 6d8aa86f2f6b..18418c82d6f6 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -232,6 +232,14 @@ int ata_rwcmd_protocol(struct ata_queued_cmd *qc)
 }
 
 static const char * const xfer_mode_str[] = {
+	"PIO0",
+	"PIO1",
+	"PIO2",
+	"PIO3",
+	"PIO4",
+	"MWDMA0",
+	"MWDMA1",
+	"MWDMA2",
 	"UDMA/16",
 	"UDMA/25",
 	"UDMA/33",
@@ -240,49 +248,31 @@ static const char * const xfer_mode_str[] = {
 	"UDMA/100",
 	"UDMA/133",
 	"UDMA7",
-	"MWDMA0",
-	"MWDMA1",
-	"MWDMA2",
-	"PIO0",
-	"PIO1",
-	"PIO2",
-	"PIO3",
-	"PIO4",
 };
 
 /**
- *	ata_udma_string - convert UDMA bit offset to string
- *	@mask: mask of bits supported; only highest bit counts.
+ *	ata_mode_string - convert xfer_mask to string
+ *	@xfer_mask: mask of bits supported; only highest bit counts.
  *
  *	Determine string which represents the highest speed
- *	(highest bit in @udma_mask).
+ *	(highest bit in @modemask).
  *
  *	LOCKING:
  *	None.
  *
  *	RETURNS:
  *	Constant C string representing highest speed listed in
- *	@udma_mask, or the constant C string "<n/a>".
+ *	@mode_mask, or the constant C string "<n/a>".
  */
 
-static const char *ata_mode_string(unsigned int mask)
+static const char *ata_mode_string(unsigned int xfer_mask)
 {
-	int i;
-
-	for (i = 7; i >= 0; i--)
-		if (mask & (1 << i))
-			goto out;
-	for (i = ATA_SHIFT_MWDMA + 2; i >= ATA_SHIFT_MWDMA; i--)
-		if (mask & (1 << i))
-			goto out;
-	for (i = ATA_SHIFT_PIO + 4; i >= ATA_SHIFT_PIO; i--)
-		if (mask & (1 << i))
-			goto out;
+	int highbit;
 
+	highbit = fls(xfer_mask) - 1;
+	if (highbit >= 0 && highbit < ARRAY_SIZE(xfer_mode_str))
+		return xfer_mode_str[highbit];
 	return "<n/a>";
-
-out:
-	return xfer_mode_str[i];
 }
 
 /**
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 15674923cc84..239408ecfddf 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -188,11 +188,19 @@ enum {
 	PORT_DISABLED		= 2,
 
 	/* encoding various smaller bitmaps into a single
-	 * unsigned long bitmap
+	 * unsigned int bitmap
 	 */
-	ATA_SHIFT_UDMA		= 0,
-	ATA_SHIFT_MWDMA		= 8,
-	ATA_SHIFT_PIO		= 11,
+	ATA_BITS_PIO		= 5,
+	ATA_BITS_MWDMA		= 3,
+	ATA_BITS_UDMA		= 8,
+
+	ATA_SHIFT_PIO		= 0,
+	ATA_SHIFT_MWDMA		= ATA_SHIFT_PIO + ATA_BITS_PIO,
+	ATA_SHIFT_UDMA		= ATA_SHIFT_MWDMA + ATA_BITS_MWDMA,
+
+	ATA_MASK_PIO		= ((1 << ATA_BITS_PIO) - 1) << ATA_SHIFT_PIO,
+	ATA_MASK_MWDMA		= ((1 << ATA_BITS_MWDMA) - 1) << ATA_SHIFT_MWDMA,
+	ATA_MASK_UDMA		= ((1 << ATA_BITS_UDMA) - 1) << ATA_SHIFT_UDMA,
 
 	/* size of buffer to pad xfers ending on unaligned boundaries */
 	ATA_DMA_PAD_SZ		= 4,
-- 
cgit v1.2.3


From 044cc6c8ec311c4ddeebfcc31c53dea282de70b7 Mon Sep 17 00:00:00 2001
From: "andrew.vasquez@qlogic.com" <andrew.vasquez@qlogic.com>
Date: Thu, 9 Mar 2006 14:27:13 -0800
Subject: [SCSI] qla2xxx: Add ISP54xx support.

Chip is similar in form to our ISP24xx offering.

Signed-off-by: Andrew Vasquez <andrew.vasquez@qlogic.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/qla2xxx/ql2400.c     | 27 +++++++++++++++++++++++++
 drivers/scsi/qla2xxx/qla_attr.c   |  8 ++++----
 drivers/scsi/qla2xxx/qla_def.h    | 12 +++++------
 drivers/scsi/qla2xxx/qla_gs.c     | 10 ++++------
 drivers/scsi/qla2xxx/qla_init.c   | 14 ++++++-------
 drivers/scsi/qla2xxx/qla_inline.h |  2 +-
 drivers/scsi/qla2xxx/qla_iocb.c   |  6 +++---
 drivers/scsi/qla2xxx/qla_isr.c    | 16 +++++++--------
 drivers/scsi/qla2xxx/qla_mbx.c    | 42 +++++++++++++++++++--------------------
 drivers/scsi/qla2xxx/qla_os.c     | 18 ++++++++++-------
 include/linux/pci_ids.h           |  2 ++
 11 files changed, 94 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla2xxx/ql2400.c b/drivers/scsi/qla2xxx/ql2400.c
index 6c7165f47e29..77914fcfa2bc 100644
--- a/drivers/scsi/qla2xxx/ql2400.c
+++ b/drivers/scsi/qla2xxx/ql2400.c
@@ -49,6 +49,18 @@ static struct qla_board_info qla_board_tbl[] = {
 		.fw_info	= qla_fw_tbl,
 		.fw_fname	= "ql2400_fw.bin",
 	},
+	{
+		.drv_name	= qla_driver_name,
+		.isp_name	= "ISP5422",
+		.fw_info	= qla_fw_tbl,
+		.fw_fname	= "ql2400_fw.bin",
+	},
+	{
+		.drv_name	= qla_driver_name,
+		.isp_name	= "ISP5432",
+		.fw_info	= qla_fw_tbl,
+		.fw_fname	= "ql2400_fw.bin",
+	},
 };
 
 static struct pci_device_id qla24xx_pci_tbl[] = {
@@ -66,6 +78,21 @@ static struct pci_device_id qla24xx_pci_tbl[] = {
 		.subdevice	= PCI_ANY_ID,
 		.driver_data	= (unsigned long)&qla_board_tbl[1],
 	},
+	{
+		.vendor		= PCI_VENDOR_ID_QLOGIC,
+		.device		= PCI_DEVICE_ID_QLOGIC_ISP5422,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (unsigned long)&qla_board_tbl[2],
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_QLOGIC,
+		.device		= PCI_DEVICE_ID_QLOGIC_ISP5432,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.driver_data	= (unsigned long)&qla_board_tbl[3],
+	},
+
 	{0, 0},
 };
 MODULE_DEVICE_TABLE(pci, qla24xx_pci_tbl);
diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
index 92b3e13e9061..2b9e329a240c 100644
--- a/drivers/scsi/qla2xxx/qla_attr.c
+++ b/drivers/scsi/qla2xxx/qla_attr.c
@@ -50,7 +50,7 @@ qla2x00_sysfs_write_fw_dump(struct kobject *kobj, char *buf, loff_t off,
 			    ha->host_no);
 
 			vfree(ha->fw_dump_buffer);
-			if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha))
+			if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha))
 				free_pages((unsigned long)ha->fw_dump,
 				    ha->fw_dump_order);
 
@@ -64,7 +64,7 @@ qla2x00_sysfs_write_fw_dump(struct kobject *kobj, char *buf, loff_t off,
 		if ((ha->fw_dump || ha->fw_dumped) && !ha->fw_dump_reading) {
 			ha->fw_dump_reading = 1;
 
-			if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+			if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 				dump_size = FW_DUMP_SIZE_24XX;
 			else {
 				dump_size = FW_DUMP_SIZE_1M;
@@ -138,7 +138,7 @@ qla2x00_sysfs_write_nvram(struct kobject *kobj, char *buf, loff_t off,
 		return 0;
 
 	/* Checksum NVRAM. */
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		uint32_t *iter;
 		uint32_t chksum;
 
@@ -750,7 +750,7 @@ qla2x00_get_fc_host_stats(struct Scsi_Host *shost)
 	pfc_host_stat = &ha->fc_host_stat;
 	memset(pfc_host_stat, -1, sizeof(struct fc_host_statistics));
 
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		rval = qla24xx_get_isp_stats(ha, (uint32_t *)&stat_buf,
 		    sizeof(stat_buf) / 4, mb_stat);
 	} else {
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 00b7e82b99b3..e1a7769008ee 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -2234,9 +2234,9 @@ typedef struct scsi_qla_host {
 #define DT_ISP6322			BIT_6
 #define DT_ISP2422			BIT_7
 #define DT_ISP2432			BIT_8
-#define DT_ISP2512			BIT_9
-#define DT_ISP2522			BIT_10
-#define DT_ISP_LAST			(DT_ISP2522 << 1)
+#define DT_ISP5422			BIT_9
+#define DT_ISP5432			BIT_10
+#define DT_ISP_LAST			(DT_ISP5432 << 1)
 
 #define DT_OEM_001			BIT_29
 #define DT_ISP2200A			BIT_30
@@ -2252,13 +2252,13 @@ typedef struct scsi_qla_host {
 #define IS_QLA6322(ha)	(DT_MASK(ha) & DT_ISP6322)
 #define IS_QLA2422(ha)	(DT_MASK(ha) & DT_ISP2422)
 #define IS_QLA2432(ha)	(DT_MASK(ha) & DT_ISP2432)
-#define IS_QLA2512(ha)	(DT_MASK(ha) & DT_ISP2512)
-#define IS_QLA2522(ha)	(DT_MASK(ha) & DT_ISP2522)
+#define IS_QLA5422(ha)	(DT_MASK(ha) & DT_ISP5422)
+#define IS_QLA5432(ha)	(DT_MASK(ha) & DT_ISP5432)
 
 #define IS_QLA23XX(ha)	(IS_QLA2300(ha) || IS_QLA2312(ha) || IS_QLA2322(ha) || \
     			 IS_QLA6312(ha) || IS_QLA6322(ha))
 #define IS_QLA24XX(ha)	(IS_QLA2422(ha) || IS_QLA2432(ha))
-#define IS_QLA25XX(ha)	(IS_QLA2512(ha) || IS_QLA2522(ha))
+#define IS_QLA54XX(ha)	(IS_QLA5422(ha) || IS_QLA5432(ha))
 
 #define IS_OEM_001(ha)		((ha)->device_type & DT_OEM_001)
 #define HAS_EXTENDED_IDS(ha)	((ha)->device_type & DT_EXTENDED_IDS)
diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c
index d620a8e8a614..2ebf259fccb2 100644
--- a/drivers/scsi/qla2xxx/qla_gs.c
+++ b/drivers/scsi/qla2xxx/qla_gs.c
@@ -126,7 +126,7 @@ qla2x00_chk_ms_status(scsi_qla_host_t *ha, ms_iocb_entry_t *ms_pkt,
 		DEBUG2_3(printk("scsi(%ld): %s failed, error status (%x).\n",
 		    ha->host_no, routine, ms_pkt->entry_status));
 	} else {
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 			comp_status =
 			    ((struct ct_entry_24xx *)ms_pkt)->comp_status;
 		else
@@ -1200,7 +1200,7 @@ qla2x00_update_ms_fdmi_iocb(scsi_qla_host_t *ha, uint32_t req_size)
 	ms_iocb_entry_t *ms_pkt = ha->ms_iocb;
 	struct ct_entry_24xx *ct_pkt = (struct ct_entry_24xx *)ha->ms_iocb;
 
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		ct_pkt->cmd_byte_count = cpu_to_le32(req_size);
 		ct_pkt->dseg_0_len = ct_pkt->cmd_byte_count;
 	} else {
@@ -1529,9 +1529,7 @@ qla2x00_fdmi_rpa(scsi_qla_host_t *ha)
 	eiter = (struct ct_fdmi_port_attr *) (entries + size);
 	eiter->type = __constant_cpu_to_be16(FDMI_PORT_SUPPORT_SPEED);
 	eiter->len = __constant_cpu_to_be16(4 + 4);
-	if (IS_QLA25XX(ha))
-		eiter->a.sup_speed = __constant_cpu_to_be32(8);
-	else if (IS_QLA24XX(ha))
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 		eiter->a.sup_speed = __constant_cpu_to_be32(4);
 	else if (IS_QLA23XX(ha))
 		eiter->a.sup_speed = __constant_cpu_to_be32(2);
@@ -1566,7 +1564,7 @@ qla2x00_fdmi_rpa(scsi_qla_host_t *ha)
 	eiter = (struct ct_fdmi_port_attr *) (entries + size);
 	eiter->type = __constant_cpu_to_be16(FDMI_PORT_MAX_FRAME_SIZE);
 	eiter->len = __constant_cpu_to_be16(4 + 4);
-	max_frame_size = IS_QLA24XX(ha) || IS_QLA25XX(ha) ?
+	max_frame_size = IS_QLA24XX(ha) || IS_QLA54XX(ha) ?
 		(uint32_t) icb24->frame_payload_size:
 		(uint32_t) ha->init_cb->frame_payload_size;
 	eiter->a.max_frame_size = cpu_to_be32(max_frame_size);
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index f49eb06d0dbd..e6a2292a2892 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -387,7 +387,7 @@ qla2x00_isp_firmware(scsi_qla_host_t *ha)
 
 		/* Verify checksum of loaded RISC code. */
 		rval = qla2x00_verify_checksum(ha,
-		    IS_QLA24XX(ha) || IS_QLA25XX(ha) ? RISC_SADDRESS :
+		    IS_QLA24XX(ha) || IS_QLA54XX(ha) ? RISC_SADDRESS :
 		    *ha->brd_info->fw_info[0].fwstart);
 	}
 
@@ -822,7 +822,7 @@ qla2x00_resize_request_q(scsi_qla_host_t *ha)
 	if (IS_QLA2100(ha) || IS_QLA2200(ha))
 		return;
 
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 		qla2x00_alloc_fw_dump(ha);
 
 	/* Retrieve IOCB counts available to the firmware. */
@@ -2123,7 +2123,7 @@ qla2x00_configure_fabric(scsi_qla_host_t *ha)
 	LIST_HEAD(new_fcports);
 
 	/* If FL port exists, then SNS is present */
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 		loop_id = NPH_F_PORT;
 	else
 		loop_id = SNS_FL_PORT;
@@ -2149,7 +2149,7 @@ qla2x00_configure_fabric(scsi_qla_host_t *ha)
 			qla2x00_fdmi_register(ha);
 
 		/* Ensure we are logged into the SNS. */
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 			loop_id = NPH_SNS;
 		else
 			loop_id = SIMPLE_NAME_SERVER;
@@ -2640,7 +2640,7 @@ qla2x00_device_resync(scsi_qla_host_t *ha)
 			if (ql2xprocessrscn &&
 			    !IS_QLA2100(ha) && !IS_QLA2200(ha) &&
 			    !IS_QLA6312(ha) && !IS_QLA6322(ha) &&
-			    !IS_QLA24XX(ha) && !IS_QLA25XX(ha) &&
+			    !IS_QLA24XX(ha) && !IS_QLA54XX(ha) &&
 			    ha->flags.init_done) {
 				/* Handle port RSCN via asyncronous IOCBs */
 				rval2 = qla2x00_handle_port_rscn(ha, rscn_entry,
@@ -3130,7 +3130,7 @@ qla2x00_restart_isp(scsi_qla_host_t *ha)
 
 			spin_lock_irqsave(&ha->hardware_lock, flags);
 
-			if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha)) {
+			if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha)) {
 				/*
 				 * Disable SRAM, Instruction RAM and GP RAM
 				 * parity.
@@ -3146,7 +3146,7 @@ qla2x00_restart_isp(scsi_qla_host_t *ha)
 
 			spin_lock_irqsave(&ha->hardware_lock, flags);
 
-			if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha)) {
+			if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha)) {
 				/* Enable proper parity */
 				if (IS_QLA2300(ha))
 					/* SRAM parity */
diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
index ecc3741a452e..45007ee58067 100644
--- a/drivers/scsi/qla2xxx/qla_inline.h
+++ b/drivers/scsi/qla2xxx/qla_inline.h
@@ -163,7 +163,7 @@ static inline int qla2x00_is_reserved_id(scsi_qla_host_t *, uint16_t);
 static inline int
 qla2x00_is_reserved_id(scsi_qla_host_t *ha, uint16_t loop_id)
 {
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 		return (loop_id > NPH_LAST_HANDLE);
 
 	return ((loop_id > ha->last_loop_id && loop_id < SNS_FIRST_LOOP_ID) ||
diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c
index 6544b6d0891d..8f0f4a298357 100644
--- a/drivers/scsi/qla2xxx/qla_iocb.c
+++ b/drivers/scsi/qla2xxx/qla_iocb.c
@@ -466,7 +466,7 @@ __qla2x00_marker(scsi_qla_host_t *ha, uint16_t loop_id, uint16_t lun,
 	mrk->entry_type = MARKER_TYPE;
 	mrk->modifier = type;
 	if (type != MK_SYNC_ALL) {
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 			mrk24 = (struct mrk_entry_24xx *) mrk;
 			mrk24->nport_handle = cpu_to_le16(loop_id);
 			mrk24->lun[1] = LSB(lun);
@@ -519,7 +519,7 @@ qla2x00_req_pkt(scsi_qla_host_t *ha)
 	for (timer = HZ; timer; timer--) {
 		if ((req_cnt + 2) >= ha->req_q_cnt) {
 			/* Calculate number of free request entries. */
-			if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+			if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 				cnt = (uint16_t)RD_REG_DWORD(
 				    &reg->isp24.req_q_out);
 			else
@@ -593,7 +593,7 @@ qla2x00_isp_cmd(scsi_qla_host_t *ha)
 		ha->request_ring_ptr++;
 
 	/* Set chip new ring index. */
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		WRT_REG_DWORD(&reg->isp24.req_q_in, ha->req_ring_index);
 		RD_REG_DWORD_RELAXED(&reg->isp24.req_q_in);
 	} else {
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index c15458c2bf32..2003dbb70579 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -343,7 +343,7 @@ qla2x00_async_event(scsi_qla_host_t *ha, uint16_t *mb)
 
 		ha->isp_ops.fw_dump(ha, 1);
 
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 			if (mb[1] == 0 && mb[2] == 0) {
 				qla_printk(KERN_ERR, ha,
 				    "Unrecoverable Hardware Error: adapter "
@@ -521,7 +521,7 @@ qla2x00_async_event(scsi_qla_host_t *ha, uint16_t *mb)
 		 */
 		if (ql2xprocessrscn &&
 		    !IS_QLA2100(ha) && !IS_QLA2200(ha) && !IS_QLA6312(ha) &&
-		    !IS_QLA6322(ha) && !IS_QLA24XX(ha) && !IS_QLA25XX(ha) &&
+		    !IS_QLA6322(ha) && !IS_QLA24XX(ha) && !IS_QLA54XX(ha) &&
 		    ha->flags.init_done && mb[1] != 0xffff &&
 		    ((ha->operating_mode == P2P && mb[1] != 0) ||
 		    (ha->operating_mode != P2P && mb[1] !=
@@ -638,7 +638,7 @@ qla2x00_async_event(scsi_qla_host_t *ha, uint16_t *mb)
 		    "scsi(%ld): [R|Z]IO update completion.\n",
 		    ha->host_no));
 
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 			qla24xx_process_response_queue(ha);
 		else
 			qla2x00_process_response_queue(ha);
@@ -810,7 +810,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt)
 
 	sts = (sts_entry_t *) pkt;
 	sts24 = (struct sts_entry_24xx *) pkt;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		comp_status = le16_to_cpu(sts24->comp_status);
 		scsi_status = le16_to_cpu(sts24->scsi_status) & SS_MASK;
 	} else {
@@ -860,7 +860,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt)
 	fcport = sp->fcport;
 
 	sense_len = rsp_info_len = resid_len = 0;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		sense_len = le32_to_cpu(sts24->sense_len);
 		rsp_info_len = le32_to_cpu(sts24->rsp_data_len);
 		resid_len = le32_to_cpu(sts24->rsp_residual_count);
@@ -878,7 +878,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt)
 	/* Check for any FCP transport errors. */
 	if (scsi_status & SS_RESPONSE_INFO_LEN_VALID) {
 		/* Sense data lies beyond any FCP RESPONSE data. */
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 			sense_data += rsp_info_len;
 		if (rsp_info_len > 3 && rsp_info[3]) {
 			DEBUG2(printk("scsi(%ld:%d:%d:%d) FCP I/O protocol "
@@ -1117,7 +1117,7 @@ qla2x00_status_entry(scsi_qla_host_t *ha, void *pkt)
 	case CS_TIMEOUT:
 		cp->result = DID_BUS_BUSY << 16;
 
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 			DEBUG2(printk(KERN_INFO
 			    "scsi(%ld:%d:%d:%d): TIMEOUT status detected "
 			    "0x%x-0x%x\n", ha->host_no, cp->device->channel,
@@ -1197,7 +1197,7 @@ qla2x00_status_cont_entry(scsi_qla_host_t *ha, sts_cont_entry_t *pkt)
 		}
 
 		/* Move sense data. */
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 			host_to_fcp_swap(pkt->data, sizeof(pkt->data));
 		memcpy(sp->request_sense_ptr, pkt->data, sense_sz);
 		DEBUG5(qla2x00_dump_buffer(sp->request_sense_ptr, sense_sz));
diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
index 584cc2f6dd35..267435f17482 100644
--- a/drivers/scsi/qla2xxx/qla_mbx.c
+++ b/drivers/scsi/qla2xxx/qla_mbx.c
@@ -91,7 +91,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp)
 	spin_lock_irqsave(&ha->hardware_lock, flags);
 
 	/* Load mailbox registers. */
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 		optr = (uint16_t __iomem *)&reg->isp24.mailbox0;
 	else
 		optr = (uint16_t __iomem *)MAILBOX_REG(ha, &reg->isp, 0);
@@ -155,7 +155,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp)
 
 		set_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
 
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 			WRT_REG_DWORD(&reg->isp24.hccr, HCCRX_SET_HOST_INT);
 		else
 			WRT_REG_WORD(&reg->isp.hccr, HCCR_SET_HOST_INT);
@@ -179,7 +179,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp)
 		DEBUG3_11(printk("%s(%ld): cmd=%x POLLING MODE.\n", __func__,
 		    ha->host_no, command);)
 
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha))
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha))
 			WRT_REG_DWORD(&reg->isp24.hccr, HCCRX_SET_HOST_INT);
 		else
 			WRT_REG_WORD(&reg->isp.hccr, HCCR_SET_HOST_INT);
@@ -237,7 +237,7 @@ qla2x00_mailbox_command(scsi_qla_host_t *ha, mbx_cmd_t *mcp)
 		uint16_t mb0;
 		uint32_t ictrl;
 
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 			mb0 = RD_REG_WORD(&reg->isp24.mailbox0);
 			ictrl = RD_REG_DWORD(&reg->isp24.ictrl);
 		} else {
@@ -334,7 +334,7 @@ qla2x00_load_ram(scsi_qla_host_t *ha, dma_addr_t req_dma, uint32_t risc_addr,
 
 	DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no));
 
-	if (MSW(risc_addr) || IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (MSW(risc_addr) || IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[0] = MBC_LOAD_RISC_RAM_EXTENDED;
 		mcp->mb[8] = MSW(risc_addr);
 		mcp->out_mb = MBX_8|MBX_0;
@@ -348,7 +348,7 @@ qla2x00_load_ram(scsi_qla_host_t *ha, dma_addr_t req_dma, uint32_t risc_addr,
 	mcp->mb[6] = MSW(MSD(req_dma));
 	mcp->mb[7] = LSW(MSD(req_dma));
 	mcp->out_mb |= MBX_7|MBX_6|MBX_3|MBX_2|MBX_1;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[4] = MSW(risc_code_size);
 		mcp->mb[5] = LSW(risc_code_size);
 		mcp->out_mb |= MBX_5|MBX_4;
@@ -399,7 +399,7 @@ qla2x00_execute_fw(scsi_qla_host_t *ha, uint32_t risc_addr)
 	mcp->mb[0] = MBC_EXECUTE_FIRMWARE;
 	mcp->out_mb = MBX_0;
 	mcp->in_mb = MBX_0;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[1] = MSW(risc_addr);
 		mcp->mb[2] = LSW(risc_addr);
 		mcp->mb[3] = 0;
@@ -422,7 +422,7 @@ qla2x00_execute_fw(scsi_qla_host_t *ha, uint32_t risc_addr)
 		DEBUG2_3_11(printk("%s(%ld): failed=%x mb[0]=%x.\n", __func__,
 		    ha->host_no, rval, mcp->mb[0]));
 	} else {
-		if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+		if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 			DEBUG11(printk("%s(%ld): done exchanges=%x.\n",
 			    __func__, ha->host_no, mcp->mb[1]);)
 		} else {
@@ -563,7 +563,7 @@ qla2x00_set_fw_options(scsi_qla_host_t *ha, uint16_t *fwopts)
 	mcp->mb[3] = fwopts[3];
 	mcp->out_mb = MBX_3|MBX_2|MBX_1|MBX_0;
 	mcp->in_mb = MBX_0;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->in_mb |= MBX_1;
 	} else {
 		mcp->mb[10] = fwopts[10];
@@ -676,7 +676,7 @@ qla2x00_verify_checksum(scsi_qla_host_t *ha, uint32_t risc_addr)
 	mcp->mb[0] = MBC_VERIFY_CHECKSUM;
 	mcp->out_mb = MBX_0;
 	mcp->in_mb = MBX_0;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[1] = MSW(risc_addr);
 		mcp->mb[2] = LSW(risc_addr);
 		mcp->out_mb |= MBX_2|MBX_1;
@@ -693,7 +693,7 @@ qla2x00_verify_checksum(scsi_qla_host_t *ha, uint32_t risc_addr)
 
 	if (rval != QLA_SUCCESS) {
 		DEBUG2_3_11(printk("%s(%ld): failed=%x chk sum=%x.\n", __func__,
-		    ha->host_no, rval, (IS_QLA24XX(ha) || IS_QLA25XX(ha) ?
+		    ha->host_no, rval, (IS_QLA24XX(ha) || IS_QLA54XX(ha) ?
 		    (mcp->mb[2] << 16) | mcp->mb[1]: mcp->mb[1]));)
 	} else {
 		DEBUG11(printk("%s(%ld): done.\n", __func__, ha->host_no);)
@@ -751,7 +751,7 @@ qla2x00_issue_iocb(scsi_qla_host_t *ha, void*  buffer, dma_addr_t phys_addr,
 
 		/* Mask reserved bits. */
 		sts_entry->entry_status &=
-		    IS_QLA24XX(ha) || IS_QLA25XX(ha) ? RF_MASK_24XX :RF_MASK;
+		    IS_QLA24XX(ha) || IS_QLA54XX(ha) ? RF_MASK_24XX :RF_MASK;
 	}
 
 	return rval;
@@ -1091,7 +1091,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt)
 	memset(pd, 0, max(PORT_DATABASE_SIZE, PORT_DATABASE_24XX_SIZE));
 
 	mcp->mb[0] = MBC_GET_PORT_DATABASE;
-	if (opt != 0 && !IS_QLA24XX(ha) && !IS_QLA25XX(ha))
+	if (opt != 0 && !IS_QLA24XX(ha) && !IS_QLA54XX(ha))
 		mcp->mb[0] = MBC_ENHANCED_GET_PORT_DATABASE;
 	mcp->mb[2] = MSW(pd_dma);
 	mcp->mb[3] = LSW(pd_dma);
@@ -1099,7 +1099,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt)
 	mcp->mb[7] = LSW(MSD(pd_dma));
 	mcp->out_mb = MBX_7|MBX_6|MBX_3|MBX_2|MBX_0;
 	mcp->in_mb = MBX_0;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[1] = fcport->loop_id;
 		mcp->mb[10] = opt;
 		mcp->out_mb |= MBX_10|MBX_1;
@@ -1112,7 +1112,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt)
 		mcp->mb[1] = fcport->loop_id << 8 | opt;
 		mcp->out_mb |= MBX_1;
 	}
-	mcp->buf_size = (IS_QLA24XX(ha) || IS_QLA25XX(ha) ?
+	mcp->buf_size = (IS_QLA24XX(ha) || IS_QLA54XX(ha) ?
 	    PORT_DATABASE_24XX_SIZE : PORT_DATABASE_SIZE);
 	mcp->flags = MBX_DMA_IN;
 	mcp->tov = (ha->login_timeout * 2) + (ha->login_timeout / 2);
@@ -1120,7 +1120,7 @@ qla2x00_get_port_database(scsi_qla_host_t *ha, fc_port_t *fcport, uint8_t opt)
 	if (rval != QLA_SUCCESS)
 		goto gpd_error_out;
 
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		pd24 = (struct port_database_24xx *) pd;
 
 		/* Check for logged in state. */
@@ -1337,7 +1337,7 @@ qla2x00_lip_reset(scsi_qla_host_t *ha)
 
 	DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no);)
 
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[0] = MBC_LIP_FULL_LOGIN;
 		mcp->mb[1] = BIT_0;
 		mcp->mb[2] = 0xff;
@@ -1866,7 +1866,7 @@ qla2x00_get_id_list(scsi_qla_host_t *ha, void *id_list, dma_addr_t id_list_dma,
 
 	mcp->mb[0] = MBC_GET_ID_LIST;
 	mcp->out_mb = MBX_0;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[2] = MSW(id_list_dma);
 		mcp->mb[3] = LSW(id_list_dma);
 		mcp->mb[6] = MSW(MSD(id_list_dma));
@@ -2057,7 +2057,7 @@ qla2x00_get_link_status(scsi_qla_host_t *ha, uint16_t loop_id,
 	mcp->mb[7] = LSW(MSD(stat_buf_dma));
 	mcp->out_mb = MBX_7|MBX_6|MBX_3|MBX_2|MBX_0;
 	mcp->in_mb = MBX_0;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		mcp->mb[1] = loop_id;
 		mcp->mb[4] = 0;
 		mcp->mb[10] = 0;
@@ -2324,7 +2324,7 @@ qla2x00_system_error(scsi_qla_host_t *ha)
 	mbx_cmd_t mc;
 	mbx_cmd_t *mcp = &mc;
 
-	if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha))
+	if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha))
 		return QLA_FUNCTION_FAILED;
 
 	DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no));
@@ -2434,7 +2434,7 @@ qla2x00_stop_firmware(scsi_qla_host_t *ha)
 	mbx_cmd_t mc;
 	mbx_cmd_t *mcp = &mc;
 
-	if (!IS_QLA24XX(ha) && !IS_QLA25XX(ha))
+	if (!IS_QLA24XX(ha) && !IS_QLA54XX(ha))
 		return QLA_FUNCTION_FAILED;
 
 	DEBUG11(printk("%s(%ld): entered.\n", __func__, ha->host_no));
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 757c4c43c453..131614751196 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1183,11 +1183,11 @@ qla2x00_set_isp_flags(scsi_qla_host_t *ha)
 	case PCI_DEVICE_ID_QLOGIC_ISP2432:
 		ha->device_type |= DT_ISP2432;
 		break;
-	case PCI_DEVICE_ID_QLOGIC_ISP2512:
-		ha->device_type |= DT_ISP2512;
+	case PCI_DEVICE_ID_QLOGIC_ISP5422:
+		ha->device_type |= DT_ISP5422;
 		break;
-	case PCI_DEVICE_ID_QLOGIC_ISP2522:
-		ha->device_type |= DT_ISP2522;
+	case PCI_DEVICE_ID_QLOGIC_ISP5432:
+		ha->device_type |= DT_ISP5432;
 		break;
 	}
 }
@@ -1433,7 +1433,7 @@ int qla2x00_probe_one(struct pci_dev *pdev, struct qla_board_info *brd_info)
 		ha->gid_list_info_size = 6;
 		if (IS_QLA2322(ha) || IS_QLA6322(ha))
 			ha->optrom_size = OPTROM_SIZE_2322;
-	} else if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	} else if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		host->max_id = MAX_TARGETS_2200;
 		ha->mbx_count = MAILBOX_REGISTER_COUNT;
 		ha->request_q_length = REQUEST_ENTRY_CNT_24XX;
@@ -1559,7 +1559,7 @@ int qla2x00_probe_one(struct pci_dev *pdev, struct qla_board_info *brd_info)
 
 	spin_lock_irqsave(&ha->hardware_lock, flags);
 	reg = ha->iobase;
-	if (IS_QLA24XX(ha) || IS_QLA25XX(ha)) {
+	if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		WRT_REG_DWORD(&reg->isp24.hccr, HCCRX_CLR_HOST_INT);
 		WRT_REG_DWORD(&reg->isp24.hccr, HCCRX_CLR_RISC_INT);
 	} else {
@@ -2631,7 +2631,7 @@ qla2x00_request_firmware(scsi_qla_host_t *ha)
 		blob = &qla_fw_blobs[FW_ISP2322];
 	} else if (IS_QLA6312(ha) || IS_QLA6322(ha)) {
 		blob = &qla_fw_blobs[FW_ISP63XX];
-	} else if (IS_QLA24XX(ha)) {
+	} else if (IS_QLA24XX(ha) || IS_QLA54XX(ha)) {
 		blob = &qla_fw_blobs[FW_ISP24XX];
 	}
 
@@ -2687,6 +2687,10 @@ static struct pci_device_id qla2xxx_pci_tbl[] = {
 		PCI_ANY_ID, PCI_ANY_ID, },
 	{ PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP2432,
 		PCI_ANY_ID, PCI_ANY_ID, },
+	{ PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP5422,
+		PCI_ANY_ID, PCI_ANY_ID, },
+	{ PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_ISP5432,
+		PCI_ANY_ID, PCI_ANY_ID, },
 	{ 0 },
 };
 MODULE_DEVICE_TABLE(pci, qla2xxx_pci_tbl);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 82b83da25d77..1afac931351e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -852,6 +852,8 @@
 #define PCI_DEVICE_ID_QLOGIC_ISP2432	0x2432
 #define PCI_DEVICE_ID_QLOGIC_ISP2512	0x2512
 #define PCI_DEVICE_ID_QLOGIC_ISP2522	0x2522
+#define PCI_DEVICE_ID_QLOGIC_ISP5422	0x5422
+#define PCI_DEVICE_ID_QLOGIC_ISP5432	0x5432
 
 #define PCI_VENDOR_ID_CYRIX		0x1078
 #define PCI_DEVICE_ID_CYRIX_5510	0x0000
-- 
cgit v1.2.3


From e935d5da8e5d12fabe5b632736c50eae0427e8c8 Mon Sep 17 00:00:00 2001
From: "Moore, Eric" <Eric.Moore@lsil.com>
Date: Tue, 14 Mar 2006 09:18:18 -0700
Subject: [SCSI] drivers/base/bus.c - export reprobe

Adding support for exposing hidden raid components for sg
interface. The sdev->no_uld_attach flag will set set accordingly.

The sas module supports adding/removing raid volumes using online
storage management application interface.

This patch was provided to me by Christoph Hellwig.

Signed-off-by: Eric Moore <Eric.Moore@lsil.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/base/bus.c     | 22 ++++++++++++++++++++++
 include/linux/device.h |  1 +
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index c3141565d59d..48718b7f4fa0 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -536,6 +536,28 @@ void bus_rescan_devices(struct bus_type * bus)
 	bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper);
 }
 
+/**
+ * device_reprobe - remove driver for a device and probe for a new driver
+ * @dev: the device to reprobe
+ *
+ * This function detaches the attached driver (if any) for the given
+ * device and restarts the driver probing process.  It is intended
+ * to use if probing criteria changed during a devices lifetime and
+ * driver attachment should change accordingly.
+ */
+void device_reprobe(struct device *dev)
+{
+	if (dev->driver) {
+		if (dev->parent)        /* Needed for USB */
+			down(&dev->parent->sem);
+		device_release_driver(dev);
+		if (dev->parent)
+			up(&dev->parent->sem);
+	}
+
+	bus_rescan_devices_helper(dev, NULL);
+}
+EXPORT_SYMBOL_GPL(device_reprobe);
 
 struct bus_type * get_bus(struct bus_type * bus)
 {
diff --git a/include/linux/device.h b/include/linux/device.h
index 58df18d9cd3e..e8ac5bcfbec7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -378,6 +378,7 @@ extern void device_bind_driver(struct device * dev);
 extern void device_release_driver(struct device * dev);
 extern int  device_attach(struct device * dev);
 extern void driver_attach(struct device_driver * drv);
+extern void device_reprobe(struct device *dev);
 
 
 /*
-- 
cgit v1.2.3


From 243f196d572822214bb86522f28b30e096d67414 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 16 Mar 2006 14:10:19 +0000
Subject: [ARM] 3366/1: Allow the 16bpp mode configuration in the CLCD control
 register

Patch from Catalin Marinas

Starting with PL111, the 5551 or 565 modes can be configured in the
primecell's control register directly. This patch detects the required mode
and sets the correct value.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/amba/clcd.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h
index 6b8d73dc1ab0..9cf64b1b688b 100644
--- a/include/linux/amba/clcd.h
+++ b/include/linux/amba/clcd.h
@@ -54,6 +54,7 @@
 #define CNTL_LCDBPP4		(2 << 1)
 #define CNTL_LCDBPP8		(3 << 1)
 #define CNTL_LCDBPP16		(4 << 1)
+#define CNTL_LCDBPP16_565	(6 << 1)
 #define CNTL_LCDBPP24		(5 << 1)
 #define CNTL_LCDBW		(1 << 4)
 #define CNTL_LCDTFT		(1 << 5)
@@ -209,7 +210,16 @@ static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs)
 		val |= CNTL_LCDBPP8;
 		break;
 	case 16:
-		val |= CNTL_LCDBPP16;
+		/*
+		 * PL110 cannot choose between 5551 and 565 modes in
+		 * its control register
+		 */
+		if ((fb->dev->periphid & 0x000fffff) == 0x00041110)
+			val |= CNTL_LCDBPP16;
+		else if (fb->fb.var.green.length == 5)
+			val |= CNTL_LCDBPP16;
+		else
+			val |= CNTL_LCDBPP16_565;
 		break;
 	case 32:
 		val |= CNTL_LCDBPP24;
-- 
cgit v1.2.3


From 12a057321529df2fb650ac5f34dfd7abcca667df Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Mar 2006 13:38:01 -0500
Subject: [PATCH] keep sync and async cfq_queue separate

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/cfq-iosched.c    | 36 ++++++++++++++++++++++++++----------
 include/linux/blkdev.h |  2 +-
 2 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 42f990f2763d..63bfe4b494ba 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1219,11 +1219,20 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 
 	spin_lock(q->queue_lock);
 
-	if (unlikely(cic->cfqq == cfqd->active_queue))
-		__cfq_slice_expired(cfqd, cic->cfqq, 0);
+	if (cic->cfqq[ASYNC]) {
+		if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
+			__cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
+		cfq_put_queue(cic->cfqq[ASYNC]);
+		cic->cfqq[ASYNC] = NULL;
+	}
+
+	if (cic->cfqq[SYNC]) {
+		if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
+			__cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
+		cfq_put_queue(cic->cfqq[SYNC]);
+		cic->cfqq[SYNC] = NULL;
+	}
 
-	cfq_put_queue(cic->cfqq);
-	cic->cfqq = NULL;
 	cic->key = NULL;
 	spin_unlock(q->queue_lock);
 }
@@ -1259,7 +1268,8 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 
 	if (cic) {
 		INIT_LIST_HEAD(&cic->list);
-		cic->cfqq = NULL;
+		cic->cfqq[ASYNC] = NULL;
+		cic->cfqq[SYNC] = NULL;
 		cic->key = NULL;
 		cic->last_end_request = jiffies;
 		cic->ttime_total = 0;
@@ -1325,7 +1335,12 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 	struct cfq_queue *cfqq;
 	if (cfqd) {
 		spin_lock(cfqd->queue->queue_lock);
-		cfqq = cic->cfqq;
+		cfqq = cic->cfqq[ASYNC];
+		if (cfqq) {
+			cfq_mark_cfqq_prio_changed(cfqq);
+			cfq_init_prio_data(cfqq);
+		}
+		cfqq = cic->cfqq[SYNC];
 		if (cfqq) {
 			cfq_mark_cfqq_prio_changed(cfqq);
 			cfq_init_prio_data(cfqq);
@@ -1892,6 +1907,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 	unsigned long flags;
+	int is_sync = key != CFQ_KEY_ASYNC;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
@@ -1902,14 +1918,14 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	if (!cic)
 		goto queue_fail;
 
-	if (!cic->cfqq) {
+	if (!cic->cfqq[is_sync]) {
 		cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
 		if (!cfqq)
 			goto queue_fail;
 
-		cic->cfqq = cfqq;
+		cic->cfqq[is_sync] = cfqq;
 	} else
-		cfqq = cic->cfqq;
+		cfqq = cic->cfqq[is_sync];
 
 	cfqq->allocated[rw]++;
 	cfq_clear_cfqq_must_alloc(cfqq);
@@ -1926,7 +1942,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		crq->cfq_queue = cfqq;
 		crq->io_context = cic;
 
-		if (rw == READ || process_sync(tsk))
+		if (is_sync)
 			cfq_mark_crq_is_sync(crq);
 		else
 			cfq_clear_crq_is_sync(crq);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 860e7a485a5f..e19cb631084e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -58,7 +58,7 @@ struct cfq_io_context {
 	 * circular list of cfq_io_contexts belonging to a process io context
 	 */
 	struct list_head list;
-	struct cfq_queue *cfqq;
+	struct cfq_queue *cfqq[2];
 	void *key;
 
 	struct io_context *ioc;
-- 
cgit v1.2.3


From d9ff41879364cfca7c15abc20ae398e35de3f883 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Mar 2006 13:51:22 -0500
Subject: [PATCH] make cfq_exit_queue() prune the cfq_io_context for that queue

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/cfq-iosched.c    | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/blkdev.h |  2 ++
 2 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3bacf4bb7dd2..3fc6e505e9c8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -177,6 +177,8 @@ struct cfq_data {
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_max_depth;
+
+	struct list_head cic_list;
 };
 
 /*
@@ -1215,7 +1217,12 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
-	request_queue_t *q = cfqd->queue;
+	request_queue_t *q;
+
+	if (!cfqd)
+		return;
+
+	q = cfqd->queue;
 
 	WARN_ON(!irqs_disabled());
 
@@ -1236,6 +1243,7 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 	}
 
 	cic->key = NULL;
+	list_del_init(&cic->queue_list);
 	spin_unlock(q->queue_lock);
 }
 
@@ -1254,12 +1262,14 @@ static void cfq_exit_io_context(struct cfq_io_context *cic)
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
+	read_lock(&cfq_exit_lock);
 	list_for_each(entry, &cic->list) {
 		__cic = list_entry(entry, struct cfq_io_context, list);
 		cfq_exit_single_io_context(__cic);
 	}
 
 	cfq_exit_single_io_context(cic);
+	read_unlock(&cfq_exit_lock);
 	local_irq_restore(flags);
 }
 
@@ -1279,6 +1289,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		cic->ttime_mean = 0;
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
+		INIT_LIST_HEAD(&cic->queue_list);
 	}
 
 	return cic;
@@ -1446,6 +1457,7 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
+restart:
 	if ((cic = ioc->cic) == NULL) {
 		cic = cfq_alloc_io_context(cfqd, gfp_mask);
 
@@ -1461,6 +1473,7 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 		read_lock(&cfq_exit_lock);
 		ioc->set_ioprio = cfq_ioc_set_ioprio;
 		ioc->cic = cic;
+		list_add(&cic->queue_list, &cfqd->cic_list);
 		read_unlock(&cfq_exit_lock);
 	} else {
 		struct cfq_io_context *__cic;
@@ -1471,6 +1484,19 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 		if (cic->key == cfqd)
 			goto out;
 
+		if (unlikely(!cic->key)) {
+			read_lock(&cfq_exit_lock);
+			if (list_empty(&cic->list))
+				ioc->cic = NULL;
+			else
+				ioc->cic = list_entry(cic->list.next,
+						      struct cfq_io_context,
+						      list);
+			read_unlock(&cfq_exit_lock);
+			kmem_cache_free(cfq_ioc_pool, cic);
+			goto restart;
+		}
+
 		/*
 		 * cic exists, check if we already are there. linear search
 		 * should be ok here, the list will usually not be more than
@@ -1485,6 +1511,13 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 				cic = __cic;
 				goto out;
 			}
+			if (unlikely(!__cic->key)) {
+				read_lock(&cfq_exit_lock);
+				list_del(&__cic->list);
+				read_unlock(&cfq_exit_lock);
+				kmem_cache_free(cfq_ioc_pool, __cic);
+				goto restart;
+			}
 		}
 
 		/*
@@ -1499,6 +1532,7 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 		__cic->key = cfqd;
 		read_lock(&cfq_exit_lock);
 		list_add(&__cic->list, &cic->list);
+		list_add(&__cic->queue_list, &cfqd->cic_list);
 		read_unlock(&cfq_exit_lock);
 		cic = __cic;
 	}
@@ -2104,8 +2138,30 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 static void cfq_exit_queue(elevator_t *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
+	request_queue_t *q = cfqd->queue;
 
 	cfq_shutdown_timer_wq(cfqd);
+	write_lock(&cfq_exit_lock);
+	spin_lock_irq(q->queue_lock);
+	if (cfqd->active_queue)
+		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
+	while(!list_empty(&cfqd->cic_list)) {
+		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
+							struct cfq_io_context,
+							queue_list);
+		if (cic->cfqq[ASYNC]) {
+			cfq_put_queue(cic->cfqq[ASYNC]);
+			cic->cfqq[ASYNC] = NULL;
+		}
+		if (cic->cfqq[SYNC]) {
+			cfq_put_queue(cic->cfqq[SYNC]);
+			cic->cfqq[SYNC] = NULL;
+		}
+		cic->key = NULL;
+		list_del_init(&cic->queue_list);
+	}
+	spin_unlock_irq(q->queue_lock);
+	write_unlock(&cfq_exit_lock);
 	cfq_put_cfqd(cfqd);
 }
 
@@ -2127,6 +2183,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->empty_list);
+	INIT_LIST_HEAD(&cfqd->cic_list);
 
 	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
 	if (!cfqd->crq_hash)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e19cb631084e..80518f703538 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -69,6 +69,8 @@ struct cfq_io_context {
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 
+	struct list_head queue_list;
+
 	void (*dtor)(struct cfq_io_context *);
 	void (*exit)(struct cfq_io_context *);
 };
-- 
cgit v1.2.3


From e17a9489b4a686bb5e9615e1d375c67619cb99c5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Mar 2006 13:21:20 -0500
Subject: [PATCH] stop elv_unregister() from rogering other iosched's data, fix
 locking

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/as-iosched.c       |  7 +++++++
 block/cfq-iosched.c      |  8 ++++++++
 block/elevator.c         | 24 +++++++++---------------
 include/linux/elevator.h |  1 +
 4 files changed, 25 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 8da3cf66894c..d2ee2af44b58 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -195,6 +195,12 @@ static void free_as_io_context(struct as_io_context *aic)
 	kfree(aic);
 }
 
+static void as_trim(struct io_context *ioc)
+{
+	kfree(ioc->aic);
+	ioc->aic = NULL;
+}
+
 /* Called when the task exits */
 static void exit_as_io_context(struct as_io_context *aic)
 {
@@ -1860,6 +1866,7 @@ static struct elevator_type iosched_as = {
 		.elevator_may_queue_fn =	as_may_queue,
 		.elevator_init_fn =		as_init_queue,
 		.elevator_exit_fn =		as_exit_queue,
+		.trim =				as_trim,
 	},
 
 	.elevator_ktype = &as_ktype,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 521c56d4fdbc..7102bafc98b3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1211,6 +1211,13 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
 	kmem_cache_free(cfq_ioc_pool, cic);
 }
 
+static void cfq_trim(struct io_context *ioc)
+{
+	ioc->set_ioprio = NULL;
+	if (ioc->cic)
+		cfq_free_io_context(ioc->cic);
+}
+
 /*
  * Called with interrupts disabled
  */
@@ -2472,6 +2479,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
+		.trim =				cfq_trim,
 	},
 	.elevator_ktype =	&cfq_ktype,
 	.elevator_name =	"cfq",
diff --git a/block/elevator.c b/block/elevator.c
index 24b702d649a9..0232df2b16e6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -675,21 +675,15 @@ void elv_unregister(struct elevator_type *e)
 	/*
 	 * Iterate every thread in the process to remove the io contexts.
 	 */
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		struct io_context *ioc = p->io_context;
-		if (ioc && ioc->cic) {
-			ioc->cic->exit(ioc->cic);
-			ioc->cic->dtor(ioc->cic);
-			ioc->cic = NULL;
-		}
-		if (ioc && ioc->aic) {
-			ioc->aic->exit(ioc->aic);
-			ioc->aic->dtor(ioc->aic);
-			ioc->aic = NULL;
-		}
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
+	if (e->ops.trim) {
+		read_lock(&tasklist_lock);
+		do_each_thread(g, p) {
+			task_lock(p);
+			e->ops.trim(p->io_context);
+			task_unlock(p);
+		} while_each_thread(g, p);
+		read_unlock(&tasklist_lock);
+	}
 
 	spin_lock_irq(&elv_list_lock);
 	list_del_init(&e->list);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 18cf1f3e1184..f65766ef0532 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -48,6 +48,7 @@ struct elevator_ops
 
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
+	void (*trim)(struct io_context *);
 };
 
 #define ELV_NAME_MAX	(16)
-- 
cgit v1.2.3


From 483f4afc421435b7cfe5e88f74eea0b73a476d75 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Mar 2006 18:34:37 -0500
Subject: [PATCH] fix sysfs interaction and lifetime rules handling for queues

---
 block/ll_rw_blk.c      | 83 +++++++++++++++++++++++++++++++++++---------------
 include/linux/blkdev.h |  6 ++--
 2 files changed, 61 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 6dc769182052..6c793b196aa9 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1740,16 +1740,11 @@ EXPORT_SYMBOL(blk_run_queue);
  *     Hopefully the low level driver will have finished any
  *     outstanding requests first...
  **/
-void blk_cleanup_queue(request_queue_t * q)
+static void blk_release_queue(struct kobject *kobj)
 {
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
 	struct request_list *rl = &q->rq;
 
-	if (!atomic_dec_and_test(&q->refcnt))
-		return;
-
-	if (q->elevator)
-		elevator_exit(q->elevator);
-
 	blk_sync_queue(q);
 
 	if (rl->rq_pool)
@@ -1761,6 +1756,24 @@ void blk_cleanup_queue(request_queue_t * q)
 	kmem_cache_free(requestq_cachep, q);
 }
 
+void blk_put_queue(request_queue_t *q)
+{
+	kobject_put(&q->kobj);
+}
+EXPORT_SYMBOL(blk_put_queue);
+
+void blk_cleanup_queue(request_queue_t * q)
+{
+	mutex_lock(&q->sysfs_lock);
+	set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
+	mutex_unlock(&q->sysfs_lock);
+
+	if (q->elevator)
+		elevator_exit(q->elevator);
+
+	blk_put_queue(q);
+}
+
 EXPORT_SYMBOL(blk_cleanup_queue);
 
 static int blk_init_free_list(request_queue_t *q)
@@ -1788,6 +1801,8 @@ request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
+static struct kobj_type queue_ktype;
+
 request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	request_queue_t *q;
@@ -1798,11 +1813,16 @@ request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
-	atomic_set(&q->refcnt, 1);
+
+	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
+	q->kobj.ktype = &queue_ktype;
+	kobject_init(&q->kobj);
 
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 
+	mutex_init(&q->sysfs_lock);
+
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -1901,7 +1921,7 @@ EXPORT_SYMBOL(blk_init_queue_node);
 int blk_get_queue(request_queue_t *q)
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		atomic_inc(&q->refcnt);
+		kobject_get(&q->kobj);
 		return 0;
 	}
 
@@ -3764,13 +3784,19 @@ static ssize_t
 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
-	struct request_queue *q;
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
+	ssize_t res;
 
-	q = container_of(kobj, struct request_queue, kobj);
 	if (!entry->show)
 		return -EIO;
-
-	return entry->show(q, page);
+	mutex_lock(&q->sysfs_lock);
+	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->show(q, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
 }
 
 static ssize_t
@@ -3778,13 +3804,20 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 		    const char *page, size_t length)
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
-	struct request_queue *q;
+	request_queue_t *q = container_of(kobj, struct request_queue, kobj);
+
+	ssize_t res;
 
-	q = container_of(kobj, struct request_queue, kobj);
 	if (!entry->store)
 		return -EIO;
-
-	return entry->store(q, page, length);
+	mutex_lock(&q->sysfs_lock);
+	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->store(q, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
 }
 
 static struct sysfs_ops queue_sysfs_ops = {
@@ -3795,6 +3828,7 @@ static struct sysfs_ops queue_sysfs_ops = {
 static struct kobj_type queue_ktype = {
 	.sysfs_ops	= &queue_sysfs_ops,
 	.default_attrs	= default_attrs,
+	.release	= blk_release_queue,
 };
 
 int blk_register_queue(struct gendisk *disk)
@@ -3807,19 +3841,17 @@ int blk_register_queue(struct gendisk *disk)
 		return -ENXIO;
 
 	q->kobj.parent = kobject_get(&disk->kobj);
-	if (!q->kobj.parent)
-		return -EBUSY;
 
-	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
-	q->kobj.ktype = &queue_ktype;
-
-	ret = kobject_register(&q->kobj);
+	ret = kobject_add(&q->kobj);
 	if (ret < 0)
 		return ret;
 
+	kobject_uevent(&q->kobj, KOBJ_ADD);
+
 	ret = elv_register_queue(q);
 	if (ret) {
-		kobject_unregister(&q->kobj);
+		kobject_uevent(&q->kobj, KOBJ_REMOVE);
+		kobject_del(&q->kobj);
 		return ret;
 	}
 
@@ -3833,7 +3865,8 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (q && q->request_fn) {
 		elv_unregister_queue(q);
 
-		kobject_unregister(&q->kobj);
+		kobject_uevent(&q->kobj, KOBJ_REMOVE);
+		kobject_del(&q->kobj);
 		kobject_put(&disk->kobj);
 	}
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 80518f703538..56bb6a4e15f3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -406,8 +406,6 @@ struct request_queue
 
 	struct blk_queue_tag	*queue_tags;
 
-	atomic_t		refcnt;
-
 	unsigned int		nr_sorted;
 	unsigned int		in_flight;
 
@@ -426,6 +424,8 @@ struct request_queue
 	struct request		pre_flush_rq, bar_rq, post_flush_rq;
 	struct request		*orig_bar_rq;
 	unsigned int		bi_size;
+
+	struct mutex		sysfs_lock;
 };
 
 #define RQ_INACTIVE		(-1)
@@ -727,7 +727,7 @@ extern long nr_blockdev_pages(void);
 int blk_get_queue(request_queue_t *);
 request_queue_t *blk_alloc_queue(gfp_t);
 request_queue_t *blk_alloc_queue_node(gfp_t, int);
-#define blk_put_queue(q) blk_cleanup_queue((q))
+extern void blk_put_queue(request_queue_t *);
 
 /*
  * tag stuff
-- 
cgit v1.2.3


From 3d1ab40f4c20767afbd361b258a531d73e3e6fc2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Mar 2006 18:35:43 -0500
Subject: [PATCH] elevator_t lifetime rules and sysfs fixes

---
 block/as-iosched.c       |  69 +++++-----------------
 block/cfq-iosched.c      |  74 ++++++------------------
 block/deadline-iosched.c |  64 ++++-----------------
 block/elevator.c         | 146 +++++++++++++++++++++++++++++++++++------------
 include/linux/elevator.h |   9 ++-
 5 files changed, 160 insertions(+), 202 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 55a997fc4bb4..3fb60eb7093b 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1709,11 +1709,6 @@ static int as_init_queue(request_queue_t *q, elevator_t *e)
 /*
  * sysfs parts below
  */
-struct as_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct as_data *, char *);
-	ssize_t (*store)(struct as_data *, const char *, size_t);
-};
 
 static ssize_t
 as_var_show(unsigned int var, char *page)
@@ -1730,8 +1725,9 @@ as_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t as_est_show(struct as_data *ad, char *page)
+static ssize_t as_est_show(elevator_t *e, char *page)
 {
+	struct as_data *ad = e->elevator_data;
 	int pos = 0;
 
 	pos += sprintf(page+pos, "%lu %% exit probability\n",
@@ -1747,8 +1743,9 @@ static ssize_t as_est_show(struct as_data *ad, char *page)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(struct as_data *ad, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)		\
 {								\
+	struct as_data *ad = e->elevator_data;			\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
 }
 SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
@@ -1759,9 +1756,10 @@ SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
-static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
-	int ret = as_var_store(__PTR, (page), count);		\
+	struct as_data *ad = e->elevator_data;				\
+	int ret = as_var_store(__PTR, (page), count);			\
 	if (*(__PTR) < (MIN))						\
 		*(__PTR) = (MIN);					\
 	else if (*(__PTR) > (MAX))					\
@@ -1778,37 +1776,37 @@ STORE_FUNCTION(as_write_batchexpire_store,
 			&ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
 #undef STORE_FUNCTION
 
-static struct as_fs_entry as_est_entry = {
+static struct elv_fs_entry as_est_entry = {
 	.attr = {.name = "est_time", .mode = S_IRUGO },
 	.show = as_est_show,
 };
-static struct as_fs_entry as_readexpire_entry = {
+static struct elv_fs_entry as_readexpire_entry = {
 	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
 	.show = as_readexpire_show,
 	.store = as_readexpire_store,
 };
-static struct as_fs_entry as_writeexpire_entry = {
+static struct elv_fs_entry as_writeexpire_entry = {
 	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
 	.show = as_writeexpire_show,
 	.store = as_writeexpire_store,
 };
-static struct as_fs_entry as_anticexpire_entry = {
+static struct elv_fs_entry as_anticexpire_entry = {
 	.attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
 	.show = as_anticexpire_show,
 	.store = as_anticexpire_store,
 };
-static struct as_fs_entry as_read_batchexpire_entry = {
+static struct elv_fs_entry as_read_batchexpire_entry = {
 	.attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
 	.show = as_read_batchexpire_show,
 	.store = as_read_batchexpire_store,
 };
-static struct as_fs_entry as_write_batchexpire_entry = {
+static struct elv_fs_entry as_write_batchexpire_entry = {
 	.attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
 	.show = as_write_batchexpire_show,
 	.store = as_write_batchexpire_store,
 };
 
-static struct attribute *default_attrs[] = {
+static struct attribute *as_attrs[] = {
 	&as_est_entry.attr,
 	&as_readexpire_entry.attr,
 	&as_writeexpire_entry.attr,
@@ -1818,43 +1816,6 @@ static struct attribute *default_attrs[] = {
 	NULL,
 };
 
-#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
-
-static ssize_t
-as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct as_fs_entry *entry = to_as(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-as_attr_store(struct kobject *kobj, struct attribute *attr,
-		    const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct as_fs_entry *entry = to_as(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops as_sysfs_ops = {
-	.show	= as_attr_show,
-	.store	= as_attr_store,
-};
-
-static struct kobj_type as_ktype = {
-	.sysfs_ops	= &as_sysfs_ops,
-	.default_attrs	= default_attrs,
-};
-
 static struct elevator_type iosched_as = {
 	.ops = {
 		.elevator_merge_fn = 		as_merge,
@@ -1876,7 +1837,7 @@ static struct elevator_type iosched_as = {
 		.trim =				as_trim,
 	},
 
-	.elevator_ktype = &as_ktype,
+	.elevator_attrs = as_attrs,
 	.elevator_name = "anticipatory",
 	.elevator_owner = THIS_MODULE,
 };
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d1f2ae1629f7..6dc156e1c29b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2275,11 +2275,6 @@ fail:
 /*
  * sysfs parts below -->
  */
-struct cfq_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct cfq_data *, char *);
-	ssize_t (*store)(struct cfq_data *, const char *, size_t);
-};
 
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
@@ -2297,8 +2292,9 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)			\
 {									\
+	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
@@ -2318,8 +2314,9 @@ SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
+	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
 	int ret = cfq_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
@@ -2345,63 +2342,63 @@ STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX,
 STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
-static struct cfq_fs_entry cfq_quantum_entry = {
+static struct elv_fs_entry cfq_quantum_entry = {
 	.attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_quantum_show,
 	.store = cfq_quantum_store,
 };
-static struct cfq_fs_entry cfq_queued_entry = {
+static struct elv_fs_entry cfq_queued_entry = {
 	.attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_queued_show,
 	.store = cfq_queued_store,
 };
-static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
+static struct elv_fs_entry cfq_fifo_expire_sync_entry = {
 	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_fifo_expire_sync_show,
 	.store = cfq_fifo_expire_sync_store,
 };
-static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
+static struct elv_fs_entry cfq_fifo_expire_async_entry = {
 	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_fifo_expire_async_show,
 	.store = cfq_fifo_expire_async_store,
 };
-static struct cfq_fs_entry cfq_back_max_entry = {
+static struct elv_fs_entry cfq_back_max_entry = {
 	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_back_max_show,
 	.store = cfq_back_max_store,
 };
-static struct cfq_fs_entry cfq_back_penalty_entry = {
+static struct elv_fs_entry cfq_back_penalty_entry = {
 	.attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_back_penalty_show,
 	.store = cfq_back_penalty_store,
 };
-static struct cfq_fs_entry cfq_slice_sync_entry = {
+static struct elv_fs_entry cfq_slice_sync_entry = {
 	.attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_slice_sync_show,
 	.store = cfq_slice_sync_store,
 };
-static struct cfq_fs_entry cfq_slice_async_entry = {
+static struct elv_fs_entry cfq_slice_async_entry = {
 	.attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_slice_async_show,
 	.store = cfq_slice_async_store,
 };
-static struct cfq_fs_entry cfq_slice_async_rq_entry = {
+static struct elv_fs_entry cfq_slice_async_rq_entry = {
 	.attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_slice_async_rq_show,
 	.store = cfq_slice_async_rq_store,
 };
-static struct cfq_fs_entry cfq_slice_idle_entry = {
+static struct elv_fs_entry cfq_slice_idle_entry = {
 	.attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_slice_idle_show,
 	.store = cfq_slice_idle_store,
 };
-static struct cfq_fs_entry cfq_max_depth_entry = {
+static struct elv_fs_entry cfq_max_depth_entry = {
 	.attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
 	.show = cfq_max_depth_show,
 	.store = cfq_max_depth_store,
 };
 
-static struct attribute *default_attrs[] = {
+static struct attribute *cfq_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
 	&cfq_fifo_expire_sync_entry.attr,
@@ -2416,43 +2413,6 @@ static struct attribute *default_attrs[] = {
 	NULL,
 };
 
-#define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
-
-static ssize_t
-cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct cfq_fs_entry *entry = to_cfq(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-cfq_attr_store(struct kobject *kobj, struct attribute *attr,
-	       const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct cfq_fs_entry *entry = to_cfq(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops cfq_sysfs_ops = {
-	.show	= cfq_attr_show,
-	.store	= cfq_attr_store,
-};
-
-static struct kobj_type cfq_ktype = {
-	.sysfs_ops	= &cfq_sysfs_ops,
-	.default_attrs	= default_attrs,
-};
-
 static struct elevator_type iosched_cfq = {
 	.ops = {
 		.elevator_merge_fn = 		cfq_merge,
@@ -2473,7 +2433,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_exit_fn =		cfq_exit_queue,
 		.trim =				cfq_trim,
 	},
-	.elevator_ktype =	&cfq_ktype,
+	.elevator_attrs =	cfq_attrs,
 	.elevator_name =	"cfq",
 	.elevator_owner =	THIS_MODULE,
 };
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 27e494b1bf97..a3e3ff1e0c65 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -694,11 +694,6 @@ deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 /*
  * sysfs parts below
  */
-struct deadline_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(struct deadline_data *, char *);
-	ssize_t (*store)(struct deadline_data *, const char *, size_t);
-};
 
 static ssize_t
 deadline_var_show(int var, char *page)
@@ -716,9 +711,10 @@ deadline_var_store(int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct deadline_data *dd, char *page)		\
+static ssize_t __FUNC(elevator_t *e, char *page)			\
 {									\
-	int __data = __VAR;					\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data = __VAR;						\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
 	return deadline_var_show(__data, (page));			\
@@ -731,8 +727,9 @@ SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct deadline_data *dd, const char *page, size_t count)	\
+static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 {									\
+	struct deadline_data *dd = e->elevator_data;			\
 	int __data;							\
 	int ret = deadline_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
@@ -752,33 +749,33 @@ STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
 STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
-static struct deadline_fs_entry deadline_readexpire_entry = {
+static struct elv_fs_entry deadline_readexpire_entry = {
 	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
 	.show = deadline_readexpire_show,
 	.store = deadline_readexpire_store,
 };
-static struct deadline_fs_entry deadline_writeexpire_entry = {
+static struct elv_fs_entry deadline_writeexpire_entry = {
 	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
 	.show = deadline_writeexpire_show,
 	.store = deadline_writeexpire_store,
 };
-static struct deadline_fs_entry deadline_writesstarved_entry = {
+static struct elv_fs_entry deadline_writesstarved_entry = {
 	.attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR },
 	.show = deadline_writesstarved_show,
 	.store = deadline_writesstarved_store,
 };
-static struct deadline_fs_entry deadline_frontmerges_entry = {
+static struct elv_fs_entry deadline_frontmerges_entry = {
 	.attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR },
 	.show = deadline_frontmerges_show,
 	.store = deadline_frontmerges_store,
 };
-static struct deadline_fs_entry deadline_fifobatch_entry = {
+static struct elv_fs_entry deadline_fifobatch_entry = {
 	.attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR },
 	.show = deadline_fifobatch_show,
 	.store = deadline_fifobatch_store,
 };
 
-static struct attribute *default_attrs[] = {
+static struct attribute *deadline_attrs[] = {
 	&deadline_readexpire_entry.attr,
 	&deadline_writeexpire_entry.attr,
 	&deadline_writesstarved_entry.attr,
@@ -787,43 +784,6 @@ static struct attribute *default_attrs[] = {
 	NULL,
 };
 
-#define to_deadline(atr) container_of((atr), struct deadline_fs_entry, attr)
-
-static ssize_t
-deadline_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct deadline_fs_entry *entry = to_deadline(attr);
-
-	if (!entry->show)
-		return -EIO;
-
-	return entry->show(e->elevator_data, page);
-}
-
-static ssize_t
-deadline_attr_store(struct kobject *kobj, struct attribute *attr,
-		    const char *page, size_t length)
-{
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
-	struct deadline_fs_entry *entry = to_deadline(attr);
-
-	if (!entry->store)
-		return -EIO;
-
-	return entry->store(e->elevator_data, page, length);
-}
-
-static struct sysfs_ops deadline_sysfs_ops = {
-	.show	= deadline_attr_show,
-	.store	= deadline_attr_store,
-};
-
-static struct kobj_type deadline_ktype = {
-	.sysfs_ops	= &deadline_sysfs_ops,
-	.default_attrs	= default_attrs,
-};
-
 static struct elevator_type iosched_deadline = {
 	.ops = {
 		.elevator_merge_fn = 		deadline_merge,
@@ -840,7 +800,7 @@ static struct elevator_type iosched_deadline = {
 		.elevator_exit_fn =		deadline_exit_queue,
 	},
 
-	.elevator_ktype = &deadline_ktype,
+	.elevator_attrs = deadline_attrs,
 	.elevator_name = "deadline",
 	.elevator_owner = THIS_MODULE,
 };
diff --git a/block/elevator.c b/block/elevator.c
index 0232df2b16e6..0d2db536c6a7 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -120,15 +120,10 @@ static struct elevator_type *elevator_get(const char *name)
 	return e;
 }
 
-static int elevator_attach(request_queue_t *q, struct elevator_type *e,
-			   struct elevator_queue *eq)
+static int elevator_attach(request_queue_t *q, struct elevator_queue *eq)
 {
 	int ret = 0;
 
-	memset(eq, 0, sizeof(*eq));
-	eq->ops = &e->ops;
-	eq->elevator_type = e;
-
 	q->elevator = eq;
 
 	if (eq->ops->elevator_init_fn)
@@ -154,6 +149,32 @@ static int __init elevator_setup(char *str)
 
 __setup("elevator=", elevator_setup);
 
+static struct kobj_type elv_ktype;
+
+static elevator_t *elevator_alloc(struct elevator_type *e)
+{
+	elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	if (eq) {
+		memset(eq, 0, sizeof(*eq));
+		eq->ops = &e->ops;
+		eq->elevator_type = e;
+		kobject_init(&eq->kobj);
+		snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
+		eq->kobj.ktype = &elv_ktype;
+		mutex_init(&eq->sysfs_lock);
+	} else {
+		elevator_put(e);
+	}
+	return eq;
+}
+
+static void elevator_release(struct kobject *kobj)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	elevator_put(e->elevator_type);
+	kfree(e);
+}
+
 int elevator_init(request_queue_t *q, char *name)
 {
 	struct elevator_type *e = NULL;
@@ -176,29 +197,26 @@ int elevator_init(request_queue_t *q, char *name)
 		e = elevator_get("noop");
 	}
 
-	eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL);
-	if (!eq) {
-		elevator_put(e);
+	eq = elevator_alloc(e);
+	if (!eq)
 		return -ENOMEM;
-	}
 
-	ret = elevator_attach(q, e, eq);
-	if (ret) {
-		kfree(eq);
-		elevator_put(e);
-	}
+	ret = elevator_attach(q, eq);
+	if (ret)
+		kobject_put(&eq->kobj);
 
 	return ret;
 }
 
 void elevator_exit(elevator_t *e)
 {
+	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
 		e->ops->elevator_exit_fn(e);
+	e->ops = NULL;
+	mutex_unlock(&e->sysfs_lock);
 
-	elevator_put(e->elevator_type);
-	e->elevator_type = NULL;
-	kfree(e);
+	kobject_put(&e->kobj);
 }
 
 /*
@@ -627,26 +645,78 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 	}
 }
 
-int elv_register_queue(struct request_queue *q)
+#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
+
+static ssize_t
+elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
-	elevator_t *e = q->elevator;
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elv_fs_entry *entry = to_elv(attr);
+	ssize_t error;
+
+	if (!entry->show)
+		return -EIO;
+
+	mutex_lock(&e->sysfs_lock);
+	error = e->ops ? entry->show(e, page) : -ENOENT;
+	mutex_unlock(&e->sysfs_lock);
+	return error;
+}
 
-	e->kobj.parent = kobject_get(&q->kobj);
-	if (!e->kobj.parent)
-		return -EBUSY;
+static ssize_t
+elv_attr_store(struct kobject *kobj, struct attribute *attr,
+	       const char *page, size_t length)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elv_fs_entry *entry = to_elv(attr);
+	ssize_t error;
 
-	snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-	e->kobj.ktype = e->elevator_type->elevator_ktype;
+	if (!entry->store)
+		return -EIO;
 
-	return kobject_register(&e->kobj);
+	mutex_lock(&e->sysfs_lock);
+	error = e->ops ? entry->store(e, page, length) : -ENOENT;
+	mutex_unlock(&e->sysfs_lock);
+	return error;
+}
+
+static struct sysfs_ops elv_sysfs_ops = {
+	.show	= elv_attr_show,
+	.store	= elv_attr_store,
+};
+
+static struct kobj_type elv_ktype = {
+	.sysfs_ops	= &elv_sysfs_ops,
+	.release	= elevator_release,
+};
+
+int elv_register_queue(struct request_queue *q)
+{
+	elevator_t *e = q->elevator;
+	int error;
+
+	e->kobj.parent = &q->kobj;
+
+	error = kobject_add(&e->kobj);
+	if (!error) {
+		struct attribute **attr = e->elevator_type->elevator_attrs;
+		if (attr) {
+			while (*attr) {
+				if (sysfs_create_file(&e->kobj,*attr++))
+					break;
+			}
+		}
+		kobject_uevent(&e->kobj, KOBJ_ADD);
+	}
+	return error;
 }
 
 void elv_unregister_queue(struct request_queue *q)
 {
 	if (q) {
 		elevator_t *e = q->elevator;
-		kobject_unregister(&e->kobj);
-		kobject_put(&q->kobj);
+		kobject_uevent(&e->kobj, KOBJ_REMOVE);
+		kobject_del(&e->kobj);
 	}
 }
 
@@ -697,16 +767,16 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  * need for the new one. this way we have a chance of going back to the old
  * one, if the new one fails init for some reason.
  */
-static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
+static int elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 {
 	elevator_t *old_elevator, *e;
 
 	/*
 	 * Allocate new elevator
 	 */
-	e = kmalloc(sizeof(elevator_t), GFP_KERNEL);
+	e = elevator_alloc(new_e);
 	if (!e)
-		goto error;
+		return 0;
 
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
@@ -737,7 +807,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	/*
 	 * attach and start new elevator
 	 */
-	if (elevator_attach(q, new_e, e))
+	if (elevator_attach(q, e))
 		goto fail;
 
 	if (elv_register_queue(q))
@@ -748,7 +818,7 @@ static void elevator_switch(request_queue_t *q, struct elevator_type *new_e)
 	 */
 	elevator_exit(old_elevator);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	return;
+	return 1;
 
 fail_register:
 	/*
@@ -761,10 +831,9 @@ fail:
 	q->elevator = old_elevator;
 	elv_register_queue(q);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-	kfree(e);
-error:
-	elevator_put(new_e);
-	printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name);
+	if (e)
+		kobject_put(&e->kobj);
+	return 0;
 }
 
 ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
@@ -791,7 +860,8 @@ ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count)
 		return count;
 	}
 
-	elevator_switch(q, e);
+	if (!elevator_switch(q, e))
+		printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name);
 	return count;
 }
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index f65766ef0532..4d0a80f13ee0 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -61,7 +61,7 @@ struct elevator_type
 	struct list_head list;
 	struct elevator_ops ops;
 	struct elevator_type *elevator_type;
-	struct kobj_type *elevator_ktype;
+	struct attribute **elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
 };
@@ -75,6 +75,7 @@ struct elevator_queue
 	void *elevator_data;
 	struct kobject kobj;
 	struct elevator_type *elevator_type;
+	struct mutex sysfs_lock;
 };
 
 /*
@@ -141,6 +142,12 @@ enum {
 	ELV_MQUEUE_MUST,
 };
 
+struct elv_fs_entry {
+	struct attribute attr;
+	ssize_t (*show)(elevator_t *, char *);
+	ssize_t (*store)(elevator_t *, const char *, size_t);
+};
+
 #define rq_end_sector(rq)	((rq)->sector + (rq)->nr_sectors)
 
 #endif
-- 
cgit v1.2.3


From e572ec7e4e432de7ecf7bd2e62117646fa64e518 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Mar 2006 22:27:18 -0500
Subject: [PATCH] fix rmmod problems with elevator attributes, clean them up

---
 block/as-iosched.c       | 71 ++++++++++++------------------------
 block/cfq-iosched.c      | 93 +++++++++++-------------------------------------
 block/deadline-iosched.c | 64 +++++++++++----------------------
 block/elevator.c         |  7 ++--
 include/linux/elevator.h | 14 ++++----
 5 files changed, 74 insertions(+), 175 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 3fb60eb7093b..296708ceceb2 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1725,7 +1725,7 @@ as_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t as_est_show(elevator_t *e, char *page)
+static ssize_t est_time_show(elevator_t *e, char *page)
 {
 	struct as_data *ad = e->elevator_data;
 	int pos = 0;
@@ -1748,11 +1748,11 @@ static ssize_t __FUNC(elevator_t *e, char *page)		\
 	struct as_data *ad = e->elevator_data;			\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
 }
-SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
-SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
-SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
+SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
@@ -1767,53 +1767,26 @@ static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 	*(__PTR) = msecs_to_jiffies(*(__PTR));				\
 	return ret;							\
 }
-STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
-STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
-STORE_FUNCTION(as_read_batchexpire_store,
+STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_write_expire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
+STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
+STORE_FUNCTION(as_read_batch_expire_store,
 			&ad->batch_expire[REQ_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_batchexpire_store,
+STORE_FUNCTION(as_write_batch_expire_store,
 			&ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
 #undef STORE_FUNCTION
 
-static struct elv_fs_entry as_est_entry = {
-	.attr = {.name = "est_time", .mode = S_IRUGO },
-	.show = as_est_show,
-};
-static struct elv_fs_entry as_readexpire_entry = {
-	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_readexpire_show,
-	.store = as_readexpire_store,
-};
-static struct elv_fs_entry as_writeexpire_entry = {
-	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_writeexpire_show,
-	.store = as_writeexpire_store,
-};
-static struct elv_fs_entry as_anticexpire_entry = {
-	.attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_anticexpire_show,
-	.store = as_anticexpire_store,
-};
-static struct elv_fs_entry as_read_batchexpire_entry = {
-	.attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_read_batchexpire_show,
-	.store = as_read_batchexpire_store,
-};
-static struct elv_fs_entry as_write_batchexpire_entry = {
-	.attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = as_write_batchexpire_show,
-	.store = as_write_batchexpire_store,
-};
-
-static struct attribute *as_attrs[] = {
-	&as_est_entry.attr,
-	&as_readexpire_entry.attr,
-	&as_writeexpire_entry.attr,
-	&as_anticexpire_entry.attr,
-	&as_read_batchexpire_entry.attr,
-	&as_write_batchexpire_entry.attr,
-	NULL,
+#define AS_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
+
+static struct elv_fs_entry as_attrs[] = {
+	__ATTR_RO(est_time),
+	AS_ATTR(read_expire),
+	AS_ATTR(write_expire),
+	AS_ATTR(antic_expire),
+	AS_ATTR(read_batch_expire),
+	AS_ATTR(write_batch_expire),
+	__ATTR_NULL
 };
 
 static struct elevator_type iosched_as = {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6dc156e1c29b..c4a0d5d8d7f0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2304,8 +2304,8 @@ SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
-SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
-SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
+SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
+SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
@@ -2333,8 +2333,8 @@ STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
-STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
+STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
@@ -2342,75 +2342,22 @@ STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX,
 STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
-static struct elv_fs_entry cfq_quantum_entry = {
-	.attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_quantum_show,
-	.store = cfq_quantum_store,
-};
-static struct elv_fs_entry cfq_queued_entry = {
-	.attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_queued_show,
-	.store = cfq_queued_store,
-};
-static struct elv_fs_entry cfq_fifo_expire_sync_entry = {
-	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_sync_show,
-	.store = cfq_fifo_expire_sync_store,
-};
-static struct elv_fs_entry cfq_fifo_expire_async_entry = {
-	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_async_show,
-	.store = cfq_fifo_expire_async_store,
-};
-static struct elv_fs_entry cfq_back_max_entry = {
-	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_back_max_show,
-	.store = cfq_back_max_store,
-};
-static struct elv_fs_entry cfq_back_penalty_entry = {
-	.attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_back_penalty_show,
-	.store = cfq_back_penalty_store,
-};
-static struct elv_fs_entry cfq_slice_sync_entry = {
-	.attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_sync_show,
-	.store = cfq_slice_sync_store,
-};
-static struct elv_fs_entry cfq_slice_async_entry = {
-	.attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_async_show,
-	.store = cfq_slice_async_store,
-};
-static struct elv_fs_entry cfq_slice_async_rq_entry = {
-	.attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_async_rq_show,
-	.store = cfq_slice_async_rq_store,
-};
-static struct elv_fs_entry cfq_slice_idle_entry = {
-	.attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_slice_idle_show,
-	.store = cfq_slice_idle_store,
-};
-static struct elv_fs_entry cfq_max_depth_entry = {
-	.attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_max_depth_show,
-	.store = cfq_max_depth_store,
-};
-
-static struct attribute *cfq_attrs[] = {
-	&cfq_quantum_entry.attr,
-	&cfq_queued_entry.attr,
-	&cfq_fifo_expire_sync_entry.attr,
-	&cfq_fifo_expire_async_entry.attr,
-	&cfq_back_max_entry.attr,
-	&cfq_back_penalty_entry.attr,
-	&cfq_slice_sync_entry.attr,
-	&cfq_slice_async_entry.attr,
-	&cfq_slice_async_rq_entry.attr,
-	&cfq_slice_idle_entry.attr,
-	&cfq_max_depth_entry.attr,
-	NULL,
+#define CFQ_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
+
+static struct elv_fs_entry cfq_attrs[] = {
+	CFQ_ATTR(quantum),
+	CFQ_ATTR(queued),
+	CFQ_ATTR(fifo_expire_sync),
+	CFQ_ATTR(fifo_expire_async),
+	CFQ_ATTR(back_seek_max),
+	CFQ_ATTR(back_seek_penalty),
+	CFQ_ATTR(slice_sync),
+	CFQ_ATTR(slice_async),
+	CFQ_ATTR(slice_async_rq),
+	CFQ_ATTR(slice_idle),
+	CFQ_ATTR(max_depth),
+	__ATTR_NULL
 };
 
 static struct elevator_type iosched_cfq = {
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index a3e3ff1e0c65..399fa1e60e1f 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -719,11 +719,11 @@ static ssize_t __FUNC(elevator_t *e, char *page)			\
 		__data = jiffies_to_msecs(__data);			\
 	return deadline_var_show(__data, (page));			\
 }
-SHOW_FUNCTION(deadline_readexpire_show, dd->fifo_expire[READ], 1);
-SHOW_FUNCTION(deadline_writeexpire_show, dd->fifo_expire[WRITE], 1);
-SHOW_FUNCTION(deadline_writesstarved_show, dd->writes_starved, 0);
-SHOW_FUNCTION(deadline_frontmerges_show, dd->front_merges, 0);
-SHOW_FUNCTION(deadline_fifobatch_show, dd->fifo_batch, 0);
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -742,46 +742,24 @@ static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
 		*(__PTR) = __data;					\
 	return ret;							\
 }
-STORE_FUNCTION(deadline_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writesstarved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
-STORE_FUNCTION(deadline_frontmerges_store, &dd->front_merges, 0, 1, 0);
-STORE_FUNCTION(deadline_fifobatch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
-static struct elv_fs_entry deadline_readexpire_entry = {
-	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_readexpire_show,
-	.store = deadline_readexpire_store,
-};
-static struct elv_fs_entry deadline_writeexpire_entry = {
-	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_writeexpire_show,
-	.store = deadline_writeexpire_store,
-};
-static struct elv_fs_entry deadline_writesstarved_entry = {
-	.attr = {.name = "writes_starved", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_writesstarved_show,
-	.store = deadline_writesstarved_store,
-};
-static struct elv_fs_entry deadline_frontmerges_entry = {
-	.attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_frontmerges_show,
-	.store = deadline_frontmerges_store,
-};
-static struct elv_fs_entry deadline_fifobatch_entry = {
-	.attr = {.name = "fifo_batch", .mode = S_IRUGO | S_IWUSR },
-	.show = deadline_fifobatch_show,
-	.store = deadline_fifobatch_store,
-};
-
-static struct attribute *deadline_attrs[] = {
-	&deadline_readexpire_entry.attr,
-	&deadline_writeexpire_entry.attr,
-	&deadline_writesstarved_entry.attr,
-	&deadline_frontmerges_entry.attr,
-	&deadline_fifobatch_entry.attr,
-	NULL,
+#define DD_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+				      deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+	DD_ATTR(read_expire),
+	DD_ATTR(write_expire),
+	DD_ATTR(writes_starved),
+	DD_ATTR(front_merges),
+	DD_ATTR(fifo_batch),
+	__ATTR_NULL
 };
 
 static struct elevator_type iosched_deadline = {
diff --git a/block/elevator.c b/block/elevator.c
index 0d2db536c6a7..db3d0d8296a0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -699,11 +699,12 @@ int elv_register_queue(struct request_queue *q)
 
 	error = kobject_add(&e->kobj);
 	if (!error) {
-		struct attribute **attr = e->elevator_type->elevator_attrs;
+		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
 		if (attr) {
-			while (*attr) {
-				if (sysfs_create_file(&e->kobj,*attr++))
+			while (attr->attr.name) {
+				if (sysfs_create_file(&e->kobj, &attr->attr))
 					break;
+				attr++;
 			}
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4d0a80f13ee0..ad133fcfb239 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -53,6 +53,12 @@ struct elevator_ops
 
 #define ELV_NAME_MAX	(16)
 
+struct elv_fs_entry {
+	struct attribute attr;
+	ssize_t (*show)(elevator_t *, char *);
+	ssize_t (*store)(elevator_t *, const char *, size_t);
+};
+
 /*
  * identifies an elevator type, such as AS or deadline
  */
@@ -61,7 +67,7 @@ struct elevator_type
 	struct list_head list;
 	struct elevator_ops ops;
 	struct elevator_type *elevator_type;
-	struct attribute **elevator_attrs;
+	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
 };
@@ -142,12 +148,6 @@ enum {
 	ELV_MQUEUE_MUST,
 };
 
-struct elv_fs_entry {
-	struct attribute attr;
-	ssize_t (*show)(elevator_t *, char *);
-	ssize_t (*store)(elevator_t *, const char *, size_t);
-};
-
 #define rq_end_sector(rq)	((rq)->sector + (rq)->nr_sectors)
 
 #endif
-- 
cgit v1.2.3


From 02fd473bd4844befc74f7ca67cd60891e0a2d890 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sat, 11 Feb 2006 02:25:21 -0800
Subject: [SPARC64]: Add SUN4V Hypervisor Console driver.

Since it can do things like BREAK and HUP, we implement
this as a serial uart driver.

This still needs interrupt probing code, as I haven't figured
out how interrupts will work or be probed for on SUN4V yet.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/serial/Kconfig      |   7 +
 drivers/serial/Makefile     |   1 +
 drivers/serial/sunhv.c      | 481 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/serial_core.h |   3 +
 4 files changed, 492 insertions(+)
 create mode 100644 drivers/serial/sunhv.c

(limited to 'include/linux')

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index b3c561abe3f6..89e5413cc2a3 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -582,6 +582,13 @@ config SERIAL_SUNSAB_CONSOLE
 	  on your Sparc system as the console, you can do so by answering
 	  Y to this option.
 
+config SERIAL_SUNHV
+	bool "Sun4v Hypervisor Console support"
+	depends on SPARC64
+	help
+	  This driver supports the console device found on SUN4V Sparc
+	  systems.  Say Y if you want to be able to use this device.
+
 config SERIAL_IP22_ZILOG
 	tristate "IP22 Zilog8530 serial support"
 	depends on SGI_IP22
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index eaf8e01db198..0f7f8d73d3ca 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_SERIAL_SUNZILOG) += sunzilog.o
 obj-$(CONFIG_SERIAL_IP22_ZILOG) += ip22zilog.o
 obj-$(CONFIG_SERIAL_SUNSU) += sunsu.o
 obj-$(CONFIG_SERIAL_SUNSAB) += sunsab.o
+obj-$(CONFIG_SERIAL_SUNHV) += sunhv.o
 obj-$(CONFIG_SERIAL_MUX) += mux.o
 obj-$(CONFIG_SERIAL_68328) += 68328serial.o
 obj-$(CONFIG_SERIAL_68360) += 68360serial.o
diff --git a/drivers/serial/sunhv.c b/drivers/serial/sunhv.c
new file mode 100644
index 000000000000..2ba716eeb0bf
--- /dev/null
+++ b/drivers/serial/sunhv.c
@@ -0,0 +1,481 @@
+/* sunhv.c: Serial driver for SUN4V hypervisor console.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/major.h>
+#include <linux/circ_buf.h>
+#include <linux/serial.h>
+#include <linux/sysrq.h>
+#include <linux/console.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+
+#include <asm/hypervisor.h>
+#include <asm/spitfire.h>
+
+#if defined(CONFIG_MAGIC_SYSRQ)
+#define SUPPORT_SYSRQ
+#endif
+
+#include <linux/serial_core.h>
+
+#include "suncore.h"
+
+#define CON_BREAK	((long)-1)
+#define CON_HUP		((long)-2)
+
+static inline long hypervisor_con_getchar(long *status)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	register unsigned long arg1 asm("%o1");
+
+	func = HV_FAST_CONS_GETCHAR;
+	arg0 = 0;
+	arg1 = 0;
+	__asm__ __volatile__("ta	%6"
+			     : "=&r" (func), "=&r" (arg0), "=&r" (arg1)
+			     : "0" (func), "1" (arg0), "2" (arg1),
+			       "i" (HV_FAST_TRAP));
+
+	*status = arg0;
+
+	return (long) arg1;
+}
+
+static inline long hypervisor_con_putchar(long ch)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+
+	func = HV_FAST_CONS_PUTCHAR;
+	arg0 = ch;
+	__asm__ __volatile__("ta	%4"
+			     : "=&r" (func), "=&r" (arg0)
+			     : "0" (func), "1" (arg0), "i" (HV_FAST_TRAP));
+
+	return (long) arg0;
+}
+
+#define IGNORE_BREAK	0x1
+#define IGNORE_ALL	0x2
+
+static int hung_up = 0;
+
+static struct tty_struct *receive_chars(struct uart_port *port, struct pt_regs *regs)
+{
+	struct tty_struct *tty = NULL;
+	int saw_console_brk = 0;
+	int limit = 10000;
+
+	if (port->info != NULL)		/* Unopened serial console */
+		tty = port->info->tty;
+
+	while (limit-- > 0) {
+		long status;
+		long c = hypervisor_con_getchar(&status);
+		unsigned char flag;
+
+		if (status == HV_EWOULDBLOCK)
+			break;
+
+		if (c == CON_BREAK) {
+			saw_console_brk = 1;
+			c = 0;
+		}
+
+		if (c == CON_HUP) {
+			hung_up = 1;
+			uart_handle_dcd_change(port, 0);
+		} else if (hung_up) {
+			hung_up = 0;
+			uart_handle_dcd_change(port, 1);
+		}
+
+		if (tty == NULL) {
+			uart_handle_sysrq_char(port, c, regs);
+			continue;
+		}
+
+		flag = TTY_NORMAL;
+		port->icount.rx++;
+		if (c == CON_BREAK) {
+			port->icount.brk++;
+			if (uart_handle_break(port))
+				continue;
+			flag = TTY_BREAK;
+		}
+
+		if (uart_handle_sysrq_char(port, c, regs))
+			continue;
+
+		if ((port->ignore_status_mask & IGNORE_ALL) ||
+		    ((port->ignore_status_mask & IGNORE_BREAK) &&
+		     (c == CON_BREAK)))
+			continue;
+
+		tty_insert_flip_char(tty, c, flag);
+	}
+
+	if (saw_console_brk)
+		sun_do_break();
+
+	return tty;
+}
+
+static void transmit_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+		return;
+
+	while (!uart_circ_empty(xmit)) {
+		long status = hypervisor_con_putchar(xmit->buf[xmit->tail]);
+
+		if (status != HV_EOK)
+			break;
+
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+}
+
+static irqreturn_t sunhv_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct uart_port *port = dev_id;
+	struct tty_struct *tty;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	tty = receive_chars(port, regs);
+	transmit_chars(port);
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	if (tty)
+		tty_flip_buffer_push(tty);
+
+	return IRQ_HANDLED;
+}
+
+/* port->lock is not held.  */
+static unsigned int sunhv_tx_empty(struct uart_port *port)
+{
+	/* Transmitter is always empty for us.  If the circ buffer
+	 * is non-empty or there is an x_char pending, our caller
+	 * will do the right thing and ignore what we return here.
+	 */
+	return TIOCSER_TEMT;
+}
+
+/* port->lock held by caller.  */
+static void sunhv_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	return;
+}
+
+/* port->lock is held by caller and interrupts are disabled.  */
+static unsigned int sunhv_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_DSR | TIOCM_CAR | TIOCM_CTS;
+}
+
+/* port->lock held by caller.  */
+static void sunhv_stop_tx(struct uart_port *port)
+{
+	return;
+}
+
+/* port->lock held by caller.  */
+static void sunhv_start_tx(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	while (!uart_circ_empty(xmit)) {
+		long status = hypervisor_con_putchar(xmit->buf[xmit->tail]);
+
+		if (status != HV_EOK)
+			break;
+
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/* port->lock is not held.  */
+static void sunhv_send_xchar(struct uart_port *port, char ch)
+{
+	unsigned long flags;
+	int limit = 10000;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	while (limit-- > 0) {
+		long status = hypervisor_con_putchar(ch);
+		if (status == HV_EOK)
+			break;
+	}
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+/* port->lock held by caller.  */
+static void sunhv_stop_rx(struct uart_port *port)
+{
+}
+
+/* port->lock held by caller.  */
+static void sunhv_enable_ms(struct uart_port *port)
+{
+}
+
+/* port->lock is not held.  */
+static void sunhv_break_ctl(struct uart_port *port, int break_state)
+{
+	if (break_state) {
+		unsigned long flags;
+		int limit = 10000;
+
+		spin_lock_irqsave(&port->lock, flags);
+
+		while (limit-- > 0) {
+			long status = hypervisor_con_putchar(CON_BREAK);
+			if (status == HV_EOK)
+				break;
+		}
+
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+}
+
+/* port->lock is not held.  */
+static int sunhv_startup(struct uart_port *port)
+{
+	return 0;
+}
+
+/* port->lock is not held.  */
+static void sunhv_shutdown(struct uart_port *port)
+{
+}
+
+/* port->lock is not held.  */
+static void sunhv_set_termios(struct uart_port *port, struct termios *termios,
+			      struct termios *old)
+{
+	unsigned int baud = uart_get_baud_rate(port, termios, old, 0, 4000000);
+	unsigned int quot = uart_get_divisor(port, baud);
+	unsigned int iflag, cflag;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	iflag = termios->c_iflag;
+	cflag = termios->c_cflag;
+
+	port->ignore_status_mask = 0;
+	if (iflag & IGNBRK)
+		port->ignore_status_mask |= IGNORE_BREAK;
+	if ((cflag & CREAD) == 0)
+		port->ignore_status_mask |= IGNORE_ALL;
+
+	/* XXX */
+	uart_update_timeout(port, cflag,
+			    (port->uartclk / (16 * quot)));
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *sunhv_type(struct uart_port *port)
+{
+	return "SUN4V HCONS";
+}
+
+static void sunhv_release_port(struct uart_port *port)
+{
+}
+
+static int sunhv_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+static void sunhv_config_port(struct uart_port *port, int flags)
+{
+}
+
+static int sunhv_verify_port(struct uart_port *port, struct serial_struct *ser)
+{
+	return -EINVAL;
+}
+
+static struct uart_ops sunhv_pops = {
+	.tx_empty	= sunhv_tx_empty,
+	.set_mctrl	= sunhv_set_mctrl,
+	.get_mctrl	= sunhv_get_mctrl,
+	.stop_tx	= sunhv_stop_tx,
+	.start_tx	= sunhv_start_tx,
+	.send_xchar	= sunhv_send_xchar,
+	.stop_rx	= sunhv_stop_rx,
+	.enable_ms	= sunhv_enable_ms,
+	.break_ctl	= sunhv_break_ctl,
+	.startup	= sunhv_startup,
+	.shutdown	= sunhv_shutdown,
+	.set_termios	= sunhv_set_termios,
+	.type		= sunhv_type,
+	.release_port	= sunhv_release_port,
+	.request_port	= sunhv_request_port,
+	.config_port	= sunhv_config_port,
+	.verify_port	= sunhv_verify_port,
+};
+
+static struct uart_driver sunhv_reg = {
+	.owner			= THIS_MODULE,
+	.driver_name		= "serial",
+	.devfs_name		= "tts/",
+	.dev_name		= "ttyS",
+	.major			= TTY_MAJOR,
+};
+
+static struct uart_port *sunhv_port;
+
+static inline void sunhv_console_putchar(struct uart_port *port, char c)
+{
+	unsigned long flags;
+	int limit = 10000;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	while (limit-- > 0) {
+		long status = hypervisor_con_putchar(c);
+		if (status == HV_EOK)
+			break;
+	}
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void sunhv_console_write(struct console *con, const char *s, unsigned n)
+{
+	struct uart_port *port = sunhv_port;
+	int i;
+
+	for (i = 0; i < n; i++) {
+		if (*s == '\n')
+			sunhv_console_putchar(port, '\r');
+		sunhv_console_putchar(port, *s++);
+	}
+}
+
+static int sunhv_console_setup(struct console *con, char *options)
+{
+	return 0;
+}
+
+static struct console sunhv_console = {
+	.name	=	"ttyS",
+	.write	=	sunhv_console_write,
+	.device	=	uart_console_device,
+	.setup	=	sunhv_console_setup,
+	.flags	=	CON_PRINTBUFFER,
+	.index	=	-1,
+	.data	=	&sunhv_reg,
+};
+
+static void __init sunhv_console_init(void)
+{
+	if (con_is_present())
+		return;
+
+	sunhv_console.index = 0;
+	register_console(&sunhv_console);
+}
+
+static int __init sunhv_init(void)
+{
+	struct uart_port *port;
+	int ret;
+
+	if (tlb_type != hypervisor)
+		return -ENODEV;
+
+	port = kmalloc(sizeof(struct uart_port), GFP_KERNEL);
+	if (unlikely(!port))
+		return -ENOMEM;
+
+	port->line = 0;
+	port->ops = &sunhv_pops;
+	port->type = PORT_SUNHV;
+	port->uartclk = ( 29491200 / 16 ); /* arbitrary */
+
+	/* XXX Get interrupt. XXX */
+	if (request_irq(0 /* XXX */, sunhv_interrupt,
+			SA_SHIRQ, "serial(sunhv)", port)) {
+		printk("sunhv: Cannot get IRQ %x\n",
+		       0 /* XXX */);
+		kfree(port);
+		return -ENODEV;
+	}
+
+	sunhv_reg.minor = sunserial_current_minor;
+	sunhv_reg.nr = 1;
+	sunhv_reg.cons = &sunhv_console;
+
+	ret = uart_register_driver(&sunhv_reg);
+	if (ret < 0) {
+		free_irq(0 /* XXX */, up);
+		kfree(port);
+
+		return ret;
+	}
+
+	sunhv_port = port;
+
+	sunserial_current_minor += 1;
+
+	sunhv_console_init();
+
+	uart_add_one_port(&sunhv_reg, port);
+
+	return 0;
+}
+
+static void __exit sunhv_exit(void)
+{
+	struct uart_port *port = sunhv_port;
+
+	BUG_ON(!port);
+
+	uart_remove_one_port(&sunhv_reg, port);
+	free_irq(0 /* XXX */, port);
+
+	sunserial_current_minor -= 1;
+
+	uart_unregister_driver(&sunhv_reg);
+
+	kfree(sunhv_port);
+	sunhv_port = NULL;
+}
+
+module_init(sunhv_init);
+module_exit(sunhv_exit);
+
+MODULE_AUTHOR("David S. Miller");
+MODULE_DESCRIPTION("SUN4V Hypervisor console driver")
+MODULE_LICENSE("GPL");
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 4041122dabfc..57abcea1cb5d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -127,6 +127,9 @@
 /* Hilscher netx */
 #define PORT_NETX	71
 
+/* SUN4V Hypervisor Console */
+#define PORT_SUNHV	72
+
 #ifdef __KERNEL__
 
 #include <linux/config.h>
-- 
cgit v1.2.3


From b92dccf65bab3b6b7deb79ff3321dc256eb0f53b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:03 -0500
Subject: NFS: Fix a busy inodes issue...

The nfs_open_context may live longer than the file descriptor that spawned
it, so it needs to carry a reference to the vfsmount. If not, then
generic_shutdown_super() may end up being called before reads and writes
have been flushed out.

Make a couple of functions static while we're at it...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 10 ++++++----
 include/linux/nfs_fs.h |  4 +---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a77ee95b7efb..8d5b6691ae3d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -973,7 +973,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	return err;
 }
 
-struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred)
+static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
 
@@ -981,6 +981,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rp
 	if (ctx != NULL) {
 		atomic_set(&ctx->count, 1);
 		ctx->dentry = dget(dentry);
+		ctx->vfsmnt = mntget(mnt);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
 		ctx->lockowner = current->files;
@@ -1011,6 +1012,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 		if (ctx->cred != NULL)
 			put_rpccred(ctx->cred);
 		dput(ctx->dentry);
+		mntput(ctx->vfsmnt);
 		kfree(ctx);
 	}
 }
@@ -1019,7 +1021,7 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
  */
-void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -1051,7 +1053,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 	return ctx;
 }
 
-void nfs_file_clear_open_context(struct file *filp)
+static void nfs_file_clear_open_context(struct file *filp)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data;
@@ -1076,7 +1078,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
-	ctx = alloc_nfs_open_context(filp->f_dentry, cred);
+	ctx = alloc_nfs_open_context(filp->f_vfsmnt, filp->f_dentry, cred);
 	put_rpccred(cred);
 	if (ctx == NULL)
 		return -ENOMEM;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b4dc6e2e10c9..1161725d75e1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -78,6 +78,7 @@ struct nfs_access_entry {
 struct nfs4_state;
 struct nfs_open_context {
 	atomic_t count;
+	struct vfsmount *vfsmnt;
 	struct dentry *dentry;
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
@@ -311,12 +312,9 @@ extern void nfs_begin_attr_update(struct inode *);
 extern void nfs_end_attr_update(struct inode *);
 extern void nfs_begin_data_update(struct inode *);
 extern void nfs_end_data_update(struct inode *);
-extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
-extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
-extern void nfs_file_clear_open_context(struct file *filp);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern u32 root_nfs_parse_addr(char *name); /*__init*/
-- 
cgit v1.2.3


From 7bab377fcb495ee2e5a1cd69d235f8d84c76e3af Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:06 -0500
Subject: lockd: Don't expose the process pid to the NLM server

Instead we use the nlm_lockowner->pid.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c       | 10 +++++++++-
 fs/lockd/clntproc.c       |  7 +++++--
 fs/lockd/svclock.c        |  1 +
 fs/lockd/xdr.c            | 13 ++++++++-----
 fs/lockd/xdr4.c           | 17 ++++++++++-------
 include/linux/lockd/xdr.h |  1 +
 6 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index da6354baa0b8..8ae79ae4b998 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -125,7 +125,15 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		struct file_lock *fl_blocked = block->b_lock;
 
-		if (!nlm_compare_locks(fl_blocked, fl))
+		if (fl_blocked->fl_start != fl->fl_start)
+			continue;
+		if (fl_blocked->fl_end != fl->fl_end)
+			continue;
+		/*
+		 * Careful! The NLM server will return the 32-bit "pid" that
+		 * we put on the wire: in this case the lockowner "pid".
+		 */
+		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
 			continue;
 		if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
 			continue;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 615a988a92a7..acc3eb13a02b 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -132,8 +132,10 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 	memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh));
 	lock->caller  = system_utsname.nodename;
 	lock->oh.data = req->a_owner;
-	lock->oh.len  = sprintf(req->a_owner, "%d@%s",
-				current->pid, system_utsname.nodename);
+	lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
+				(unsigned int)fl->fl_u.nfs_fl.owner->pid,
+				system_utsname.nodename);
+	lock->svid = fl->fl_u.nfs_fl.owner->pid;
 	locks_copy_lock(&lock->fl, fl);
 }
 
@@ -159,6 +161,7 @@ nlmclnt_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
 
 	/* set default data area */
 	call->a_args.lock.oh.data = call->a_owner;
+	call->a_args.lock.svid = lock->fl.fl_pid;
 
 	if (lock->oh.len > NLMCLNT_OHSIZE) {
 		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 9cfced65d4a2..a525a141dd3b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -397,6 +397,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 				(long long)fl->fl_end);
 		conflock->caller = "somehost";	/* FIXME */
 		conflock->oh.len = 0;		/* don't return OH info */
+		conflock->svid = fl->fl_pid;
 		conflock->fl = *fl;
 		return nlm_lck_denied;
 	}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 200fbda2c6d1..1e984ab14d3f 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -131,10 +131,11 @@ nlm_decode_lock(u32 *p, struct nlm_lock *lock)
 	 || !(p = nlm_decode_fh(p, &lock->fh))
 	 || !(p = nlm_decode_oh(p, &lock->oh)))
 		return NULL;
+	lock->svid  = ntohl(*p++);
 
 	locks_init_lock(fl);
 	fl->fl_owner = current->files;
-	fl->fl_pid   = ntohl(*p++);
+	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	start = ntohl(*p++);
@@ -174,7 +175,7 @@ nlm_encode_lock(u32 *p, struct nlm_lock *lock)
 	else
 		len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
 
-	*p++ = htonl(fl->fl_pid);
+	*p++ = htonl(lock->svid);
 	*p++ = htonl(start);
 	*p++ = htonl(len);
 
@@ -197,7 +198,7 @@ nlm_encode_testres(u32 *p, struct nlm_res *resp)
 		struct file_lock	*fl = &resp->lock.fl;
 
 		*p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
-		*p++ = htonl(fl->fl_pid);
+		*p++ = htonl(resp->lock.svid);
 
 		/* Encode owner handle. */
 		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
@@ -298,7 +299,8 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
-	lock->fl.fl_pid = ~(u32) 0;
+	lock->svid = ~(u32) 0;
+	lock->fl.fl_pid = (pid_t)lock->svid;
 
 	if (!(p = nlm_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
@@ -415,7 +417,8 @@ nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 		memset(&resp->lock, 0, sizeof(resp->lock));
 		locks_init_lock(fl);
 		excl = ntohl(*p++);
-		fl->fl_pid = ntohl(*p++);
+		resp->lock.svid = ntohl(*p++);
+		fl->fl_pid = (pid_t)resp->lock.svid;
 		if (!(p = nlm_decode_oh(p, &resp->lock.oh)))
 			return -EIO;
 
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index fdcf105a5303..906ddc203186 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -130,10 +130,11 @@ nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
 	 || !(p = nlm4_decode_fh(p, &lock->fh))
 	 || !(p = nlm4_decode_oh(p, &lock->oh)))
 		return NULL;
+	lock->svid  = ntohl(*p++);
 
 	locks_init_lock(fl);
 	fl->fl_owner = current->files;
-	fl->fl_pid   = ntohl(*p++);
+	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	p = xdr_decode_hyper(p, &start);
@@ -167,7 +168,7 @@ nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
 	 || (fl->fl_end > NLM4_OFFSET_MAX && fl->fl_end != OFFSET_MAX))
 		return NULL;
 
-	*p++ = htonl(fl->fl_pid);
+	*p++ = htonl(lock->svid);
 
 	start = loff_t_to_s64(fl->fl_start);
 	if (fl->fl_end == OFFSET_MAX)
@@ -198,7 +199,7 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
 		struct file_lock	*fl = &resp->lock.fl;
 
 		*p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
-		*p++ = htonl(fl->fl_pid);
+		*p++ = htonl(resp->lock.svid);
 
 		/* Encode owner handle. */
 		if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
@@ -212,8 +213,8 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
 		
 		p = xdr_encode_hyper(p, start);
 		p = xdr_encode_hyper(p, len);
-		dprintk("xdr: encode_testres (status %d pid %d type %d start %Ld end %Ld)\n",
-			resp->status, fl->fl_pid, fl->fl_type,
+		dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n",
+			resp->status, (int)resp->lock.svid, fl->fl_type,
 			(long long)fl->fl_start,  (long long)fl->fl_end);
 	}
 
@@ -303,7 +304,8 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
-	lock->fl.fl_pid = ~(u32) 0;
+	lock->svid = ~(u32) 0;
+	lock->fl.fl_pid = (pid_t)lock->svid;
 
 	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
@@ -420,7 +422,8 @@ nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 		memset(&resp->lock, 0, sizeof(resp->lock));
 		locks_init_lock(fl);
 		excl = ntohl(*p++);
-		fl->fl_pid = ntohl(*p++);
+		resp->lock.svid = ntohl(*p++);
+		fl->fl_pid = (pid_t)resp->lock.svid;
 		if (!(p = nlm4_decode_oh(p, &resp->lock.oh)))
 			return -EIO;
 
diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index d7a5cc4cfa97..bb0a0f1caa91 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -28,6 +28,7 @@ struct nlm_lock {
 	int			len; 	/* length of "caller" */
 	struct nfs_fh		fh;
 	struct xdr_netobj	oh;
+	u32			svid;
 	struct file_lock	fl;
 };
 
-- 
cgit v1.2.3


From 24c5d9d7ea5a64fb5f157d17aa2c67a3300f8a08 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:08 -0500
Subject: SUNRPC: Run rpci->queue_timeout on the rpciod workqueue instead of
 generic

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h | 1 +
 net/sunrpc/rpc_pipe.c        | 5 +++--
 net/sunrpc/sched.c           | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 8b25629accd8..a390c9b8a01e 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -276,6 +276,7 @@ void		rpc_show_tasks(void);
 #endif
 int		rpc_init_mempool(void);
 void		rpc_destroy_mempool(void);
+extern struct workqueue_struct *rpciod_workqueue;
 
 static inline void rpc_exit(struct rpc_task *task, int status)
 {
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index a5c0c7b6e151..567abbe25bc8 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -91,7 +91,8 @@ rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
 		res = 0;
 	} else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) {
 		if (list_empty(&rpci->pipe))
-			schedule_delayed_work(&rpci->queue_timeout,
+			queue_delayed_work(rpciod_workqueue,
+					&rpci->queue_timeout,
 					RPC_UPCALL_TIMEOUT);
 		list_add_tail(&msg->list, &rpci->pipe);
 		rpci->pipelen += msg->len;
@@ -132,7 +133,7 @@ rpc_close_pipes(struct inode *inode)
 		if (ops->release_pipe)
 			ops->release_pipe(inode);
 		cancel_delayed_work(&rpci->queue_timeout);
-		flush_scheduled_work();
+		flush_workqueue(rpciod_workqueue);
 	}
 	rpc_inode_setowner(inode, NULL);
 	mutex_unlock(&inode->i_mutex);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index e838d042f7f5..1b74420d1603 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -64,7 +64,7 @@ static LIST_HEAD(all_tasks);
  */
 static DECLARE_MUTEX(rpciod_sema);
 static unsigned int		rpciod_users;
-static struct workqueue_struct *rpciod_workqueue;
+struct workqueue_struct *rpciod_workqueue;
 
 /*
  * Spinlock for other critical sections of code.
-- 
cgit v1.2.3


From 24bd68f46b1ad08d69bf32779f860df867780a7a Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Date: Mon, 20 Mar 2006 13:44:11 -0500
Subject: NFS: Code comments update in NFS

read_cache_mtime is no longer used in nfs_inode. This patch removes
references of read_cache_mtime in the code comments.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1161725d75e1..b71da4d4b137 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -119,8 +119,7 @@ struct nfs_inode {
 	unsigned long		cache_validity;		/* bit mask */
 
 	/*
-	 * read_cache_jiffies is when we started read-caching this inode,
-	 * and read_cache_mtime is the mtime of the inode at that time.
+	 * read_cache_jiffies is when we started read-caching this inode.
 	 * attrtimeo is for how long the cached information is assumed
 	 * to be valid. A successful attribute revalidation doubles
 	 * attrtimeo (up to acregmax/acdirmax), a failure resets it to
@@ -129,11 +128,6 @@ struct nfs_inode {
 	 * We need to revalidate the cached attrs for this inode if
 	 *
 	 *	jiffies - read_cache_jiffies > attrtimeo
-	 *
-	 * and invalidate any cached data/flush out any dirty pages if
-	 * we find that
-	 *
-	 *	mtime != read_cache_mtime
 	 */
 	unsigned long		read_cache_jiffies;
 	unsigned long		attrtimeo;
-- 
cgit v1.2.3


From b4629fe2f094b719847f31be1ee5ab38300038b2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:12 -0500
Subject: VFS: New /proc file /proc/self/mountstats

Create a new file under /proc/self, called mountstats, where mounted file
systems can export information (configuration options, performance counters,
and so on).  Use a mechanism similar to /proc/mounts and s_ops->show_options.

This mechanism does not violate namespace security, and is safe to use while
other processes are unmounting file systems.

Thanks to Mike Waychison for his review and comments.

Test-plan:
Test concurrent mount/unmount operations while cat'ing /proc/self/mountstats.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/namespace.c     | 38 ++++++++++++++++++++++++++++++++++++++
 fs/proc/base.c     | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 3 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 39c81a8d6316..71e75bcf4d28 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -399,6 +399,44 @@ struct seq_operations mounts_op = {
 	.show	= show_vfsmnt
 };
 
+static int show_vfsstat(struct seq_file *m, void *v)
+{
+	struct vfsmount *mnt = v;
+	int err = 0;
+
+	/* device */
+	if (mnt->mnt_devname) {
+		seq_puts(m, "device ");
+		mangle(m, mnt->mnt_devname);
+	} else
+		seq_puts(m, "no device");
+
+	/* mount point */
+	seq_puts(m, " mounted on ");
+	seq_path(m, mnt, mnt->mnt_root, " \t\n\\");
+	seq_putc(m, ' ');
+
+	/* file system type */
+	seq_puts(m, "with fstype ");
+	mangle(m, mnt->mnt_sb->s_type->name);
+
+	/* optional statistics */
+	if (mnt->mnt_sb->s_op->show_stats) {
+		seq_putc(m, ' ');
+		err = mnt->mnt_sb->s_op->show_stats(m, mnt);
+	}
+
+	seq_putc(m, '\n');
+	return err;
+}
+
+struct seq_operations mountstats_op = {
+	.start	= m_start,
+	.next	= m_next,
+	.stop	= m_stop,
+	.show	= show_vfsstat,
+};
+
 /**
  * may_umount_tree - check if a mount tree is busy
  * @mnt: root of mount tree
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 20feb7568deb..8f1f49ceebec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -104,6 +104,7 @@ enum pid_directory_inos {
 	PROC_TGID_MAPS,
 	PROC_TGID_NUMA_MAPS,
 	PROC_TGID_MOUNTS,
+	PROC_TGID_MOUNTSTATS,
 	PROC_TGID_WCHAN,
 #ifdef CONFIG_MMU
 	PROC_TGID_SMAPS,
@@ -144,6 +145,7 @@ enum pid_directory_inos {
 	PROC_TID_MAPS,
 	PROC_TID_NUMA_MAPS,
 	PROC_TID_MOUNTS,
+	PROC_TID_MOUNTSTATS,
 	PROC_TID_WCHAN,
 #ifdef CONFIG_MMU
 	PROC_TID_SMAPS,
@@ -201,6 +203,7 @@ static struct pid_entry tgid_base_stuff[] = {
 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_MOUNTS,    "mounts",  S_IFREG|S_IRUGO),
+	E(PROC_TGID_MOUNTSTATS, "mountstats", S_IFREG|S_IRUSR),
 #ifdef CONFIG_MMU
 	E(PROC_TGID_SMAPS,     "smaps",   S_IFREG|S_IRUGO),
 #endif
@@ -732,6 +735,38 @@ static struct file_operations proc_mounts_operations = {
 	.poll		= mounts_poll,
 };
 
+extern struct seq_operations mountstats_op;
+static int mountstats_open(struct inode *inode, struct file *file)
+{
+	struct task_struct *task = proc_task(inode);
+	int ret = seq_open(file, &mountstats_op);
+
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		struct namespace *namespace;
+		task_lock(task);
+		namespace = task->namespace;
+		if (namespace)
+			get_namespace(namespace);
+		task_unlock(task);
+
+		if (namespace)
+			m->private = namespace;
+		else {
+			seq_release(inode, file);
+			ret = -EINVAL;
+		}
+	}
+	return ret;
+}
+
+static struct file_operations proc_mountstats_operations = {
+	.open		= mountstats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= mounts_release,
+};
+
 #define PROC_BLOCK_SIZE	(3*1024)		/* 4K page size but our output routines use some slack for overruns */
 
 static ssize_t proc_info_read(struct file * file, char __user * buf,
@@ -1730,6 +1765,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 			inode->i_fop = &proc_smaps_operations;
 			break;
 #endif
+		case PROC_TID_MOUNTSTATS:
+		case PROC_TGID_MOUNTSTATS:
+			inode->i_fop = &proc_mountstats_operations;
+			break;
 #ifdef CONFIG_SECURITY
 		case PROC_TID_ATTR:
 			inode->i_nlink = 2;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 128d0082522c..be21e860a9f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1086,6 +1086,7 @@ struct super_operations {
 	void (*umount_begin) (struct super_block *);
 
 	int (*show_options)(struct seq_file *, struct vfsmount *);
+	int (*show_stats)(struct seq_file *, struct vfsmount *);
 
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
-- 
cgit v1.2.3


From 7a480e250c7ca9187275d8574ae9e48a6b602cb9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:12 -0500
Subject: NFS: show retransmit settings when displaying mount options

Sometimes it's important to know the exact RPC retransmit settings the
kernel is using for an NFS mount point.  Add this facility to the NFS
client's show_options method.

Test plan:
Set various retransmit settings via the mount command, and check that the
settings are reflected in /proc/mounts.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c            | 8 ++++++++
 include/linux/nfs_fs_sb.h | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 521d1dcb4cf0..48683d4bf0f6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -396,6 +396,9 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 
 	nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
 
+	server->retrans_timeo = timeparms.to_initval;
+	server->retrans_count = timeparms.to_retries;
+
 	/* create transport and client */
 	xprt = xprt_create_proto(proto, &server->addr, &timeparms);
 	if (IS_ERR(xprt)) {
@@ -629,6 +632,8 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 			proto = buf;
 	}
 	seq_printf(m, ",proto=%s", proto);
+	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
+	seq_printf(m, ",retrans=%u", nfss->retrans_count);
 	seq_puts(m, ",addr=");
 	seq_escape(m, nfss->hostname, " \t\n\\");
 	return 0;
@@ -1800,6 +1805,9 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 
 	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
 
+	server->retrans_timeo = timeparms.to_initval;
+	server->retrans_count = timeparms.to_retries;
+
 	clp = nfs4_get_client(&server->addr.sin_addr);
 	if (!clp) {
 		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 3d3a305488cf..a522ab97358d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -26,6 +26,8 @@ struct nfs_server {
 	unsigned int		acregmax;
 	unsigned int		acdirmin;
 	unsigned int		acdirmax;
+	unsigned long		retrans_timeo;	/* retransmit timeout */
+	unsigned int		retrans_count;	/* number of retransmit tries */
 	unsigned int		namelen;
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
-- 
cgit v1.2.3


From d9ef5a8c26aab09762afce43df64736720b4860e Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:13 -0500
Subject: NFS: introduce mechanism for tracking NFS client metrics

Add a per-superblock performance counter facility to the NFS client.  This
facility mimics the counters available for block devices and for
networking.  Expose these new counters via the new /proc/self/mountstats
interface.

Thanks to Andrew Morton and Trond Myklebust for their review and comments.

Test plan:
fsx and iozone on UP and SMP systems, with and without pre-emption.  Watch
for memory overwrite bugs, and performance loss (significantly more CPU
required per op).

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c            | 103 +++++++++++++++++++++++++++++--
 fs/nfs/iostat.h           | 152 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs_sb.h |   3 +
 3 files changed, 252 insertions(+), 6 deletions(-)
 create mode 100644 fs/nfs/iostat.h

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 827d69255b1b..86b756f44e27 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -42,6 +42,7 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
+#include "iostat.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 #define NFS_PARANOIA 1
@@ -65,6 +66,7 @@ static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct super_block *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
 
 static struct rpc_program	nfs_program;
@@ -78,6 +80,7 @@ static struct super_operations nfs_sops = {
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
+	.show_stats	= nfs_show_stats,
 };
 
 /*
@@ -290,6 +293,12 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 	}
 	sb->s_root->d_op = server->rpc_ops->dentry_ops;
 
+	server->io_stats = nfs_alloc_iostats();
+	if (!server->io_stats) {
+		no_root_error = -ENOMEM;
+		goto out_no_root;
+	}
+
 	/* Get some general file system info */
 	if (server->namelen == 0 &&
 	    server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
@@ -582,7 +591,7 @@ nfs_statfs(struct super_block *sb, struct kstatfs *buf)
 
 }
 
-static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
 {
 	static struct proc_nfs_info {
 		int flag;
@@ -598,20 +607,19 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 		{ 0, NULL, NULL }
 	};
 	struct proc_nfs_info *nfs_infop;
-	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
 	char buf[12];
 	char *proto;
 
 	seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
 	seq_printf(m, ",rsize=%d", nfss->rsize);
 	seq_printf(m, ",wsize=%d", nfss->wsize);
-	if (nfss->acregmin != 3*HZ)
+	if (nfss->acregmin != 3*HZ || showdefaults)
 		seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
-	if (nfss->acregmax != 60*HZ)
+	if (nfss->acregmax != 60*HZ || showdefaults)
 		seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
-	if (nfss->acdirmin != 30*HZ)
+	if (nfss->acdirmin != 30*HZ || showdefaults)
 		seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
-	if (nfss->acdirmax != 60*HZ)
+	if (nfss->acdirmax != 60*HZ || showdefaults)
 		seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
 	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
 		if (nfss->flags & nfs_infop->flag)
@@ -633,8 +641,89 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, ",proto=%s", proto);
 	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
 	seq_printf(m, ",retrans=%u", nfss->retrans_count);
+}
+
+static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+
+	nfs_show_mount_options(m, nfss, 0);
+
 	seq_puts(m, ",addr=");
 	seq_escape(m, nfss->hostname, " \t\n\\");
+
+	return 0;
+}
+
+static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+{
+	int i, cpu;
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+	struct rpc_auth *auth = nfss->client->cl_auth;
+	struct nfs_iostats totals = { };
+
+	seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+
+	/*
+	 * Display all mount option settings
+	 */
+	seq_printf(m, "\n\topts:\t");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	nfs_show_mount_options(m, nfss, 1);
+
+	seq_printf(m, "\n\tcaps:\t");
+	seq_printf(m, "caps=0x%x", nfss->caps);
+	seq_printf(m, ",wtmult=%d", nfss->wtmult);
+	seq_printf(m, ",dtsize=%d", nfss->dtsize);
+	seq_printf(m, ",bsize=%d", nfss->bsize);
+	seq_printf(m, ",namelen=%d", nfss->namelen);
+
+#ifdef CONFIG_NFS_V4
+	if (nfss->rpc_ops->version == 4) {
+		seq_printf(m, "\n\tnfsv4:\t");
+		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+	}
+#endif
+
+	/*
+	 * Display security flavor in effect for this mount
+	 */
+	seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
+	if (auth->au_flavor)
+		seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
+
+	/*
+	 * Display superblock I/O counters
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		struct nfs_iostats *stats;
+
+		if (!cpu_possible(cpu))
+			continue;
+
+		preempt_disable();
+		stats = per_cpu_ptr(nfss->io_stats, cpu);
+
+		for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+			totals.events[i] += stats->events[i];
+		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+			totals.bytes[i] += stats->bytes[i];
+
+		preempt_enable();
+	}
+
+	seq_printf(m, "\n\tevents:\t");
+	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+		seq_printf(m, "%lu ", totals.events[i]);
+	seq_printf(m, "\n\tbytes:\t");
+	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+		seq_printf(m, "%Lu ", totals.bytes[i]);
+
 	return 0;
 }
 
@@ -1742,6 +1831,7 @@ static struct super_operations nfs4_sops = {
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
+	.show_stats	= nfs_show_stats,
 };
 
 /*
@@ -2015,6 +2105,7 @@ out_err:
 out_free:
 	kfree(server->mnt_path);
 	kfree(server->hostname);
+	nfs_free_iostats(server->io_stats);
 	kfree(server);
 	return s;
 }
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
new file mode 100644
index 000000000000..dc080e50ec57
--- /dev/null
+++ b/fs/nfs/iostat.h
@@ -0,0 +1,152 @@
+/*
+ *  linux/fs/nfs/iostat.h
+ *
+ *  Declarations for NFS client per-mount statistics
+ *
+ *  Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
+ *
+ *  NFS client per-mount statistics provide information about the health of
+ *  the NFS client and the health of each NFS mount point.  Generally these
+ *  are not for detailed problem diagnosis, but simply to indicate that there
+ *  is a problem.
+ *
+ *  These counters are not meant to be human-readable, but are meant to be
+ *  integrated into system monitoring tools such as "sar" and "iostat".  As
+ *  such, the counters are sampled by the tools over time, and are never
+ *  zeroed after a file system is mounted.  Moving averages can be computed
+ *  by the tools by taking the difference between two instantaneous samples
+ *  and dividing that by the time between the samples.
+ */
+
+#ifndef _NFS_IOSTAT
+#define _NFS_IOSTAT
+
+#define NFS_IOSTAT_VERS		"1.0"
+
+/*
+ * NFS byte counters
+ *
+ * 1.  SERVER - the number of payload bytes read from or written to the
+ *     server by the NFS client via an NFS READ or WRITE request.
+ *
+ * 2.  NORMAL - the number of bytes read or written by applications via
+ *     the read(2) and write(2) system call interfaces.
+ *
+ * 3.  DIRECT - the number of bytes read or written from files opened
+ *     with the O_DIRECT flag.
+ *
+ * These counters give a view of the data throughput into and out of the NFS
+ * client.  Comparing the number of bytes requested by an application with the
+ * number of bytes the client requests from the server can provide an
+ * indication of client efficiency (per-op, cache hits, etc).
+ *
+ * These counters can also help characterize which access methods are in
+ * use.  DIRECT by itself shows whether there is any O_DIRECT traffic.
+ * NORMAL + DIRECT shows how much data is going through the system call
+ * interface.  A large amount of SERVER traffic without much NORMAL or
+ * DIRECT traffic shows that applications are using mapped files.
+ *
+ * NFS page counters
+ *
+ * These count the number of pages read or written via nfs_readpage(),
+ * nfs_readpages(), or their write equivalents.
+ */
+enum nfs_stat_bytecounters {
+	NFSIOS_NORMALREADBYTES = 0,
+	NFSIOS_NORMALWRITTENBYTES,
+	NFSIOS_DIRECTREADBYTES,
+	NFSIOS_DIRECTWRITTENBYTES,
+	NFSIOS_SERVERREADBYTES,
+	NFSIOS_SERVERWRITTENBYTES,
+	NFSIOS_READPAGES,
+	NFSIOS_WRITEPAGES,
+	__NFSIOS_BYTESMAX,
+};
+
+/*
+ * NFS event counters
+ *
+ * These counters provide a low-overhead way of monitoring client activity
+ * without enabling NFS trace debugging.  The counters show the rate at
+ * which VFS requests are made, and how often the client invalidates its
+ * data and attribute caches.  This allows system administrators to monitor
+ * such things as how close-to-open is working, and answer questions such
+ * as "why are there so many GETATTR requests on the wire?"
+ *
+ * They also count anamolous events such as short reads and writes, silly
+ * renames due to close-after-delete, and operations that change the size
+ * of a file (such operations can often be the source of data corruption
+ * if applications aren't using file locking properly).
+ */
+enum nfs_stat_eventcounters {
+	NFSIOS_INODEREVALIDATE = 0,
+	NFSIOS_DENTRYREVALIDATE,
+	NFSIOS_DATAINVALIDATE,
+	NFSIOS_ATTRINVALIDATE,
+	NFSIOS_VFSOPEN,
+	NFSIOS_VFSLOOKUP,
+	NFSIOS_VFSACCESS,
+	NFSIOS_VFSUPDATEPAGE,
+	NFSIOS_VFSREADPAGE,
+	NFSIOS_VFSREADPAGES,
+	NFSIOS_VFSWRITEPAGE,
+	NFSIOS_VFSWRITEPAGES,
+	NFSIOS_VFSGETDENTS,
+	NFSIOS_VFSSETATTR,
+	NFSIOS_VFSFLUSH,
+	NFSIOS_VFSFSYNC,
+	NFSIOS_VFSLOCK,
+	NFSIOS_VFSRELEASE,
+	NFSIOS_CONGESTIONWAIT,
+	NFSIOS_SETATTRTRUNC,
+	NFSIOS_EXTENDWRITE,
+	NFSIOS_SILLYRENAME,
+	NFSIOS_SHORTREAD,
+	NFSIOS_SHORTWRITE,
+	__NFSIOS_COUNTSMAX,
+};
+
+#ifdef __KERNEL__
+
+#include <linux/percpu.h>
+#include <linux/cache.h>
+
+struct nfs_iostats {
+	unsigned long long	bytes[__NFSIOS_BYTESMAX];
+	unsigned long		events[__NFSIOS_COUNTSMAX];
+} ____cacheline_aligned;
+
+static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat)
+{
+	struct nfs_iostats *iostats;
+	int cpu;
+
+	cpu = get_cpu();
+	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+	iostats->events[stat] ++;
+	put_cpu_no_resched();
+}
+
+static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend)
+{
+	struct nfs_iostats *iostats;
+	int cpu;
+
+	cpu = get_cpu();
+	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+	iostats->bytes[stat] += addend;
+	put_cpu_no_resched();
+}
+
+static inline struct nfs_iostats *nfs_alloc_iostats(void)
+{
+	return alloc_percpu(struct nfs_iostats);
+}
+
+static inline void nfs_free_iostats(struct nfs_iostats *stats)
+{
+	free_percpu(stats);
+}
+
+#endif
+#endif
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a522ab97358d..d65e69a06b72 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -4,6 +4,8 @@
 #include <linux/list.h>
 #include <linux/backing-dev.h>
 
+struct nfs_iostats;
+
 /*
  * NFS client parameters stored in the superblock.
  */
@@ -12,6 +14,7 @@ struct nfs_server {
 	struct rpc_clnt *	client_sys;	/* 2nd handle for FSINFO */
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
 	struct nfs_rpc_ops *	rpc_ops;	/* NFS protocol vector */
+	struct nfs_iostats *	io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	int			flags;		/* various flags */
 	unsigned int		caps;		/* server capabilities */
-- 
cgit v1.2.3


From 67ec9f46b889bfb1ab0a4e307d53929d5f0692bf Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:15 -0500
Subject: NFS: report how long an NFS file system has been mounted

Add a field in nfs_server to record a timestamp when a mount succeeds.
Report the number of seconds the file system has been mounted via
nfs_show_stats().

Test plan:
Mount an NFS file system, watch the mountstats reports and compare with
clock time.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c            | 5 +++++
 include/linux/nfs_fs_sb.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8ee74111e519..1b2d782374b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -299,6 +299,9 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 		goto out_no_root;
 	}
 
+	/* mount time stamp, in seconds */
+	server->mount_time = jiffies;
+
 	/* Get some general file system info */
 	if (server->namelen == 0 &&
 	    server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
@@ -674,6 +677,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
 	nfs_show_mount_options(m, nfss, 1);
 
+	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+
 	seq_printf(m, "\n\tcaps:\t");
 	seq_printf(m, "caps=0x%x", nfss->caps);
 	seq_printf(m, ",wtmult=%d", nfss->wtmult);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d65e69a06b72..65dec21af774 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -35,6 +35,7 @@ struct nfs_server {
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
 	struct sockaddr_in	addr;
+	unsigned long		mount_time;	/* when this fs was mounted */
 #ifdef CONFIG_NFS_V4
 	/* Our own IP address, as a null-terminated string.
 	 * This is used to generate the clientid, and the callback address.
-- 
cgit v1.2.3


From e19b63dafdf7d615b0d36b90990a07e7792b9d3a Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:15 -0500
Subject: SUNRPC: track length of RPC wait queues

RPC wait queue length will eventually be exported to userland via the RPC
iostats interface.

Test plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h | 1 +
 net/sunrpc/sched.c           | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index a390c9b8a01e..6c23f73a799a 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -203,6 +203,7 @@ struct rpc_wait_queue {
 	unsigned char		priority;		/* current priority */
 	unsigned char		count;			/* # task groups remaining serviced so far */
 	unsigned char		nr;			/* # tasks remaining for cookie */
+	unsigned short		qlen;			/* total # tasks waiting in queue */
 #ifdef RPC_DEBUG
 	const char *		name;
 #endif
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 1b74420d1603..aa0449dcd8e5 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -181,6 +181,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *
 	else
 		list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
 	task->u.tk_wait.rpc_waitq = queue;
+	queue->qlen++;
 	rpc_set_queued(task);
 
 	dprintk("RPC: %4d added to queue %p \"%s\"\n",
@@ -215,6 +216,7 @@ static void __rpc_remove_wait_queue(struct rpc_task *task)
 		__rpc_remove_wait_queue_priority(task);
 	else
 		list_del(&task->u.tk_wait.list);
+	queue->qlen--;
 	dprintk("RPC: %4d removed from queue %p \"%s\"\n",
 				task->tk_pid, queue, rpc_qname(queue));
 }
-- 
cgit v1.2.3


From 262ca07de4d7f1bff20361c1353bb14b3607afb2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:16 -0500
Subject: SUNRPC: add a handful of per-xprt counters

Monitor generic transport events.  Add a transport switch callout to
format transport counters for export to user-land.

Test plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 13 ++++++++++++
 net/sunrpc/pmap_clnt.c      |  1 +
 net/sunrpc/xprt.c           | 21 ++++++++++++++------
 net/sunrpc/xprtsock.c       | 48 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 6ef99b14ff09..7eebbab7160b 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -114,6 +114,7 @@ struct rpc_xprt_ops {
 	void		(*release_request)(struct rpc_task *task);
 	void		(*close)(struct rpc_xprt *xprt);
 	void		(*destroy)(struct rpc_xprt *xprt);
+	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
 };
 
 struct rpc_xprt {
@@ -187,6 +188,18 @@ struct rpc_xprt {
 
 	struct list_head	recv;
 
+	struct {
+		unsigned long		bind_count,	/* total number of binds */
+					connect_count,	/* total number of connects */
+					connect_start,	/* connect start timestamp */
+					connect_time,	/* jiffies waiting for connect */
+					sends,		/* how many complete requests */
+					recvs,		/* how many complete requests */
+					bad_xids;	/* lookup_rqst didn't find XID */
+
+		unsigned long long	req_u,		/* average requests on the wire */
+					bklog_u;	/* backlog queue utilization */
+	} stat;
 
 	void			(*old_data_ready)(struct sock *, int);
 	void			(*old_state_change)(struct sock *);
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 8139ce68e915..332cc5d362d5 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -82,6 +82,7 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
 	rpc_call_setup(child, &msg, 0);
 
 	/* ... and run the child task */
+	task->tk_xprt->stat.bind_count++;
 	rpc_run_child(task, child, pmap_getport_done);
 	return;
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8ff2c8acb223..93a0a3ca0d5f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -548,6 +548,7 @@ void xprt_connect(struct rpc_task *task)
 
 		task->tk_timeout = xprt->connect_timeout;
 		rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL);
+		xprt->stat.connect_start = jiffies;
 		xprt->ops->connect(task);
 	}
 	return;
@@ -558,6 +559,8 @@ static void xprt_connect_status(struct rpc_task *task)
 	struct rpc_xprt	*xprt = task->tk_xprt;
 
 	if (task->tk_status >= 0) {
+		xprt->stat.connect_count++;
+		xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start;
 		dprintk("RPC: %4d xprt_connect_status: connection established\n",
 				task->tk_pid);
 		return;
@@ -601,16 +604,14 @@ static void xprt_connect_status(struct rpc_task *task)
 struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid)
 {
 	struct list_head *pos;
-	struct rpc_rqst	*req = NULL;
 
 	list_for_each(pos, &xprt->recv) {
 		struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list);
-		if (entry->rq_xid == xid) {
-			req = entry;
-			break;
-		}
+		if (entry->rq_xid == xid)
+			return entry;
 	}
-	return req;
+	xprt->stat.bad_xids++;
+	return NULL;
 }
 
 /**
@@ -646,6 +647,7 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 	dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
 			task->tk_pid, ntohl(req->rq_xid), copied);
 
+	task->tk_xprt->stat.recvs++;
 	list_del_init(&req->rq_list);
 	req->rq_received = req->rq_private_buf.len = copied;
 	rpc_wake_up_task(task);
@@ -744,12 +746,19 @@ void xprt_transmit(struct rpc_task *task)
 	if (status == 0) {
 		dprintk("RPC: %4d xmit complete\n", task->tk_pid);
 		spin_lock_bh(&xprt->transport_lock);
+
 		xprt->ops->set_retrans_timeout(task);
+
+		xprt->stat.sends++;
+		xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
+		xprt->stat.bklog_u += xprt->backlog.qlen;
+
 		/* Don't race with disconnect */
 		if (!xprt_connected(xprt))
 			task->tk_status = -ENOTCONN;
 		else if (!req->rq_received)
 			rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer);
+
 		xprt->ops->release_xprt(xprt, task);
 		spin_unlock_bh(&xprt->transport_lock);
 		return;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c458f8d1d6d1..6766b7f1ecf9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1114,6 +1114,8 @@ static void xs_tcp_connect_worker(void *args)
 	}
 
 	/* Tell the socket layer to start connecting... */
+	xprt->stat.connect_count++;
+	xprt->stat.connect_start = jiffies;
 	status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr,
 			sizeof(xprt->addr), O_NONBLOCK);
 	dprintk("RPC: %p  connect status %d connected %d sock state %d\n",
@@ -1177,6 +1179,50 @@ static void xs_connect(struct rpc_task *task)
 	}
 }
 
+/**
+ * xs_udp_print_stats - display UDP socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %Lu %Lu\n",
+			xprt->port,
+			xprt->stat.bind_count,
+			xprt->stat.sends,
+			xprt->stat.recvs,
+			xprt->stat.bad_xids,
+			xprt->stat.req_u,
+			xprt->stat.bklog_u);
+}
+
+/**
+ * xs_tcp_print_stats - display TCP socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu\n",
+			xprt->port,
+			xprt->stat.bind_count,
+			xprt->stat.connect_count,
+			xprt->stat.connect_time,
+			idle_time,
+			xprt->stat.sends,
+			xprt->stat.recvs,
+			xprt->stat.bad_xids,
+			xprt->stat.req_u,
+			xprt->stat.bklog_u);
+}
+
 static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
@@ -1191,6 +1237,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.release_request	= xprt_release_rqst_cong,
 	.close			= xs_close,
 	.destroy		= xs_destroy,
+	.print_stats		= xs_udp_print_stats,
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
@@ -1204,6 +1251,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
 	.close			= xs_close,
 	.destroy		= xs_destroy,
+	.print_stats		= xs_tcp_print_stats,
 };
 
 /**
-- 
cgit v1.2.3


From ef759a2e54ed434b2f72b52a14edecd6d4eadf74 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:17 -0500
Subject: SUNRPC: introduce per-task RPC iostats

Account for various things that occur while an RPC task is executed.
Separate timers for RPC round trip and RPC execution time show how
long RPC requests wait in queue before being sent.  Eventually these
will be accumulated at xprt_release time in one place where they can
be viewed from userland.

Test plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h | 6 ++++++
 net/sunrpc/clnt.c            | 2 ++
 net/sunrpc/sched.c           | 3 +++
 net/sunrpc/xprt.c            | 2 ++
 net/sunrpc/xprtsock.c        | 1 +
 5 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 6c23f73a799a..45a64ae963ee 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -86,6 +86,12 @@ struct rpc_task {
 		struct work_struct	tk_work;	/* Async task work queue */
 		struct rpc_wait		tk_wait;	/* RPC wait */
 	} u;
+
+	unsigned short		tk_timeouts;	/* maj timeouts */
+	size_t			tk_bytes_sent;	/* total bytes sent */
+	unsigned long		tk_start;	/* RPC task init timestamp */
+	long			tk_rtt;		/* round-trip time (jiffies) */
+
 #ifdef RPC_DEBUG
 	unsigned short		tk_pid;		/* debugging aid */
 #endif
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index cad7efe2cb22..84eb5b4565fc 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -996,6 +996,8 @@ call_timeout(struct rpc_task *task)
 	}
 
 	dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid);
+	task->tk_timeouts++;
+
 	if (RPC_IS_SOFT(task)) {
 		printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
 				clnt->cl_protname, clnt->cl_server);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index aa0449dcd8e5..cd51b5468332 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -817,6 +817,9 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, int flags, cons
 
 	BUG_ON(task->tk_ops == NULL);
 
+	/* starting timestamp */
+	task->tk_start = jiffies;
+
 	dprintk("RPC: %4d new task procpid %d\n", task->tk_pid,
 				current->pid);
 }
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 93a0a3ca0d5f..c6241976a6ee 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -648,6 +648,8 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
 			task->tk_pid, ntohl(req->rq_xid), copied);
 
 	task->tk_xprt->stat.recvs++;
+	task->tk_rtt = (long)jiffies - req->rq_xtime;
+
 	list_del_init(&req->rq_list);
 	req->rq_received = req->rq_private_buf.len = copied;
 	rpc_wake_up_task(task);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 6766b7f1ecf9..4b4e7dfdff14 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -382,6 +382,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		/* If we've sent the entire packet, immediately
 		 * reset the count of bytes sent. */
 		req->rq_bytes_sent += status;
+		task->tk_bytes_sent += status;
 		if (likely(req->rq_bytes_sent >= req->rq_slen)) {
 			req->rq_bytes_sent = 0;
 			return 0;
-- 
cgit v1.2.3


From 11c556b3d8d481829ab5f9933a25d29b00913b5a Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:22 -0500
Subject: SUNRPC: provide a mechanism for collecting stats in the RPC client

Add a simple mechanism for collecting stats in the RPC client.  Stats are
tabulated during xprt_release.  Note that per_cpu shenanigans are not
required here because the RPC client already serializes on the transport
write lock.

Test plan:
Compile kernel with CONFIG_NFS enabled.  Basic performance regression
testing with high-speed networking and high performance server.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h    |   3 +-
 include/linux/sunrpc/metrics.h |  77 ++++++++++++++++++++++++++++++
 net/sunrpc/clnt.c              |   9 ++--
 net/sunrpc/stats.c             | 105 +++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprt.c              |   2 +
 5 files changed, 192 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/sunrpc/metrics.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index f147e6b84332..0f3662002ffc 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -45,7 +45,8 @@ struct rpc_clnt {
 	char *			cl_server;	/* server machine name */
 	char *			cl_protname;	/* protocol name */
 	struct rpc_auth *	cl_auth;	/* authenticator */
-	struct rpc_stat *	cl_stats;	/* statistics */
+	struct rpc_stat *	cl_stats;	/* per-program statistics */
+	struct rpc_iostats *	cl_metrics;	/* per-client statistics */
 
 	unsigned int		cl_softrtry : 1,/* soft timeouts */
 				cl_intr     : 1,/* interruptible */
diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
new file mode 100644
index 000000000000..8f96e9dc369a
--- /dev/null
+++ b/include/linux/sunrpc/metrics.h
@@ -0,0 +1,77 @@
+/*
+ *  linux/include/linux/sunrpc/metrics.h
+ *
+ *  Declarations for RPC client per-operation metrics
+ *
+ *  Copyright (C) 2005	Chuck Lever <cel@netapp.com>
+ *
+ *  RPC client per-operation statistics provide latency and retry
+ *  information about each type of RPC procedure in a given RPC program.
+ *  These statistics are not for detailed problem diagnosis, but simply
+ *  to indicate whether the problem is local or remote.
+ *
+ *  These counters are not meant to be human-readable, but are meant to be
+ *  integrated into system monitoring tools such as "sar" and "iostat".  As
+ *  such, the counters are sampled by the tools over time, and are never
+ *  zeroed after a file system is mounted.  Moving averages can be computed
+ *  by the tools by taking the difference between two instantaneous samples
+ *  and dividing that by the time between the samples.
+ *
+ *  The counters are maintained in a single array per RPC client, indexed
+ *  by procedure number.  There is no need to maintain separate counter
+ *  arrays per-CPU because these counters are always modified behind locks.
+ */
+
+#ifndef _LINUX_SUNRPC_METRICS_H
+#define _LINUX_SUNRPC_METRICS_H
+
+#include <linux/seq_file.h>
+
+#define RPC_IOSTATS_VERS	"1.0"
+
+struct rpc_iostats {
+	/*
+	 * These counters give an idea about how many request
+	 * transmissions are required, on average, to complete that
+	 * particular procedure.  Some procedures may require more
+	 * than one transmission because the server is unresponsive,
+	 * the client is retransmitting too aggressively, or the
+	 * requests are large and the network is congested.
+	 */
+	unsigned long		om_ops,		/* count of operations */
+				om_ntrans,	/* count of RPC transmissions */
+				om_timeouts;	/* count of major timeouts */
+
+	/*
+	 * These count how many bytes are sent and received for a
+	 * given RPC procedure type.  This indicates how much load a
+	 * particular procedure is putting on the network.  These
+	 * counts include the RPC and ULP headers, and the request
+	 * payload.
+	 */
+	unsigned long long      om_bytes_sent,	/* count of bytes out */
+				om_bytes_recv;	/* count of bytes in */
+
+	/*
+	 * The length of time an RPC request waits in queue before
+	 * transmission, the network + server latency of the request,
+	 * and the total time the request spent from init to release
+	 * are measured.
+	 */
+	unsigned long long	om_queue,	/* jiffies queued for xmit */
+				om_rtt,		/* jiffies for RPC RTT */
+				om_execute;	/* jiffies for RPC execution */
+} ____cacheline_aligned;
+
+struct rpc_task;
+struct rpc_clnt;
+
+/*
+ * EXPORTed functions for managing rpc_iostats structures
+ */
+struct rpc_iostats *	rpc_alloc_iostats(struct rpc_clnt *);
+void			rpc_count_iostats(struct rpc_task *);
+void			rpc_print_iostats(struct seq_file *, struct rpc_clnt *);
+void			rpc_free_iostats(struct rpc_iostats *);
+
+#endif /* _LINUX_SUNRPC_METRICS_H */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 84eb5b4565fc..0bb23e8e9d0c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -28,12 +28,11 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/utsname.h>
+#include <linux/workqueue.h>
 
 #include <linux/sunrpc/clnt.h>
-#include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-
-#include <linux/nfs.h>
+#include <linux/sunrpc/metrics.h>
 
 
 #define RPC_SLACK_SPACE		(1024)	/* total overkill */
@@ -147,6 +146,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 	clnt->cl_vers     = version->number;
 	clnt->cl_prot     = xprt->prot;
 	clnt->cl_stats    = program->stats;
+	clnt->cl_metrics  = rpc_alloc_iostats(clnt);
 	rpc_init_wait_queue(&clnt->cl_pmap_default.pm_bindwait, "bindwait");
 
 	if (!clnt->cl_port)
@@ -245,6 +245,7 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
 	new->cl_pmap		= &new->cl_pmap_default;
+	new->cl_metrics         = rpc_alloc_iostats(clnt);
 	rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
 	return new;
 out_no_clnt:
@@ -315,6 +316,8 @@ rpc_destroy_client(struct rpc_clnt *clnt)
 	if (clnt->cl_server != clnt->cl_inline_name)
 		kfree(clnt->cl_server);
 out_free:
+	rpc_free_iostats(clnt->cl_metrics);
+	clnt->cl_metrics = NULL;
 	if (clnt->cl_dentry)
 		dput(clnt->cl_dentry);
 	kfree(clnt);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 4979f226e285..24ac7163b9c7 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/metrics.h>
 
 #define RPCDBG_FACILITY	RPCDBG_MISC
 
@@ -106,6 +107,110 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
 	}
 }
 
+/**
+ * rpc_alloc_iostats - allocate an rpc_iostats structure
+ * @clnt: RPC program, version, and xprt
+ *
+ */
+struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt)
+{
+	unsigned int ops = clnt->cl_maxproc;
+	size_t size = ops * sizeof(struct rpc_iostats);
+	struct rpc_iostats *new;
+
+	new = kmalloc(size, GFP_KERNEL);
+	if (new)
+		memset(new, 0 , size);
+	return new;
+}
+EXPORT_SYMBOL(rpc_alloc_iostats);
+
+/**
+ * rpc_free_iostats - release an rpc_iostats structure
+ * @stats: doomed rpc_iostats structure
+ *
+ */
+void rpc_free_iostats(struct rpc_iostats *stats)
+{
+	kfree(stats);
+}
+EXPORT_SYMBOL(rpc_free_iostats);
+
+/**
+ * rpc_count_iostats - tally up per-task stats
+ * @task: completed rpc_task
+ *
+ * Relies on the caller for serialization.
+ */
+void rpc_count_iostats(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_iostats *stats = task->tk_client->cl_metrics;
+	struct rpc_iostats *op_metrics;
+	long rtt, execute, queue;
+
+	if (!stats || !req)
+		return;
+	op_metrics = &stats[task->tk_msg.rpc_proc->p_proc];
+
+	op_metrics->om_ops++;
+	op_metrics->om_ntrans += req->rq_ntrans;
+	op_metrics->om_timeouts += task->tk_timeouts;
+
+	op_metrics->om_bytes_sent += task->tk_bytes_sent;
+	op_metrics->om_bytes_recv += req->rq_received;
+
+	queue = (long)req->rq_xtime - task->tk_start;
+	if (queue < 0)
+		queue = -queue;
+	op_metrics->om_queue += queue;
+
+	rtt = task->tk_rtt;
+	if (rtt < 0)
+		rtt = -rtt;
+	op_metrics->om_rtt += rtt;
+
+	execute = (long)jiffies - task->tk_start;
+	if (execute < 0)
+		execute = -execute;
+	op_metrics->om_execute += execute;
+}
+
+#define MILLISECS_PER_JIFFY	(1000UL / HZ)
+
+void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
+{
+	struct rpc_iostats *stats = clnt->cl_metrics;
+	struct rpc_xprt *xprt = clnt->cl_xprt;
+	unsigned int op, maxproc = clnt->cl_maxproc;
+
+	if (!stats)
+		return;
+
+	seq_printf(seq, "\tRPC iostats version: %s  ", RPC_IOSTATS_VERS);
+	seq_printf(seq, "p/v: %u/%u (%s)\n",
+			clnt->cl_prog, clnt->cl_vers, clnt->cl_protname);
+
+	if (xprt)
+		xprt->ops->print_stats(xprt, seq);
+
+	seq_printf(seq, "\tper-op statistics\n");
+	for (op = 0; op < maxproc; op++) {
+		struct rpc_iostats *metrics = &stats[op];
+		seq_printf(seq, "%12u: ", op);
+		seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
+				metrics->om_ops,
+				metrics->om_ntrans,
+				metrics->om_timeouts,
+				metrics->om_bytes_sent,
+				metrics->om_bytes_recv,
+				metrics->om_queue * MILLISECS_PER_JIFFY,
+				metrics->om_rtt * MILLISECS_PER_JIFFY,
+				metrics->om_execute * MILLISECS_PER_JIFFY);
+	}
+}
+EXPORT_SYMBOL(rpc_print_iostats);
+
 /*
  * Register/unregister RPC proc files
  */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c6241976a6ee..eb5a262e024e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -44,6 +44,7 @@
 #include <linux/random.h>
 
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/metrics.h>
 
 /*
  * Local variables
@@ -859,6 +860,7 @@ void xprt_release(struct rpc_task *task)
 
 	if (!(req = task->tk_rqstp))
 		return;
+	rpc_count_iostats(task);
 	spin_lock_bh(&xprt->transport_lock);
 	xprt->ops->release_xprt(xprt, task);
 	if (xprt->ops->release_request)
-- 
cgit v1.2.3


From cc0175c1dc1de8f6af0eb0631dcc5b999a6fcc42 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:22 -0500
Subject: SUNRPC: display human-readable procedure name in rpc_iostats output

Add fields to the rpc_procinfo struct that allow the display of a
human-readable name for each procedure in the rpc_iostats output.

Also fix it so that the NFSv4 stats are broken up correctly by
sub-procedure number.  NFSv4 uses only two real RPC procedures:
NULL, and COMPOUND.

Test plan:
Mount with NFSv2, NFSv3, and NFSv4, and do "cat /proc/self/mountstats".

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c              |  4 ++++
 fs/lockd/xdr.c              |  4 +++-
 fs/lockd/xdr4.c             |  4 +++-
 fs/nfs/mount_clnt.c         |  4 ++++
 fs/nfs/nfs2xdr.c            |  4 +++-
 fs/nfs/nfs3xdr.c            |  6 +++++-
 fs/nfs/nfs4xdr.c            |  2 ++
 fs/nfsd/nfs4callback.c      |  2 ++
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/pmap_clnt.c      |  6 ++++++
 net/sunrpc/stats.c          | 14 ++++++++++++--
 11 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 0edc03e67966..84ee39e6a3a2 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -214,12 +214,16 @@ static struct rpc_procinfo	nsm_procedures[] = {
 		.p_encode	= (kxdrproc_t) xdr_encode_mon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat_res,
 		.p_bufsiz	= MAX(SM_mon_sz, SM_monres_sz) << 2,
+		.p_statidx	= SM_MON,
+		.p_name		= "MONITOR",
 	},
 [SM_UNMON] = {
 		.p_proc		= SM_UNMON,
 		.p_encode	= (kxdrproc_t) xdr_encode_unmon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat,
 		.p_bufsiz	= MAX(SM_mon_id_sz, SM_unmonres_sz) << 2,
+		.p_statidx	= SM_UNMON,
+		.p_name		= "UNMONITOR",
 	},
 };
 
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1e984ab14d3f..766ce06146b5 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -546,7 +546,9 @@ nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nlmclt_encode_##argtype,		\
 	.p_decode    = (kxdrproc_t) nlmclt_decode_##restype,		\
-	.p_bufsiz    = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2	\
+	.p_bufsiz    = MAX(NLM_##argtype##_sz, NLM_##restype##_sz) << 2,	\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
 	}
 
 static struct rpc_procinfo	nlm_procedures[] = {
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 906ddc203186..36eb175ec335 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -551,7 +551,9 @@ nlm4clt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 	.p_proc      = NLMPROC_##proc,					\
 	.p_encode    = (kxdrproc_t) nlm4clt_encode_##argtype,		\
 	.p_decode    = (kxdrproc_t) nlm4clt_decode_##restype,		\
-	.p_bufsiz    = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2	\
+	.p_bufsiz    = MAX(NLM4_##argtype##_sz, NLM4_##restype##_sz) << 2,	\
+	.p_statidx   = NLMPROC_##proc,					\
+	.p_name      = #proc,						\
 	}
 
 static struct rpc_procinfo	nlm4_procedures[] = {
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index db99b8f678f8..4a1340358223 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -137,6 +137,8 @@ static struct rpc_procinfo	mnt_procedures[] = {
 	  .p_encode		= (kxdrproc_t) xdr_encode_dirpath,	
 	  .p_decode		= (kxdrproc_t) xdr_decode_fhstatus,
 	  .p_bufsiz		= MNT_dirpath_sz << 2,
+	  .p_statidx		= MNTPROC_MNT,
+	  .p_name		= "MOUNT",
 	},
 };
 
@@ -146,6 +148,8 @@ static struct rpc_procinfo mnt3_procedures[] = {
 	  .p_encode		= (kxdrproc_t) xdr_encode_dirpath,
 	  .p_decode		= (kxdrproc_t) xdr_decode_fhstatus3,
 	  .p_bufsiz		= MNT_dirpath_sz << 2,
+	  .p_statidx		= MOUNTPROC3_MNT,
+	  .p_name		= "MOUNT",
 	},
 };
 
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7fc0560c89c9..8cdc792ff3c7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -682,7 +682,9 @@ nfs_stat_to_errno(int stat)
 	.p_encode   =  (kxdrproc_t) nfs_xdr_##argtype,			\
 	.p_decode   =  (kxdrproc_t) nfs_xdr_##restype,			\
 	.p_bufsiz   =  MAX(NFS_##argtype##_sz,NFS_##restype##_sz) << 2,	\
-	.p_timer    =  timer						\
+	.p_timer    =  timer,						\
+	.p_statidx  =  NFSPROC_##proc,					\
+	.p_name     =  #proc,						\
 	}
 struct rpc_procinfo	nfs_procedures[] = {
     PROC(GETATTR,	fhandle,	attrstat, 1),
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index b6c0b5012bce..2d8701a230f0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1109,7 +1109,9 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
 	.p_encode    = (kxdrproc_t) nfs3_xdr_##argtype,			\
 	.p_decode    = (kxdrproc_t) nfs3_xdr_##restype,			\
 	.p_bufsiz    = MAX(NFS3_##argtype##_sz,NFS3_##restype##_sz) << 2,	\
-	.p_timer     = timer						\
+	.p_timer     = timer,						\
+	.p_statidx   = NFS3PROC_##proc,					\
+	.p_name      = #proc,						\
 	}
 
 struct rpc_procinfo	nfs3_procedures[] = {
@@ -1150,6 +1152,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 		.p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
 		.p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2,
 		.p_timer = 1,
+		.p_name = "GETACL",
 	},
 	[ACLPROC3_SETACL] = {
 		.p_proc = ACLPROC3_SETACL,
@@ -1157,6 +1160,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {
 		.p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
 		.p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2,
 		.p_timer = 0,
+		.p_name = "SETACL",
 	},
 };
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4bbf5ef57785..b95675349ba3 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4344,6 +4344,8 @@ nfs_stat_to_errno(int stat)
 	.p_encode = (kxdrproc_t) nfs4_xdr_##argtype,		\
 	.p_decode = (kxdrproc_t) nfs4_xdr_##restype,		\
 	.p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,	\
+	.p_statidx = NFSPROC4_CLNT_##proc,			\
+	.p_name   = #proc,					\
     }
 
 struct rpc_procinfo	nfs4_procedures[] = {
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index d828662d737d..4f391cbf2fd1 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -326,6 +326,8 @@ out:
         .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                    \
         .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                    \
         .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
+        .p_statidx = NFSPROC4_CB_##call,				\
+	.p_name   = #proc,                                              \
 }
 
 static struct rpc_procinfo     nfs4_cb_procedures[] = {
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 0f3662002ffc..3bec751ee249 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -101,6 +101,8 @@ struct rpc_procinfo {
 	unsigned int		p_bufsiz;	/* req. buffer size */
 	unsigned int		p_count;	/* call count */
 	unsigned int		p_timer;	/* Which RTT timer to use */
+	u32			p_statidx;	/* Which procedure to account */
+	char *			p_name;		/* name of procedure */
 };
 
 #define RPC_CONGESTED(clnt)	(RPCXPRT_CONGESTED((clnt)->cl_xprt))
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 332cc5d362d5..efa00bd9ff9e 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -261,6 +261,8 @@ static struct rpc_procinfo	pmap_procedures[] = {
 	  .p_decode		= (kxdrproc_t) xdr_decode_bool,
 	  .p_bufsiz		= 4,
 	  .p_count		= 1,
+	  .p_statidx		= PMAP_SET,
+	  .p_name		= "SET",
 	},
 [PMAP_UNSET] = {
 	  .p_proc		= PMAP_UNSET,
@@ -268,6 +270,8 @@ static struct rpc_procinfo	pmap_procedures[] = {
 	  .p_decode		= (kxdrproc_t) xdr_decode_bool,
 	  .p_bufsiz		= 4,
 	  .p_count		= 1,
+	  .p_statidx		= PMAP_UNSET,
+	  .p_name		= "UNSET",
 	},
 [PMAP_GETPORT] = {
 	  .p_proc		= PMAP_GETPORT,
@@ -275,6 +279,8 @@ static struct rpc_procinfo	pmap_procedures[] = {
 	  .p_decode		= (kxdrproc_t) xdr_decode_port,
 	  .p_bufsiz		= 4,
 	  .p_count		= 1,
+	  .p_statidx		= PMAP_GETPORT,
+	  .p_name		= "GETPORT",
 	},
 };
 
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 24ac7163b9c7..53746793eca2 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -151,7 +151,7 @@ void rpc_count_iostats(struct rpc_task *task)
 
 	if (!stats || !req)
 		return;
-	op_metrics = &stats[task->tk_msg.rpc_proc->p_proc];
+	op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx];
 
 	op_metrics->om_ops++;
 	op_metrics->om_ntrans += req->rq_ntrans;
@@ -176,6 +176,16 @@ void rpc_count_iostats(struct rpc_task *task)
 	op_metrics->om_execute += execute;
 }
 
+void _print_name(struct seq_file *seq, unsigned int op, struct rpc_procinfo *procs)
+{
+	if (procs[op].p_name)
+		seq_printf(seq, "\t%12s: ", procs[op].p_name);
+	else if (op == 0)
+		seq_printf(seq, "\t        NULL: ");
+	else
+		seq_printf(seq, "\t%12u: ", op);
+}
+
 #define MILLISECS_PER_JIFFY	(1000UL / HZ)
 
 void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
@@ -197,7 +207,7 @@ void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
 	seq_printf(seq, "\tper-op statistics\n");
 	for (op = 0; op < maxproc; op++) {
 		struct rpc_iostats *metrics = &stats[op];
-		seq_printf(seq, "%12u: ", op);
+		_print_name(seq, op, clnt->cl_procinfo);
 		seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
 				metrics->om_ops,
 				metrics->om_ntrans,
-- 
cgit v1.2.3


From dead28da8e3fb32601d38fb32b7021122e0a3d21 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:23 -0500
Subject: SUNRPC: eliminate rpc_call()

Clean-up: replace rpc_call() helper with direct call to rpc_call_sync.

This makes NFSv2 and NFSv3 synchronous calls more computationally
efficient, and reduces stack consumption in functions that used to
invoke rpc_call more than once.

Test plan:
Compile kernel with CONFIG_NFS enabled.  Connectathon on NFS version 2,
version 3, and version 4 mount points.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/mon.c               |   7 ++-
 fs/nfs/mount_clnt.c          |  13 +++-
 fs/nfs/nfs3acl.c             |  16 +++--
 fs/nfs/nfs3proc.c            | 140 +++++++++++++++++++++++++++++++------------
 fs/nfs/proc.c                | 107 ++++++++++++++++++++++++++-------
 include/linux/sunrpc/clnt.h  |  14 -----
 include/linux/sunrpc/sched.h |   1 -
 net/sunrpc/pmap_clnt.c       |  34 +++++++----
 8 files changed, 237 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 84ee39e6a3a2..5dd52b70859a 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -35,6 +35,10 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
 	struct rpc_clnt	*clnt;
 	int		status;
 	struct nsm_args	args;
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= res,
+	};
 
 	clnt = nsm_create();
 	if (IS_ERR(clnt)) {
@@ -49,7 +53,8 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
 	args.proc = NLMPROC_NSM_NOTIFY;
 	memset(res, 0, sizeof(*res));
 
-	status = rpc_call(clnt, proc, &args, res, 0);
+	msg.rpc_proc = &clnt->cl_procinfo[proc];
+	status = rpc_call_sync(clnt, &msg, 0);
 	if (status < 0)
 		printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n",
 			status);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 4a1340358223..c44d87bdddb3 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -49,9 +49,12 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
 	struct mnt_fhstatus	result = {
 		.fh		= fh
 	};
+	struct rpc_message msg	= {
+		.rpc_argp	= path,
+		.rpc_resp	= &result,
+	};
 	char			hostname[32];
 	int			status;
-	int			call;
 
 	dprintk("NFS:      nfs_mount(%08x:%s)\n",
 			(unsigned)ntohl(addr->sin_addr.s_addr), path);
@@ -61,8 +64,12 @@ nfsroot_mount(struct sockaddr_in *addr, char *path, struct nfs_fh *fh,
 	if (IS_ERR(mnt_clnt))
 		return PTR_ERR(mnt_clnt);
 
-	call = (version == NFS_MNT3_VERSION) ? MOUNTPROC3_MNT : MNTPROC_MNT;
-	status = rpc_call(mnt_clnt, call, path, &result, 0);
+	if (version == NFS_MNT3_VERSION)
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
+	else
+		msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
+
+	status = rpc_call_sync(mnt_clnt, &msg, 0);
 	return status < 0? status : (result.status? -EACCES : 0);
 }
 
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 6a5bbc0ae941..33287879bd23 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -190,6 +190,10 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 	struct nfs3_getaclres res = {
 		.fattr =	&fattr,
 	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+	};
 	struct posix_acl *acl;
 	int status, count;
 
@@ -218,8 +222,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 		return NULL;
 
 	dprintk("NFS call getacl\n");
-	status = rpc_call(server->client_acl, ACLPROC3_GETACL,
-			  &args, &res, 0);
+	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+	status = rpc_call_sync(server->client_acl, &msg, 0);
 	dprintk("NFS reply getacl: %d\n", status);
 
 	/* pages may have been allocated at the xdr layer. */
@@ -286,6 +290,10 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		.acl_access = acl,
 		.pages = pages,
 	};
+	struct rpc_message msg = {
+		.rpc_argp	= &args,
+		.rpc_resp	= &fattr,
+	};
 	int status, count;
 
 	status = -EOPNOTSUPP;
@@ -306,8 +314,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 
 	dprintk("NFS call setacl\n");
 	nfs_begin_data_update(inode);
-	status = rpc_call(server->client_acl, ACLPROC3_SETACL,
-			  &args, &fattr, 0);
+	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+	status = rpc_call_sync(server->client_acl, &msg, 0);
 	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
 	spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 7204ba5b2bf8..740f8b1ab04d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -43,21 +43,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 	return res;
 }
 
-static inline int
-nfs3_rpc_call_wrapper(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags)
-{
-	struct rpc_message msg = {
-		.rpc_proc	= &clnt->cl_procinfo[proc],
-		.rpc_argp	= argp,
-		.rpc_resp	= resp,
-	};
-	return nfs3_rpc_wrapper(clnt, &msg, flags);
-}
-
-#define rpc_call(clnt, proc, argp, resp, flags) \
-		nfs3_rpc_call_wrapper(clnt, proc, argp, resp, flags)
-#define rpc_call_sync(clnt, msg, flags) \
-		nfs3_rpc_wrapper(clnt, msg, flags)
+#define rpc_call_sync(clnt, msg, flags)	nfs3_rpc_wrapper(clnt, msg, flags)
 
 static int
 nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
@@ -75,14 +61,21 @@ static int
 do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
 		 struct nfs_fsinfo *info)
 {
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSINFO],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
 	int	status;
 
 	dprintk("%s: call  fsinfo\n", __FUNCTION__);
 	nfs_fattr_init(info->fattr);
-	status = rpc_call(client, NFS3PROC_FSINFO, fhandle, info, 0);
+	status = rpc_call_sync(client, &msg, 0);
 	dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
 	if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
-		status = rpc_call(client, NFS3PROC_GETATTR, fhandle, info->fattr, 0);
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+		msg.rpc_resp = info->fattr;
+		status = rpc_call_sync(client, &msg, 0);
 		dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
 	}
 	return status;
@@ -110,12 +103,16 @@ static int
 nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 		struct nfs_fattr *fattr)
 {
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
 	int	status;
 
 	dprintk("NFS call  getattr\n");
 	nfs_fattr_init(fattr);
-	status = rpc_call(server->client, NFS3PROC_GETATTR,
-			  fhandle, fattr, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
 	dprintk("NFS reply getattr: %d\n", status);
 	return status;
 }
@@ -129,11 +126,16 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 		.fh		= NFS_FH(inode),
 		.sattr		= sattr,
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= fattr,
+	};
 	int	status;
 
 	dprintk("NFS call  setattr\n");
 	nfs_fattr_init(fattr);
-	status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (status == 0)
 		nfs_setattr_update_inode(inode, sattr);
 	dprintk("NFS reply setattr: %d\n", status);
@@ -155,15 +157,23 @@ nfs3_proc_lookup(struct inode *dir, struct qstr *name,
 		.fh		= fhandle,
 		.fattr		= fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_LOOKUP],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int			status;
 
 	dprintk("NFS call  lookup %s\n", name->name);
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_LOOKUP, &arg, &res, 0);
-	if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR))
-		status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR,
-			 fhandle, fattr, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) {
+		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
+		msg.rpc_argp = fhandle;
+		msg.rpc_resp = fattr;
+		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	}
 	dprintk("NFS reply lookup: %d\n", status);
 	if (status >= 0)
 		status = nfs_refresh_inode(dir, &dir_attr);
@@ -183,7 +193,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_ACCESS],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
-		.rpc_cred	= entry->cred
+		.rpc_cred	= entry->cred,
 	};
 	int mode = entry->mask;
 	int status;
@@ -229,12 +239,16 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
 		.pglen		= pglen,
 		.pages		= &page
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_READLINK],
+		.rpc_argp	= &args,
+		.rpc_resp	= &fattr,
+	};
 	int			status;
 
 	dprintk("NFS call  readlink\n");
 	nfs_fattr_init(&fattr);
-	status = rpc_call(NFS_CLIENT(inode), NFS3PROC_READLINK,
-			  &args, &fattr, 0);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	nfs_refresh_inode(inode, &fattr);
 	dprintk("NFS reply readlink: %d\n", status);
 	return status;
@@ -330,6 +344,11 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		.fh		= &fhandle,
 		.fattr		= &fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_CREATE],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	mode_t mode = sattr->ia_mode;
 	int status;
 
@@ -346,8 +365,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 again:
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(&fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_CREATE, &arg, &res, 0);
-	nfs_post_op_update_inode(dir, &dir_attr);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nfs_refresh_inode(dir, &dir_attr);
 
 	/* If the server doesn't support the exclusive creation semantics,
 	 * try again with simple 'guarded' mode. */
@@ -477,12 +496,17 @@ nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.fromattr	= &old_dir_attr,
 		.toattr		= &new_dir_attr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_RENAME],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int			status;
 
 	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
 	nfs_fattr_init(&old_dir_attr);
 	nfs_fattr_init(&new_dir_attr);
-	status = rpc_call(NFS_CLIENT(old_dir), NFS3PROC_RENAME, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
 	nfs_post_op_update_inode(old_dir, &old_dir_attr);
 	nfs_post_op_update_inode(new_dir, &new_dir_attr);
 	dprintk("NFS reply rename: %d\n", status);
@@ -503,12 +527,17 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 		.dir_attr	= &dir_attr,
 		.fattr		= &fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_LINK],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int			status;
 
 	dprintk("NFS call  link %s\n", name->name);
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(&fattr);
-	status = rpc_call(NFS_CLIENT(inode), NFS3PROC_LINK, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	nfs_post_op_update_inode(dir, &dir_attr);
 	nfs_post_op_update_inode(inode, &fattr);
 	dprintk("NFS reply link: %d\n", status);
@@ -534,6 +563,11 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
 		.fh		= fhandle,
 		.fattr		= fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_SYMLINK],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int			status;
 
 	if (path->len > NFS3_MAXPATHLEN)
@@ -541,7 +575,7 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
 	dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_SYMLINK, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_post_op_update_inode(dir, &dir_attr);
 	dprintk("NFS reply symlink: %d\n", status);
 	return status;
@@ -563,6 +597,11 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 		.fh		= &fhandle,
 		.fattr		= &fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_MKDIR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int mode = sattr->ia_mode;
 	int status;
 
@@ -572,7 +611,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(&fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_post_op_update_inode(dir, &dir_attr);
 	if (status != 0)
 		goto out;
@@ -594,11 +633,16 @@ nfs3_proc_rmdir(struct inode *dir, struct qstr *name)
 		.name		= name->name,
 		.len		= name->len
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_RMDIR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &dir_attr,
+	};
 	int			status;
 
 	dprintk("NFS call  rmdir %s\n", name->name);
 	nfs_fattr_init(&dir_attr);
-	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_RMDIR, &arg, &dir_attr, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_post_op_update_inode(dir, &dir_attr);
 	dprintk("NFS reply rmdir: %d\n", status);
 	return status;
@@ -675,6 +719,11 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		.fh		= &fh,
 		.fattr		= &fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_MKNOD],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	mode_t mode = sattr->ia_mode;
 	int status;
 
@@ -693,7 +742,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(&fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_post_op_update_inode(dir, &dir_attr);
 	if (status != 0)
 		goto out;
@@ -710,11 +759,16 @@ static int
 nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 		 struct nfs_fsstat *stat)
 {
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSSTAT],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= stat,
+	};
 	int	status;
 
 	dprintk("NFS call  fsstat\n");
 	nfs_fattr_init(stat->fattr);
-	status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
 	dprintk("NFS reply statfs: %d\n", status);
 	return status;
 }
@@ -723,11 +777,16 @@ static int
 nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 		 struct nfs_fsinfo *info)
 {
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_FSINFO],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
 	int	status;
 
 	dprintk("NFS call  fsinfo\n");
 	nfs_fattr_init(info->fattr);
-	status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0);
+	status = rpc_call_sync(server->client_sys, &msg, 0);
 	dprintk("NFS reply fsinfo: %d\n", status);
 	return status;
 }
@@ -736,11 +795,16 @@ static int
 nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 		   struct nfs_pathconf *info)
 {
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs3_procedures[NFS3PROC_PATHCONF],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= info,
+	};
 	int	status;
 
 	dprintk("NFS call  pathconf\n");
 	nfs_fattr_init(info->fattr);
-	status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
 	dprintk("NFS reply pathconf: %d\n", status);
 	return status;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index f5150d71c03d..2b051ab8bea8 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -58,16 +58,23 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 {
 	struct nfs_fattr *fattr = info->fattr;
 	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
 	int status;
 
 	dprintk("%s: call getattr\n", __FUNCTION__);
 	nfs_fattr_init(fattr);
-	status = rpc_call(server->client_sys, NFSPROC_GETATTR, fhandle, fattr, 0);
+	status = rpc_call_sync(server->client_sys, &msg, 0);
 	dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
 	if (status)
 		return status;
 	dprintk("%s: call statfs\n", __FUNCTION__);
-	status = rpc_call(server->client_sys, NFSPROC_STATFS, fhandle, &fsinfo, 0);
+	msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
+	msg.rpc_resp = &fsinfo;
+	status = rpc_call_sync(server->client_sys, &msg, 0);
 	dprintk("%s: reply statfs: %d\n", __FUNCTION__, status);
 	if (status)
 		return status;
@@ -90,12 +97,16 @@ static int
 nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 		struct nfs_fattr *fattr)
 {
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= fattr,
+	};
 	int	status;
 
 	dprintk("NFS call  getattr\n");
 	nfs_fattr_init(fattr);
-	status = rpc_call(server->client, NFSPROC_GETATTR,
-				fhandle, fattr, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
 	dprintk("NFS reply getattr: %d\n", status);
 	return status;
 }
@@ -109,6 +120,11 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 		.fh	= NFS_FH(inode),
 		.sattr	= sattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_SETATTR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= fattr,
+	};
 	int	status;
 
 	/* Mask out the non-modebit related stuff from attr->ia_mode */
@@ -116,7 +132,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 	dprintk("NFS call  setattr\n");
 	nfs_fattr_init(fattr);
-	status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	if (status == 0)
 		nfs_setattr_update_inode(inode, sattr);
 	dprintk("NFS reply setattr: %d\n", status);
@@ -136,11 +152,16 @@ nfs_proc_lookup(struct inode *dir, struct qstr *name,
 		.fh		= fhandle,
 		.fattr		= fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_LOOKUP],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int			status;
 
 	dprintk("NFS call  lookup %s\n", name->name);
 	nfs_fattr_init(fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFSPROC_LOOKUP, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
@@ -154,10 +175,14 @@ static int nfs_proc_readlink(struct inode *inode, struct page *page,
 		.pglen		= pglen,
 		.pages		= &page
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_READLINK],
+		.rpc_argp	= &args,
+	};
 	int			status;
 
 	dprintk("NFS call  readlink\n");
-	status = rpc_call(NFS_CLIENT(inode), NFSPROC_READLINK, &args, NULL, 0);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	dprintk("NFS reply readlink: %d\n", status);
 	return status;
 }
@@ -233,11 +258,16 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		.fh		= &fhandle,
 		.fattr		= &fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int			status;
 
 	nfs_fattr_init(&fattr);
 	dprintk("NFS call  create %s\n", dentry->d_name.name);
-	status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	if (status == 0)
 		status = nfs_instantiate(dentry, &fhandle, &fattr);
 	dprintk("NFS reply create: %d\n", status);
@@ -263,6 +293,11 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		.fh		= &fhandle,
 		.fattr		= &fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_CREATE],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int status, mode;
 
 	dprintk("NFS call  mknod %s\n", dentry->d_name.name);
@@ -277,13 +312,13 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	}
 
 	nfs_fattr_init(&fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 
 	if (status == -EINVAL && S_ISFIFO(mode)) {
 		sattr->ia_mode = mode;
 		nfs_fattr_init(&fattr);
-		status = rpc_call(NFS_CLIENT(dir), NFSPROC_CREATE, &arg, &res, 0);
+		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	}
 	if (status == 0)
 		status = nfs_instantiate(dentry, &fhandle, &fattr);
@@ -302,8 +337,6 @@ nfs_proc_remove(struct inode *dir, struct qstr *name)
 	struct rpc_message	msg = { 
 		.rpc_proc	= &nfs_procedures[NFSPROC_REMOVE],
 		.rpc_argp	= &arg,
-		.rpc_resp	= NULL,
-		.rpc_cred	= NULL
 	};
 	int			status;
 
@@ -355,10 +388,14 @@ nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.toname		= new_name->name,
 		.tolen		= new_name->len
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_RENAME],
+		.rpc_argp	= &arg,
+	};
 	int			status;
 
 	dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-	status = rpc_call(NFS_CLIENT(old_dir), NFSPROC_RENAME, &arg, NULL, 0);
+	status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
 	nfs_mark_for_revalidate(old_dir);
 	nfs_mark_for_revalidate(new_dir);
 	dprintk("NFS reply rename: %d\n", status);
@@ -374,10 +411,14 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 		.toname		= name->name,
 		.tolen		= name->len
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_LINK],
+		.rpc_argp	= &arg,
+	};
 	int			status;
 
 	dprintk("NFS call  link %s\n", name->name);
-	status = rpc_call(NFS_CLIENT(inode), NFSPROC_LINK, &arg, NULL, 0);
+	status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
 	nfs_mark_for_revalidate(inode);
 	nfs_mark_for_revalidate(dir);
 	dprintk("NFS reply link: %d\n", status);
@@ -397,6 +438,10 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
 		.tolen		= path->len,
 		.sattr		= sattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_SYMLINK],
+		.rpc_argp	= &arg,
+	};
 	int			status;
 
 	if (path->len > NFS2_MAXPATHLEN)
@@ -404,7 +449,7 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
 	dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
 	nfs_fattr_init(fattr);
 	fhandle->size = 0;
-	status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	dprintk("NFS reply symlink: %d\n", status);
 	return status;
@@ -425,11 +470,16 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 		.fh		= &fhandle,
 		.fattr		= &fattr
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_MKDIR],
+		.rpc_argp	= &arg,
+		.rpc_resp	= &res,
+	};
 	int			status;
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
 	nfs_fattr_init(&fattr);
-	status = rpc_call(NFS_CLIENT(dir), NFSPROC_MKDIR, &arg, &res, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	if (status == 0)
 		status = nfs_instantiate(dentry, &fhandle, &fattr);
@@ -445,10 +495,14 @@ nfs_proc_rmdir(struct inode *dir, struct qstr *name)
 		.name		= name->name,
 		.len		= name->len
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_RMDIR],
+		.rpc_argp	= &arg,
+	};
 	int			status;
 
 	dprintk("NFS call  rmdir %s\n", name->name);
-	status = rpc_call(NFS_CLIENT(dir), NFSPROC_RMDIR, &arg, NULL, 0);
+	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	dprintk("NFS reply rmdir: %d\n", status);
 	return status;
@@ -470,13 +524,12 @@ nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 		.fh		= NFS_FH(dir),
 		.cookie		= cookie,
 		.count		= count,
-		.pages		= &page
+		.pages		= &page,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
 		.rpc_argp	= &arg,
-		.rpc_resp	= NULL,
-		.rpc_cred	= cred
+		.rpc_cred	= cred,
 	};
 	int			status;
 
@@ -495,11 +548,16 @@ nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 			struct nfs_fsstat *stat)
 {
 	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_STATFS],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= &fsinfo,
+	};
 	int	status;
 
 	dprintk("NFS call  statfs\n");
 	nfs_fattr_init(stat->fattr);
-	status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
 	dprintk("NFS reply statfs: %d\n", status);
 	if (status)
 		goto out;
@@ -518,11 +576,16 @@ nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 			struct nfs_fsinfo *info)
 {
 	struct nfs2_fsstat fsinfo;
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs_procedures[NFSPROC_STATFS],
+		.rpc_argp	= fhandle,
+		.rpc_resp	= &fsinfo,
+	};
 	int	status;
 
 	dprintk("NFS call  fsinfo\n");
 	nfs_fattr_init(info->fattr);
-	status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
 	dprintk("NFS reply fsinfo: %d\n", status);
 	if (status)
 		goto out;
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 3bec751ee249..e37c06128e51 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -140,20 +140,6 @@ size_t		rpc_max_payload(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
 int		rpc_ping(struct rpc_clnt *clnt, int flags);
 
-static __inline__
-int rpc_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags)
-{
-	struct rpc_message msg = {
-		.rpc_proc	= &clnt->cl_procinfo[proc],
-		.rpc_argp	= argp,
-		.rpc_resp	= resp,
-		.rpc_cred	= NULL
-	};
-	return rpc_call_sync(clnt, &msg, flags);
-}
-		
-extern void rpciod_wake_up(void);
-
 /*
  * Helper function for NFSroot support
  */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 45a64ae963ee..82a91bb22362 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -276,7 +276,6 @@ void *		rpc_malloc(struct rpc_task *, size_t);
 void		rpc_free(struct rpc_task *);
 int		rpciod_up(void);
 void		rpciod_down(void);
-void		rpciod_wake_up(void);
 int		__rpc_wait_for_completion_task(struct rpc_task *task, int (*)(void *));
 #ifdef RPC_DEBUG
 void		rpc_show_tasks(void);
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index efa00bd9ff9e..d25b054ec921 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -104,6 +104,11 @@ rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot)
 		.pm_prot	= prot,
 		.pm_port	= 0
 	};
+	struct rpc_message msg = {
+		.rpc_proc	= &pmap_procedures[PMAP_GETPORT],
+		.rpc_argp	= &map,
+		.rpc_resp	= &map.pm_port,
+	};
 	struct rpc_clnt	*pmap_clnt;
 	char		hostname[32];
 	int		status;
@@ -117,7 +122,7 @@ rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot)
 		return PTR_ERR(pmap_clnt);
 
 	/* Setup the call info struct */
-	status = rpc_call(pmap_clnt, PMAP_GETPORT, &map, &map.pm_port, 0);
+	status = rpc_call_sync(pmap_clnt, &msg, 0);
 
 	if (status >= 0) {
 		if (map.pm_port != 0)
@@ -162,16 +167,27 @@ pmap_getport_done(struct rpc_task *task)
 int
 rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 {
-	struct sockaddr_in	sin;
-	struct rpc_portmap	map;
+	struct sockaddr_in	sin = {
+		.sin_family	= AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+	};
+	struct rpc_portmap	map = {
+		.pm_prog	= prog,
+		.pm_vers	= vers,
+		.pm_prot	= prot,
+		.pm_port	= port,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &pmap_procedures[port ? PMAP_SET : PMAP_UNSET],
+		.rpc_argp	= &map,
+		.rpc_resp	= okay,
+	};
 	struct rpc_clnt		*pmap_clnt;
 	int error = 0;
 
 	dprintk("RPC: registering (%d, %d, %d, %d) with portmapper.\n",
 			prog, vers, prot, port);
 
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 	pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP, 1);
 	if (IS_ERR(pmap_clnt)) {
 		error = PTR_ERR(pmap_clnt);
@@ -179,13 +195,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 		return error;
 	}
 
-	map.pm_prog = prog;
-	map.pm_vers = vers;
-	map.pm_prot = prot;
-	map.pm_port = port;
-
-	error = rpc_call(pmap_clnt, port? PMAP_SET : PMAP_UNSET,
-					&map, okay, 0);
+	error = rpc_call_sync(pmap_clnt, &msg, 0);
 
 	if (error < 0) {
 		printk(KERN_WARNING
-- 
cgit v1.2.3


From 2e0af86f618c697b44e2d67dff151256c58201c4 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Mon, 20 Mar 2006 13:44:26 -0500
Subject: locks: remove unused posix_block_lock

posix_lock_file() is used to add a blocked lock to Lockd's block, so
posix_block_lock() is no longer needed.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/locks.c         | 15 ---------------
 include/linux/fs.h |  1 -
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index d2c5306e3db0..cb940b142c5f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1954,21 +1954,6 @@ void locks_remove_flock(struct file *filp)
 	unlock_kernel();
 }
 
-/**
- *	posix_block_lock - blocks waiting for a file lock
- *	@blocker: the lock which is blocking
- *	@waiter: the lock which conflicts and has to wait
- *
- * lockd needs to block waiting for locks.
- */
-void
-posix_block_lock(struct file_lock *blocker, struct file_lock *waiter)
-{
-	locks_insert_block(blocker, waiter);
-}
-
-EXPORT_SYMBOL(posix_block_lock);
-
 /**
  *	posix_unblock_lock - stop waiting for a file lock
  *      @filp:   how the file was opened
diff --git a/include/linux/fs.h b/include/linux/fs.h
index be21e860a9f2..b01482c721ae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -757,7 +757,6 @@ extern void locks_remove_flock(struct file *);
 extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
-extern void posix_block_lock(struct file_lock *, struct file_lock *);
 extern int posix_unblock_lock(struct file *, struct file_lock *);
 extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
-- 
cgit v1.2.3


From 8dc7c3115b611c00006eac3ee5b108296432aab7 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Mon, 20 Mar 2006 13:44:26 -0500
Subject: locks,lockd: fix race in nlmsvc_testlock

posix_test_lock() returns a pointer to a struct file_lock which is unprotected
and can be removed while in use by the caller.  Move the conflicting lock from
the return to a parameter, and copy the conflicting lock.

In most cases the caller ends up putting the copy of the conflicting lock on
the stack.  On i386, sizeof(struct file_lock) appears to be about 100 bytes.
We're assuming that's reasonable.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c  | 12 +++++-------
 fs/locks.c          | 21 +++++++++++++--------
 fs/nfs/file.c       |  7 +++----
 fs/nfsd/nfs4state.c | 13 ++++++-------
 include/linux/fs.h  |  2 +-
 5 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index f5398097b84b..d683dd022e08 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -376,8 +376,6 @@ u32
 nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 				       struct nlm_lock *conflock)
 {
-	struct file_lock	*fl;
-
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				file->f_file->f_dentry->d_inode->i_sb->s_id,
 				file->f_file->f_dentry->d_inode->i_ino,
@@ -385,14 +383,14 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
-	if ((fl = posix_test_lock(file->f_file, &lock->fl)) != NULL) {
+	if (posix_test_lock(file->f_file, &lock->fl, &conflock->fl)) {
 		dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
-				fl->fl_type, (long long)fl->fl_start,
-				(long long)fl->fl_end);
+				conflock->fl.fl_type,
+				(long long)conflock->fl.fl_start,
+				(long long)conflock->fl.fl_end);
 		conflock->caller = "somehost";	/* FIXME */
 		conflock->oh.len = 0;		/* don't return OH info */
-		conflock->svid = fl->fl_pid;
-		conflock->fl = *fl;
+		conflock->svid = conflock->fl.fl_pid;
 		return nlm_lck_denied;
 	}
 
diff --git a/fs/locks.c b/fs/locks.c
index cb940b142c5f..231b23c12c3d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -672,8 +672,9 @@ static int locks_block_on_timeout(struct file_lock *blocker, struct file_lock *w
 	return result;
 }
 
-struct file_lock *
-posix_test_lock(struct file *filp, struct file_lock *fl)
+int
+posix_test_lock(struct file *filp, struct file_lock *fl,
+		struct file_lock *conflock)
 {
 	struct file_lock *cfl;
 
@@ -684,9 +685,13 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 		if (posix_locks_conflict(cfl, fl))
 			break;
 	}
+	if (cfl) {
+		locks_copy_lock(conflock, cfl);
+		unlock_kernel();
+		return 1;
+	}
 	unlock_kernel();
-
-	return (cfl);
+	return 0;
 }
 
 EXPORT_SYMBOL(posix_test_lock);
@@ -1563,7 +1568,7 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
  */
 int fcntl_getlk(struct file *filp, struct flock __user *l)
 {
-	struct file_lock *fl, file_lock;
+	struct file_lock *fl, cfl, file_lock;
 	struct flock flock;
 	int error;
 
@@ -1587,7 +1592,7 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
 		else
 		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
 	} else {
-		fl = posix_test_lock(filp, &file_lock);
+		fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL);
 	}
  
 	flock.l_type = F_UNLCK;
@@ -1717,7 +1722,7 @@ out:
  */
 int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
 {
-	struct file_lock *fl, file_lock;
+	struct file_lock *fl, cfl, file_lock;
 	struct flock64 flock;
 	int error;
 
@@ -1741,7 +1746,7 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
 		else
 		  fl = (file_lock.fl_type == F_UNLCK ? NULL : &file_lock);
 	} else {
-		fl = posix_test_lock(filp, &file_lock);
+		fl = (posix_test_lock(filp, &file_lock, &cfl) ? &cfl : NULL);
 	}
  
 	flock.l_type = F_UNLCK;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1cf07e4ad136..ee140c53dba6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -392,15 +392,14 @@ out_swapfile:
 
 static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct file_lock *cfl;
+	struct file_lock cfl;
 	struct inode *inode = filp->f_mapping->host;
 	int status = 0;
 
 	lock_kernel();
 	/* Try local locking first */
-	cfl = posix_test_lock(filp, fl);
-	if (cfl != NULL) {
-		locks_copy_lock(fl, cfl);
+	if (posix_test_lock(filp, fl, &cfl)) {
+		locks_copy_lock(fl, &cfl);
 		goto out;
 	}
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1143cfb64549..f6ab762bea99 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2639,7 +2639,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	struct nfs4_stateid *lock_stp;
 	struct file *filp;
 	struct file_lock file_lock;
-	struct file_lock *conflock;
+	struct file_lock conflock;
 	int status = 0;
 	unsigned int strhashval;
 
@@ -2775,11 +2775,11 @@ conflicting_lock:
 	/* XXX There is a race here. Future patch needed to provide 
 	 * an atomic posix_lock_and_test_file
 	 */
-	if (!(conflock = posix_test_lock(filp, &file_lock))) {
+	if (!posix_test_lock(filp, &file_lock, &conflock)) {
 		status = nfserr_serverfault;
 		goto out;
 	}
-	nfs4_set_lock_denied(conflock, &lock->lk_denied);
+	nfs4_set_lock_denied(&conflock, &lock->lk_denied);
 out:
 	if (status && lock->lk_is_new && lock_sop)
 		release_stateowner(lock_sop);
@@ -2800,7 +2800,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	struct inode *inode;
 	struct file file;
 	struct file_lock file_lock;
-	struct file_lock *conflicting_lock;
+	struct file_lock conflock;
 	int status;
 
 	if (nfs4_in_grace())
@@ -2864,10 +2864,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	file.f_dentry = current_fh->fh_dentry;
 
 	status = nfs_ok;
-	conflicting_lock = posix_test_lock(&file, &file_lock);
-	if (conflicting_lock) {
+	if (posix_test_lock(&file, &file_lock, &conflock)) {
 		status = nfserr_denied;
-		nfs4_set_lock_denied(conflicting_lock, &lockt->lt_denied);
+		nfs4_set_lock_denied(&conflock, &lockt->lt_denied);
 	}
 out:
 	nfs4_unlock_state();
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b01482c721ae..8ef4dd788a83 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -754,7 +754,7 @@ extern void locks_init_lock(struct file_lock *);
 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
 extern void locks_remove_flock(struct file *);
-extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
+extern int posix_test_lock(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file *, struct file_lock *);
-- 
cgit v1.2.3


From 7117bf3dfb10b534a017260d9fc643bc1d0afd2a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Mon, 20 Mar 2006 13:44:26 -0500
Subject: lockd: Remove FL_LOCKD flag

Currently lockd identifies its own locks using the FL_LOCKD flag.  This
doesn't scale well to multiple lock managers--if we did this in nfsv4 too,
for example, we'd be left with only one free flag bit.

Instead, we just check whether the file manager ops (fl_lmops) set on this
lock are our own.

The only use for this is in nlm_traverse_locks, which uses it to find locks
that need cleaning up when freeing a host or a file.

In the long run it might be nice to do reference counting instead of
traversing all the locks like this....

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c | 2 --
 fs/lockd/svcsubs.c | 2 +-
 include/linux/fs.h | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d683dd022e08..d50946dcddd9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -313,8 +313,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	/* Get existing block (in case client is busy-waiting) */
 	block = nlmsvc_lookup_block(file, lock, 0);
 
-	lock->fl.fl_flags |= FL_LOCKD;
-
 again:
 	/* Lock file against concurrent access */
 	down(&file->f_sema);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 62f4a385177f..601e5b3dfe20 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -182,7 +182,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, int action)
 again:
 	file->f_locks = 0;
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-		if (!(fl->fl_flags & FL_LOCKD))
+		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
 
 		/* update current lock count */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8ef4dd788a83..d2cffee8fc11 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -667,7 +667,6 @@ extern spinlock_t files_lock;
 #define FL_POSIX	1
 #define FL_FLOCK	2
 #define FL_ACCESS	8	/* not trying to lock, just looking */
-#define FL_LOCKD	16	/* lock held by rpc.lockd */
 #define FL_LEASE	32	/* lease held on this file */
 #define FL_SLEEP	128	/* A blocking lock */
 
-- 
cgit v1.2.3


From 788e7a89a03e364855583c0ab4649b94925efbb9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:27 -0500
Subject: NFS: Cleanup of NFS write code in preparation for asynchronous
 o_direct

This patch inverts the callback hierarchy for NFS write calls.

Instead of having the NFSv2/v3/v4-specific code set up the RPC callback
ops, we allow the original caller to do so. This allows for more
flexibility w.r.t. how to set up and tear down the nfs_write_data
structure while still allowing the NFSv3/v4 code to perform error
handling.

The greater flexibility is needed by the asynchronous O_DIRECT code, which
wants to be able to hold on to the original nfs_write_data structures after
the WRITE RPC call has completed in order to be able to replay them if the
COMMIT call determines that the server has rebooted.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3proc.c       | 66 +++++++++--------------------------
 fs/nfs/nfs4proc.c       | 54 +++++++----------------------
 fs/nfs/proc.c           | 24 +++----------
 fs/nfs/write.c          | 92 +++++++++++++++++++++++++++++++++++--------------
 include/linux/nfs_fs.h  |  7 ----
 include/linux/nfs_xdr.h |  3 +-
 6 files changed, 103 insertions(+), 143 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 740f8b1ab04d..c4f7de8830e9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -849,29 +849,17 @@ nfs3_proc_read_setup(struct nfs_read_data *data)
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void nfs3_write_done(struct rpc_task *task, void *calldata)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct nfs_write_data *data = calldata;
-
 	if (nfs3_async_handle_jukebox(task, data->inode))
-		return;
+		return -EAGAIN;
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_writeback_done(task, calldata);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs3_write_ops = {
-	.rpc_call_done = nfs3_write_done,
-	.rpc_release = nfs_writedata_release,
-};
-
-static void
-nfs3_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_write_setup(struct nfs_write_data *data, int how)
 {
-	struct rpc_task		*task = &data->task;
-	struct inode		*inode = data->inode;
-	int			stable;
-	int			flags;
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_WRITE],
 		.rpc_argp	= &data->args,
@@ -879,45 +867,28 @@ nfs3_proc_write_setup(struct nfs_write_data *data, int how)
 		.rpc_cred	= data->cred,
 	};
 
+	data->args.stable = NFS_UNSTABLE;
 	if (how & FLUSH_STABLE) {
-		if (!NFS_I(inode)->ncommit)
-			stable = NFS_FILE_SYNC;
-		else
-			stable = NFS_DATA_SYNC;
-	} else
-		stable = NFS_UNSTABLE;
-	data->args.stable = stable;
-
-	/* Set the initial flags for the task.  */
-	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+		data->args.stable = NFS_FILE_SYNC;
+		if (NFS_I(data->inode)->ncommit)
+			data->args.stable = NFS_DATA_SYNC;
+	}
 
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_write_ops, data);
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
-static void nfs3_commit_done(struct rpc_task *task, void *calldata)
+static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct nfs_write_data *data = calldata;
-
 	if (nfs3_async_handle_jukebox(task, data->inode))
-		return;
+		return -EAGAIN;
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_commit_done(task, calldata);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs3_commit_ops = {
-	.rpc_call_done = nfs3_commit_done,
-	.rpc_release = nfs_commit_release,
-};
-
-static void
-nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
 {
-	struct rpc_task		*task = &data->task;
-	struct inode		*inode = data->inode;
-	int			flags;
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_COMMIT],
 		.rpc_argp	= &data->args,
@@ -925,12 +896,7 @@ nfs3_proc_commit_setup(struct nfs_write_data *data, int how)
 		.rpc_cred	= data->cred,
 	};
 
-	/* Set the initial flags for the task.  */
-	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-
-	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_commit_ops, data);
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
 static int
@@ -970,7 +936,9 @@ struct nfs_rpc_ops	nfs_v3_clientops = {
 	.decode_dirent	= nfs3_decode_dirent,
 	.read_setup	= nfs3_proc_read_setup,
 	.write_setup	= nfs3_proc_write_setup,
+	.write_done	= nfs3_write_done,
 	.commit_setup	= nfs3_proc_commit_setup,
+	.commit_done	= nfs3_commit_done,
 	.file_open	= nfs_open,
 	.file_release	= nfs_release,
 	.lock		= nfs3_proc_lock,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f1ff4fa6cce5..ef4dc315ecc2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2388,32 +2388,23 @@ nfs4_proc_read_setup(struct nfs_read_data *data)
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void nfs4_write_done(struct rpc_task *task, void *calldata)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct nfs_write_data *data = calldata;
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
 		rpc_restart_call(task);
-		return;
+		return -EAGAIN;
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), data->timestamp);
 		nfs_post_op_update_inode(inode, data->res.fattr);
 	}
-	/* Call back common NFS writeback processing */
-	nfs_writeback_done(task, calldata);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs4_write_ops = {
-	.rpc_call_done = nfs4_write_done,
-	.rpc_release = nfs_writedata_release,
-};
-
-static void
-nfs4_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_write_setup(struct nfs_write_data *data, int how)
 {
-	struct rpc_task	*task = &data->task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE],
 		.rpc_argp = &data->args,
@@ -2423,7 +2414,6 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
 	struct inode *inode = data->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
 	int stable;
-	int flags;
 	
 	if (how & FLUSH_STABLE) {
 		if (!NFS_I(inode)->ncommit)
@@ -2438,57 +2428,37 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how)
 
 	data->timestamp   = jiffies;
 
-	/* Set the initial flags for the task.  */
-	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_write_ops, data);
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
-static void nfs4_commit_done(struct rpc_task *task, void *calldata)
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct nfs_write_data *data = calldata;
 	struct inode *inode = data->inode;
 	
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
 		rpc_restart_call(task);
-		return;
+		return -EAGAIN;
 	}
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(inode, data->res.fattr);
-	/* Call back common NFS writeback processing */
-	nfs_commit_done(task, calldata);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs4_commit_ops = {
-	.rpc_call_done = nfs4_commit_done,
-	.rpc_release = nfs_commit_release,
-};
-
-static void
-nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
+static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
 {
-	struct rpc_task	*task = &data->task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_cred = data->cred,
 	};	
-	struct inode *inode = data->inode;
-	struct nfs_server *server = NFS_SERVER(inode);
-	int flags;
+	struct nfs_server *server = NFS_SERVER(data->inode);
 	
 	data->args.bitmask = server->attr_bitmask;
 	data->res.server = server;
 
-	/* Set the initial flags for the task.  */
-	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-
-	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_commit_ops, data);
-	rpc_call_setup(task, &msg, 0);	
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
 /*
@@ -3648,7 +3618,9 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
 	.decode_dirent	= nfs4_decode_dirent,
 	.read_setup	= nfs4_proc_read_setup,
 	.write_setup	= nfs4_proc_write_setup,
+	.write_done	= nfs4_write_done,
 	.commit_setup	= nfs4_proc_commit_setup,
+	.commit_done	= nfs4_commit_done,
 	.file_open      = nfs_open,
 	.file_release   = nfs_release,
 	.lock		= nfs4_proc_lock,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 2b051ab8bea8..608aa5932a1d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -654,26 +654,15 @@ nfs_proc_read_setup(struct nfs_read_data *data)
 	rpc_call_setup(task, &msg, 0);
 }
 
-static void nfs_write_done(struct rpc_task *task, void *calldata)
+static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct nfs_write_data *data = calldata;
-
 	if (task->tk_status >= 0)
 		nfs_post_op_update_inode(data->inode, data->res.fattr);
-	nfs_writeback_done(task, calldata);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs_write_ops = {
-	.rpc_call_done = nfs_write_done,
-	.rpc_release = nfs_writedata_release,
-};
-
-static void
-nfs_proc_write_setup(struct nfs_write_data *data, int how)
+static void nfs_proc_write_setup(struct nfs_write_data *data, int how)
 {
-	struct rpc_task		*task = &data->task;
-	struct inode		*inode = data->inode;
-	int			flags;
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_WRITE],
 		.rpc_argp	= &data->args,
@@ -684,12 +673,8 @@ nfs_proc_write_setup(struct nfs_write_data *data, int how)
 	/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
 	data->args.stable = NFS_FILE_SYNC;
 
-	/* Set the initial flags for the task.  */
-	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
-
 	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_write_ops, data);
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
 static void
@@ -736,6 +721,7 @@ struct nfs_rpc_ops	nfs_v2_clientops = {
 	.decode_dirent	= nfs_decode_dirent,
 	.read_setup	= nfs_proc_read_setup,
 	.write_setup	= nfs_proc_write_setup,
+	.write_done	= nfs_write_done,
 	.commit_setup	= nfs_proc_commit_setup,
 	.file_open	= nfs_open,
 	.file_release	= nfs_release,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e7c8361cf201..5912274ff1a1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -77,12 +77,14 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context*,
 					    struct inode *,
 					    struct page *,
 					    unsigned int, unsigned int);
-static void nfs_writeback_done_partial(struct nfs_write_data *, int);
-static void nfs_writeback_done_full(struct nfs_write_data *, int);
+static int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 static int nfs_wait_on_write_congestion(struct address_space *, int);
 static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
 static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
 			   unsigned int npages, int how);
+static const struct rpc_call_ops nfs_write_partial_ops;
+static const struct rpc_call_ops nfs_write_full_ops;
+static const struct rpc_call_ops nfs_commit_ops;
 
 static kmem_cache_t *nfs_wdata_cachep;
 mempool_t *nfs_wdata_mempool;
@@ -872,10 +874,12 @@ static inline int flush_task_priority(int how)
  */
 static void nfs_write_rpcsetup(struct nfs_page *req,
 		struct nfs_write_data *data,
+		const struct rpc_call_ops *call_ops,
 		unsigned int count, unsigned int offset,
 		int how)
 {
 	struct inode		*inode;
+	int flags;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
@@ -896,6 +900,9 @@ static void nfs_write_rpcsetup(struct nfs_page *req,
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
 
+	/* Set up the initial task struct.  */
+	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+	rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
 	NFS_PROTO(inode)->write_setup(data, how);
 
 	data->task.tk_priority = flush_task_priority(how);
@@ -959,14 +966,15 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how)
 		list_del_init(&data->pages);
 
 		data->pagevec[0] = page;
-		data->complete = nfs_writeback_done_partial;
 
 		if (nbytes > wsize) {
-			nfs_write_rpcsetup(req, data, wsize, offset, how);
+			nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
+					wsize, offset, how);
 			offset += wsize;
 			nbytes -= wsize;
 		} else {
-			nfs_write_rpcsetup(req, data, nbytes, offset, how);
+			nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
+					nbytes, offset, how);
 			nbytes = 0;
 		}
 		nfs_execute_write(data);
@@ -1020,9 +1028,8 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
 	}
 	req = nfs_list_entry(data->pages.next);
 
-	data->complete = nfs_writeback_done_full;
 	/* Set up the argument struct */
-	nfs_write_rpcsetup(req, data, count, 0, how);
+	nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
 
 	nfs_execute_write(data);
 	return 0;
@@ -1066,8 +1073,9 @@ nfs_flush_list(struct list_head *head, int wpages, int how)
 /*
  * Handle a write reply that flushed part of a page.
  */
-static void nfs_writeback_done_partial(struct nfs_write_data *data, int status)
+static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
 {
+	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req = data->req;
 	struct page		*page = req->wb_page;
 
@@ -1077,11 +1085,14 @@ static void nfs_writeback_done_partial(struct nfs_write_data *data, int status)
 		req->wb_bytes,
 		(long long)req_offset(req));
 
-	if (status < 0) {
+	if (nfs_writeback_done(task, data) != 0)
+		return;
+
+	if (task->tk_status < 0) {
 		ClearPageUptodate(page);
 		SetPageError(page);
-		req->wb_context->error = status;
-		dprintk(", error = %d\n", status);
+		req->wb_context->error = task->tk_status;
+		dprintk(", error = %d\n", task->tk_status);
 	} else {
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 		if (data->verf.committed < NFS_FILE_SYNC) {
@@ -1102,6 +1113,11 @@ static void nfs_writeback_done_partial(struct nfs_write_data *data, int status)
 		nfs_writepage_release(req);
 }
 
+static const struct rpc_call_ops nfs_write_partial_ops = {
+	.rpc_call_done = nfs_writeback_done_partial,
+	.rpc_release = nfs_writedata_release,
+};
+
 /*
  * Handle a write reply that flushes a whole page.
  *
@@ -1109,11 +1125,15 @@ static void nfs_writeback_done_partial(struct nfs_write_data *data, int status)
  *	  writebacks since the page->count is kept > 1 for as long
  *	  as the page has a write request pending.
  */
-static void nfs_writeback_done_full(struct nfs_write_data *data, int status)
+static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 {
+	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req;
 	struct page		*page;
 
+	if (nfs_writeback_done(task, data) != 0)
+		return;
+
 	/* Update attributes as result of writeback. */
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
@@ -1126,13 +1146,13 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status)
 			req->wb_bytes,
 			(long long)req_offset(req));
 
-		if (status < 0) {
+		if (task->tk_status < 0) {
 			ClearPageUptodate(page);
 			SetPageError(page);
-			req->wb_context->error = status;
+			req->wb_context->error = task->tk_status;
 			end_page_writeback(page);
 			nfs_inode_remove_request(req);
-			dprintk(", error = %d\n", status);
+			dprintk(", error = %d\n", task->tk_status);
 			goto next;
 		}
 		end_page_writeback(page);
@@ -1154,18 +1174,28 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status)
 	}
 }
 
+static const struct rpc_call_ops nfs_write_full_ops = {
+	.rpc_call_done = nfs_writeback_done_full,
+	.rpc_release = nfs_writedata_release,
+};
+
+
 /*
  * This function is called when the WRITE call is complete.
  */
-void nfs_writeback_done(struct rpc_task *task, void *calldata)
+static int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
-	struct nfs_write_data	*data = calldata;
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
+	int status;
 
 	dprintk("NFS: %4d nfs_writeback_done (status %d)\n",
 		task->tk_pid, task->tk_status);
 
+	/* Call the NFS version-specific code */
+	status = NFS_PROTO(data->inode)->write_done(task, data);
+	if (status != 0)
+		return status;
 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1210,7 +1240,7 @@ void nfs_writeback_done(struct rpc_task *task, void *calldata)
 				argp->stable = NFS_FILE_SYNC;
 			}
 			rpc_restart_call(task);
-			return;
+			return -EAGAIN;
 		}
 		if (time_before(complain, jiffies)) {
 			printk(KERN_WARNING
@@ -1221,11 +1251,7 @@ void nfs_writeback_done(struct rpc_task *task, void *calldata)
 		/* Can't do anything about it except throw an error. */
 		task->tk_status = -EIO;
 	}
-
-	/*
-	 * Process the nfs_page list
-	 */
-	data->complete(data, task->tk_status);
+	return 0;
 }
 
 
@@ -1239,10 +1265,12 @@ void nfs_commit_release(void *wdata)
  * Set up the argument/result storage required for the RPC call.
  */
 static void nfs_commit_rpcsetup(struct list_head *head,
-		struct nfs_write_data *data, int how)
+		struct nfs_write_data *data,
+		int how)
 {
 	struct nfs_page		*first;
 	struct inode		*inode;
+	int flags;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
@@ -1262,7 +1290,10 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
-	
+
+	/* Set up the initial task struct.  */
+	flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
+	rpc_init_task(&data->task, NFS_CLIENT(inode), flags, &nfs_commit_ops, data);
 	NFS_PROTO(inode)->commit_setup(data, how);
 
 	data->task.tk_priority = flush_task_priority(how);
@@ -1303,7 +1334,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 /*
  * COMMIT call returned
  */
-void nfs_commit_done(struct rpc_task *task, void *calldata)
+static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req;
@@ -1312,6 +1343,10 @@ void nfs_commit_done(struct rpc_task *task, void *calldata)
         dprintk("NFS: %4d nfs_commit_done (status %d)\n",
                                 task->tk_pid, task->tk_status);
 
+	/* Call the NFS version-specific code */
+	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
+		return;
+
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
@@ -1345,6 +1380,11 @@ void nfs_commit_done(struct rpc_task *task, void *calldata)
 	}
 	sub_page_state(nr_unstable,res);
 }
+
+static const struct rpc_call_ops nfs_commit_ops = {
+	.rpc_call_done = nfs_commit_done,
+	.rpc_release = nfs_commit_release,
+};
 #endif
 
 static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b71da4d4b137..782e59765696 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -407,13 +407,6 @@ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
-extern void nfs_writeback_done(struct rpc_task *task, void *data);
-extern void nfs_writedata_release(void *data);
-
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-extern void nfs_commit_done(struct rpc_task *, void *data);
-extern void nfs_commit_release(void *data);
-#endif
 
 /*
  * Try to write back everything synchronously (but check the
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6d6f69ec5675..277750cc70c0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -714,7 +714,6 @@ struct nfs_write_data {
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
-	void (*complete) (struct nfs_write_data *, int);
 	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
 };
 
@@ -770,7 +769,9 @@ struct nfs_rpc_ops {
 	u32 *	(*decode_dirent)(u32 *, struct nfs_entry *, int plus);
 	void	(*read_setup)   (struct nfs_read_data *);
 	void	(*write_setup)  (struct nfs_write_data *, int how);
+	int	(*write_done)  (struct rpc_task *, struct nfs_write_data *);
 	void	(*commit_setup) (struct nfs_write_data *, int how);
+	int	(*commit_done) (struct rpc_task *, struct nfs_write_data *);
 	int	(*file_open)   (struct inode *, struct file *);
 	int	(*file_release) (struct inode *, struct file *);
 	int	(*lock)(struct file *, int, struct file_lock *);
-- 
cgit v1.2.3


From ec06c096edec0755534c7126f4caded69de131c2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:27 -0500
Subject: NFS: Cleanup of NFS read code

Same callback hierarchy inversion as for the NFS write calls. This patch is
not strictly speaking needed by the O_DIRECT code, but avoids confusing
differences between the asynchronous read and write code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c         | 17 +++++++++++----
 fs/nfs/nfs3proc.c       | 27 +++++------------------
 fs/nfs/nfs4proc.c       | 33 ++++++++--------------------
 fs/nfs/proc.c           | 25 +++++----------------
 fs/nfs/read.c           | 58 ++++++++++++++++++++++++++++++++++---------------
 include/linux/nfs_fs.h  |  4 ++--
 include/linux/nfs_xdr.h |  2 +-
 7 files changed, 77 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fc07ce4885da..3f87a72bd137 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -218,14 +218,17 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
  * until the RPCs complete.  This could be long *after* we are woken up in
  * nfs_direct_read_wait (for instance, if someone hits ^C on a slow server).
  */
-static void nfs_direct_read_result(struct nfs_read_data *data, int status)
+static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
+	struct nfs_read_data *data = calldata;
 	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
-	if (likely(status >= 0))
+	if (nfs_readpage_result(task, data) != 0)
+		return;
+	if (likely(task->tk_status >= 0))
 		atomic_add(data->res.count, &dreq->count);
 	else
-		atomic_set(&dreq->error, status);
+		atomic_set(&dreq->error, task->tk_status);
 
 	if (unlikely(atomic_dec_and_test(&dreq->complete))) {
 		nfs_free_user_pages(dreq->pages, dreq->npages, 1);
@@ -234,6 +237,11 @@ static void nfs_direct_read_result(struct nfs_read_data *data, int status)
 	}
 }
 
+static const struct rpc_call_ops nfs_read_direct_ops = {
+	.rpc_call_done = nfs_direct_read_result,
+	.rpc_release = nfs_readdata_release,
+};
+
 /**
  * nfs_direct_read_schedule - dispatch NFS READ operations for a direct read
  * @dreq: address of nfs_direct_req struct for this request
@@ -280,10 +288,11 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq,
 		data->res.eof = 0;
 		data->res.count = bytes;
 
+		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
+				&nfs_read_direct_ops, data);
 		NFS_PROTO(inode)->read_setup(data);
 
 		data->task.tk_cookie = (unsigned long) inode;
-		data->complete = nfs_direct_read_result;
 
 		lock_kernel();
 		rpc_execute(&data->task);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c4f7de8830e9..cf186f0d2b3b 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -811,29 +811,18 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
 
-static void nfs3_read_done(struct rpc_task *task, void *calldata)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
-	struct nfs_read_data *data = calldata;
-
 	if (nfs3_async_handle_jukebox(task, data->inode))
-		return;
+		return -EAGAIN;
 	/* Call back common NFS readpage processing */
 	if (task->tk_status >= 0)
 		nfs_refresh_inode(data->inode, &data->fattr);
-	nfs_readpage_result(task, calldata);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs3_read_ops = {
-	.rpc_call_done = nfs3_read_done,
-	.rpc_release = nfs_readdata_release,
-};
-
-static void
-nfs3_proc_read_setup(struct nfs_read_data *data)
+static void nfs3_proc_read_setup(struct nfs_read_data *data)
 {
-	struct rpc_task		*task = &data->task;
-	struct inode		*inode = data->inode;
-	int			flags;
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READ],
 		.rpc_argp	= &data->args,
@@ -841,12 +830,7 @@ nfs3_proc_read_setup(struct nfs_read_data *data)
 		.rpc_cred	= data->cred,
 	};
 
-	/* N.B. Do we need to test? Never called for swapfile inode */
-	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
-
-	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_read_ops, data);
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
 static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -935,6 +919,7 @@ struct nfs_rpc_ops	nfs_v3_clientops = {
 	.pathconf	= nfs3_proc_pathconf,
 	.decode_dirent	= nfs3_decode_dirent,
 	.read_setup	= nfs3_proc_read_setup,
+	.read_done	= nfs3_read_done,
 	.write_setup	= nfs3_proc_write_setup,
 	.write_done	= nfs3_write_done,
 	.commit_setup	= nfs3_proc_commit_setup,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ef4dc315ecc2..bad1eae5608a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2345,47 +2345,31 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
-static void nfs4_read_done(struct rpc_task *task, void *calldata)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
-	struct nfs_read_data *data = calldata;
-	struct inode *inode = data->inode;
+	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+	if (nfs4_async_handle_error(task, server) == -EAGAIN) {
 		rpc_restart_call(task);
-		return;
+		return -EAGAIN;
 	}
 	if (task->tk_status > 0)
-		renew_lease(NFS_SERVER(inode), data->timestamp);
-	/* Call back common NFS readpage processing */
-	nfs_readpage_result(task, calldata);
+		renew_lease(server, data->timestamp);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs4_read_ops = {
-	.rpc_call_done = nfs4_read_done,
-	.rpc_release = nfs_readdata_release,
-};
-
-static void
-nfs4_proc_read_setup(struct nfs_read_data *data)
+static void nfs4_proc_read_setup(struct nfs_read_data *data)
 {
-	struct rpc_task	*task = &data->task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ],
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
 		.rpc_cred = data->cred,
 	};
-	struct inode *inode = data->inode;
-	int flags;
 
 	data->timestamp   = jiffies;
 
-	/* N.B. Do we need to test? Never called for swapfile inode */
-	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
-
-	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_read_ops, data);
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
 static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -3617,6 +3601,7 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
 	.pathconf	= nfs4_proc_pathconf,
 	.decode_dirent	= nfs4_decode_dirent,
 	.read_setup	= nfs4_proc_read_setup,
+	.read_done	= nfs4_read_done,
 	.write_setup	= nfs4_proc_write_setup,
 	.write_done	= nfs4_write_done,
 	.commit_setup	= nfs4_proc_commit_setup,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 608aa5932a1d..9dd85cac2df0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -613,10 +613,8 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 
 extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
 
-static void nfs_read_done(struct rpc_task *task, void *calldata)
+static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
-	struct nfs_read_data *data = calldata;
-
 	if (task->tk_status >= 0) {
 		nfs_refresh_inode(data->inode, data->res.fattr);
 		/* Emulate the eof flag, which isn't normally needed in NFSv2
@@ -625,20 +623,11 @@ static void nfs_read_done(struct rpc_task *task, void *calldata)
 		if (data->args.offset + data->args.count >= data->res.fattr->size)
 			data->res.eof = 1;
 	}
-	nfs_readpage_result(task, calldata);
+	return 0;
 }
 
-static const struct rpc_call_ops nfs_read_ops = {
-	.rpc_call_done = nfs_read_done,
-	.rpc_release = nfs_readdata_release,
-};
-
-static void
-nfs_proc_read_setup(struct nfs_read_data *data)
+static void nfs_proc_read_setup(struct nfs_read_data *data)
 {
-	struct rpc_task		*task = &data->task;
-	struct inode		*inode = data->inode;
-	int			flags;
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READ],
 		.rpc_argp	= &data->args,
@@ -646,12 +635,7 @@ nfs_proc_read_setup(struct nfs_read_data *data)
 		.rpc_cred	= data->cred,
 	};
 
-	/* N.B. Do we need to test? Never called for swapfile inode */
-	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
-
-	/* Finalize the task. */
-	rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_read_ops, data);
-	rpc_call_setup(task, &msg, 0);
+	rpc_call_setup(&data->task, &msg, 0);
 }
 
 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -720,6 +704,7 @@ struct nfs_rpc_ops	nfs_v2_clientops = {
 	.pathconf	= nfs_proc_pathconf,
 	.decode_dirent	= nfs_decode_dirent,
 	.read_setup	= nfs_proc_read_setup,
+	.read_done	= nfs_read_done,
 	.write_setup	= nfs_proc_write_setup,
 	.write_done	= nfs_write_done,
 	.commit_setup	= nfs_proc_commit_setup,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index ae3ddd24cf8f..2da255f0247f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -36,8 +36,8 @@
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
 static int nfs_pagein_one(struct list_head *, struct inode *);
-static void nfs_readpage_result_partial(struct nfs_read_data *, int);
-static void nfs_readpage_result_full(struct nfs_read_data *, int);
+static const struct rpc_call_ops nfs_read_partial_ops;
+static const struct rpc_call_ops nfs_read_full_ops;
 
 static kmem_cache_t *nfs_rdata_cachep;
 mempool_t *nfs_rdata_mempool;
@@ -200,9 +200,11 @@ static void nfs_readpage_release(struct nfs_page *req)
  * Set up the NFS read request struct
  */
 static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+		const struct rpc_call_ops *call_ops,
 		unsigned int count, unsigned int offset)
 {
 	struct inode		*inode;
+	int flags;
 
 	data->req	  = req;
 	data->inode	  = inode = req->wb_context->dentry->d_inode;
@@ -220,6 +222,9 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->res.eof     = 0;
 	nfs_fattr_init(&data->fattr);
 
+	/* Set up the initial task struct. */
+	flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
+	rpc_init_task(&data->task, NFS_CLIENT(inode), flags, call_ops, data);
 	NFS_PROTO(inode)->read_setup(data);
 
 	data->task.tk_cookie = (unsigned long)inode;
@@ -307,14 +312,15 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
 		list_del_init(&data->pages);
 
 		data->pagevec[0] = page;
-		data->complete = nfs_readpage_result_partial;
 
 		if (nbytes > rsize) {
-			nfs_read_rpcsetup(req, data, rsize, offset);
+			nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
+					rsize, offset);
 			offset += rsize;
 			nbytes -= rsize;
 		} else {
-			nfs_read_rpcsetup(req, data, nbytes, offset);
+			nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
+					nbytes, offset);
 			nbytes = 0;
 		}
 		nfs_execute_read(data);
@@ -360,8 +366,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
 	}
 	req = nfs_list_entry(data->pages.next);
 
-	data->complete = nfs_readpage_result_full;
-	nfs_read_rpcsetup(req, data, count, 0);
+	nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
 
 	nfs_execute_read(data);
 	return 0;
@@ -395,12 +400,15 @@ nfs_pagein_list(struct list_head *head, int rpages)
 /*
  * Handle a read reply that fills part of a page.
  */
-static void nfs_readpage_result_partial(struct nfs_read_data *data, int status)
+static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
 {
+	struct nfs_read_data *data = calldata;
 	struct nfs_page *req = data->req;
 	struct page *page = req->wb_page;
  
-	if (status >= 0) {
+	if (nfs_readpage_result(task, data) != 0)
+		return;
+	if (task->tk_status >= 0) {
 		unsigned int request = data->args.count;
 		unsigned int result = data->res.count;
 
@@ -419,20 +427,28 @@ static void nfs_readpage_result_partial(struct nfs_read_data *data, int status)
 	}
 }
 
+static const struct rpc_call_ops nfs_read_partial_ops = {
+	.rpc_call_done = nfs_readpage_result_partial,
+	.rpc_release = nfs_readdata_release,
+};
+
 /*
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-static void nfs_readpage_result_full(struct nfs_read_data *data, int status)
+static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 {
+	struct nfs_read_data *data = calldata;
 	unsigned int count = data->res.count;
 
+	if (nfs_readpage_result(task, data) != 0)
+		return;
 	while (!list_empty(&data->pages)) {
 		struct nfs_page *req = nfs_list_entry(data->pages.next);
 		struct page *page = req->wb_page;
 		nfs_list_remove_request(req);
 
-		if (status >= 0) {
+		if (task->tk_status >= 0) {
 			if (count < PAGE_CACHE_SIZE) {
 				if (count < req->wb_bytes)
 					memclear_highpage_flush(page,
@@ -448,19 +464,27 @@ static void nfs_readpage_result_full(struct nfs_read_data *data, int status)
 	}
 }
 
+static const struct rpc_call_ops nfs_read_full_ops = {
+	.rpc_call_done = nfs_readpage_result_full,
+	.rpc_release = nfs_readdata_release,
+};
+
 /*
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
  */
-void nfs_readpage_result(struct rpc_task *task, void *calldata)
+int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 {
-	struct nfs_read_data *data = calldata;
 	struct nfs_readargs *argp = &data->args;
 	struct nfs_readres *resp = &data->res;
-	int status = task->tk_status;
+	int status;
 
 	dprintk("NFS: %4d nfs_readpage_result, (status %d)\n",
-		task->tk_pid, status);
+		task->tk_pid, task->tk_status);
+
+	status = NFS_PROTO(data->inode)->read_done(task, data);
+	if (status != 0)
+		return status;
 
 	nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count);
 
@@ -474,14 +498,14 @@ void nfs_readpage_result(struct rpc_task *task, void *calldata)
 			argp->pgbase += resp->count;
 			argp->count -= resp->count;
 			rpc_restart_call(task);
-			return;
+			return -EAGAIN;
 		}
 		task->tk_status = -EIO;
 	}
 	spin_lock(&data->inode->i_lock);
 	NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 	spin_unlock(&data->inode->i_lock);
-	data->complete(data, status);
+	return 0;
 }
 
 /*
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 782e59765696..f55827be4f8e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -492,8 +492,8 @@ static inline void nfs_writedata_free(struct nfs_write_data *p)
 extern int  nfs_readpage(struct file *, struct page *);
 extern int  nfs_readpages(struct file *, struct address_space *,
 		struct list_head *, unsigned);
-extern void nfs_readpage_result(struct rpc_task *, void *);
-extern void  nfs_readdata_release(void *data);
+extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
+extern void nfs_readdata_release(void *data);
 
 
 /*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 277750cc70c0..7fafc4c546b7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -695,7 +695,6 @@ struct nfs_read_data {
 #ifdef CONFIG_NFS_V4
 	unsigned long		timestamp;	/* For lease renewal */
 #endif
-	void (*complete) (struct nfs_read_data *, int);
 	struct page		*page_array[NFS_PAGEVEC_SIZE + 1];
 };
 
@@ -768,6 +767,7 @@ struct nfs_rpc_ops {
 			     struct nfs_pathconf *);
 	u32 *	(*decode_dirent)(u32 *, struct nfs_entry *, int plus);
 	void	(*read_setup)   (struct nfs_read_data *);
+	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
 	void	(*write_setup)  (struct nfs_write_data *, int how);
 	int	(*write_done)  (struct rpc_task *, struct nfs_write_data *);
 	void	(*commit_setup) (struct nfs_write_data *, int how);
-- 
cgit v1.2.3


From 462d5b3296b56289efec426499a83faad4c08d9e Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Mon, 20 Mar 2006 13:44:32 -0500
Subject: NFS: make direct write path generate write requests concurrently

Duplicate infrastructure from direct read path that will allow write
path to generate multiple write requests concurrently.  This will
enable us to add support for aio in this path.

Temporarily we will lose the ability to do UNSTABLE writes followed by
a COMMIT in the direct write path.  However, all applications I am
aware of that use NFS O_DIRECT currently write in relatively small
chunks, so this should not be inconvenient in any way.

Test plan:
Millions of fsx-odirect ops. OraSim.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c        | 240 ++++++++++++++++++++++++++++++++-----------------
 fs/nfs/write.c         |   3 +-
 include/linux/nfs_fs.h |   2 +
 3 files changed, 162 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4df21ce28e17..dea3239cdded 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -384,106 +384,185 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	return result;
 }
 
-static ssize_t nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx, unsigned long user_addr, size_t count, loff_t file_offset, struct page **pages, int nr_pages)
+static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
 {
-	const unsigned int wsize = NFS_SERVER(inode)->wsize;
-	size_t request;
-	int curpage, need_commit;
-	ssize_t result, tot_bytes;
-	struct nfs_writeverf first_verf;
-	struct nfs_write_data *wdata;
-
-	wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
-	if (!wdata)
-		return -ENOMEM;
+	struct list_head *list;
+	struct nfs_direct_req *dreq;
+	unsigned int writes = 0;
+	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-	wdata->inode = inode;
-	wdata->cred = ctx->cred;
-	wdata->args.fh = NFS_FH(inode);
-	wdata->args.context = ctx;
-	wdata->args.stable = NFS_UNSTABLE;
-	if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize)
-		wdata->args.stable = NFS_FILE_SYNC;
-	wdata->res.fattr = &wdata->fattr;
-	wdata->res.verf = &wdata->verf;
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		return NULL;
+
+	list = &dreq->list;
+	for(;;) {
+		struct nfs_write_data *data = nfs_writedata_alloc(wpages);
+
+		if (unlikely(!data)) {
+			while (!list_empty(list)) {
+				data = list_entry(list->next,
+						  struct nfs_write_data, pages);
+				list_del(&data->pages);
+				nfs_writedata_free(data);
+			}
+			kref_put(&dreq->kref, nfs_direct_req_release);
+			return NULL;
+		}
+
+		INIT_LIST_HEAD(&data->pages);
+		list_add(&data->pages, list);
+
+		data->req = (struct nfs_page *) dreq;
+		writes++;
+		if (nbytes <= wsize)
+			break;
+		nbytes -= wsize;
+	}
+	kref_get(&dreq->kref);
+	atomic_set(&dreq->complete, writes);
+	return dreq;
+}
+
+/*
+ * Collects and returns the final error value/byte-count.
+ */
+static ssize_t nfs_direct_write_wait(struct nfs_direct_req *dreq, int intr)
+{
+	int result = 0;
+
+	if (intr) {
+		result = wait_event_interruptible(dreq->wait,
+					(atomic_read(&dreq->complete) == 0));
+	} else {
+		wait_event(dreq->wait, (atomic_read(&dreq->complete) == 0));
+	}
+
+	if (!result)
+		result = atomic_read(&dreq->error);
+	if (!result)
+		result = atomic_read(&dreq->count);
+
+	kref_put(&dreq->kref, nfs_direct_req_release);
+	return (ssize_t) result;
+}
+
+static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+	int status = task->tk_status;
+
+	if (nfs_writeback_done(task, data) != 0)
+		return;
+	/* If the server fell back to an UNSTABLE write, it's an error. */
+	if (unlikely(data->res.verf->committed != NFS_FILE_SYNC))
+		status = -EIO;
+
+	if (likely(status >= 0))
+		atomic_add(data->res.count, &dreq->count);
+	else
+		atomic_set(&dreq->error, status);
+
+	if (unlikely(atomic_dec_and_test(&dreq->complete)))
+		nfs_direct_complete(dreq);
+}
+
+static const struct rpc_call_ops nfs_write_direct_ops = {
+	.rpc_call_done = nfs_direct_write_result,
+	.rpc_release = nfs_writedata_release,
+};
+
+/*
+ * For each nfs_write_data struct that was allocated on the list, dispatch
+ * an NFS WRITE operation
+ *
+ * XXX: For now, support only FILE_SYNC writes.  Later we may add
+ *      support for UNSTABLE + COMMIT.
+ */
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, struct inode *inode, struct nfs_open_context *ctx, unsigned long user_addr, size_t count, loff_t file_offset)
+{
+	struct list_head *list = &dreq->list;
+	struct page **pages = dreq->pages;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	unsigned int curpage, pgbase;
 
-	nfs_begin_data_update(inode);
-retry:
-	need_commit = 0;
-	tot_bytes = 0;
 	curpage = 0;
-	request = count;
-	wdata->args.pgbase = user_addr & ~PAGE_MASK;
-	wdata->args.offset = file_offset;
+	pgbase = user_addr & ~PAGE_MASK;
 	do {
-		wdata->args.count = request;
-		if (wdata->args.count > wsize)
-			wdata->args.count = wsize;
-		wdata->args.pages = &pages[curpage];
+		struct nfs_write_data *data;
+		size_t bytes;
+
+		bytes = wsize;
+		if (count < wsize)
+			bytes = count;
+
+		data = list_entry(list->next, struct nfs_write_data, pages);
+		list_del_init(&data->pages);
+
+		data->inode = inode;
+		data->cred = ctx->cred;
+		data->args.fh = NFS_FH(inode);
+		data->args.context = ctx;
+		data->args.offset = file_offset;
+		data->args.pgbase = pgbase;
+		data->args.pages = &pages[curpage];
+		data->args.count = bytes;
+		data->res.fattr = &data->fattr;
+		data->res.count = bytes;
+
+		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
+				&nfs_write_direct_ops, data);
+		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
 
-		dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
-			wdata->args.count, (long long) wdata->args.offset,
-			user_addr + tot_bytes, wdata->args.pgbase, curpage);
+		data->task.tk_priority = RPC_PRIORITY_NORMAL;
+		data->task.tk_cookie = (unsigned long) inode;
 
 		lock_kernel();
-		result = NFS_PROTO(inode)->write(wdata);
+		rpc_execute(&data->task);
 		unlock_kernel();
 
-		if (result <= 0) {
-			if (tot_bytes > 0)
-				break;
-			goto out;
-		}
+		dfprintk(VFS, "NFS: %4d initiated direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+				data->task.tk_pid,
+				inode->i_sb->s_id,
+				(long long)NFS_FILEID(inode),
+				bytes,
+				(unsigned long long)data->args.offset);
 
-		if (tot_bytes == 0)
-			memcpy(&first_verf.verifier, &wdata->verf.verifier,
-						sizeof(first_verf.verifier));
-		if (wdata->verf.committed != NFS_FILE_SYNC) {
-			need_commit = 1;
-			if (memcmp(&first_verf.verifier, &wdata->verf.verifier,
-					sizeof(first_verf.verifier)))
-				goto sync_retry;
-		}
+		file_offset += bytes;
+		pgbase += bytes;
+		curpage += pgbase >> PAGE_SHIFT;
+		pgbase &= ~PAGE_MASK;
 
-		tot_bytes += result;
+		count -= bytes;
+	} while (count != 0);
+}
 
-		/* in case of a short write: stop now, let the app recover */
-		if (result < wdata->args.count)
-			break;
+static ssize_t nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx, unsigned long user_addr, size_t count, loff_t file_offset, struct page **pages, int nr_pages)
+{
+	ssize_t result;
+	sigset_t oldset;
+	struct rpc_clnt *clnt = NFS_CLIENT(inode);
+	struct nfs_direct_req *dreq;
 
-		wdata->args.offset += result;
-		wdata->args.pgbase += result;
-		curpage += wdata->args.pgbase >> PAGE_SHIFT;
-		wdata->args.pgbase &= ~PAGE_MASK;
-		request -= result;
-	} while (request != 0);
+	dreq = nfs_direct_write_alloc(count, NFS_SERVER(inode)->wsize);
+	if (!dreq)
+		return -ENOMEM;
 
-	/*
-	 * Commit data written so far, even in the event of an error
-	 */
-	if (need_commit) {
-		wdata->args.count = tot_bytes;
-		wdata->args.offset = file_offset;
+	dreq->pages = pages;
+	dreq->npages = nr_pages;
 
-		lock_kernel();
-		result = NFS_PROTO(inode)->commit(wdata);
-		unlock_kernel();
+	nfs_begin_data_update(inode);
 
-		if (result < 0 || memcmp(&first_verf.verifier,
-					 &wdata->verf.verifier,
-					 sizeof(first_verf.verifier)) != 0)
-			goto sync_retry;
-	}
-	result = tot_bytes;
+	rpc_clnt_sigmask(clnt, &oldset);
+	nfs_direct_write_schedule(dreq, inode, ctx, user_addr, count,
+				  file_offset);
+	result = nfs_direct_write_wait(dreq, clnt->cl_intr);
+	rpc_clnt_sigunmask(clnt, &oldset);
 
-out:
 	nfs_end_data_update(inode);
-	nfs_writedata_free(wdata);
-	return result;
 
-sync_retry:
-	wdata->args.stable = NFS_FILE_SYNC;
-	goto retry;
+	return result;
 }
 
 /*
@@ -515,7 +594,6 @@ static ssize_t nfs_direct_write(struct inode *inode, struct nfs_open_context *ct
 		nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, size);
 		result = nfs_direct_write_seg(inode, ctx, user_addr, size,
 				file_offset, pages, page_count);
-		nfs_free_user_pages(pages, page_count, 0);
 
 		if (result <= 0) {
 			if (tot_bytes > 0)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5912274ff1a1..875f5b060533 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -77,7 +77,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context*,
 					    struct inode *,
 					    struct page *,
 					    unsigned int, unsigned int);
-static int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 static int nfs_wait_on_write_congestion(struct address_space *, int);
 static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
 static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
@@ -1183,7 +1182,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
 /*
  * This function is called when the WRITE call is complete.
  */
-static int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f55827be4f8e..6c130a6b0f4d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -407,6 +407,8 @@ extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
+extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern void nfs_writedata_release(void *);
 
 /*
  * Try to write back everything synchronously (but check the
-- 
cgit v1.2.3


From e17b1fc4b35399935f00a635206e183d9292fe4f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:35 -0500
Subject: NFS: Make nfs_commit_alloc() extern

We need to use nfs_commit_alloc() in fs/nfs/direct.c.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c         | 4 ++--
 include/linux/nfs_fs.h | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 875f5b060533..f7c8be0beccf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -91,7 +91,7 @@ static mempool_t *nfs_commit_mempool;
 
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
 
-static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
+struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 {
 	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
 
@@ -112,7 +112,7 @@ static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 	return p;
 }
 
-static inline void nfs_commit_free(struct nfs_write_data *p)
+void nfs_commit_free(struct nfs_write_data *p)
 {
 	if (p && (p->pagevec != &p->page_array[0]))
 		kfree(p->pagevec);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6c130a6b0f4d..423f202b881c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -410,6 +410,11 @@ extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 extern void nfs_writedata_release(void *);
 
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount);
+void nfs_commit_free(struct nfs_write_data *p);
+#endif
+
 /*
  * Try to write back everything synchronously (but check the
  * return value!)
-- 
cgit v1.2.3


From fad61490419b3e494f300e9b2579810ef3bcda31 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:36 -0500
Subject: nfs: Use UNSTABLE + COMMIT for NFS O_DIRECT writes

Currently NFS O_DIRECT writes use FILE_SYNC so that a COMMIT is not
necessary.  This simplifies the internal logic, but this could be a
difficult workload for some servers.

Instead, let's send UNSTABLE writes, and after they all complete, send a
COMMIT for the dirty range.  After the COMMIT returns successfully, then do
the wake_up or fire off aio_complete().

Test plan:
Async direct I/O tests against Solaris (or any server that requires
committed unstable writes).  Reboot server during test.

Based on an earlier patch by Chuck Lever <cel@netapp.com>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c        | 224 +++++++++++++++++++++++++++++++++++++++++++------
 include/linux/nfs_fs.h |   1 +
 2 files changed, 200 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 737990dd4dfe..f0f2053c7a61 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,11 +69,15 @@ struct nfs_direct_req {
 	struct kref		kref;		/* release manager */
 
 	/* I/O parameters */
-	struct list_head	list;		/* nfs_read/write_data structs */
+	struct list_head	list,		/* nfs_read/write_data structs */
+				rewrite_list;	/* saved nfs_write_data structs */
 	struct file *		filp;		/* file descriptor */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	wait_queue_head_t	wait;		/* wait for i/o completion */
 	struct inode *		inode;		/* target file of i/o */
+	unsigned long		user_addr;	/* location of user's buffer */
+	size_t			user_count;	/* total bytes to move */
+	loff_t			pos;		/* starting offset in file */
 	struct page **		pages;		/* pages in our buffer */
 	unsigned int		npages;		/* count of pages */
 
@@ -82,8 +86,18 @@ struct nfs_direct_req {
 	int			outstanding;	/* i/os we're waiting for */
 	ssize_t			count,		/* bytes actually processed */
 				error;		/* any reported error */
+
+	/* commit state */
+	struct nfs_write_data *	commit_data;	/* special write_data for commits */
+	int			flags;
+#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
+	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -160,11 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	kref_init(&dreq->kref);
 	init_waitqueue_head(&dreq->wait);
 	INIT_LIST_HEAD(&dreq->list);
+	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	spin_lock_init(&dreq->lock);
 	dreq->outstanding = 0;
 	dreq->count = 0;
 	dreq->error = 0;
+	dreq->flags = 0;
 
 	return dreq;
 }
@@ -299,7 +315,7 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
  * For each nfs_read_data struct that was allocated on the list, dispatch
  * an NFS READ operation
  */
-static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
 {
 	struct file *file = dreq->filp;
 	struct inode *inode = file->f_mapping->host;
@@ -307,11 +323,13 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
 							file->private_data;
 	struct list_head *list = &dreq->list;
 	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t rsize = NFS_SERVER(inode)->rsize;
 	unsigned int curpage, pgbase;
 
 	curpage = 0;
-	pgbase = user_addr & ~PAGE_MASK;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_read_data *data;
 		size_t bytes;
@@ -373,6 +391,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	if (!dreq)
 		return -ENOMEM;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
 	dreq->pages = pages;
 	dreq->npages = nr_pages;
 	igrab(inode);
@@ -383,13 +404,137 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_read_schedule(dreq, user_addr, count, pos);
+	nfs_direct_read_schedule(dreq);
 	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
 }
 
+static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
+{
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	while (!list_empty(&dreq->list)) {
+		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
+		list_del(&data->pages);
+		nfs_writedata_release(data);
+	}
+}
+
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+{
+	struct list_head *pos;
+
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	list_for_each(pos, &dreq->list)
+		dreq->outstanding++;
+	dreq->count = 0;
+
+	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
+}
+
+static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+
+	/* Call the NFS version-specific code */
+	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
+		return;
+	if (unlikely(task->tk_status < 0)) {
+		dreq->error = task->tk_status;
+		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+	}
+	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
+		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+	}
+
+	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
+	nfs_direct_write_complete(dreq, data->inode);
+}
+
+static const struct rpc_call_ops nfs_commit_direct_ops = {
+	.rpc_call_done = nfs_direct_commit_result,
+	.rpc_release = nfs_commit_release,
+};
+
+static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
+{
+	struct file *file = dreq->filp;
+	struct nfs_open_context *ctx = (struct nfs_open_context *)
+							file->private_data;
+	struct nfs_write_data *data = dreq->commit_data;
+	struct rpc_task *task = &data->task;
+
+	data->inode = dreq->inode;
+	data->cred = ctx->cred;
+
+	data->args.fh = NFS_FH(data->inode);
+	data->args.offset = dreq->pos;
+	data->args.count = dreq->user_count;
+	data->res.count = 0;
+	data->res.fattr = &data->fattr;
+	data->res.verf = &data->verf;
+
+	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
+				&nfs_commit_direct_ops, data);
+	NFS_PROTO(data->inode)->commit_setup(data, 0);
+
+	data->task.tk_priority = RPC_PRIORITY_NORMAL;
+	data->task.tk_cookie = (unsigned long)data->inode;
+	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
+	dreq->commit_data = NULL;
+
+	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
+
+	lock_kernel();
+	rpc_execute(&data->task);
+	unlock_kernel();
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+	int flags = dreq->flags;
+
+	dreq->flags = 0;
+	switch (flags) {
+		case NFS_ODIRECT_DO_COMMIT:
+			nfs_direct_commit_schedule(dreq);
+			break;
+		case NFS_ODIRECT_RESCHED_WRITES:
+			nfs_direct_write_reschedule(dreq);
+			break;
+		default:
+			nfs_end_data_update(inode);
+			if (dreq->commit_data != NULL)
+				nfs_commit_free(dreq->commit_data);
+			nfs_direct_free_writedata(dreq);
+			nfs_direct_complete(dreq);
+	}
+}
+
+static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+	dreq->commit_data = nfs_commit_alloc(0);
+	if (dreq->commit_data != NULL)
+		dreq->commit_data->req = (struct nfs_page *) dreq;
+}
+#else
+static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
+{
+	dreq->commit_data = NULL;
+}
+
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+{
+	nfs_end_data_update(inode);
+	nfs_direct_free_writedata(dreq);
+	nfs_direct_complete(dreq);
+}
+#endif
+
 static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
 {
 	struct list_head *list;
@@ -424,14 +569,13 @@ static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize
 			break;
 		nbytes -= wsize;
 	}
+
+	nfs_alloc_commit_data(dreq);
+
 	kref_get(&dreq->kref);
 	return dreq;
 }
 
-/*
- * NB: Return the value of the first error return code.  Subsequent
- *     errors after the first one are ignored.
- */
 static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
@@ -440,41 +584,62 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 
 	if (nfs_writeback_done(task, data) != 0)
 		return;
-	/* If the server fell back to an UNSTABLE write, it's an error. */
-	if (unlikely(data->res.verf->committed != NFS_FILE_SYNC))
-		status = -EIO;
 
 	spin_lock(&dreq->lock);
 
 	if (likely(status >= 0))
 		dreq->count += data->res.count;
 	else
-		dreq->error = status;
+		dreq->error = task->tk_status;
 
+	if (data->res.verf->committed != NFS_FILE_SYNC) {
+		switch (dreq->flags) {
+			case 0:
+				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
+				dreq->flags = NFS_ODIRECT_DO_COMMIT;
+				break;
+			case NFS_ODIRECT_DO_COMMIT:
+				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
+					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
+					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+				}
+		}
+	}
+	/* In case we have to resend */
+	data->args.stable = NFS_FILE_SYNC;
+
+	spin_unlock(&dreq->lock);
+}
+
+/*
+ * NB: Return the value of the first error return code.  Subsequent
+ *     errors after the first one are ignored.
+ */
+static void nfs_direct_write_release(void *calldata)
+{
+	struct nfs_write_data *data = calldata;
+	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
+
+	spin_lock(&dreq->lock);
 	if (--dreq->outstanding) {
 		spin_unlock(&dreq->lock);
 		return;
 	}
-
 	spin_unlock(&dreq->lock);
 
-	nfs_end_data_update(data->inode);
-	nfs_direct_complete(dreq);
+	nfs_direct_write_complete(dreq, data->inode);
 }
 
 static const struct rpc_call_ops nfs_write_direct_ops = {
 	.rpc_call_done = nfs_direct_write_result,
-	.rpc_release = nfs_writedata_release,
+	.rpc_release = nfs_direct_write_release,
 };
 
 /*
  * For each nfs_write_data struct that was allocated on the list, dispatch
  * an NFS WRITE operation
- *
- * XXX: For now, support only FILE_SYNC writes.  Later we may add
- *      support for UNSTABLE + COMMIT.
  */
-static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
 {
 	struct file *file = dreq->filp;
 	struct inode *inode = file->f_mapping->host;
@@ -482,11 +647,13 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 							file->private_data;
 	struct list_head *list = &dreq->list;
 	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int curpage, pgbase;
 
 	curpage = 0;
-	pgbase = user_addr & ~PAGE_MASK;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_write_data *data;
 		size_t bytes;
@@ -496,7 +663,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 			bytes = count;
 
 		data = list_entry(list->next, struct nfs_write_data, pages);
-		list_del_init(&data->pages);
+		list_move_tail(&data->pages, &dreq->rewrite_list);
 
 		data->inode = inode;
 		data->cred = ctx->cred;
@@ -512,7 +679,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 
 		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
 				&nfs_write_direct_ops, data);
-		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
+		NFS_PROTO(inode)->write_setup(data, sync);
 
 		data->task.tk_priority = RPC_PRIORITY_NORMAL;
 		data->task.tk_cookie = (unsigned long) inode;
@@ -544,11 +711,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
+	size_t wsize = NFS_SERVER(inode)->wsize;
+	int sync = 0;
 
-	dreq = nfs_direct_write_alloc(count, NFS_SERVER(inode)->wsize);
+	dreq = nfs_direct_write_alloc(count, wsize);
 	if (!dreq)
 		return -ENOMEM;
+	if (dreq->commit_data == NULL || count < wsize)
+		sync = FLUSH_STABLE;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
 	dreq->pages = pages;
 	dreq->npages = nr_pages;
 	igrab(inode);
@@ -562,7 +736,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_write_schedule(dreq, user_addr, count, pos);
+	nfs_direct_write_schedule(dreq, sync);
 	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 423f202b881c..9f84c8a5ea43 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -422,6 +422,7 @@ void nfs_commit_free(struct nfs_write_data *p);
 extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
+extern void nfs_commit_release(void *wdata);
 #else
 static inline int
 nfs_commit_inode(struct inode *inode, int how)
-- 
cgit v1.2.3


From 3feb2d49394b7874348a6e43c076b780c1d222c5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:37 -0500
Subject: NFS: Uninline nfs_writedata_(alloc|free) and
 nfs_readdata_(alloc|free)

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c          | 32 +++++++++++++++++++++++-
 fs/nfs/write.c         | 32 +++++++++++++++++++++++-
 include/linux/nfs_fs.h | 67 +++-----------------------------------------------
 3 files changed, 66 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2da255f0247f..3961524fd4ab 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -40,10 +40,40 @@ static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
 static kmem_cache_t *nfs_rdata_cachep;
-mempool_t *nfs_rdata_mempool;
+static mempool_t *nfs_rdata_mempool;
 
 #define MIN_POOL_READ	(32)
 
+struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
+{
+	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
+
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_rdata_mempool);
+				p = NULL;
+			}
+		}
+	}
+	return p;
+}
+
+void nfs_readdata_free(struct nfs_read_data *p)
+{
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
+	mempool_free(p, nfs_rdata_mempool);
+}
+
 void nfs_readdata_release(void *data)
 {
         nfs_readdata_free(data);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f7c8be0beccf..647e3217c2e1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -86,7 +86,7 @@ static const struct rpc_call_ops nfs_write_full_ops;
 static const struct rpc_call_ops nfs_commit_ops;
 
 static kmem_cache_t *nfs_wdata_cachep;
-mempool_t *nfs_wdata_mempool;
+static mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
 
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
@@ -119,6 +119,36 @@ void nfs_commit_free(struct nfs_write_data *p)
 	mempool_free(p, nfs_commit_mempool);
 }
 
+struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
+{
+	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		INIT_LIST_HEAD(&p->pages);
+		if (pagecount < NFS_PAGEVEC_SIZE)
+			p->pagevec = &p->page_array[0];
+		else {
+			size_t size = ++pagecount * sizeof(struct page *);
+			p->pagevec = kmalloc(size, GFP_NOFS);
+			if (p->pagevec) {
+				memset(p->pagevec, 0, size);
+			} else {
+				mempool_free(p, nfs_wdata_mempool);
+				p = NULL;
+			}
+		}
+	}
+	return p;
+}
+
+void nfs_writedata_free(struct nfs_write_data *p)
+{
+	if (p && (p->pagevec != &p->page_array[0]))
+		kfree(p->pagevec);
+	mempool_free(p, nfs_wdata_mempool);
+}
+
 void nfs_writedata_release(void *wdata)
 {
 	nfs_writedata_free(wdata);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 9f84c8a5ea43..55de0770df4a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -462,37 +462,8 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page)
 /*
  * Allocate and free nfs_write_data structures
  */
-extern mempool_t *nfs_wdata_mempool;
-
-static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
-{
-	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
-
-	if (p) {
-		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
-		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
-				mempool_free(p, nfs_wdata_mempool);
-				p = NULL;
-			}
-		}
-	}
-	return p;
-}
-
-static inline void nfs_writedata_free(struct nfs_write_data *p)
-{
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
-	mempool_free(p, nfs_wdata_mempool);
-}
+extern struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount);
+extern void nfs_writedata_free(struct nfs_write_data *p);
 
 /*
  * linux/fs/nfs/read.c
@@ -503,41 +474,11 @@ extern int  nfs_readpages(struct file *, struct address_space *,
 extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
 extern void nfs_readdata_release(void *data);
 
-
 /*
  * Allocate and free nfs_read_data structures
  */
-extern mempool_t *nfs_rdata_mempool;
-
-static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
-{
-	struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
-
-	if (p) {
-		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
-		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
-				mempool_free(p, nfs_rdata_mempool);
-				p = NULL;
-			}
-		}
-	}
-	return p;
-}
-
-static inline void nfs_readdata_free(struct nfs_read_data *p)
-{
-	if (p && (p->pagevec != &p->page_array[0]))
-		kfree(p->pagevec);
-	mempool_free(p, nfs_rdata_mempool);
-}
+extern struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount);
+extern void nfs_readdata_free(struct nfs_read_data *p);
 
 /*
  * linux/fs/nfs3proc.c
-- 
cgit v1.2.3


From 6849c0cab69f5d1a0fc7b05fa5bfb3dec53f86df Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:39 -0500
Subject: lockd: Add refcounting to struct nlm_block

Otherwise, the block may disappear from underneath us when in
nlmsvc_retry_blocked.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c          | 94 ++++++++++++++++++++++-----------------------
 include/linux/lockd/lockd.h |  3 +-
 2 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 1d3a74df93f3..20caeceffb14 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -39,6 +39,7 @@
 #define nlm_deadlock	nlm_lck_denied
 #endif
 
+static void nlmsvc_release_block(struct nlm_block *block);
 static void	nlmsvc_insert_block(struct nlm_block *block, unsigned long);
 static int	nlmsvc_remove_block(struct nlm_block *block);
 
@@ -58,6 +59,7 @@ nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
 	struct nlm_block **bp, *b;
 
 	dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when);
+	kref_get(&block->b_count);
 	if (block->b_queued)
 		nlmsvc_remove_block(block);
 	bp = &nlm_blocked;
@@ -90,6 +92,7 @@ nlmsvc_remove_block(struct nlm_block *block)
 		if (b == block) {
 			*bp = block->b_next;
 			block->b_queued = 0;
+			nlmsvc_release_block(block);
 			return 1;
 		}
 	}
@@ -123,6 +126,7 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock, int remove)
 				*head = block->b_next;
 				block->b_queued = 0;
 			}
+			kref_get(&block->b_count);
 			return block;
 		}
 	}
@@ -155,6 +159,8 @@ nlmsvc_find_block(struct nlm_cookie *cookie,  struct sockaddr_in *sin)
 			break;
 	}
 
+	if (block != NULL)
+		kref_get(&block->b_count);
 	return block;
 }
 
@@ -188,6 +194,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	memset(block, 0, sizeof(*block));
 	locks_init_lock(&block->b_call.a_args.lock.fl);
 	locks_init_lock(&block->b_call.a_res.lock.fl);
+	kref_init(&block->b_count);
 
 	if (!nlmclnt_setgrantargs(&block->b_call, lock))
 		goto failed_free;
@@ -228,27 +235,24 @@ failed:
  * It is the caller's responsibility to check whether the file
  * can be closed hereafter.
  */
-static int
-nlmsvc_delete_block(struct nlm_block *block)
+static int nlmsvc_unlink_block(struct nlm_block *block)
 {
-	struct file_lock	*fl = &block->b_call.a_args.lock.fl;
-	struct nlm_file		*file = block->b_file;
-	struct nlm_block	**bp;
 	int status;
-
-	dprintk("lockd: deleting block %p...\n", block);
+	dprintk("lockd: unlinking block %p...\n", block);
 
 	/* Remove block from list */
+	status = posix_unblock_lock(block->b_file->f_file, &block->b_call.a_args.lock.fl);
 	nlmsvc_remove_block(block);
-	status = posix_unblock_lock(file->f_file, fl);
+	return status;
+}
 
-	/* If the block is in the middle of a GRANT callback,
-	 * don't kill it yet. */
-	if (block->b_incall) {
-		nlmsvc_insert_block(block, NLM_NEVER);
-		block->b_done = 1;
-		return status;
-	}
+static void nlmsvc_free_block(struct kref *kref)
+{
+	struct nlm_block *block = container_of(kref, struct nlm_block, b_count);
+	struct nlm_file		*file = block->b_file;
+	struct nlm_block	**bp;
+
+	dprintk("lockd: freeing block %p...\n", block);
 
 	/* Remove block from file's list of blocks */
 	for (bp = &file->f_blocks; *bp; bp = &(*bp)->b_fnext) {
@@ -262,7 +266,12 @@ nlmsvc_delete_block(struct nlm_block *block)
 		nlm_release_host(block->b_host);
 	nlmclnt_freegrantargs(&block->b_call);
 	kfree(block);
-	return status;
+}
+
+static void nlmsvc_release_block(struct nlm_block *block)
+{
+	if (block != NULL)
+		kref_put(&block->b_count, nlmsvc_free_block);
 }
 
 /*
@@ -282,7 +291,7 @@ nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
 			block->b_host->h_inuse = 1;
 		else if (action == NLM_ACT_UNLOCK) {
 			if (host == NULL || host == block->b_host)
-				nlmsvc_delete_block(block);
+				nlmsvc_unlink_block(block);
 		}
 	}
 	up(&file->f_sema);
@@ -318,9 +327,9 @@ again:
 	block = nlmsvc_lookup_block(file, lock, 0);
 	if (block == NULL) {
 		if (newblock != NULL)
-			lock = &newblock->b_call.a_args.lock.fl;
+			lock = &newblock->b_call.a_args.lock;
 	} else
-		lock = &block->b_call.a_args.lock.fl;
+		lock = &block->b_call.a_args.lock;
 
 	error = posix_lock_file(file->f_file, &lock->fl);
 	lock->fl.fl_flags &= ~FL_SLEEP;
@@ -363,12 +372,10 @@ again:
 
 	/* Append to list of blocked */
 	nlmsvc_insert_block(newblock, NLM_NEVER);
-	newblock = NULL;
-
 out:
 	up(&file->f_sema);
-	if (newblock != NULL)
-		nlmsvc_delete_block(newblock);
+	nlmsvc_release_block(newblock);
+	nlmsvc_release_block(block);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
 	return ret;
 }
@@ -450,8 +457,10 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 				(long long)lock->fl.fl_end);
 
 	down(&file->f_sema);
-	if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL)
-		status = nlmsvc_delete_block(block);
+	if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL) {
+		status = nlmsvc_unlink_block(block);
+		nlmsvc_release_block(block);
+	}
 	up(&file->f_sema);
 	return status ? nlm_lck_denied : nlm_granted;
 }
@@ -514,7 +523,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 	down(&file->f_sema);
 
 	/* Unlink block request from list */
-	nlmsvc_remove_block(block);
+	nlmsvc_unlink_block(block);
 
 	/* If b_granted is true this means we've been here before.
 	 * Just retry the grant callback, possibly refreshing the RPC
@@ -525,7 +534,6 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 	}
 
 	/* Try the lock operation again */
-	posix_unblock_lock(file->f_file, &lock->fl);
 	lock->fl.fl_flags |= FL_SLEEP;
 	error = posix_lock_file(file->f_file, &lock->fl);
 	lock->fl.fl_flags &= ~FL_SLEEP;
@@ -548,16 +556,15 @@ callback:
 	/* Lock was granted by VFS. */
 	dprintk("lockd: GRANTing blocked lock.\n");
 	block->b_granted = 1;
-	block->b_incall  = 1;
 
 	/* Schedule next grant callback in 30 seconds */
 	nlmsvc_insert_block(block, 30 * HZ);
 
 	/* Call the client */
-	nlm_get_host(block->b_call.a_host);
+	kref_get(&block->b_count);
 	if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG,
 						&nlmsvc_grant_ops) < 0)
-		nlm_release_host(block->b_call.a_host);
+		nlmsvc_release_block(block);
 out_unlock:
 	up(&file->f_sema);
 }
@@ -573,20 +580,10 @@ out_unlock:
 static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 {
 	struct nlm_rqst		*call = data;
-	struct nlm_block	*block;
+	struct nlm_block	*block = container_of(call, struct nlm_block, b_call);
 	unsigned long		timeout;
-	struct sockaddr_in	*peer_addr = RPC_PEERADDR(task->tk_client);
 
 	dprintk("lockd: GRANT_MSG RPC callback\n");
-	dprintk("callback: looking for cookie %s, host (%u.%u.%u.%u)\n",
-		nlmdbg_cookie2a(&call->a_args.cookie),
-		NIPQUAD(peer_addr->sin_addr.s_addr));
-	if (!(block = nlmsvc_find_block(&call->a_args.cookie, peer_addr))) {
-		dprintk("lockd: no block for cookie %s, host (%u.%u.%u.%u)\n",
-			nlmdbg_cookie2a(&call->a_args.cookie),
-			NIPQUAD(peer_addr->sin_addr.s_addr));
-		return;
-	}
 
 	/* Technically, we should down the file semaphore here. Since we
 	 * move the block towards the head of the queue only, no harm
@@ -603,9 +600,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 	}
 	nlmsvc_insert_block(block, timeout);
 	svc_wake_up(block->b_daemon);
-	block->b_incall = 0;
-
-	nlm_release_host(call->a_host);
+	nlmsvc_release_block(block);
 }
 
 static const struct rpc_call_ops nlmsvc_grant_ops = {
@@ -631,20 +626,19 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
 
 	file->f_count++;
 	down(&file->f_sema);
-	block = nlmsvc_find_block(cookie, &rqstp->rq_addr);
 	if (block) {
 		if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
 			/* Try again in a couple of seconds */
 			nlmsvc_insert_block(block, 10 * HZ);
-			up(&file->f_sema);
 		} else {
 			/* Lock is now held by client, or has been rejected.
 			 * In both cases, the block should be removed. */
-			nlmsvc_delete_block(block);
-			up(&file->f_sema);
+			nlmsvc_unlink_block(block);
 		}
 	}
+	up(&file->f_sema);
 	nlm_release_file(file);
+	nlmsvc_release_block(block);
 }
 
 /*
@@ -667,10 +661,12 @@ nlmsvc_retry_blocked(void)
 			break;
 		dprintk("nlmsvc_retry_blocked(%p, when=%ld, done=%d)\n",
 			block, block->b_when, block->b_done);
+		kref_get(&block->b_count);
 		if (block->b_done)
-			nlmsvc_delete_block(block);
+			nlmsvc_unlink_block(block);
 		else
 			nlmsvc_grant_blocked(block);
+		nlmsvc_release_block(block);
 	}
 
 	if ((block = nlm_blocked) && block->b_when != NLM_NEVER)
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index ef21ed296039..08ab9773f762 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -14,6 +14,7 @@
 #include <linux/config.h>
 #include <linux/in.h>
 #include <linux/fs.h>
+#include <linux/kref.h>
 #include <linux/utsname.h>
 #include <linux/nfsd/nfsfh.h>
 #include <linux/lockd/bind.h>
@@ -110,6 +111,7 @@ struct nlm_file {
  */
 #define NLM_NEVER		(~(unsigned long) 0)
 struct nlm_block {
+	struct kref		b_count;	/* Reference count */
 	struct nlm_block *	b_next;		/* linked list (all blocks) */
 	struct nlm_block *	b_fnext;	/* linked list (per file) */
 	struct nlm_rqst		b_call;		/* RPC args & callback info */
@@ -119,7 +121,6 @@ struct nlm_block {
 	unsigned int		b_id;		/* block id */
 	unsigned char		b_queued;	/* re-queued */
 	unsigned char		b_granted;	/* VFS granted lock */
-	unsigned char		b_incall;	/* doing callback */
 	unsigned char		b_done;		/* callback complete */
 	struct nlm_file *	b_file;		/* file in question */
 };
-- 
cgit v1.2.3


From 5e1abf8cb713a0b94f5a400c7b9b797990cd9dec Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:39 -0500
Subject: lockd: Clean up of the server-side GRANTED code

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         | 43 ----------------------------------
 fs/lockd/svclock.c          | 56 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/lockd/lockd.h |  2 --
 3 files changed, 53 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index acc3eb13a02b..80ae3127699f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -147,49 +147,6 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 		fl->fl_ops->fl_release_private(fl);
 }
 
-/*
- * Initialize arguments for GRANTED call. The nlm_rqst structure
- * has been cleared already.
- */
-int
-nlmclnt_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
-{
-	locks_copy_lock(&call->a_args.lock.fl, &lock->fl);
-	memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh));
-	call->a_args.lock.caller = system_utsname.nodename;
-	call->a_args.lock.oh.len = lock->oh.len;
-
-	/* set default data area */
-	call->a_args.lock.oh.data = call->a_owner;
-	call->a_args.lock.svid = lock->fl.fl_pid;
-
-	if (lock->oh.len > NLMCLNT_OHSIZE) {
-		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
-		if (!data) {
-			nlmclnt_freegrantargs(call);
-			return 0;
-		}
-		call->a_args.lock.oh.data = (u8 *) data;
-	}
-
-	memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len);
-	return 1;
-}
-
-void
-nlmclnt_freegrantargs(struct nlm_rqst *call)
-{
-	struct file_lock *fl = &call->a_args.lock.fl;
-	/*
-	 * Check whether we allocated memory for the owner.
-	 */
-	if (call->a_args.lock.oh.data != (u8 *) call->a_owner) {
-		kfree(call->a_args.lock.oh.data);
-	}
-	if (fl->fl_ops && fl->fl_ops->fl_release_private)
-		fl->fl_ops->fl_release_private(fl);
-}
-
 /*
  * This is the main entry point for the NLM client.
  */
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 20caeceffb14..3c7dd956d9c1 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -43,6 +43,8 @@ static void nlmsvc_release_block(struct nlm_block *block);
 static void	nlmsvc_insert_block(struct nlm_block *block, unsigned long);
 static int	nlmsvc_remove_block(struct nlm_block *block);
 
+static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
+static void nlmsvc_freegrantargs(struct nlm_rqst *call);
 static const struct rpc_call_ops nlmsvc_grant_ops;
 
 /*
@@ -196,7 +198,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	locks_init_lock(&block->b_call.a_res.lock.fl);
 	kref_init(&block->b_count);
 
-	if (!nlmclnt_setgrantargs(&block->b_call, lock))
+	if (!nlmsvc_setgrantargs(&block->b_call, lock))
 		goto failed_free;
 
 	/* Set notifier function for VFS, and init args */
@@ -264,7 +266,7 @@ static void nlmsvc_free_block(struct kref *kref)
 
 	if (block->b_host)
 		nlm_release_host(block->b_host);
-	nlmclnt_freegrantargs(&block->b_call);
+	nlmsvc_freegrantargs(&block->b_call);
 	kfree(block);
 }
 
@@ -298,6 +300,49 @@ nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
 	return 0;
 }
 
+/*
+ * Initialize arguments for GRANTED call. The nlm_rqst structure
+ * has been cleared already.
+ */
+static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
+{
+	locks_copy_lock(&call->a_args.lock.fl, &lock->fl);
+	memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh));
+	call->a_args.lock.caller = system_utsname.nodename;
+	call->a_args.lock.oh.len = lock->oh.len;
+
+	/* set default data area */
+	call->a_args.lock.oh.data = call->a_owner;
+	call->a_args.lock.svid = lock->fl.fl_pid;
+
+	if (lock->oh.len > NLMCLNT_OHSIZE) {
+		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
+		if (!data) {
+			nlmsvc_freegrantargs(call);
+			return 0;
+		}
+		call->a_args.lock.oh.data = (u8 *) data;
+	}
+
+	memcpy(call->a_args.lock.oh.data, lock->oh.data, lock->oh.len);
+	return 1;
+}
+
+static void nlmsvc_freegrantargs(struct nlm_rqst *call)
+{
+	struct file_lock *fl = &call->a_args.lock.fl;
+	/*
+	 * Check whether we allocated memory for the owner.
+	 */
+	if (call->a_args.lock.oh.data != (u8 *) call->a_owner) {
+		kfree(call->a_args.lock.oh.data);
+	}
+	if (fl->fl_ops && fl->fl_ops->fl_release_private)
+		fl->fl_ops->fl_release_private(fl);
+	if (fl->fl_lmops && fl->fl_lmops->fl_release_private)
+		fl->fl_lmops->fl_release_private(fl);
+}
+
 /*
  * Attempt to establish a lock, and if it can't be granted, block it
  * if required.
@@ -600,11 +645,16 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 	}
 	nlmsvc_insert_block(block, timeout);
 	svc_wake_up(block->b_daemon);
-	nlmsvc_release_block(block);
+}
+
+void nlmsvc_grant_release(void *data)
+{
+	nlmsvc_release_block(data);
 }
 
 static const struct rpc_call_ops nlmsvc_grant_ops = {
 	.rpc_call_done = nlmsvc_grant_callback,
+	.rpc_release = nlmsvc_grant_release,
 };
 
 /*
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 08ab9773f762..860a93f6ce6d 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -153,8 +153,6 @@ long		  nlmclnt_block(struct nlm_rqst *req, long timeout);
 u32		  nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *);
 void		  nlmclnt_recovery(struct nlm_host *, u32);
 int		  nlmclnt_reclaim(struct nlm_host *, struct file_lock *);
-int		  nlmclnt_setgrantargs(struct nlm_rqst *, struct nlm_lock *);
-void		  nlmclnt_freegrantargs(struct nlm_rqst *);
 
 /*
  * Host cache
-- 
cgit v1.2.3


From 26bcbf965f857c710adafd16cf424f043006b5dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 20 Mar 2006 13:44:40 -0500
Subject: lockd: stop abusing file_lock_list

Currently lockd directly access the file_lock_list from fs/locks.c.
It does so to mark locks granted or reclaimable.  This is very
suboptimal, because a) lockd needs to poke into locks.c internals, and
b) it needs to iterate over all locks in the system for marking locks
granted or reclaimable.

This patch adds lists for granted and reclaimable locks to the nlm_host
structure instead, and adds locks to those.

nlmclnt_lock:
	now adds the lock to h_granted instead of setting the
	NFS_LCK_GRANTED, still O(1)

nlmclnt_mark_reclaim:
	goes away completely, replaced by a list_splice_init.
	Complexity reduced from O(locks in the system) to O(1)

reclaimer:
	iterates over h_reclaim now, complexity reduced from
	O(locks in the system) to O(locks per nlm_host)

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c         | 54 ++++++++-------------------------------------
 fs/lockd/clntproc.c         | 11 ++++-----
 fs/lockd/host.c             |  2 ++
 fs/locks.c                  |  5 +----
 include/linux/fs.h          |  2 --
 include/linux/lockd/lockd.h |  2 ++
 include/linux/nfs_fs_i.h    |  8 +------
 7 files changed, 21 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8ae79ae4b998..0fc0ee267b04 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -154,34 +154,6 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
  * server crash.
  */
 
-/*
- * Mark the locks for reclaiming.
- * FIXME: In 2.5 we don't want to iterate through any global file_lock_list.
- *        Maintain NLM lock reclaiming lists in the nlm_host instead.
- */
-static
-void nlmclnt_mark_reclaim(struct nlm_host *host)
-{
-	struct file_lock *fl;
-	struct inode *inode;
-	struct list_head *tmp;
-
-	list_for_each(tmp, &file_lock_list) {
-		fl = list_entry(tmp, struct file_lock, fl_link);
-
-		inode = fl->fl_file->f_dentry->d_inode;
-		if (inode->i_sb->s_magic != NFS_SUPER_MAGIC)
-			continue;
-		if (fl->fl_u.nfs_fl.owner == NULL)
-			continue;
-		if (fl->fl_u.nfs_fl.owner->host != host)
-			continue;
-		if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_GRANTED))
-			continue;
-		fl->fl_u.nfs_fl.flags |= NFS_LCK_RECLAIM;
-	}
-}
-
 /*
  * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
  * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
@@ -194,7 +166,12 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 	host->h_state++;
 	host->h_nextrebind = 0;
 	nlm_rebind_host(host);
-	nlmclnt_mark_reclaim(host);
+
+	/*
+	 * Mark the locks for reclaiming.
+	 */
+	list_splice_init(&host->h_granted, &host->h_reclaim);
+
 	dprintk("NLM: reclaiming locks for host %s", host->h_name);
 }
 
@@ -223,9 +200,7 @@ reclaimer(void *ptr)
 {
 	struct nlm_host	  *host = (struct nlm_host *) ptr;
 	struct nlm_wait	  *block;
-	struct list_head *tmp;
-	struct file_lock *fl;
-	struct inode *inode;
+	struct file_lock *fl, *next;
 
 	daemonize("%s-reclaim", host->h_name);
 	allow_signal(SIGKILL);
@@ -237,20 +212,9 @@ reclaimer(void *ptr)
 
 	/* First, reclaim all locks that have been marked. */
 restart:
-	list_for_each(tmp, &file_lock_list) {
-		fl = list_entry(tmp, struct file_lock, fl_link);
-
-		inode = fl->fl_file->f_dentry->d_inode;
-		if (inode->i_sb->s_magic != NFS_SUPER_MAGIC)
-			continue;
-		if (fl->fl_u.nfs_fl.owner == NULL)
-			continue;
-		if (fl->fl_u.nfs_fl.owner->host != host)
-			continue;
-		if (!(fl->fl_u.nfs_fl.flags & NFS_LCK_RECLAIM))
-			continue;
+	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
+		list_del(&fl->fl_u.nfs_fl.list);
 
-		fl->fl_u.nfs_fl.flags &= ~NFS_LCK_RECLAIM;
 		nlmclnt_reclaim(host, fl);
 		if (signalled())
 			break;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 80ae3127699f..cb469431bd1d 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -465,7 +465,6 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho
 {
 	BUG_ON(fl->fl_ops != NULL);
 	fl->fl_u.nfs_fl.state = 0;
-	fl->fl_u.nfs_fl.flags = 0;
 	fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
 	fl->fl_ops = &nlmclnt_lock_ops;
 }
@@ -552,8 +551,8 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 
 	if (resp->status == NLM_LCK_GRANTED) {
 		fl->fl_u.nfs_fl.state = host->h_state;
-		fl->fl_u.nfs_fl.flags |= NFS_LCK_GRANTED;
 		fl->fl_flags |= FL_SLEEP;
+		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
 		do_vfs_lock(fl);
 	}
 	status = nlm_stat_to_errno(resp->status);
@@ -619,9 +618,11 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	struct nlm_res	*resp = &req->a_res;
 	int		status;
 
-	/* Clean the GRANTED flag now so the lock doesn't get
-	 * reclaimed while we're stuck in the unlock call. */
-	fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED;
+	/*
+	 * Remove from the granted list now so the lock doesn't get
+	 * reclaimed while we're stuck in the unlock call.
+	 */
+	list_del(&fl->fl_u.nfs_fl.list);
 
 	/*
 	 * Note: the server is supposed to either grant us the unlock
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 100e78229700..f456f8ed9acd 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -123,6 +123,8 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	nlm_hosts[hash]    = host;
 	INIT_LIST_HEAD(&host->h_lockowners);
 	spin_lock_init(&host->h_lock);
+	INIT_LIST_HEAD(&host->h_granted);
+	INIT_LIST_HEAD(&host->h_reclaim);
 
 	if (++nrhosts > NLM_HOST_MAX)
 		next_gc = 0;
diff --git a/fs/locks.c b/fs/locks.c
index c83b5dbe0ed9..56f996e98bbc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -139,10 +139,7 @@ int lease_break_time = 45;
 #define for_each_lock(inode, lockp) \
 	for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
 
-LIST_HEAD(file_lock_list);
-
-EXPORT_SYMBOL(file_lock_list);
-
+static LIST_HEAD(file_lock_list);
 static LIST_HEAD(blocked_list);
 
 static kmem_cache_t *filelock_cache;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d2cffee8fc11..5dc0fa288a4c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -730,8 +730,6 @@ struct file_lock {
 #define OFFT_OFFSET_MAX	INT_LIMIT(off_t)
 #endif
 
-extern struct list_head file_lock_list;
-
 #include <linux/fcntl.h>
 
 extern int fcntl_getlk(struct file *, struct flock __user *);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 860a93f6ce6d..b0f63b6ab0d4 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -59,6 +59,8 @@ struct nlm_host {
 	unsigned long		h_expires;	/* eligible for GC */
 	struct list_head	h_lockowners;	/* Lockowners for the client */
 	spinlock_t		h_lock;
+	struct list_head	h_granted;	/* Locks in GRANTED state */
+	struct list_head	h_reclaim;	/* Locks in RECLAIM state */
 };
 
 /*
diff --git a/include/linux/nfs_fs_i.h b/include/linux/nfs_fs_i.h
index e2c18dabff86..861730275ba0 100644
--- a/include/linux/nfs_fs_i.h
+++ b/include/linux/nfs_fs_i.h
@@ -12,8 +12,8 @@ struct nlm_lockowner;
  */
 struct nfs_lock_info {
 	u32		state;
-	u32		flags;
 	struct nlm_lockowner *owner;
+	struct list_head list;
 };
 
 struct nfs4_lock_state;
@@ -21,10 +21,4 @@ struct nfs4_lock_info {
 	struct nfs4_lock_state *owner;
 };
 
-/*
- * Lock flag values
- */
-#define NFS_LCK_GRANTED		0x0001		/* lock has been granted */
-#define NFS_LCK_RECLAIM		0x0002		/* lock marked for reclaiming */
-
 #endif
-- 
cgit v1.2.3


From 3a649b884637c4fdff50a6beebc3dc0e6082e048 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:44 -0500
Subject: NLM: Simplify client locks

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c         | 43 ++++++++++++++++---------------------------
 fs/lockd/clntproc.c         | 39 ++++++++++++++++-----------------------
 include/linux/lockd/lockd.h |  7 +++----
 3 files changed, 35 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 7cf41c1e1a88..bce744468708 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -44,32 +44,25 @@ static LIST_HEAD(nlm_blocked);
 /*
  * Queue up a lock for blocking so that the GRANTED request can see it
  */
-int nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl)
+struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl)
 {
 	struct nlm_wait *block;
 
-	BUG_ON(req->a_block != NULL);
 	block = kmalloc(sizeof(*block), GFP_KERNEL);
-	if (block == NULL)
-		return -ENOMEM;
-	block->b_host = host;
-	block->b_lock = fl;
-	init_waitqueue_head(&block->b_wait);
-	block->b_status = NLM_LCK_BLOCKED;
-
-	list_add(&block->b_list, &nlm_blocked);
-	req->a_block = block;
-
-	return 0;
+	if (block != NULL) {
+		block->b_host = host;
+		block->b_lock = fl;
+		init_waitqueue_head(&block->b_wait);
+		block->b_status = NLM_LCK_BLOCKED;
+		list_add(&block->b_list, &nlm_blocked);
+	}
+	return block;
 }
 
-void nlmclnt_finish_block(struct nlm_rqst *req)
+void nlmclnt_finish_block(struct nlm_wait *block)
 {
-	struct nlm_wait *block = req->a_block;
-
 	if (block == NULL)
 		return;
-	req->a_block = NULL;
 	list_del(&block->b_list);
 	kfree(block);
 }
@@ -77,15 +70,14 @@ void nlmclnt_finish_block(struct nlm_rqst *req)
 /*
  * Block on a lock
  */
-long nlmclnt_block(struct nlm_rqst *req, long timeout)
+int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 {
-	struct nlm_wait	*block = req->a_block;
 	long ret;
 
 	/* A borken server might ask us to block even if we didn't
 	 * request it. Just say no!
 	 */
-	if (!req->a_args.block)
+	if (block == NULL)
 		return -EAGAIN;
 
 	/* Go to sleep waiting for GRANT callback. Some servers seem
@@ -99,13 +91,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout)
 	ret = wait_event_interruptible_timeout(block->b_wait,
 			block->b_status != NLM_LCK_BLOCKED,
 			timeout);
-
-	if (block->b_status != NLM_LCK_BLOCKED) {
-		req->a_res.status = block->b_status;
-		block->b_status = NLM_LCK_BLOCKED;
-	}
-
-	return ret;
+	if (ret < 0)
+		return -ERESTARTSYS;
+	req->a_res.status = block->b_status;
+	return 0;
 }
 
 /*
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c25044f3b660..8af017105854 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -136,15 +136,14 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 				(unsigned int)fl->fl_u.nfs_fl.owner->pid,
 				system_utsname.nodename);
 	lock->svid = fl->fl_u.nfs_fl.owner->pid;
-	locks_copy_lock(&lock->fl, fl);
+	lock->fl.fl_start = fl->fl_start;
+	lock->fl.fl_end = fl->fl_end;
+	lock->fl.fl_type = fl->fl_type;
 }
 
 static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 {
-	struct file_lock *fl = &req->a_args.lock.fl;
-
-	if (fl->fl_ops && fl->fl_ops->fl_release_private)
-		fl->fl_ops->fl_release_private(fl);
+	BUG_ON(req->a_args.lock.fl.fl_ops != NULL);
 }
 
 /*
@@ -455,7 +454,6 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
 {
 	list_del(&fl->fl_u.nfs_fl.list);
 	nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
-	fl->fl_ops = NULL;
 }
 
 static struct file_lock_operations nlmclnt_lock_ops = {
@@ -515,41 +513,36 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 {
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
-	long timeout;
-	int status;
+	struct nlm_wait *block = NULL;
+	int status = -ENOLCK;
 
 	if (!host->h_monitored && nsm_monitor(host) < 0) {
 		printk(KERN_NOTICE "lockd: failed to monitor %s\n",
 					host->h_name);
-		status = -ENOLCK;
 		goto out;
 	}
 
-	if (req->a_args.block) {
-		status = nlmclnt_prepare_block(req, host, fl);
-		if (status < 0)
-			goto out;
-	}
+	block = nlmclnt_prepare_block(host, fl);
 	for(;;) {
 		status = nlmclnt_call(req, NLMPROC_LOCK);
 		if (status < 0)
 			goto out_unblock;
-		if (resp->status != NLM_LCK_BLOCKED)
+		if (!req->a_args.block)
 			break;
-		/* Wait on an NLM blocking lock */
-		timeout = nlmclnt_block(req, NLMCLNT_POLL_TIMEOUT);
 		/* Did a reclaimer thread notify us of a server reboot? */
 		if (resp->status ==  NLM_LCK_DENIED_GRACE_PERIOD)
 			continue;
 		if (resp->status != NLM_LCK_BLOCKED)
 			break;
-		if (timeout >= 0)
-			continue;
-		/* We were interrupted. Send a CANCEL request to the server
+		/* Wait on an NLM blocking lock */
+		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
+		/* if we were interrupted. Send a CANCEL request to the server
 		 * and exit
 		 */
-		status = (int)timeout;
-		goto out_unblock;
+		if (status < 0)
+			goto out_unblock;
+		if (resp->status != NLM_LCK_BLOCKED)
+			break;
 	}
 
 	if (resp->status == NLM_LCK_GRANTED) {
@@ -560,7 +553,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 	status = nlm_stat_to_errno(resp->status);
 out_unblock:
-	nlmclnt_finish_block(req);
+	nlmclnt_finish_block(block);
 	/* Cancel the blocked request if it is still pending */
 	if (resp->status == NLM_LCK_BLOCKED)
 		nlmclnt_cancel(host, req->a_args.block, fl);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index b0f63b6ab0d4..cb9933d04091 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -86,7 +86,6 @@ struct nlm_rqst {
 	struct nlm_host *	a_host;		/* host handle */
 	struct nlm_args		a_args;		/* arguments */
 	struct nlm_res		a_res;		/* result */
-	struct nlm_wait *	a_block;
 	unsigned int		a_retries;	/* Retry count */
 	char			a_owner[NLMCLNT_OHSIZE];
 };
@@ -149,9 +148,9 @@ extern unsigned long		nlmsvc_timeout;
  * Lockd client functions
  */
 struct nlm_rqst * nlmclnt_alloc_call(void);
-int		  nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl);
-void		  nlmclnt_finish_block(struct nlm_rqst *req);
-long		  nlmclnt_block(struct nlm_rqst *req, long timeout);
+struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl);
+void		  nlmclnt_finish_block(struct nlm_wait *block);
+int		  nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
 u32		  nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *);
 void		  nlmclnt_recovery(struct nlm_host *, u32);
 int		  nlmclnt_reclaim(struct nlm_host *, struct file_lock *);
-- 
cgit v1.2.3


From 92737230dd3f1478033819d4bc20339f8da852da Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:45 -0500
Subject: NLM: Add nlmclnt_release_call

Add a helper function to simplify the freeing of NLM client requests.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         | 195 +++++++++++++++++---------------------------
 fs/lockd/svc4proc.c         |  34 ++++----
 fs/lockd/svclock.c          |  66 +++++++--------
 fs/lockd/svcproc.c          |  33 +++-----
 include/linux/lockd/lockd.h |  10 ++-
 5 files changed, 135 insertions(+), 203 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 7a239864b8bf..3f8ad7c54efa 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -152,9 +152,8 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 int
 nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 {
-	struct nfs_server	*nfssrv = NFS_SERVER(inode);
 	struct nlm_host		*host;
-	struct nlm_rqst		reqst, *call = &reqst;
+	struct nlm_rqst		*call;
 	sigset_t		oldset;
 	unsigned long		flags;
 	int			status, proto, vers;
@@ -168,23 +167,17 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 	/* Retrieve transport protocol from NFS client */
 	proto = NFS_CLIENT(inode)->cl_xprt->prot;
 
-	if (!(host = nlmclnt_lookup_host(NFS_ADDR(inode), proto, vers)))
+	host = nlmclnt_lookup_host(NFS_ADDR(inode), proto, vers);
+	if (host == NULL)
 		return -ENOLCK;
 
-	/* Create RPC client handle if not there, and copy soft
-	 * and intr flags from NFS client. */
-	if (host->h_rpcclnt == NULL) {
-		struct rpc_clnt	*clnt;
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return -ENOMEM;
 
-		/* Bind an rpc client to this host handle (does not
-		 * perform a portmapper lookup) */
-		if (!(clnt = nlm_bind_host(host))) {
-			status = -ENOLCK;
-			goto done;
-		}
-		clnt->cl_softrtry = nfssrv->client->cl_softrtry;
-		clnt->cl_intr = nfssrv->client->cl_intr;
-	}
+	nlmclnt_locks_init_private(fl, host);
+	/* Set up the argument struct */
+	nlmclnt_setlockargs(call, fl);
 
 	/* Keep the old signal mask */
 	spin_lock_irqsave(&current->sighand->siglock, flags);
@@ -197,26 +190,10 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 	    && (current->flags & PF_EXITING)) {
 		sigfillset(&current->blocked);	/* Mask all signals */
 		recalc_sigpending();
-		spin_unlock_irqrestore(&current->sighand->siglock, flags);
 
-		call = nlmclnt_alloc_call();
-		if (!call) {
-			status = -ENOMEM;
-			goto out_restore;
-		}
 		call->a_flags = RPC_TASK_ASYNC;
-	} else {
-		spin_unlock_irqrestore(&current->sighand->siglock, flags);
-		memset(call, 0, sizeof(*call));
-		locks_init_lock(&call->a_args.lock.fl);
-		locks_init_lock(&call->a_res.lock.fl);
 	}
-	call->a_host = host;
-
-	nlmclnt_locks_init_private(fl, host);
-
-	/* Set up the argument struct */
-	nlmclnt_setlockargs(call, fl);
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
 		if (fl->fl_type != F_UNLCK) {
@@ -229,24 +206,26 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 	else
 		status = -EINVAL;
 
- out_restore:
+	fl->fl_ops->fl_release_private(fl);
+	fl->fl_ops = NULL;
+
 	spin_lock_irqsave(&current->sighand->siglock, flags);
 	current->blocked = oldset;
 	recalc_sigpending();
 	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 
-done:
 	dprintk("lockd: clnt proc returns %d\n", status);
-	nlm_release_host(host);
 	return status;
 }
 EXPORT_SYMBOL(nlmclnt_proc);
 
 /*
  * Allocate an NLM RPC call struct
+ *
+ * Note: the caller must hold a reference to host. In case of failure,
+ * this reference will be released.
  */
-struct nlm_rqst *
-nlmclnt_alloc_call(void)
+struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 {
 	struct nlm_rqst	*call;
 
@@ -255,16 +234,30 @@ nlmclnt_alloc_call(void)
 		if (call != NULL) {
 			locks_init_lock(&call->a_args.lock.fl);
 			locks_init_lock(&call->a_res.lock.fl);
+			call->a_host = host;
 			return call;
 		}
 		if (signalled())
 			break;
-		printk("nlmclnt_alloc_call: failed, waiting for memory\n");
+		printk("nlm_alloc_call: failed, waiting for memory\n");
 		schedule_timeout_interruptible(5*HZ);
 	}
+	nlm_release_host(host);
 	return NULL;
 }
 
+void nlm_release_call(struct nlm_rqst *call)
+{
+	nlm_release_host(call->a_host);
+	nlmclnt_release_lockargs(call);
+	kfree(call);
+}
+
+static void nlmclnt_rpc_release(void *data)
+{
+	return nlm_release_call(data);
+}
+
 static int nlm_wait_on_grace(wait_queue_head_t *queue)
 {
 	DEFINE_WAIT(wait);
@@ -361,7 +354,7 @@ in_grace_period:
 /*
  * Generic NLM call, async version.
  */
-int nlmsvc_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
@@ -369,48 +362,23 @@ int nlmsvc_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops
 		.rpc_argp	= &req->a_args,
 		.rpc_resp	= &req->a_res,
 	};
-	int		status;
-
-	dprintk("lockd: call procedure %d on %s (async)\n",
-			(int)proc, host->h_name);
-
-	/* If we have no RPC client yet, create one. */
-	if ((clnt = nlm_bind_host(host)) == NULL)
-		return -ENOLCK;
-	msg.rpc_proc = &clnt->cl_procinfo[proc];
-
-        /* bootstrap and kick off the async RPC call */
-        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
-
-	return status;
-}
-
-static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
-{
-	struct nlm_host	*host = req->a_host;
-	struct rpc_clnt	*clnt;
-	struct nlm_args	*argp = &req->a_args;
-	struct nlm_res	*resp = &req->a_res;
-	struct rpc_message msg = {
-		.rpc_argp	= argp,
-		.rpc_resp	= resp,
-	};
-	int		status;
+	int status = -ENOLCK;
 
 	dprintk("lockd: call procedure %d on %s (async)\n",
 			(int)proc, host->h_name);
 
 	/* If we have no RPC client yet, create one. */
-	if ((clnt = nlm_bind_host(host)) == NULL)
-		return -ENOLCK;
+	clnt = nlm_bind_host(host);
+	if (clnt == NULL)
+		goto out_err;
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 
-	/* Increment host refcount */
-	nlm_get_host(host);
         /* bootstrap and kick off the async RPC call */
         status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
-	if (status < 0)
-		nlm_release_host(host);
+	if (status == 0)
+		return 0;
+out_err:
+	nlm_release_call(req);
 	return status;
 }
 
@@ -423,26 +391,28 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 	int	status;
 
 	status = nlmclnt_call(req, NLMPROC_TEST);
-	nlmclnt_release_lockargs(req);
 	if (status < 0)
-		return status;
+		goto out;
 
-	status = req->a_res.status;
-	if (status == NLM_LCK_GRANTED) {
-		fl->fl_type = F_UNLCK;
-	} if (status == NLM_LCK_DENIED) {
-		/*
-		 * Report the conflicting lock back to the application.
-		 */
-		fl->fl_start = req->a_res.lock.fl.fl_start;
-		fl->fl_end = req->a_res.lock.fl.fl_start;
-		fl->fl_type = req->a_res.lock.fl.fl_type;
-		fl->fl_pid = 0;
-	} else {
-		return nlm_stat_to_errno(req->a_res.status);
+	switch (req->a_res.status) {
+		case NLM_LCK_GRANTED:
+			fl->fl_type = F_UNLCK;
+			break;
+		case NLM_LCK_DENIED:
+			/*
+			 * Report the conflicting lock back to the application.
+			 */
+			fl->fl_start = req->a_res.lock.fl.fl_start;
+			fl->fl_end = req->a_res.lock.fl.fl_start;
+			fl->fl_type = req->a_res.lock.fl.fl_type;
+			fl->fl_pid = 0;
+			break;
+		default:
+			status = nlm_stat_to_errno(req->a_res.status);
 	}
-
-	return 0;
+out:
+	nlm_release_call(req);
+	return status;
 }
 
 static void nlmclnt_locks_copy_lock(struct file_lock *new, struct file_lock *fl)
@@ -560,7 +530,7 @@ out_unblock:
 	if (resp->status == NLM_LCK_BLOCKED)
 		nlmclnt_cancel(host, req->a_args.block, fl);
 out:
-	nlmclnt_release_lockargs(req);
+	nlm_release_call(req);
 	return status;
 }
 
@@ -623,32 +593,24 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 */
 	do_vfs_lock(fl);
 
-	if (req->a_flags & RPC_TASK_ASYNC) {
-		status = nlmclnt_async_call(req, NLMPROC_UNLOCK,
-					&nlmclnt_unlock_ops);
-		/* Hrmf... Do the unlock early since locks_remove_posix()
-		 * really expects us to free the lock synchronously */
-		if (status < 0) {
-			nlmclnt_release_lockargs(req);
-			kfree(req);
-		}
-		return status;
-	}
+	if (req->a_flags & RPC_TASK_ASYNC)
+		return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
 
 	status = nlmclnt_call(req, NLMPROC_UNLOCK);
-	nlmclnt_release_lockargs(req);
 	if (status < 0)
-		return status;
+		goto out;
 
+	status = 0;
 	if (resp->status == NLM_LCK_GRANTED)
-		return 0;
+		goto out;
 
 	if (resp->status != NLM_LCK_DENIED_NOLOCKS)
 		printk("lockd: unexpected unlock status: %d\n", resp->status);
-
 	/* What to do now? I'm out of my depth... */
-
-	return -ENOLCK;
+	status = -ENOLCK;
+out:
+	nlm_release_call(req);
+	return status;
 }
 
 static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
@@ -670,9 +632,6 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 	if (status != NLM_LCK_GRANTED)
 		printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status);
 die:
-	nlm_release_host(req->a_host);
-	nlmclnt_release_lockargs(req);
-	kfree(req);
 	return;
  retry_rebind:
 	nlm_rebind_host(req->a_host);
@@ -682,6 +641,7 @@ die:
 
 static const struct rpc_call_ops nlmclnt_unlock_ops = {
 	.rpc_call_done = nlmclnt_unlock_callback,
+	.rpc_release = nlmclnt_rpc_release,
 };
 
 /*
@@ -703,20 +663,15 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 	recalc_sigpending();
 	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 
-	req = nlmclnt_alloc_call();
+	req = nlm_alloc_call(nlm_get_host(host));
 	if (!req)
 		return -ENOMEM;
-	req->a_host  = host;
 	req->a_flags = RPC_TASK_ASYNC;
 
 	nlmclnt_setlockargs(req, fl);
 	req->a_args.block = block;
 
-	status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
-	if (status < 0) {
-		nlmclnt_release_lockargs(req);
-		kfree(req);
-	}
+	status = nlm_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 
 	spin_lock_irqsave(&current->sighand->siglock, flags);
 	current->blocked = oldset;
@@ -757,9 +712,6 @@ static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
 	}
 
 die:
-	nlm_release_host(req->a_host);
-	nlmclnt_release_lockargs(req);
-	kfree(req);
 	return;
 
 retry_cancel:
@@ -773,6 +725,7 @@ retry_cancel:
 
 static const struct rpc_call_ops nlmclnt_cancel_ops = {
 	.rpc_call_done = nlmclnt_cancel_callback,
+	.rpc_release = nlmclnt_rpc_release,
 };
 
 /*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index ac4a700af01a..cb51c7025825 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -480,43 +480,37 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	struct nlm_host	*host;
 	struct nlm_rqst	*call;
 
-	if (!(call = nlmclnt_alloc_call()))
+	host = nlmsvc_lookup_host(rqstp);
+	if (host == NULL)
 		return rpc_system_err;
 
-	host = nlmsvc_lookup_host(rqstp);
-	if (!host) {
-		kfree(call);
+	call = nlm_alloc_call(host);
+	if (call == NULL)
 		return rpc_system_err;
-	}
+
 
 	call->a_flags = RPC_TASK_ASYNC;
-	call->a_host  = host;
 	memcpy(&call->a_args, resp, sizeof(*resp));
 
-	if (nlmsvc_async_call(call, proc, &nlm4svc_callback_ops) < 0)
-		goto error;
-
+	if (nlm_async_call(call, proc, &nlm4svc_callback_ops) < 0)
+		return rpc_system_err;
 	return rpc_success;
- error:
-	kfree(call);
-	nlm_release_host(host);
-	return rpc_system_err;
 }
 
 static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*call = data;
+	dprintk("lockd: %4d callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
 
-	if (task->tk_status < 0) {
-		dprintk("lockd: %4d callback failed (errno = %d)\n",
-					task->tk_pid, -task->tk_status);
-	}
-	nlm_release_host(call->a_host);
-	kfree(call);
+static void nlm4svc_callback_release(void *data)
+{
+	nlm_release_call(data);
 }
 
 static const struct rpc_call_ops nlm4svc_callback_ops = {
 	.rpc_call_done = nlm4svc_callback_exit,
+	.rpc_release = nlm4svc_callback_release,
 };
 
 /*
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index a95d260b7137..185bf7ea1c0c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -117,12 +117,12 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock, int remove)
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end, lock->fl.fl_type);
 	for (head = &nlm_blocked; (block = *head) != 0; head = &block->b_next) {
-		fl = &block->b_call.a_args.lock.fl;
+		fl = &block->b_call->a_args.lock.fl;
 		dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
 				block->b_file, fl->fl_pid,
 				(long long)fl->fl_start,
 				(long long)fl->fl_end, fl->fl_type,
-				nlmdbg_cookie2a(&block->b_call.a_args.cookie));
+				nlmdbg_cookie2a(&block->b_call->a_args.cookie));
 		if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
 			if (remove) {
 				*head = block->b_next;
@@ -156,7 +156,7 @@ nlmsvc_find_block(struct nlm_cookie *cookie,  struct sockaddr_in *sin)
 	for (block = nlm_blocked; block; block = block->b_next) {
 		dprintk("cookie: head of blocked queue %p, block %p\n", 
 			nlm_blocked, block);
-		if (nlm_cookie_match(&block->b_call.a_args.cookie,cookie)
+		if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)
 				&& nlm_cmp_addr(sin, &block->b_host->h_addr))
 			break;
 	}
@@ -182,28 +182,30 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 {
 	struct nlm_block	*block;
 	struct nlm_host		*host;
-	struct nlm_rqst		*call;
+	struct nlm_rqst		*call = NULL;
 
 	/* Create host handle for callback */
 	host = nlmsvc_lookup_host(rqstp);
 	if (host == NULL)
 		return NULL;
 
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return NULL;
+
 	/* Allocate memory for block, and initialize arguments */
-	if (!(block = (struct nlm_block *) kmalloc(sizeof(*block), GFP_KERNEL)))
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (block == NULL)
 		goto failed;
-	memset(block, 0, sizeof(*block));
-	locks_init_lock(&block->b_call.a_args.lock.fl);
-	locks_init_lock(&block->b_call.a_res.lock.fl);
 	kref_init(&block->b_count);
 
-	if (!nlmsvc_setgrantargs(&block->b_call, lock))
+	if (!nlmsvc_setgrantargs(call, lock))
 		goto failed_free;
 
 	/* Set notifier function for VFS, and init args */
-	block->b_call.a_args.lock.fl.fl_flags |= FL_SLEEP;
-	block->b_call.a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
-	block->b_call.a_args.cookie = *cookie;	/* see above */
+	call->a_args.lock.fl.fl_flags |= FL_SLEEP;
+	call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
+	call->a_args.cookie = *cookie;	/* see above */
 
 	dprintk("lockd: created block %p...\n", block);
 
@@ -217,16 +219,16 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	file->f_blocks  = block;
 
 	/* Set up RPC arguments for callback */
-	call = &block->b_call;
-	call->a_host    = host;
+	block->b_call = call;
 	call->a_flags   = RPC_TASK_ASYNC;
+	call->a_block = block;
 
 	return block;
 
 failed_free:
 	kfree(block);
 failed:
-	nlm_release_host(host);
+	nlm_release_call(call);
 	return NULL;
 }
 
@@ -242,7 +244,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
 	dprintk("lockd: unlinking block %p...\n", block);
 
 	/* Remove block from list */
-	status = posix_unblock_lock(block->b_file->f_file, &block->b_call.a_args.lock.fl);
+	status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl);
 	nlmsvc_remove_block(block);
 	return status;
 }
@@ -263,9 +265,8 @@ static void nlmsvc_free_block(struct kref *kref)
 		}
 	}
 
-	if (block->b_host)
-		nlm_release_host(block->b_host);
-	nlmsvc_freegrantargs(&block->b_call);
+	nlmsvc_freegrantargs(block->b_call);
+	nlm_release_call(block->b_call);
 	kfree(block);
 }
 
@@ -316,10 +317,8 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
 
 	if (lock->oh.len > NLMCLNT_OHSIZE) {
 		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
-		if (!data) {
-			nlmsvc_freegrantargs(call);
+		if (!data)
 			return 0;
-		}
 		call->a_args.lock.oh.data = (u8 *) data;
 	}
 
@@ -329,17 +328,8 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
 
 static void nlmsvc_freegrantargs(struct nlm_rqst *call)
 {
-	struct file_lock *fl = &call->a_args.lock.fl;
-	/*
-	 * Check whether we allocated memory for the owner.
-	 */
-	if (call->a_args.lock.oh.data != (u8 *) call->a_owner) {
+	if (call->a_args.lock.oh.data != call->a_owner)
 		kfree(call->a_args.lock.oh.data);
-	}
-	if (fl->fl_ops && fl->fl_ops->fl_release_private)
-		fl->fl_ops->fl_release_private(fl);
-	if (fl->fl_lmops && fl->fl_lmops->fl_release_private)
-		fl->fl_lmops->fl_release_private(fl);
 }
 
 /*
@@ -371,9 +361,9 @@ again:
 	block = nlmsvc_lookup_block(file, lock, 0);
 	if (block == NULL) {
 		if (newblock != NULL)
-			lock = &newblock->b_call.a_args.lock;
+			lock = &newblock->b_call->a_args.lock;
 	} else
-		lock = &block->b_call.a_args.lock;
+		lock = &block->b_call->a_args.lock;
 
 	error = posix_lock_file(file->f_file, &lock->fl);
 	lock->fl.fl_flags &= ~FL_SLEEP;
@@ -523,7 +513,7 @@ nlmsvc_notify_blocked(struct file_lock *fl)
 
 	dprintk("lockd: VFS unblock notification for block %p\n", fl);
 	for (bp = &nlm_blocked; (block = *bp) != 0; bp = &block->b_next) {
-		if (nlm_compare_locks(&block->b_call.a_args.lock.fl, fl)) {
+		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
 			nlmsvc_insert_block(block, 0);
 			svc_wake_up(block->b_daemon);
 			return;
@@ -558,7 +548,7 @@ static void
 nlmsvc_grant_blocked(struct nlm_block *block)
 {
 	struct nlm_file		*file = block->b_file;
-	struct nlm_lock		*lock = &block->b_call.a_args.lock;
+	struct nlm_lock		*lock = &block->b_call->a_args.lock;
 	int			error;
 
 	dprintk("lockd: grant blocked lock %p\n", block);
@@ -606,7 +596,7 @@ callback:
 
 	/* Call the client */
 	kref_get(&block->b_count);
-	if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG,
+	if (nlm_async_call(block->b_call, NLMPROC_GRANTED_MSG,
 						&nlmsvc_grant_ops) < 0)
 		nlmsvc_release_block(block);
 out_unlock:
@@ -624,7 +614,7 @@ out_unlock:
 static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 {
 	struct nlm_rqst		*call = data;
-	struct nlm_block	*block = container_of(call, struct nlm_block, b_call);
+	struct nlm_block	*block = call->a_block;
 	unsigned long		timeout;
 
 	dprintk("lockd: GRANT_MSG RPC callback\n");
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 4986fbe44540..956d1d71e2af 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -505,43 +505,36 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
 	struct nlm_host	*host;
 	struct nlm_rqst	*call;
 
-	if (!(call = nlmclnt_alloc_call()))
+	host = nlmsvc_lookup_host(rqstp);
+	if (host == NULL)
 		return rpc_system_err;
 
-	host = nlmsvc_lookup_host(rqstp);
-	if (!host) {
-		kfree(call);
+	call = nlm_alloc_call(host);
+	if (call == NULL)
 		return rpc_system_err;
-	}
 
 	call->a_flags = RPC_TASK_ASYNC;
-	call->a_host  = host;
 	memcpy(&call->a_args, resp, sizeof(*resp));
 
-	if (nlmsvc_async_call(call, proc, &nlmsvc_callback_ops) < 0)
-		goto error;
-
+	if (nlm_async_call(call, proc, &nlmsvc_callback_ops) < 0)
+		return rpc_system_err;
 	return rpc_success;
- error:
-	nlm_release_host(host);
-	kfree(call);
-	return rpc_system_err;
 }
 
 static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
 {
-	struct nlm_rqst	*call = data;
+	dprintk("lockd: %4d callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
 
-	if (task->tk_status < 0) {
-		dprintk("lockd: %4d callback failed (errno = %d)\n",
-					task->tk_pid, -task->tk_status);
-	}
-	nlm_release_host(call->a_host);
-	kfree(call);
+static void nlmsvc_callback_release(void *data)
+{
+	nlm_release_call(data);
 }
 
 static const struct rpc_call_ops nlmsvc_callback_ops = {
 	.rpc_call_done = nlmsvc_callback_exit,
+	.rpc_release = nlmsvc_callback_release,
 };
 
 /*
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index cb9933d04091..e7ba8110d579 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -86,8 +86,9 @@ struct nlm_rqst {
 	struct nlm_host *	a_host;		/* host handle */
 	struct nlm_args		a_args;		/* arguments */
 	struct nlm_res		a_res;		/* result */
+	struct nlm_block *	a_block;
 	unsigned int		a_retries;	/* Retry count */
-	char			a_owner[NLMCLNT_OHSIZE];
+	u8			a_owner[NLMCLNT_OHSIZE];
 };
 
 /*
@@ -115,7 +116,7 @@ struct nlm_block {
 	struct kref		b_count;	/* Reference count */
 	struct nlm_block *	b_next;		/* linked list (all blocks) */
 	struct nlm_block *	b_fnext;	/* linked list (per file) */
-	struct nlm_rqst		b_call;		/* RPC args & callback info */
+	struct nlm_rqst	*	b_call;		/* RPC args & callback info */
 	struct svc_serv *	b_daemon;	/* NLM service */
 	struct nlm_host *	b_host;		/* host handle for RPC clnt */
 	unsigned long		b_when;		/* next re-xmit */
@@ -147,7 +148,9 @@ extern unsigned long		nlmsvc_timeout;
 /*
  * Lockd client functions
  */
-struct nlm_rqst * nlmclnt_alloc_call(void);
+struct nlm_rqst * nlm_alloc_call(struct nlm_host *host);
+void		  nlm_release_call(struct nlm_rqst *);
+int		  nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl);
 void		  nlmclnt_finish_block(struct nlm_wait *block);
 int		  nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
@@ -172,7 +175,6 @@ extern struct nlm_host *nlm_find_client(void);
 /*
  * Server-side lock handling
  */
-int		  nlmsvc_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 u32		  nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
 					struct nlm_lock *, int, struct nlm_cookie *);
 u32		  nlmsvc_unlock(struct nlm_file *, struct nlm_lock *);
-- 
cgit v1.2.3


From d47166244860eb5dfdb12ee4703968beef8a0db2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:45 -0500
Subject: lockd: Add helper for *_RES callbacks

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntproc.c         |  27 +++++---
 fs/lockd/svc4proc.c         | 150 ++++++++++++++++----------------------------
 fs/lockd/svcproc.c          | 143 ++++++++++++++++-------------------------
 include/linux/lockd/lockd.h |   1 +
 4 files changed, 129 insertions(+), 192 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 3f8ad7c54efa..f96e38155b5c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -354,14 +354,10 @@ in_grace_period:
 /*
  * Generic NLM call, async version.
  */
-int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops)
 {
 	struct nlm_host	*host = req->a_host;
 	struct rpc_clnt	*clnt;
-	struct rpc_message msg = {
-		.rpc_argp	= &req->a_args,
-		.rpc_resp	= &req->a_res,
-	};
 	int status = -ENOLCK;
 
 	dprintk("lockd: call procedure %d on %s (async)\n",
@@ -371,10 +367,10 @@ int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk
 	clnt = nlm_bind_host(host);
 	if (clnt == NULL)
 		goto out_err;
-	msg.rpc_proc = &clnt->cl_procinfo[proc];
+	msg->rpc_proc = &clnt->cl_procinfo[proc];
 
         /* bootstrap and kick off the async RPC call */
-        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req);
+        status = rpc_call_async(clnt, msg, RPC_TASK_ASYNC, tk_ops, req);
 	if (status == 0)
 		return 0;
 out_err:
@@ -382,6 +378,23 @@ out_err:
 	return status;
 }
 
+int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_args,
+		.rpc_resp	= &req->a_res,
+	};
+	return __nlm_async_call(req, proc, &msg, tk_ops);
+}
+
+int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_message msg = {
+		.rpc_argp	= &req->a_res,
+	};
+	return __nlm_async_call(req, proc, &msg, tk_ops);
+}
+
 /*
  * TEST for the presence of a conflicting lock
  */
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index cb51c7025825..a2dd9ccb9b32 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -21,10 +21,6 @@
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
-static u32	nlm4svc_callback(struct svc_rqst *, u32, struct nlm_res *);
-
-static const struct rpc_call_ops nlm4svc_callback_ops;
-
 /*
  * Obtain client and file from arguments
  */
@@ -233,84 +229,90 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return rpc_success;
 }
 
+/*
+ * This is the generic lockd callback for async RPC calls
+ */
+static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
+{
+	dprintk("lockd: %4d callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
+
+static void nlm4svc_callback_release(void *data)
+{
+	nlm_release_call(data);
+}
+
+static const struct rpc_call_ops nlm4svc_callback_ops = {
+	.rpc_call_done = nlm4svc_callback_exit,
+	.rpc_release = nlm4svc_callback_release,
+};
+
 /*
  * `Async' versions of the above service routines. They aren't really,
  * because we send the callback before the reply proper. I hope this
  * doesn't break any clients.
  */
-static int
-nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
-					     void	     *resp)
+static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
 {
-	struct nlm_res	res;
-	u32		stat;
+	struct nlm_host	*host;
+	struct nlm_rqst	*call;
+	int stat;
 
-	dprintk("lockd: TEST_MSG      called\n");
-	memset(&res, 0, sizeof(res));
+	host = nlmsvc_lookup_host(rqstp);
+	if (host == NULL)
+		return rpc_system_err;
+
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return rpc_system_err;
+
+	stat = func(rqstp, argp, &call->a_res);
+	if (stat != 0) {
+		nlm_release_call(call);
+		return stat;
+	}
 
-	if ((stat = nlm4svc_proc_test(rqstp, argp, &res)) == 0)
-		stat = nlm4svc_callback(rqstp, NLMPROC_TEST_RES, &res);
-	return stat;
+	call->a_flags = RPC_TASK_ASYNC;
+	if (nlm_async_reply(call, proc, &nlm4svc_callback_ops) < 0)
+		return rpc_system_err;
+	return rpc_success;
 }
 
-static int
-nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static int nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void	     *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
+	dprintk("lockd: TEST_MSG      called\n");
+	return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test);
+}
 
+static int nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
 	dprintk("lockd: LOCK_MSG      called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlm4svc_proc_lock(rqstp, argp, &res)) == 0)
-		stat = nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, &res);
-	return stat;
+	return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock);
 }
 
-static int
-nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static int nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					       void	       *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
-
 	dprintk("lockd: CANCEL_MSG    called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlm4svc_proc_cancel(rqstp, argp, &res)) == 0)
-		stat = nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, &res);
-	return stat;
+	return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel);
 }
 
-static int
-nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static int nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                void            *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
-
 	dprintk("lockd: UNLOCK_MSG    called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlm4svc_proc_unlock(rqstp, argp, &res)) == 0)
-		stat = nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, &res);
-	return stat;
+	return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock);
 }
 
-static int
-nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                 void            *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
-
 	dprintk("lockd: GRANTED_MSG   called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlm4svc_proc_granted(rqstp, argp, &res)) == 0)
-		stat = nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, &res);
-	return stat;
+	return nlm4svc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlm4svc_proc_granted);
 }
 
 /*
@@ -471,48 +473,6 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
 }
 
 
-/*
- * This is the generic lockd callback for async RPC calls
- */
-static u32
-nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
-{
-	struct nlm_host	*host;
-	struct nlm_rqst	*call;
-
-	host = nlmsvc_lookup_host(rqstp);
-	if (host == NULL)
-		return rpc_system_err;
-
-	call = nlm_alloc_call(host);
-	if (call == NULL)
-		return rpc_system_err;
-
-
-	call->a_flags = RPC_TASK_ASYNC;
-	memcpy(&call->a_args, resp, sizeof(*resp));
-
-	if (nlm_async_call(call, proc, &nlm4svc_callback_ops) < 0)
-		return rpc_system_err;
-	return rpc_success;
-}
-
-static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
-{
-	dprintk("lockd: %4d callback returned %d\n", task->tk_pid,
-			-task->tk_status);
-}
-
-static void nlm4svc_callback_release(void *data)
-{
-	nlm_release_call(data);
-}
-
-static const struct rpc_call_ops nlm4svc_callback_ops = {
-	.rpc_call_done = nlm4svc_callback_exit,
-	.rpc_release = nlm4svc_callback_release,
-};
-
 /*
  * NLM Server procedures.
  */
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 956d1d71e2af..d210cf304e92 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -22,10 +22,6 @@
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
-static u32	nlmsvc_callback(struct svc_rqst *, u32, struct nlm_res *);
-
-static const struct rpc_call_ops nlmsvc_callback_ops;
-
 #ifdef CONFIG_LOCKD_V4
 static u32
 cast_to_nlm(u32 status, u32 vers)
@@ -261,84 +257,92 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return rpc_success;
 }
 
+/*
+ * This is the generic lockd callback for async RPC calls
+ */
+static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
+{
+	dprintk("lockd: %4d callback returned %d\n", task->tk_pid,
+			-task->tk_status);
+}
+
+static void nlmsvc_callback_release(void *data)
+{
+	nlm_release_call(data);
+}
+
+static const struct rpc_call_ops nlmsvc_callback_ops = {
+	.rpc_call_done = nlmsvc_callback_exit,
+	.rpc_release = nlmsvc_callback_release,
+};
+
 /*
  * `Async' versions of the above service routines. They aren't really,
  * because we send the callback before the reply proper. I hope this
  * doesn't break any clients.
  */
-static int
-nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
-					     void	     *resp)
+static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
 {
-	struct nlm_res	res;
-	u32		stat;
+	struct nlm_host	*host;
+	struct nlm_rqst	*call;
+	int stat;
 
-	dprintk("lockd: TEST_MSG      called\n");
-	memset(&res, 0, sizeof(res));
+	host = nlmsvc_lookup_host(rqstp);
+	if (host == NULL)
+		return rpc_system_err;
+
+	call = nlm_alloc_call(host);
+	if (call == NULL)
+		return rpc_system_err;
+
+	stat = func(rqstp, argp, &call->a_res);
+	if (stat != 0) {
+		nlm_release_call(call);
+		return stat;
+	}
 
-	if ((stat = nlmsvc_proc_test(rqstp, argp, &res)) == 0)
-		stat = nlmsvc_callback(rqstp, NLMPROC_TEST_RES, &res);
-	return stat;
+	call->a_flags = RPC_TASK_ASYNC;
+	if (nlm_async_reply(call, proc, &nlmsvc_callback_ops) < 0)
+		return rpc_system_err;
+	return rpc_success;
 }
 
-static int
-nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static int nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void	     *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
+	dprintk("lockd: TEST_MSG      called\n");
+	return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test);
+}
 
+static int nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+					     void	     *resp)
+{
 	dprintk("lockd: LOCK_MSG      called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlmsvc_proc_lock(rqstp, argp, &res)) == 0)
-		stat = nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, &res);
-	return stat;
+	return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock);
 }
 
-static int
-nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static int nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					       void	       *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
-
 	dprintk("lockd: CANCEL_MSG    called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlmsvc_proc_cancel(rqstp, argp, &res)) == 0)
-		stat = nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, &res);
-	return stat;
+	return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel);
 }
 
 static int
 nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                void            *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
-
 	dprintk("lockd: UNLOCK_MSG    called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlmsvc_proc_unlock(rqstp, argp, &res)) == 0)
-		stat = nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, &res);
-	return stat;
+	return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock);
 }
 
 static int
 nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                 void            *resp)
 {
-	struct nlm_res	res;
-	u32		stat;
-
 	dprintk("lockd: GRANTED_MSG   called\n");
-	memset(&res, 0, sizeof(res));
-
-	if ((stat = nlmsvc_proc_granted(rqstp, argp, &res)) == 0)
-		stat = nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, &res);
-	return stat;
+	return nlmsvc_callback(rqstp, NLMPROC_GRANTED_RES, argp, nlmsvc_proc_granted);
 }
 
 /*
@@ -496,47 +500,6 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
 	return rpc_success;
 }
 
-/*
- * This is the generic lockd callback for async RPC calls
- */
-static u32
-nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp)
-{
-	struct nlm_host	*host;
-	struct nlm_rqst	*call;
-
-	host = nlmsvc_lookup_host(rqstp);
-	if (host == NULL)
-		return rpc_system_err;
-
-	call = nlm_alloc_call(host);
-	if (call == NULL)
-		return rpc_system_err;
-
-	call->a_flags = RPC_TASK_ASYNC;
-	memcpy(&call->a_args, resp, sizeof(*resp));
-
-	if (nlm_async_call(call, proc, &nlmsvc_callback_ops) < 0)
-		return rpc_system_err;
-	return rpc_success;
-}
-
-static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
-{
-	dprintk("lockd: %4d callback returned %d\n", task->tk_pid,
-			-task->tk_status);
-}
-
-static void nlmsvc_callback_release(void *data)
-{
-	nlm_release_call(data);
-}
-
-static const struct rpc_call_ops nlmsvc_callback_ops = {
-	.rpc_call_done = nlmsvc_callback_exit,
-	.rpc_release = nlmsvc_callback_release,
-};
-
 /*
  * NLM Server procedures.
  */
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index e7ba8110d579..a04137d0c5de 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -151,6 +151,7 @@ extern unsigned long		nlmsvc_timeout;
 struct nlm_rqst * nlm_alloc_call(struct nlm_host *host);
 void		  nlm_release_call(struct nlm_rqst *);
 int		  nlm_async_call(struct nlm_rqst *, u32, const struct rpc_call_ops *);
+int		  nlm_async_reply(struct nlm_rqst *, u32, const struct rpc_call_ops *);
 struct nlm_wait * nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl);
 void		  nlmclnt_finish_block(struct nlm_wait *block);
 int		  nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout);
-- 
cgit v1.2.3


From 5428154827c2bf7cfdc9dab60db1e0eaa57c027a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:49 -0500
Subject: SUNRPC: Fix a 'Busy inodes' error in rpc_pipefs

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h        |  1 +
 include/linux/sunrpc/rpc_pipe_fs.h |  2 ++
 net/sunrpc/clnt.c                  | 24 ++++++++++++++++++++----
 net/sunrpc/rpc_pipe.c              | 22 ++++++++++++----------
 4 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index e37c06128e51..8fe9f35eba31 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -60,6 +60,7 @@ struct rpc_clnt {
 	int			cl_nodelen;	/* nodename length */
 	char 			cl_nodename[UNX_MAXNODENAME];
 	char			cl_pathname[30];/* Path in rpc_pipe_fs */
+	struct vfsmount *	cl_vfsmnt;
 	struct dentry *		cl_dentry;	/* inode */
 	struct rpc_clnt *	cl_parent;	/* Points to parent of clones */
 	struct rpc_rtt		cl_rtt_default;
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index 63929349571f..2c2189cb30aa 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -45,6 +45,8 @@ extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *);
 extern int rpc_rmdir(char *);
 extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags);
 extern int rpc_unlink(char *);
+extern struct vfsmount *rpc_get_mount(void);
+extern void rpc_put_mount(void);
 
 #endif
 #endif
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0bb23e8e9d0c..9f775302d1df 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -70,8 +70,15 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 	static uint32_t clntid;
 	int error;
 
+	clnt->cl_vfsmnt = ERR_PTR(-ENOENT);
+	clnt->cl_dentry = ERR_PTR(-ENOENT);
 	if (dir_name == NULL)
 		return 0;
+
+	clnt->cl_vfsmnt = rpc_get_mount();
+	if (IS_ERR(clnt->cl_vfsmnt))
+		return PTR_ERR(clnt->cl_vfsmnt);
+
 	for (;;) {
 		snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname),
 				"%s/clnt%x", dir_name,
@@ -84,6 +91,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 		if (error != -EEXIST) {
 			printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n",
 					clnt->cl_pathname, error);
+			rpc_put_mount();
 			return error;
 		}
 	}
@@ -175,7 +183,11 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 	return clnt;
 
 out_no_auth:
-	rpc_rmdir(clnt->cl_pathname);
+	if (!IS_ERR(clnt->cl_dentry)) {
+		rpc_rmdir(clnt->cl_pathname);
+		dput(clnt->cl_dentry);
+		rpc_put_mount();
+	}
 out_no_path:
 	if (clnt->cl_server != clnt->cl_inline_name)
 		kfree(clnt->cl_server);
@@ -240,13 +252,15 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	new->cl_autobind = 0;
 	new->cl_oneshot = 0;
 	new->cl_dead = 0;
-	dget(new->cl_dentry);
+	if (!IS_ERR(new->cl_dentry)) {
+		dget(new->cl_dentry);
+		rpc_get_mount();
+	}
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
 	new->cl_pmap		= &new->cl_pmap_default;
 	new->cl_metrics         = rpc_alloc_iostats(clnt);
-	rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
 	return new;
 out_no_clnt:
 	printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
@@ -318,8 +332,10 @@ rpc_destroy_client(struct rpc_clnt *clnt)
 out_free:
 	rpc_free_iostats(clnt->cl_metrics);
 	clnt->cl_metrics = NULL;
-	if (clnt->cl_dentry)
+	if (!IS_ERR(clnt->cl_dentry)) {
 		dput(clnt->cl_dentry);
+		rpc_put_mount();
+	}
 	kfree(clnt);
 	return 0;
 }
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 72b22172f0af..391d2bfc71aa 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -435,14 +435,17 @@ static struct rpc_filelist authfiles[] = {
 	},
 };
 
-static int
-rpc_get_mount(void)
+struct vfsmount *rpc_get_mount(void)
 {
-	return simple_pin_fs("rpc_pipefs", &rpc_mount, &rpc_mount_count);
+	int err;
+
+	err = simple_pin_fs("rpc_pipefs", &rpc_mount, &rpc_mount_count);
+	if (err != 0)
+		return ERR_PTR(err);
+	return rpc_mount;
 }
 
-static void
-rpc_put_mount(void)
+void rpc_put_mount(void)
 {
 	simple_release_fs(&rpc_mount, &rpc_mount_count);
 }
@@ -452,12 +455,13 @@ rpc_lookup_parent(char *path, struct nameidata *nd)
 {
 	if (path[0] == '\0')
 		return -ENOENT;
-	if (rpc_get_mount()) {
+	nd->mnt = rpc_get_mount();
+	if (IS_ERR(nd->mnt)) {
 		printk(KERN_WARNING "%s: %s failed to mount "
 			       "pseudofilesystem \n", __FILE__, __FUNCTION__);
-		return -ENODEV;
+		return PTR_ERR(nd->mnt);
 	}
-	nd->mnt = mntget(rpc_mount);
+	mntget(nd->mnt);
 	nd->dentry = dget(rpc_mount->mnt_root);
 	nd->last_type = LAST_ROOT;
 	nd->flags = LOOKUP_PARENT;
@@ -594,7 +598,6 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry)
 	d_instantiate(dentry, inode);
 	dir->i_nlink++;
 	inode_dir_notify(dir, DN_CREATE);
-	rpc_get_mount();
 	return 0;
 out_err:
 	printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n",
@@ -615,7 +618,6 @@ __rpc_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!error) {
 		inode_dir_notify(dir, DN_DELETE);
 		d_drop(dentry);
-		rpc_put_mount();
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From c42de9dd67250fe984e0e31c9b542d721af6454b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 20 Mar 2006 13:44:51 -0500
Subject: NFS: Fix a race in nfs_sync_inode()

Kudos to Neil Brown for spotting the problem:

"in nfs_sync_inode, there is effectively the sequence:

   nfs_wait_on_requests
   nfs_flush_inode
   nfs_commit_inode

 This seems a bit racy to me as if the only requests are on the
 ->commit list, and nfs_commit_inode is called separately after
 nfs_wait_on_requests completes, and before nfs_commit_inode start
 (say: by nfs_write_inode) then none of these function will return
 >0, yet there will be some pending request that aren't waited for."

The solution is to search for requests to wait upon, search for dirty
requests, and search for uncommitted requests while holding the
nfsi->req_lock

The patch also cleans up nfs_sync_inode(), getting rid of the redundant
FLUSH_WAIT flag. It turns out that we were always setting it.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  4 +--
 fs/nfs/write.c         | 72 +++++++++++++++++++++++++++++++++++---------------
 include/linux/nfs_fs.h | 10 +++----
 3 files changed, 56 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a0cda53461b3..60aac58270a8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -137,7 +137,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 static int
 nfs_write_inode(struct inode *inode, int sync)
 {
-	int flags = sync ? FLUSH_WAIT : 0;
+	int flags = sync ? FLUSH_SYNC : 0;
 	int ret;
 
 	ret = nfs_commit_inode(inode, flags);
@@ -1051,7 +1051,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	int err;
 
 	/* Flush out writes to the server in order to update c/mtime */
-	nfs_sync_inode(inode, 0, 0, FLUSH_WAIT|FLUSH_NOCOMMIT);
+	nfs_sync_inode_wait(inode, 0, 0, FLUSH_NOCOMMIT);
 
 	/*
 	 * We may force a getattr if the user cares about atime.
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2beb1268bb1b..3f5225404c97 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -539,8 +539,7 @@ nfs_mark_request_commit(struct nfs_page *req)
  *
  * Interruptible by signals only if mounted with intr flag.
  */
-static int
-nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int npages)
+static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *req;
@@ -553,7 +552,6 @@ nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int
 	else
 		idx_end = idx_start + npages - 1;
 
-	spin_lock(&nfsi->req_lock);
 	next = idx_start;
 	while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) {
 		if (req->wb_index > idx_end)
@@ -566,15 +564,25 @@ nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int
 		spin_unlock(&nfsi->req_lock);
 		error = nfs_wait_on_request(req);
 		nfs_release_request(req);
+		spin_lock(&nfsi->req_lock);
 		if (error < 0)
 			return error;
-		spin_lock(&nfsi->req_lock);
 		res++;
 	}
-	spin_unlock(&nfsi->req_lock);
 	return res;
 }
 
+static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int npages)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret;
+
+	spin_lock(&nfsi->req_lock);
+	ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
+	spin_unlock(&nfsi->req_lock);
+	return ret;
+}
+
 /*
  * nfs_scan_dirty - Scan an inode for dirty requests
  * @inode: NFS inode to scan
@@ -626,6 +634,11 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
 	}
 	return res;
 }
+#else
+static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+{
+	return 0;
+}
 #endif
 
 static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
@@ -1421,6 +1434,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
 	.rpc_call_done = nfs_commit_done,
 	.rpc_release = nfs_commit_release,
 };
+#else
+static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+{
+	return 0;
+}
 #endif
 
 static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
@@ -1460,28 +1478,38 @@ int nfs_commit_inode(struct inode *inode, int how)
 }
 #endif
 
-int nfs_sync_inode(struct inode *inode, unsigned long idx_start,
-		  unsigned int npages, int how)
+int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
+		unsigned int npages, int how)
 {
+	struct nfs_inode *nfsi = NFS_I(inode);
+	LIST_HEAD(head);
 	int nocommit = how & FLUSH_NOCOMMIT;
-	int wait = how & FLUSH_WAIT;
-	int error;
-
-	how &= ~(FLUSH_WAIT|FLUSH_NOCOMMIT);
+	int pages, ret;
 
+	how &= ~FLUSH_NOCOMMIT;
+	spin_lock(&nfsi->req_lock);
 	do {
-		if (wait) {
-			error = nfs_wait_on_requests(inode, idx_start, npages);
-			if (error != 0)
-				continue;
-		}
-		error = nfs_flush_inode(inode, idx_start, npages, how);
-		if (error != 0)
+		ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
+		if (ret != 0)
 			continue;
-		if (!nocommit)
-			error = nfs_commit_inode(inode, how);
-	} while (error > 0);
-	return error;
+		pages = nfs_scan_dirty(inode, &head, idx_start, npages);
+		if (pages != 0) {
+			spin_unlock(&nfsi->req_lock);
+			ret = nfs_flush_list(inode, &head, pages, how);
+			spin_lock(&nfsi->req_lock);
+			continue;
+		}
+		if (nocommit)
+			break;
+		pages = nfs_scan_commit(inode, &head, 0, 0);
+		if (pages == 0)
+			break;
+		spin_unlock(&nfsi->req_lock);
+		ret = nfs_commit_list(inode, &head, how);
+		spin_lock(&nfsi->req_lock);
+	} while (ret >= 0);
+	spin_unlock(&nfsi->req_lock);
+	return ret;
 }
 
 int nfs_init_writepagecache(void)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 55de0770df4a..cbebd7d1b9e8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -56,9 +56,7 @@
  * When flushing a cluster of dirty pages, there can be different
  * strategies:
  */
-#define FLUSH_AGING		0	/* only flush old buffers */
 #define FLUSH_SYNC		1	/* file being synced, or contention */
-#define FLUSH_WAIT		2	/* wait for completion */
 #define FLUSH_STABLE		4	/* commit to stable storage */
 #define FLUSH_LOWPRI		8	/* low priority background flush */
 #define FLUSH_HIGHPRI		16	/* high priority memory reclaim flush */
@@ -419,7 +417,7 @@ void nfs_commit_free(struct nfs_write_data *p);
  * Try to write back everything synchronously (but check the
  * return value!)
  */
-extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
+extern int  nfs_sync_inode_wait(struct inode *, unsigned long, unsigned int, int);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
 extern void nfs_commit_release(void *wdata);
@@ -440,7 +438,7 @@ nfs_have_writebacks(struct inode *inode)
 static inline int
 nfs_wb_all(struct inode *inode)
 {
-	int error = nfs_sync_inode(inode, 0, 0, FLUSH_WAIT);
+	int error = nfs_sync_inode_wait(inode, 0, 0, 0);
 	return (error < 0) ? error : 0;
 }
 
@@ -449,8 +447,8 @@ nfs_wb_all(struct inode *inode)
  */
 static inline int nfs_wb_page_priority(struct inode *inode, struct page* page, int how)
 {
-	int error = nfs_sync_inode(inode, page->index, 1,
-			how | FLUSH_WAIT | FLUSH_STABLE);
+	int error = nfs_sync_inode_wait(inode, page->index, 1,
+			how | FLUSH_STABLE);
 	return (error < 0) ? error : 0;
 }
 
-- 
cgit v1.2.3


From b63862f46547487388e582e8ac9083830d34f058 Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Thu, 3 Nov 2005 15:41:46 +0000
Subject: [PATCH] Filter rule comparators

Currently, audit only supports the "=" and "!=" operators in the -F
filter rules.

This patch reworks the support for "=" and "!=", and adds support
for ">", ">=", "<", and "<=".

This turned out to be a pretty clean, and simply process.  I ended up
using the high order bits of the "field", as suggested by Steve and Amy.
This allowed for no changes whatsoever to the netlink communications.
See the documentation within the patch in the include/linux/audit.h
area, where there is a table that explains the reasoning of the bitmask
assignments clearly.

The patch adds a new function, audit_comparator(left, op, right).
This function will perform the specified comparison (op, which defaults
to "==" for backward compatibility) between two values (left and right).
If the negate bit is on, it will negate whatever that result was.  This
value is returned.

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h |  29 ++++++++++++-
 kernel/auditsc.c      | 117 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 103 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index da3c01955f3d..2408cb77899c 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -98,6 +98,13 @@
 #define AUDIT_WORD(nr) ((__u32)((nr)/32))
 #define AUDIT_BIT(nr)  (1 << ((nr) - AUDIT_WORD(nr)*32))
 
+/* This bitmask is used to validate user input.  It represents all bits that
+ * are currently used in an audit field constant understood by the kernel.
+ * If you are adding a new #define AUDIT_<whatever>, please ensure that
+ * AUDIT_UNUSED_BITS is updated if need be. */
+#define AUDIT_UNUSED_BITS	0x0FFFFC00
+
+
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
@@ -128,8 +135,28 @@
 #define AUDIT_ARG2      (AUDIT_ARG0+2)
 #define AUDIT_ARG3      (AUDIT_ARG0+3)
 
-#define AUDIT_NEGATE    0x80000000
+#define AUDIT_NEGATE			0x80000000
 
+/* These are the supported operators.
+ *	4  2  1
+ *	=  >  <
+ *	-------
+ *	0  0  0		0	nonsense
+ *	0  0  1		1	<
+ *	0  1  0		2	>
+ *	0  1  1		3	!=
+ *	1  0  0		4	=
+ *	1  0  1		5	<=
+ *	1  1  0		6	>=
+ *	1  1  1		7	all operators
+ */
+#define AUDIT_LESS_THAN			0x10000000
+#define AUDIT_GREATER_THAN		0x20000000
+#define AUDIT_NOT_EQUAL			0x30000000
+#define AUDIT_EQUAL			0x40000000
+#define AUDIT_LESS_THAN_OR_EQUAL	(AUDIT_LESS_THAN|AUDIT_EQUAL)
+#define AUDIT_GREATER_THAN_OR_EQUAL	(AUDIT_GREATER_THAN|AUDIT_EQUAL)
+#define AUDIT_OPERATORS			(AUDIT_EQUAL|AUDIT_NOT_EQUAL)
 
 /* Status symbols */
 				/* Mask values */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 51a4f58a4d81..95076fa12202 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,7 @@
  * Handles all system-call specific auditing features.
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright (C) 2005 IBM Corporation
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -27,6 +28,9 @@
  * this file -- see entry.S) is based on a GPL'd patch written by
  * okir@suse.de and Copyright 2003 SuSE Linux AG.
  *
+ * The support of additional filter rules compares (>, <, >=, <=) was
+ * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
+ *
  */
 
 #include <linux/init.h>
@@ -252,6 +256,7 @@ static inline int audit_add_rule(struct audit_rule *rule,
 				  struct list_head *list)
 {
 	struct audit_entry  *entry;
+	int i;
 
 	/* Do not use the _rcu iterator here, since this is the only
 	 * addition routine. */
@@ -261,6 +266,16 @@ static inline int audit_add_rule(struct audit_rule *rule,
 		}
 	}
 
+	for (i = 0; i < rule->field_count; i++) {
+		if (rule->fields[i] & AUDIT_UNUSED_BITS)
+			return -EINVAL;
+		if ( rule->fields[i] & AUDIT_NEGATE )
+			rule->fields[i] |= AUDIT_NOT_EQUAL;
+		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
+			rule->fields[i] |= AUDIT_EQUAL;
+		rule->fields[i] &= (~AUDIT_NEGATE);
+	}
+
 	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
 		return -ENOMEM;
 	if (audit_copy_rule(&entry->rule, rule)) {
@@ -394,6 +409,26 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 	return err;
 }
 
+static int audit_comparator(const u32 left, const u32 op, const u32 right)
+{
+	switch (op) {
+	case AUDIT_EQUAL:
+		return (left == right);
+	case AUDIT_NOT_EQUAL:
+		return (left != right);
+	case AUDIT_LESS_THAN:
+		return (left < right);
+	case AUDIT_LESS_THAN_OR_EQUAL:
+		return (left <= right);
+	case AUDIT_GREATER_THAN:
+		return (left > right);
+	case AUDIT_GREATER_THAN_OR_EQUAL:
+		return (left >= right);
+	default:
+		return -EINVAL;
+	}
+}
+
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
 static int audit_filter_rules(struct task_struct *tsk,
@@ -404,62 +439,63 @@ static int audit_filter_rules(struct task_struct *tsk,
 	int i, j;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
 		u32 value  = rule->values[i];
 		int result = 0;
 
 		switch (field) {
 		case AUDIT_PID:
-			result = (tsk->pid == value);
+			result = audit_comparator(tsk->pid, op, value);
 			break;
 		case AUDIT_UID:
-			result = (tsk->uid == value);
+			result = audit_comparator(tsk->uid, op, value);
 			break;
 		case AUDIT_EUID:
-			result = (tsk->euid == value);
+			result = audit_comparator(tsk->euid, op, value);
 			break;
 		case AUDIT_SUID:
-			result = (tsk->suid == value);
+			result = audit_comparator(tsk->suid, op, value);
 			break;
 		case AUDIT_FSUID:
-			result = (tsk->fsuid == value);
+			result = audit_comparator(tsk->fsuid, op, value);
 			break;
 		case AUDIT_GID:
-			result = (tsk->gid == value);
+			result = audit_comparator(tsk->gid, op, value);
 			break;
 		case AUDIT_EGID:
-			result = (tsk->egid == value);
+			result = audit_comparator(tsk->egid, op, value);
 			break;
 		case AUDIT_SGID:
-			result = (tsk->sgid == value);
+			result = audit_comparator(tsk->sgid, op, value);
 			break;
 		case AUDIT_FSGID:
-			result = (tsk->fsgid == value);
+			result = audit_comparator(tsk->fsgid, op, value);
 			break;
 		case AUDIT_PERS:
-			result = (tsk->personality == value);
+			result = audit_comparator(tsk->personality, op, value);
 			break;
 		case AUDIT_ARCH:
-			if (ctx) 
-				result = (ctx->arch == value);
+ 			if (ctx)
+				result = audit_comparator(ctx->arch, op, value);
 			break;
 
 		case AUDIT_EXIT:
 			if (ctx && ctx->return_valid)
-				result = (ctx->return_code == value);
+				result = audit_comparator(ctx->return_code, op, value);
 			break;
 		case AUDIT_SUCCESS:
 			if (ctx && ctx->return_valid) {
 				if (value)
-					result = (ctx->return_valid == AUDITSC_SUCCESS);
+					result = audit_comparator(ctx->return_valid, op, AUDITSC_SUCCESS);
 				else
-					result = (ctx->return_valid == AUDITSC_FAILURE);
+					result = audit_comparator(ctx->return_valid, op, AUDITSC_FAILURE);
 			}
 			break;
 		case AUDIT_DEVMAJOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (MAJOR(ctx->names[j].dev)==value) {
+					if (audit_comparator(MAJOR(ctx->names[j].dev),	op, value)) {
 						++result;
 						break;
 					}
@@ -469,7 +505,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_DEVMINOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (MINOR(ctx->names[j].dev)==value) {
+					if (audit_comparator(MINOR(ctx->names[j].dev), op, value)) {
 						++result;
 						break;
 					}
@@ -479,7 +515,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (ctx->names[j].ino == value) {
+					if (audit_comparator(ctx->names[j].ino, op, value)) {
 						++result;
 						break;
 					}
@@ -489,19 +525,17 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_LOGINUID:
 			result = 0;
 			if (ctx)
-				result = (ctx->loginuid == value);
+				result = audit_comparator(ctx->loginuid, op, value);
 			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 			if (ctx)
-				result = (ctx->argv[field-AUDIT_ARG0]==value);
+				result = audit_comparator(ctx->argv[field-AUDIT_ARG0], op, value);
 			break;
 		}
 
-		if (rule->fields[i] & AUDIT_NEGATE)
-			result = !result;
 		if (!result)
 			return 0;
 	}
@@ -550,49 +584,48 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 
 	rcu_read_lock();
 	if (!list_empty(list)) {
-		    int word = AUDIT_WORD(ctx->major);
-		    int bit  = AUDIT_BIT(ctx->major);
-
-		    list_for_each_entry_rcu(e, list, list) {
-			    if ((e->rule.mask[word] & bit) == bit
-				&& audit_filter_rules(tsk, &e->rule, ctx, &state)) {
-				    rcu_read_unlock();
-				    return state;
-			    }
-		    }
+		int word = AUDIT_WORD(ctx->major);
+		int bit  = AUDIT_BIT(ctx->major);
+
+		list_for_each_entry_rcu(e, list, list) {
+			if ((e->rule.mask[word] & bit) == bit
+					&& audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+				rcu_read_unlock();
+				return state;
+			}
+		}
 	}
 	rcu_read_unlock();
 	return AUDIT_BUILD_CONTEXT;
 }
 
 static int audit_filter_user_rules(struct netlink_skb_parms *cb,
-			      struct audit_rule *rule,
-			      enum audit_state *state)
+				   struct audit_rule *rule,
+				   enum audit_state *state)
 {
 	int i;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
 		u32 value  = rule->values[i];
 		int result = 0;
 
 		switch (field) {
 		case AUDIT_PID:
-			result = (cb->creds.pid == value);
+			result = audit_comparator(cb->creds.pid, op, value);
 			break;
 		case AUDIT_UID:
-			result = (cb->creds.uid == value);
+			result = audit_comparator(cb->creds.uid, op, value);
 			break;
 		case AUDIT_GID:
-			result = (cb->creds.gid == value);
+			result = audit_comparator(cb->creds.gid, op, value);
 			break;
 		case AUDIT_LOGINUID:
-			result = (cb->loginuid == value);
+			result = audit_comparator(cb->loginuid, op, value);
 			break;
 		}
 
-		if (rule->fields[i] & AUDIT_NEGATE)
-			result = !result;
 		if (!result)
 			return 0;
 	}
-- 
cgit v1.2.3


From 90d526c074ae5db484388da56c399acf892b6c17 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 3 Nov 2005 15:48:08 +0000
Subject: [PATCH] Define new range of userspace messages.

The attached patch updates various items for the new user space
messages. Please apply.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h       | 19 +++++++++++++++----
 kernel/audit.c              |  2 ++
 security/selinux/nlmsgtab.c |  6 ++++--
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2408cb77899c..fd65078e794a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -33,11 +33,20 @@
  * 1200 - 1299 messages internal to the audit daemon
  * 1300 - 1399 audit event messages
  * 1400 - 1499 SE Linux use
- * 1500 - 1999 future use
- * 2000 is for otherwise unclassified kernel audit messages
+ * 1500 - 1599 kernel LSPP events
+ * 1600 - 1699 kernel crypto events
+ * 1700 - 1999 future kernel use (maybe integrity labels and related events)
+ * 2000 is for otherwise unclassified kernel audit messages (legacy)
+ * 2001 - 2099 unused (kernel)
+ * 2100 - 2199 user space anomaly records
+ * 2200 - 2299 user space actions taken in response to anomalies
+ * 2300 - 2399 user space generated LSPP events
+ * 2400 - 2499 user space crypto events
+ * 2500 - 2999 future user space (maybe integrity labels and related events)
  *
- * Messages from 1000-1199 are bi-directional. 1200-1299 are exclusively user
- * space. Anything over that is kernel --> user space communication.
+ * Messages from 1000-1199 are bi-directional. 1200-1299 & 2100 - 2999 are
+ * exclusively user space. 1300-2099 is kernel --> user space 
+ * communication.
  */
 #define AUDIT_GET		1000	/* Get status */
 #define AUDIT_SET		1001	/* Set status (enable/disable/auditd) */
@@ -54,6 +63,8 @@
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
 #define AUDIT_LAST_USER_MSG	1199
+#define AUDIT_FIRST_USER_MSG2	2100	/* More user space messages */
+#define AUDIT_LAST_USER_MSG2	2999
  
 #define AUDIT_DAEMON_START      1200    /* Daemon startup record */
 #define AUDIT_DAEMON_END        1201    /* Daemon normal stop record */
diff --git a/kernel/audit.c b/kernel/audit.c
index 973ca5a9e0d6..6d61dd79a605 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -369,6 +369,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
 		if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
@@ -449,6 +450,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
 		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
 			return 0;
 
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 69b9329b2054..d7c0e912c5f3 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -145,8 +145,10 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 		break;
 
 	case SECCLASS_NETLINK_AUDIT_SOCKET:
-		if (nlmsg_type >= AUDIT_FIRST_USER_MSG &&
-		    nlmsg_type <= AUDIT_LAST_USER_MSG) {
+		if ((nlmsg_type >= AUDIT_FIRST_USER_MSG &&
+		     nlmsg_type <= AUDIT_LAST_USER_MSG) ||
+		    (nlmsg_type >= AUDIT_FIRST_USER_MSG2 &&
+                     nlmsg_type <= AUDIT_LAST_USER_MSG2)) {
 			*perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY;
 		} else {
 			err = nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
-- 
cgit v1.2.3


From f38aa94224c5517a40ba56d453779f70d3229803 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 3 Nov 2005 15:57:06 +0000
Subject: [PATCH] Pass dentry, not just name, in fsnotify creation hooks.

The audit hooks (to be added shortly) will want to see dentry->d_inode
too, not just the name.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/namei.c               | 10 +++++-----
 include/linux/fsnotify.h |  9 +++++----
 kernel/auditsc.c         |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 8dc2b038d5d9..f6619af9e957 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1472,7 +1472,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	DQUOT_INIT(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error)
-		fsnotify_create(dir, dentry->d_name.name);
+		fsnotify_create(dir, dentry);
 	return error;
 }
 
@@ -1793,7 +1793,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	DQUOT_INIT(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
 	if (!error)
-		fsnotify_create(dir, dentry->d_name.name);
+		fsnotify_create(dir, dentry);
 	return error;
 }
 
@@ -1870,7 +1870,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DQUOT_INIT(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
 	if (!error)
-		fsnotify_mkdir(dir, dentry->d_name.name);
+		fsnotify_mkdir(dir, dentry);
 	return error;
 }
 
@@ -2133,7 +2133,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 	DQUOT_INIT(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
 	if (!error)
-		fsnotify_create(dir, dentry->d_name.name);
+		fsnotify_create(dir, dentry);
 	return error;
 }
 
@@ -2210,7 +2210,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&old_dentry->d_inode->i_mutex);
 	if (!error)
-		fsnotify_create(dir, new_dentry->d_name.name);
+		fsnotify_create(dir, new_dentry);
 	return error;
 }
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 03b8e7932b83..b5ff64d2f092 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -70,19 +70,20 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 /*
  * fsnotify_create - 'name' was linked in
  */
-static inline void fsnotify_create(struct inode *inode, const char *name)
+static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name);
 }
 
 /*
  * fsnotify_mkdir - directory 'name' was created
  */
-static inline void fsnotify_mkdir(struct inode *inode, const char *name)
+static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
+				  dentry->d_name.name);
 }
 
 /*
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 95076fa12202..55ba331757c5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -515,7 +515,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(ctx->names[j].ino, op, value)) {
+					if ( audit_comparator(ctx->names[j].ino, op, value)) {
 						++result;
 						break;
 					}
-- 
cgit v1.2.3


From 73241ccca0f7786933f1d31b3d86f2456549953a Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 3 Nov 2005 16:00:25 +0000
Subject: [PATCH] Collect more inode information during syscall processing.

This patch augments the collection of inode info during syscall
processing. It represents part of the functionality that was provided
by the auditfs patch included in RHEL4.

Specifically, it:

- Collects information for target inodes created or removed during
  syscalls.  Previous code only collects information for the target
  inode's parent.

- Adds the audit_inode() hook to syscalls that operate on a file
  descriptor (e.g. fchown), enabling audit to do inode filtering for
  these calls.

- Modifies filtering code to check audit context for either an inode #
  or a parent inode # matching a given rule.

- Modifies logging to provide inode # for both parent and child.

- Protect debug info from NULL audit_names.name.

[AV: folded a later typo fix from the same author]

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               |   1 +
 fs/open.c                |   8 ++-
 fs/xattr.c               |  11 +++-
 include/linux/audit.h    |  18 +++++-
 include/linux/fsnotify.h |   5 ++
 kernel/auditsc.c         | 142 +++++++++++++++++++++++++++++++++++++++--------
 6 files changed, 157 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index f6619af9e957..51cfc9c3ed00 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1353,6 +1353,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return -ENOENT;
 
 	BUG_ON(victim->d_parent->d_inode != dir);
+	audit_inode_child(victim->d_name.name, victim->d_inode, dir->i_ino);
 
 	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
 	if (error)
diff --git a/fs/open.c b/fs/open.c
index 70e0230d8e77..70510004d06e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/rcupdate.h>
+#include <linux/audit.h>
 
 #include <asm/unistd.h>
 
@@ -626,6 +627,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	dentry = file->f_dentry;
 	inode = dentry->d_inode;
 
+	audit_inode(NULL, inode, 0);
+
 	err = -EROFS;
 	if (IS_RDONLY(inode))
 		goto out_putf;
@@ -775,7 +778,10 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 
 	file = fget(fd);
 	if (file) {
-		error = chown_common(file->f_dentry, user, group);
+		struct dentry * dentry;
+		dentry = file->f_dentry;
+		audit_inode(NULL, dentry->d_inode, 0);
+		error = chown_common(dentry, user, group);
 		fput(file);
 	}
 	return error;
diff --git a/fs/xattr.c b/fs/xattr.c
index 80eca7d3d69f..e416190f5e9c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -17,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/fsnotify.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 
 
@@ -234,12 +235,15 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
 	      size_t size, int flags)
 {
 	struct file *f;
+	struct dentry *dentry;
 	int error = -EBADF;
 
 	f = fget(fd);
 	if (!f)
 		return error;
-	error = setxattr(f->f_dentry, name, value, size, flags);
+	dentry = f->f_dentry;
+	audit_inode(NULL, dentry->d_inode, 0);
+	error = setxattr(dentry, name, value, size, flags);
 	fput(f);
 	return error;
 }
@@ -458,12 +462,15 @@ asmlinkage long
 sys_fremovexattr(int fd, char __user *name)
 {
 	struct file *f;
+	struct dentry *dentry;
 	int error = -EBADF;
 
 	f = fget(fd);
 	if (!f)
 		return error;
-	error = removexattr(f->f_dentry, name);
+	dentry = f->f_dentry;
+	audit_inode(NULL, dentry->d_inode, 0);
+	error = removexattr(dentry, name);
 	fput(f);
 	return error;
 }
diff --git a/include/linux/audit.h b/include/linux/audit.h
index fd65078e794a..739b954cb242 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -260,7 +260,20 @@ extern void audit_syscall_entry(struct task_struct *task, int arch,
 extern void audit_syscall_exit(struct task_struct *task, int failed, long return_code);
 extern void audit_getname(const char *name);
 extern void audit_putname(const char *name);
-extern void audit_inode(const char *name, const struct inode *inode, unsigned flags);
+extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags);
+extern void __audit_inode_child(const char *dname, const struct inode *inode,
+				unsigned long pino);
+static inline void audit_inode(const char *name, const struct inode *inode,
+			       unsigned flags) {
+	if (unlikely(current->audit_context))
+		__audit_inode(name, inode, flags);
+}
+static inline void audit_inode_child(const char *dname, 
+				     const struct inode *inode, 
+				     unsigned long pino) {
+	if (unlikely(current->audit_context))
+		__audit_inode_child(dname, inode, pino);
+}
 
 				/* Private API (for audit.c only) */
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
@@ -283,7 +296,10 @@ extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 #define audit_syscall_exit(t,f,r) do { ; } while (0)
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
+#define __audit_inode(n,i,f) do { ; } while (0)
+#define __audit_inode_child(d,i,p) do { ; } while (0)
 #define audit_inode(n,i,f) do { ; } while (0)
+#define audit_inode_child(d,i,p) do { ; } while (0)
 #define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index b5ff64d2f092..94919c376a72 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -15,6 +15,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/audit.h>
 
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
@@ -45,6 +46,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
 	}
+	audit_inode_child(old_name, source, old_dir->i_ino);
+	audit_inode_child(new_name, target, new_dir->i_ino);
 }
 
 /*
@@ -74,6 +77,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name);
+	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
 }
 
 /*
@@ -84,6 +88,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
 				  dentry->d_name.name);
+	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
 }
 
 /*
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 55ba331757c5..73f932b7deb6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,7 @@
  * Handles all system-call specific auditing features.
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
  * Copyright (C) 2005 IBM Corporation
  * All Rights Reserved.
  *
@@ -31,11 +32,16 @@
  * The support of additional filter rules compares (>, <, >=, <=) was
  * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
  *
+ * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
+ * filesystem information.
  */
 
 #include <linux/init.h>
 #include <asm/types.h>
 #include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -97,12 +103,12 @@ enum audit_state {
 struct audit_names {
 	const char	*name;
 	unsigned long	ino;
+	unsigned long	pino;
 	dev_t		dev;
 	umode_t		mode;
 	uid_t		uid;
 	gid_t		gid;
 	dev_t		rdev;
-	unsigned	flags;
 };
 
 struct audit_aux_data {
@@ -515,7 +521,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if ( audit_comparator(ctx->names[j].ino, op, value)) {
+					if (audit_comparator(ctx->names[j].ino, op, value) ||
+					    audit_comparator(ctx->names[j].pino, op, value)) {
 						++result;
 						break;
 					}
@@ -696,17 +703,17 @@ static inline void audit_free_names(struct audit_context *context)
 #if AUDIT_DEBUG == 2
 	if (context->auditable
 	    ||context->put_count + context->ino_count != context->name_count) {
-		printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+		printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
 		       " name_count=%d put_count=%d"
 		       " ino_count=%d [NOT freeing]\n",
-		       __LINE__,
+		       __FILE__, __LINE__,
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
 		for (i = 0; i < context->name_count; i++)
 			printk(KERN_ERR "names[%d] = %p = %s\n", i,
 			       context->names[i].name,
-			       context->names[i].name);
+			       context->names[i].name ?: "(null)");
 		dump_stack();
 		return;
 	}
@@ -932,27 +939,34 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
+		unsigned long ino  = context->names[i].ino;
+		unsigned long pino = context->names[i].pino;
+
 		ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
 		audit_log_format(ab, "item=%d", i);
-		if (context->names[i].name) {
-			audit_log_format(ab, " name=");
+
+		audit_log_format(ab, " name=");
+		if (context->names[i].name)
 			audit_log_untrustedstring(ab, context->names[i].name);
-		}
-		audit_log_format(ab, " flags=%x\n", context->names[i].flags);
-			 
-		if (context->names[i].ino != (unsigned long)-1)
-			audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
-					     " ouid=%u ogid=%u rdev=%02x:%02x",
-					 context->names[i].ino,
-					 MAJOR(context->names[i].dev),
-					 MINOR(context->names[i].dev),
-					 context->names[i].mode,
-					 context->names[i].uid,
-					 context->names[i].gid,
-					 MAJOR(context->names[i].rdev),
+		else
+			audit_log_format(ab, "(null)");
+
+		if (pino != (unsigned long)-1)
+			audit_log_format(ab, " parent=%lu",  pino);
+		if (ino != (unsigned long)-1)
+			audit_log_format(ab, " inode=%lu",  ino);
+		if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
+			audit_log_format(ab, " dev=%02x:%02x mode=%#o" 
+					 " ouid=%u ogid=%u rdev=%02x:%02x", 
+					 MAJOR(context->names[i].dev), 
+					 MINOR(context->names[i].dev), 
+					 context->names[i].mode, 
+					 context->names[i].uid, 
+					 context->names[i].gid, 
+					 MAJOR(context->names[i].rdev), 
 					 MINOR(context->names[i].rdev));
 		audit_log_end(ab);
 	}
@@ -1174,7 +1188,7 @@ void audit_putname(const char *name)
 			for (i = 0; i < context->name_count; i++)
 				printk(KERN_ERR "name[%d] = %p = %s\n", i,
 				       context->names[i].name,
-				       context->names[i].name);
+				       context->names[i].name ?: "(null)");
 		}
 #endif
 		__putname(name);
@@ -1204,7 +1218,7 @@ void audit_putname(const char *name)
  *
  * Called from fs/namei.c:path_lookup().
  */
-void audit_inode(const char *name, const struct inode *inode, unsigned flags)
+void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
@@ -1230,13 +1244,93 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags)
 		++context->ino_count;
 #endif
 	}
-	context->names[idx].flags = flags;
-	context->names[idx].ino   = inode->i_ino;
 	context->names[idx].dev	  = inode->i_sb->s_dev;
 	context->names[idx].mode  = inode->i_mode;
 	context->names[idx].uid   = inode->i_uid;
 	context->names[idx].gid   = inode->i_gid;
 	context->names[idx].rdev  = inode->i_rdev;
+	if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && 
+	    (strcmp(name, ".") != 0)) {
+		context->names[idx].ino   = (unsigned long)-1;
+		context->names[idx].pino  = inode->i_ino;
+	} else {
+		context->names[idx].ino   = inode->i_ino;
+		context->names[idx].pino  = (unsigned long)-1;
+	}
+}
+
+/**
+ * audit_inode_child - collect inode info for created/removed objects
+ * @dname: inode's dentry name
+ * @inode: inode being audited
+ * @pino: inode number of dentry parent
+ *
+ * For syscalls that create or remove filesystem objects, audit_inode
+ * can only collect information for the filesystem object's parent.
+ * This call updates the audit context with the child's information.
+ * Syscalls that create a new filesystem object must be hooked after
+ * the object is created.  Syscalls that remove a filesystem object
+ * must be hooked prior, in order to capture the target inode during
+ * unsuccessful attempts.
+ */
+void __audit_inode_child(const char *dname, const struct inode *inode,
+			 unsigned long pino)
+{
+	int idx;
+	struct audit_context *context = current->audit_context;
+
+	if (!context->in_syscall)
+		return;
+
+	/* determine matching parent */
+	if (dname)
+		for (idx = 0; idx < context->name_count; idx++)
+			if (context->names[idx].pino == pino) {
+				const char *n;
+				const char *name = context->names[idx].name;
+				int dlen = strlen(dname);
+				int nlen = name ? strlen(name) : 0;
+
+				if (nlen < dlen)
+					continue;
+				
+				/* disregard trailing slashes */
+				n = name + nlen - 1;
+				while ((*n == '/') && (n > name))
+					n--;
+
+				/* find last path component */
+				n = n - dlen + 1;
+				if (n < name)
+					continue;
+				else if (n > name) {
+					if (*--n != '/')
+						continue;
+					else
+						n++;
+				}
+
+				if (strncmp(n, dname, dlen) == 0)
+					goto update_context;
+			}
+
+	/* catch-all in case match not found */
+	idx = context->name_count++;
+	context->names[idx].name  = NULL;
+	context->names[idx].pino  = pino;
+#if AUDIT_DEBUG
+	context->ino_count++;
+#endif
+
+update_context:
+	if (inode) {
+		context->names[idx].ino   = inode->i_ino;
+		context->names[idx].dev	  = inode->i_sb->s_dev;
+		context->names[idx].mode  = inode->i_mode;
+		context->names[idx].uid   = inode->i_uid;
+		context->names[idx].gid   = inode->i_gid;
+		context->names[idx].rdev  = inode->i_rdev;
+	}
 }
 
 /**
-- 
cgit v1.2.3


From c8edc80c8b8c397c53f4f659a05b9ea6208029bf Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Thu, 3 Nov 2005 16:12:36 +0000
Subject: [PATCH] Exclude messages by message type

    - Add a new, 5th filter called "exclude".
    - And add a new field AUDIT_MSGTYPE.
    - Define a new function audit_filter_exclude() that takes a message type
      as input and examines all rules in the filter.  It returns '1' if the
      message is to be excluded, and '0' otherwise.
    - Call the audit_filter_exclude() function near the top of
      audit_log_start() just after asserting audit_initialized.  If the
      message type is not to be audited, return NULL very early, before
      doing a lot of work.
[combined with followup fix for bug in original patch, Nov 4, same author]
[combined with later renaming AUDIT_FILTER_EXCLUDE->AUDIT_FILTER_TYPE
and audit_filter_exclude() -> audit_filter_type()]

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  5 ++++-
 kernel/audit.c        |  3 +++
 kernel/auditsc.c      | 35 ++++++++++++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 739b954cb242..8fa1a8fbc04d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -92,8 +92,9 @@
 #define AUDIT_FILTER_ENTRY	0x02	/* Apply rule at syscall entry */
 #define AUDIT_FILTER_WATCH	0x03	/* Apply rule to file system watches */
 #define AUDIT_FILTER_EXIT	0x04	/* Apply rule at syscall exit */
+#define AUDIT_FILTER_TYPE	0x05	/* Apply rule at audit_log_start */
 
-#define AUDIT_NR_FILTERS	5
+#define AUDIT_NR_FILTERS	6
 
 #define AUDIT_FILTER_PREPEND	0x10	/* Prepend to front of list */
 
@@ -132,6 +133,7 @@
 #define AUDIT_LOGINUID	9
 #define AUDIT_PERS	10
 #define AUDIT_ARCH	11
+#define AUDIT_MSGTYPE	12
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
@@ -289,6 +291,7 @@ extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern void audit_signal_info(int sig, struct task_struct *t);
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
+extern int audit_filter_type(int type);
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
diff --git a/kernel/audit.c b/kernel/audit.c
index 6d61dd79a605..1c3eb1b12bfc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -702,6 +702,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	if (!audit_initialized)
 		return NULL;
 
+	if (unlikely(audit_filter_type(type)))
+		return NULL;
+
 	if (gfp_mask & __GFP_WAIT)
 		reserve = 0;
 	else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 73f932b7deb6..31917ac730af 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -187,7 +187,8 @@ static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[2]),
 	LIST_HEAD_INIT(audit_filter_list[3]),
 	LIST_HEAD_INIT(audit_filter_list[4]),
-#if AUDIT_NR_FILTERS != 5
+	LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
 #error Fix audit_filter_list initialiser
 #endif
 };
@@ -663,6 +664,38 @@ int audit_filter_user(struct netlink_skb_parms *cb, int type)
 	return ret; /* Audit by default */
 }
 
+int audit_filter_type(int type)
+{
+	struct audit_entry *e;
+	int result = 0;
+	
+	rcu_read_lock();
+	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+		goto unlock_and_return;
+
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+				list) {
+		struct audit_rule *rule = &e->rule;
+		int i;
+		for (i = 0; i < rule->field_count; i++) {
+			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
+			u32 value  = rule->values[i];
+			if ( field == AUDIT_MSGTYPE ) {
+				result = audit_comparator(type, op, value); 
+				if (!result)
+					break;
+			}
+		}
+		if (result)
+			goto unlock_and_return;
+	}
+unlock_and_return:
+	rcu_read_unlock();
+	return result;
+}
+
+
 /* This should be called with task_lock() held. */
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
-- 
cgit v1.2.3


From 8c8570fb8feef2bc166bee75a85748b25cda22d9 Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Thu, 3 Nov 2005 17:15:16 +0000
Subject: [PATCH] Capture selinux subject/object context information.

This patch extends existing audit records with subject/object context
information. Audit records associated with filesystem inodes, ipc, and
tasks now contain SELinux label information in the field "subj" if the
item is performing the action, or in "obj" if the item is the receiver
of an action.

These labels are collected via hooks in SELinux and appended to the
appropriate record in the audit code.

This additional information is required for Common Criteria Labeled
Security Protection Profile (LSPP).

[AV: fixed kmalloc flags use]
[folded leak fixes]
[folded cleanup from akpm (kfree(NULL)]
[folded audit_inode_context() leak fix]
[folded akpm's fix for audit_ipc_perm() definition in case of !CONFIG_AUDIT]

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h    |   8 ++-
 include/linux/security.h |  27 +++++++++
 ipc/msg.c                |   5 +-
 ipc/sem.c                |   5 +-
 ipc/shm.c                |   4 +-
 kernel/audit.c           |   2 +-
 kernel/auditsc.c         | 142 ++++++++++++++++++++++++++++++++++++++++++++---
 security/dummy.c         |   6 ++
 security/selinux/hooks.c |  96 +++++++++++++++-----------------
 9 files changed, 226 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8fa1a8fbc04d..1912d8e8ae90 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -285,13 +285,14 @@ extern void auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
-extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
+extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern void audit_signal_info(int sig, struct task_struct *t);
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
+extern int audit_set_macxattr(const char *name);
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
@@ -306,12 +307,13 @@ extern int audit_filter_type(int type);
 #define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
-#define audit_ipc_perms(q,u,g,m) ({ 0; })
+#define audit_ipc_perms(q,u,g,m,i) ({ 0; })
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_signal_info(s,t) do { ; } while (0)
 #define audit_filter_user(cb,t) ({ 1; })
+#define audit_set_macxattr(n) do { ; } while (0)
 #endif
 
 #ifdef CONFIG_AUDIT
@@ -340,6 +342,7 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 					     int done, int multi,
 					     void *payload, int size);
 extern void		    audit_log_lost(const char *message);
+extern void		    audit_panic(const char *message);
 extern struct semaphore audit_netlink_sem;
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
@@ -350,6 +353,7 @@ extern struct semaphore audit_netlink_sem;
 #define audit_log_hex(a,b,l) do { ; } while (0)
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_d_path(b,p,d,v) do { ; } while (0)
+#define audit_panic(m) do { ; } while (0)
 #endif
 #endif
 #endif
diff --git a/include/linux/security.h b/include/linux/security.h
index 7cbef482e13a..ec0bbbc3ffc2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -869,6 +869,11 @@ struct swap_info_struct;
  *	@ipcp contains the kernel IPC permission structure
  *	@flag contains the desired (requested) permission set
  *	Return 0 if permission is granted.
+ * @ipc_getsecurity:
+ *      Copy the security label associated with the ipc object into
+ *      @buffer.  @buffer may be NULL to request the size of the buffer 
+ *      required.  @size indicates the size of @buffer in bytes. Return 
+ *      number of bytes used/required on success.
  *
  * Security hooks for individual messages held in System V IPC message queues
  * @msg_msg_alloc_security:
@@ -1168,6 +1173,7 @@ struct security_operations {
 	int (*inode_getxattr) (struct dentry *dentry, char *name);
 	int (*inode_listxattr) (struct dentry *dentry);
 	int (*inode_removexattr) (struct dentry *dentry, char *name);
+	char *(*inode_xattr_getsuffix) (void);
   	int (*inode_getsecurity)(struct inode *inode, const char *name, void *buffer, size_t size, int err);
   	int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, int flags);
   	int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size);
@@ -1217,6 +1223,7 @@ struct security_operations {
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
 
 	int (*ipc_permission) (struct kern_ipc_perm * ipcp, short flag);
+	int (*ipc_getsecurity)(struct kern_ipc_perm *ipcp, void *buffer, size_t size);
 
 	int (*msg_msg_alloc_security) (struct msg_msg * msg);
 	void (*msg_msg_free_security) (struct msg_msg * msg);
@@ -1674,6 +1681,11 @@ static inline int security_inode_removexattr (struct dentry *dentry, char *name)
 	return security_ops->inode_removexattr (dentry, name);
 }
 
+static inline const char *security_inode_xattr_getsuffix(void)
+{
+	return security_ops->inode_xattr_getsuffix();
+}
+
 static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	if (unlikely (IS_PRIVATE (inode)))
@@ -1869,6 +1881,11 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp,
 	return security_ops->ipc_permission (ipcp, flag);
 }
 
+static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	return security_ops->ipc_getsecurity(ipcp, buffer, size);
+}
+
 static inline int security_msg_msg_alloc (struct msg_msg * msg)
 {
 	return security_ops->msg_msg_alloc_security (msg);
@@ -2316,6 +2333,11 @@ static inline int security_inode_removexattr (struct dentry *dentry, char *name)
 	return cap_inode_removexattr(dentry, name);
 }
 
+static inline const char *security_inode_xattr_getsuffix (void)
+{
+	return NULL ;
+}
+
 static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	return -EOPNOTSUPP;
@@ -2499,6 +2521,11 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp,
 	return 0;
 }
 
+static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int security_msg_msg_alloc (struct msg_msg * msg)
 {
 	return 0;
diff --git a/ipc/msg.c b/ipc/msg.c
index fbf757064a32..8c30ec2f6e34 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -429,8 +429,6 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 			return -EFAULT;
 		if (copy_msqid_from_user (&setbuf, buf, version))
 			return -EFAULT;
-		if ((err = audit_ipc_perms(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode)))
-			return err;
 		break;
 	case IPC_RMID:
 		break;
@@ -461,6 +459,9 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 	switch (cmd) {
 	case IPC_SET:
 	{
+		if ((err = audit_ipc_perms(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp)))
+			goto out_unlock_up;
+
 		err = -EPERM;
 		if (setbuf.qbytes > msg_ctlmnb && !capable(CAP_SYS_RESOURCE))
 			goto out_unlock_up;
diff --git a/ipc/sem.c b/ipc/sem.c
index 31fd4027d2b5..59696a840be1 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -809,8 +809,6 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 	if(cmd == IPC_SET) {
 		if(copy_semid_from_user (&setbuf, arg.buf, version))
 			return -EFAULT;
-		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode)))
-			return err;
 	}
 	sma = sem_lock(semid);
 	if(sma==NULL)
@@ -821,7 +819,6 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 		goto out_unlock;
 	}	
 	ipcp = &sma->sem_perm;
-	
 	if (current->euid != ipcp->cuid && 
 	    current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) {
 	    	err=-EPERM;
@@ -838,6 +835,8 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 		err = 0;
 		break;
 	case IPC_SET:
+		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp)))
+			goto out_unlock;
 		ipcp->uid = setbuf.uid;
 		ipcp->gid = setbuf.gid;
 		ipcp->mode = (ipcp->mode & ~S_IRWXUGO)
diff --git a/ipc/shm.c b/ipc/shm.c
index 9162123a7b23..a88c8a02e7f3 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -620,13 +620,13 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 			err = -EFAULT;
 			goto out;
 		}
-		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode)))
-			return err;
 		down(&shm_ids.sem);
 		shp = shm_lock(shmid);
 		err=-EINVAL;
 		if(shp==NULL)
 			goto out_up;
+		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode, &(shp->shm_perm))))
+			goto out_unlock_up;
 		err = shm_checkid(shp,shmid);
 		if(err)
 			goto out_unlock_up;
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c3eb1b12bfc..45c123ef77a7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -142,7 +142,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
 	nlh->nlmsg_pid = pid;
 }
 
-static void audit_panic(const char *message)
+void audit_panic(const char *message)
 {
 	switch (audit_failure)
 	{
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 31917ac730af..4e2256ec7cf3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -34,6 +34,9 @@
  *
  * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
  * filesystem information.
+ *
+ * Subject and object context labeling support added by <danjones@us.ibm.com>
+ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
  */
 
 #include <linux/init.h>
@@ -53,6 +56,7 @@
 #include <linux/netlink.h>
 #include <linux/compiler.h>
 #include <asm/unistd.h>
+#include <linux/security.h>
 
 /* 0 = no checking
    1 = put_count checking
@@ -109,6 +113,7 @@ struct audit_names {
 	uid_t		uid;
 	gid_t		gid;
 	dev_t		rdev;
+	char		*ctx;
 };
 
 struct audit_aux_data {
@@ -125,6 +130,7 @@ struct audit_aux_data_ipcctl {
 	uid_t			uid;
 	gid_t			gid;
 	mode_t			mode;
+	char 			*ctx;
 };
 
 struct audit_aux_data_socketcall {
@@ -743,10 +749,11 @@ static inline void audit_free_names(struct audit_context *context)
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
-		for (i = 0; i < context->name_count; i++)
+		for (i = 0; i < context->name_count; i++) {
 			printk(KERN_ERR "names[%d] = %p = %s\n", i,
 			       context->names[i].name,
 			       context->names[i].name ?: "(null)");
+		}
 		dump_stack();
 		return;
 	}
@@ -756,9 +763,13 @@ static inline void audit_free_names(struct audit_context *context)
 	context->ino_count  = 0;
 #endif
 
-	for (i = 0; i < context->name_count; i++)
+	for (i = 0; i < context->name_count; i++) {
+		char *p = context->names[i].ctx;
+		context->names[i].ctx = NULL;
+		kfree(p);
 		if (context->names[i].name)
 			__putname(context->names[i].name);
+	}
 	context->name_count = 0;
 	if (context->pwd)
 		dput(context->pwd);
@@ -778,6 +789,12 @@ static inline void audit_free_aux(struct audit_context *context)
 			dput(axi->dentry);
 			mntput(axi->mnt);
 		}
+		if ( aux->type == AUDIT_IPC ) {
+			struct audit_aux_data_ipcctl *axi = (void *)aux;
+			if (axi->ctx)
+				kfree(axi->ctx);
+		}
+
 		context->aux = aux->next;
 		kfree(aux);
 	}
@@ -862,7 +879,38 @@ static inline void audit_free_context(struct audit_context *context)
 		printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
 
-static void audit_log_task_info(struct audit_buffer *ab)
+static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
+{
+	char *ctx = NULL;
+	ssize_t len = 0;
+
+	len = security_getprocattr(current, "current", NULL, 0);
+	if (len < 0) {
+		if (len != -EINVAL)
+			goto error_path;
+		return;
+	}
+
+	ctx = kmalloc(len, gfp_mask);
+	if (!ctx) {
+		goto error_path;
+		return;
+	}
+
+	len = security_getprocattr(current, "current", ctx, len);
+	if (len < 0 )
+		goto error_path;
+
+	audit_log_format(ab, " subj=%s", ctx);
+
+error_path:
+	if (ctx)
+		kfree(ctx);
+	audit_panic("security_getprocattr error in audit_log_task_context");
+	return;
+}
+
+static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask)
 {
 	char name[sizeof(current->comm)];
 	struct mm_struct *mm = current->mm;
@@ -875,6 +923,10 @@ static void audit_log_task_info(struct audit_buffer *ab)
 	if (!mm)
 		return;
 
+	/*
+	 * this is brittle; all callers that pass GFP_ATOMIC will have
+	 * NULL current->mm and we won't get here.
+	 */
 	down_read(&mm->mmap_sem);
 	vma = mm->mmap;
 	while (vma) {
@@ -888,6 +940,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
 		vma = vma->vm_next;
 	}
 	up_read(&mm->mmap_sem);
+	audit_log_task_context(ab, gfp_mask);
 }
 
 static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
@@ -923,7 +976,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		  context->gid,
 		  context->euid, context->suid, context->fsuid,
 		  context->egid, context->sgid, context->fsgid);
-	audit_log_task_info(ab);
+	audit_log_task_info(ab, gfp_mask);
 	audit_log_end(ab);
 
 	for (aux = context->aux; aux; aux = aux->next) {
@@ -936,8 +989,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-					 " qbytes=%lx iuid=%u igid=%u mode=%x",
-					 axi->qbytes, axi->uid, axi->gid, axi->mode);
+					 " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s",
+					 axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx);
 			break; }
 
 		case AUDIT_SOCKETCALL: {
@@ -1001,6 +1054,11 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 					 context->names[i].gid, 
 					 MAJOR(context->names[i].rdev), 
 					 MINOR(context->names[i].rdev));
+		if (context->names[i].ctx) {
+			audit_log_format(ab, " obj=%s",
+					context->names[i].ctx);
+		}
+
 		audit_log_end(ab);
 	}
 }
@@ -1243,6 +1301,39 @@ void audit_putname(const char *name)
 #endif
 }
 
+void audit_inode_context(int idx, const struct inode *inode)
+{
+	struct audit_context *context = current->audit_context;
+	char *ctx = NULL;
+	int len = 0;
+
+	if (!security_inode_xattr_getsuffix())
+		return;
+
+	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), NULL, 0, 0);
+	if (len < 0) 
+		goto error_path;
+
+	ctx = kmalloc(len, GFP_KERNEL);
+	if (!ctx) 
+		goto error_path;
+
+	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), ctx, len, 0);
+	if (len < 0)
+		goto error_path;
+
+	kfree(context->names[idx].ctx);
+	context->names[idx].ctx = ctx;
+	return;
+
+error_path:
+	if (ctx)
+		kfree(ctx);
+	audit_panic("error in audit_inode_context");
+	return;
+}
+
+
 /**
  * audit_inode - store the inode and device from a lookup
  * @name: name being audited
@@ -1282,6 +1373,7 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
 	context->names[idx].uid   = inode->i_uid;
 	context->names[idx].gid   = inode->i_gid;
 	context->names[idx].rdev  = inode->i_rdev;
+	audit_inode_context(idx, inode);
 	if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && 
 	    (strcmp(name, ".") != 0)) {
 		context->names[idx].ino   = (unsigned long)-1;
@@ -1363,6 +1455,7 @@ update_context:
 		context->names[idx].uid   = inode->i_uid;
 		context->names[idx].gid   = inode->i_gid;
 		context->names[idx].rdev  = inode->i_rdev;
+		audit_inode_context(idx, inode);
 	}
 }
 
@@ -1423,6 +1516,38 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 	return ctx ? ctx->loginuid : -1;
 }
 
+static char *audit_ipc_context(struct kern_ipc_perm *ipcp)
+{
+	struct audit_context *context = current->audit_context;
+	char *ctx = NULL;
+	int len = 0;
+
+	if (likely(!context))
+		return NULL;
+
+	len = security_ipc_getsecurity(ipcp, NULL, 0);
+	if (len == -EOPNOTSUPP)
+		goto ret;
+	if (len < 0)
+		goto error_path;
+
+	ctx = kmalloc(len, GFP_ATOMIC);
+	if (!ctx)
+		goto error_path;
+
+	len = security_ipc_getsecurity(ipcp, ctx, len);
+	if (len < 0)
+		goto error_path;
+
+	return ctx;
+
+error_path:
+	kfree(ctx);
+	audit_panic("error in audit_ipc_context");
+ret:
+	return NULL;
+}
+
 /**
  * audit_ipc_perms - record audit data for ipc
  * @qbytes: msgq bytes
@@ -1432,7 +1557,7 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
 {
 	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
@@ -1440,7 +1565,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	if (likely(!context))
 		return 0;
 
-	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
 	if (!ax)
 		return -ENOMEM;
 
@@ -1448,6 +1573,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	ax->uid = uid;
 	ax->gid = gid;
 	ax->mode = mode;
+	ax->ctx = audit_ipc_context(ipcp);
 
 	ax->d.type = AUDIT_IPC;
 	ax->d.next = context->aux;
diff --git a/security/dummy.c b/security/dummy.c
index f1a5bd98bf10..6febe7d39fa0 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -558,6 +558,11 @@ static int dummy_ipc_permission (struct kern_ipc_perm *ipcp, short flag)
 	return 0;
 }
 
+static int dummy_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
 static int dummy_msg_msg_alloc_security (struct msg_msg *msg)
 {
 	return 0;
@@ -959,6 +964,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, task_reparent_to_init);
  	set_to_dummy_if_null(ops, task_to_inode);
 	set_to_dummy_if_null(ops, ipc_permission);
+	set_to_dummy_if_null(ops, ipc_getsecurity);
 	set_to_dummy_if_null(ops, msg_msg_alloc_security);
 	set_to_dummy_if_null(ops, msg_msg_free_security);
 	set_to_dummy_if_null(ops, msg_queue_alloc_security);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b65c201e9ff5..9c08a19cc81b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -117,6 +117,32 @@ static struct security_operations *secondary_ops = NULL;
 static LIST_HEAD(superblock_security_head);
 static DEFINE_SPINLOCK(sb_security_lock);
 
+/* Return security context for a given sid or just the context 
+   length if the buffer is null or length is 0 */
+static int selinux_getsecurity(u32 sid, void *buffer, size_t size)
+{
+	char *context;
+	unsigned len;
+	int rc;
+
+	rc = security_sid_to_context(sid, &context, &len);
+	if (rc)
+		return rc;
+
+	if (!buffer || !size)
+		goto getsecurity_exit;
+
+	if (size < len) {
+		len = -ERANGE;
+		goto getsecurity_exit;
+	}
+	memcpy(buffer, context, len);
+
+getsecurity_exit:
+	kfree(context);
+	return len;
+}
+
 /* Allocate and free functions for each kind of security blob. */
 
 static int task_alloc_security(struct task_struct *task)
@@ -2209,6 +2235,11 @@ static int selinux_inode_removexattr (struct dentry *dentry, char *name)
 	return -EACCES;
 }
 
+static const char *selinux_inode_xattr_getsuffix(void)
+{
+      return XATTR_SELINUX_SUFFIX;
+}
+
 /*
  * Copy the in-core inode security context value to the user.  If the
  * getxattr() prior to this succeeded, check to see if we need to
@@ -2219,44 +2250,11 @@ static int selinux_inode_removexattr (struct dentry *dentry, char *name)
 static int selinux_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	struct inode_security_struct *isec = inode->i_security;
-	char *context;
-	unsigned len;
-	int rc;
-
-	if (strcmp(name, XATTR_SELINUX_SUFFIX)) {
-		rc = -EOPNOTSUPP;
-		goto out;
-	}
-
-	rc = security_sid_to_context(isec->sid, &context, &len);
-	if (rc)
-		goto out;
-
-	/* Probe for required buffer size */
-	if (!buffer || !size) {
-		rc = len;
-		goto out_free;
-	}
 
-	if (size < len) {
-		rc = -ERANGE;
-		goto out_free;
-	}
+	if (strcmp(name, XATTR_SELINUX_SUFFIX))
+		return -EOPNOTSUPP;
 
-	if (err > 0) {
-		if ((len == err) && !(memcmp(context, buffer, len))) {
-			/* Don't need to canonicalize value */
-			rc = err;
-			goto out_free;
-		}
-		memset(buffer, 0, size);
-	}
-	memcpy(buffer, context, len);
-	rc = len;
-out_free:
-	kfree(context);
-out:
-	return rc;
+	return selinux_getsecurity(isec->sid, buffer, size);
 }
 
 static int selinux_inode_setsecurity(struct inode *inode, const char *name,
@@ -4022,6 +4020,13 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 	return ipc_has_perm(ipcp, av);
 }
 
+static int selinux_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	struct ipc_security_struct *isec = ipcp->security;
+
+	return selinux_getsecurity(isec->sid, buffer, size);
+}
+
 /* module stacking operations */
 static int selinux_register_security (const char *name, struct security_operations *ops)
 {
@@ -4063,8 +4068,7 @@ static int selinux_getprocattr(struct task_struct *p,
 			       char *name, void *value, size_t size)
 {
 	struct task_security_struct *tsec;
-	u32 sid, len;
-	char *context;
+	u32 sid;
 	int error;
 
 	if (current != p) {
@@ -4073,9 +4077,6 @@ static int selinux_getprocattr(struct task_struct *p,
 			return error;
 	}
 
-	if (!size)
-		return -ERANGE;
-
 	tsec = p->security;
 
 	if (!strcmp(name, "current"))
@@ -4092,16 +4093,7 @@ static int selinux_getprocattr(struct task_struct *p,
 	if (!sid)
 		return 0;
 
-	error = security_sid_to_context(sid, &context, &len);
-	if (error)
-		return error;
-	if (len > size) {
-		kfree(context);
-		return -ERANGE;
-	}
-	memcpy(value, context, len);
-	kfree(context);
-	return len;
+	return selinux_getsecurity(sid, value, size);
 }
 
 static int selinux_setprocattr(struct task_struct *p,
@@ -4259,6 +4251,7 @@ static struct security_operations selinux_ops = {
 	.inode_getxattr =		selinux_inode_getxattr,
 	.inode_listxattr =		selinux_inode_listxattr,
 	.inode_removexattr =		selinux_inode_removexattr,
+	.inode_xattr_getsuffix =        selinux_inode_xattr_getsuffix,
 	.inode_getsecurity =            selinux_inode_getsecurity,
 	.inode_setsecurity =            selinux_inode_setsecurity,
 	.inode_listsecurity =           selinux_inode_listsecurity,
@@ -4296,6 +4289,7 @@ static struct security_operations selinux_ops = {
 	.task_to_inode =                selinux_task_to_inode,
 
 	.ipc_permission =		selinux_ipc_permission,
+	.ipc_getsecurity =		selinux_ipc_getsecurity,
 
 	.msg_msg_alloc_security =	selinux_msg_msg_alloc_security,
 	.msg_msg_free_security =	selinux_msg_msg_free_security,
-- 
cgit v1.2.3


From 7306a0b9b3e2056a616c84841288ca2431a05627 Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Wed, 16 Nov 2005 15:53:13 +0000
Subject: [PATCH] Miscellaneous bug and warning fixes

This patch fixes a couple of bugs revealed in new features recently
added to -mm1:
* fixes warnings due to inconsistent use of const struct inode *inode
* fixes bug that prevent a kernel from booting with audit on, and SELinux off
  due to a missing function in security/dummy.c
* fixes a bug that throws spurious audit_panic() messages due to a missing
  return just before an error_path label
* some reasonable house cleaning in audit_ipc_context(),
  audit_inode_context(), and audit_log_task_context()

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/security.h |  8 ++++----
 kernel/auditsc.c         | 21 ++++++++++++---------
 security/dummy.c         |  8 +++++++-
 security/selinux/hooks.c |  2 +-
 4 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index ec0bbbc3ffc2..2a502250eb5c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1173,8 +1173,8 @@ struct security_operations {
 	int (*inode_getxattr) (struct dentry *dentry, char *name);
 	int (*inode_listxattr) (struct dentry *dentry);
 	int (*inode_removexattr) (struct dentry *dentry, char *name);
-	char *(*inode_xattr_getsuffix) (void);
-  	int (*inode_getsecurity)(struct inode *inode, const char *name, void *buffer, size_t size, int err);
+	const char *(*inode_xattr_getsuffix) (void);
+  	int (*inode_getsecurity)(const struct inode *inode, const char *name, void *buffer, size_t size, int err);
   	int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, int flags);
   	int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size);
 
@@ -1686,7 +1686,7 @@ static inline const char *security_inode_xattr_getsuffix(void)
 	return security_ops->inode_xattr_getsuffix();
 }
 
-static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	if (unlikely (IS_PRIVATE (inode)))
 		return 0;
@@ -2338,7 +2338,7 @@ static inline const char *security_inode_xattr_getsuffix (void)
 	return NULL ;
 }
 
-static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4e2256ec7cf3..4ef14515da35 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -892,21 +892,20 @@ static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
 	}
 
 	ctx = kmalloc(len, gfp_mask);
-	if (!ctx) {
+	if (!ctx)
 		goto error_path;
-		return;
-	}
 
 	len = security_getprocattr(current, "current", ctx, len);
 	if (len < 0 )
 		goto error_path;
 
 	audit_log_format(ab, " subj=%s", ctx);
+	return;
 
 error_path:
 	if (ctx)
 		kfree(ctx);
-	audit_panic("security_getprocattr error in audit_log_task_context");
+	audit_panic("error in audit_log_task_context");
 	return;
 }
 
@@ -1304,13 +1303,16 @@ void audit_putname(const char *name)
 void audit_inode_context(int idx, const struct inode *inode)
 {
 	struct audit_context *context = current->audit_context;
+	const char *suffix = security_inode_xattr_getsuffix();
 	char *ctx = NULL;
 	int len = 0;
 
-	if (!security_inode_xattr_getsuffix())
-		return;
+	if (!suffix)
+		goto ret;
 
-	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), NULL, 0, 0);
+	len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
+	if (len == -EOPNOTSUPP)
+		goto ret;
 	if (len < 0) 
 		goto error_path;
 
@@ -1318,18 +1320,19 @@ void audit_inode_context(int idx, const struct inode *inode)
 	if (!ctx) 
 		goto error_path;
 
-	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), ctx, len, 0);
+	len = security_inode_getsecurity(inode, suffix, ctx, len, 0);
 	if (len < 0)
 		goto error_path;
 
 	kfree(context->names[idx].ctx);
 	context->names[idx].ctx = ctx;
-	return;
+	goto ret;
 
 error_path:
 	if (ctx)
 		kfree(ctx);
 	audit_panic("error in audit_inode_context");
+ret:
 	return;
 }
 
diff --git a/security/dummy.c b/security/dummy.c
index 6febe7d39fa0..0a553d39729f 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -378,7 +378,7 @@ static int dummy_inode_removexattr (struct dentry *dentry, char *name)
 	return 0;
 }
 
-static int dummy_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static int dummy_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	return -EOPNOTSUPP;
 }
@@ -393,6 +393,11 @@ static int dummy_inode_listsecurity(struct inode *inode, char *buffer, size_t bu
 	return 0;
 }
 
+static const char *dummy_inode_xattr_getsuffix(void)
+{
+	return NULL;
+}
+
 static int dummy_file_permission (struct file *file, int mask)
 {
 	return 0;
@@ -930,6 +935,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, inode_getxattr);
 	set_to_dummy_if_null(ops, inode_listxattr);
 	set_to_dummy_if_null(ops, inode_removexattr);
+	set_to_dummy_if_null(ops, inode_xattr_getsuffix);
 	set_to_dummy_if_null(ops, inode_getsecurity);
 	set_to_dummy_if_null(ops, inode_setsecurity);
 	set_to_dummy_if_null(ops, inode_listsecurity);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9c08a19cc81b..81b726b1a419 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2247,7 +2247,7 @@ static const char *selinux_inode_xattr_getsuffix(void)
  *
  * Permission check is handled by selinux_inode_getxattr hook.
  */
-static int selinux_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static int selinux_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	struct inode_security_struct *isec = inode->i_security;
 
-- 
cgit v1.2.3


From fe7752bab26a9ac0651b695ad4f55659761f68f7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 15 Dec 2005 18:33:52 +0000
Subject: [PATCH] Fix audit record filtering with !CONFIG_AUDITSYSCALL

This fixes the per-user and per-message-type filtering when syscall
auditing isn't enabled.

[AV: folded followup fix from the same author]

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  18 +--
 kernel/Makefile       |   2 +-
 kernel/audit.c        |   1 +
 kernel/audit.h        |  70 ++++++++++
 kernel/auditfilter.c  | 378 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/auditsc.c      | 380 +-------------------------------------------------
 6 files changed, 459 insertions(+), 390 deletions(-)
 create mode 100644 kernel/audit.h
 create mode 100644 kernel/auditfilter.c

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1912d8e8ae90..fbc21d6267f3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -278,8 +278,6 @@ static inline void audit_inode_child(const char *dname,
 }
 
 				/* Private API (for audit.c only) */
-extern int  audit_receive_filter(int type, int pid, int uid, int seq,
-				 void *data, uid_t loginuid);
 extern unsigned int audit_serial(void);
 extern void auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
@@ -290,8 +288,6 @@ extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern void audit_signal_info(int sig, struct task_struct *t);
-extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
-extern int audit_filter_type(int type);
 extern int audit_set_macxattr(const char *name);
 #else
 #define audit_alloc(t) ({ 0; })
@@ -304,7 +300,6 @@ extern int audit_set_macxattr(const char *name);
 #define __audit_inode_child(d,i,p) do { ; } while (0)
 #define audit_inode(n,i,f) do { ; } while (0)
 #define audit_inode_child(d,i,p) do { ; } while (0)
-#define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_perms(q,u,g,m,i) ({ 0; })
@@ -312,7 +307,6 @@ extern int audit_set_macxattr(const char *name);
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_signal_info(s,t) do { ; } while (0)
-#define audit_filter_user(cb,t) ({ 1; })
 #define audit_set_macxattr(n) do { ; } while (0)
 #endif
 
@@ -337,13 +331,11 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct dentry *dentry,
 					     struct vfsmount *vfsmnt);
-				/* Private API (for auditsc.c only) */
-extern void		    audit_send_reply(int pid, int seq, int type,
-					     int done, int multi,
-					     void *payload, int size);
-extern void		    audit_log_lost(const char *message);
-extern void		    audit_panic(const char *message);
-extern struct semaphore audit_netlink_sem;
+				/* Private API (for audit.c only) */
+extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
+extern int audit_filter_type(int type);
+extern int  audit_receive_filter(int type, int pid, int uid, int seq,
+				 void *data, uid_t loginuid);
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
 #define audit_log_start(c,g,t) ({ NULL; })
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde815d..58cf129e5622 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
-obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 45c123ef77a7..07c5d2bdd38c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -305,6 +305,7 @@ static int kauditd_thread(void *dummy)
 			remove_wait_queue(&kauditd_wait, &wait);
 		}
 	}
+	return 0;
 }
 
 /**
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 000000000000..7643e46daeb2
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,70 @@
+/* audit -- definition of audit_context structure and supporting types 
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/audit.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* Rule lists */
+struct audit_entry {
+	struct list_head  list;
+	struct rcu_head   rcu;
+	struct audit_rule rule;
+};
+
+
+extern int audit_pid;
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+
+extern void		    audit_send_reply(int pid, int seq, int type,
+					     int done, int multi,
+					     void *payload, int size);
+extern void		    audit_log_lost(const char *message);
+extern void		    audit_panic(const char *message);
+extern struct semaphore audit_netlink_sem;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 000000000000..7f347c360876
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,378 @@
+/* auditfilter.c -- filtering of audit events
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/netlink.h>
+#include "audit.h"
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+	LIST_HEAD_INIT(audit_filter_list[0]),
+	LIST_HEAD_INIT(audit_filter_list[1]),
+	LIST_HEAD_INIT(audit_filter_list[2]),
+	LIST_HEAD_INIT(audit_filter_list[3]),
+	LIST_HEAD_INIT(audit_filter_list[4]),
+	LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
+#error Fix audit_filter_list initialiser
+#endif
+};
+
+/* Copy rule from user-space to kernel-space.  Called from 
+ * audit_add_rule during AUDIT_ADD. */
+static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+	if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
+/* Check to see if two rules are identical.  It is called from
+ * audit_add_rule during AUDIT_ADD and 
+ * audit_del_rule during AUDIT_DEL. */
+static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+	int i;
+
+	if (a->flags != b->flags)
+		return 1;
+
+	if (a->action != b->action)
+		return 1;
+
+	if (a->field_count != b->field_count)
+		return 1;
+
+	for (i = 0; i < a->field_count; i++) {
+		if (a->fields[i] != b->fields[i]
+		    || a->values[i] != b->values[i])
+			return 1;
+	}
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		if (a->mask[i] != b->mask[i])
+			return 1;
+
+	return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_rule *rule,
+				  struct list_head *list)
+{
+	struct audit_entry  *entry;
+	int i;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * addition routine. */
+	list_for_each_entry(entry, list, list) {
+		if (!audit_compare_rule(rule, &entry->rule)) {
+			return -EEXIST;
+		}
+	}
+
+	for (i = 0; i < rule->field_count; i++) {
+		if (rule->fields[i] & AUDIT_UNUSED_BITS)
+			return -EINVAL;
+		if ( rule->fields[i] & AUDIT_NEGATE )
+			rule->fields[i] |= AUDIT_NOT_EQUAL;
+		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
+			rule->fields[i] |= AUDIT_EQUAL;
+		rule->fields[i] &= (~AUDIT_NEGATE);
+	}
+
+	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+		return -ENOMEM;
+	if (audit_copy_rule(&entry->rule, rule)) {
+		kfree(entry);
+		return -EINVAL;
+	}
+
+	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
+		list_add_rcu(&entry->list, list);
+	} else {
+		list_add_tail_rcu(&entry->list, list);
+	}
+
+	return 0;
+}
+
+static inline void audit_free_rule(struct rcu_head *head)
+{
+	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+	kfree(e);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+				 struct list_head *list)
+{
+	struct audit_entry  *e;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * deletion routine. */
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(rule, &e->rule)) {
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule);
+			return 0;
+		}
+	}
+	return -ENOENT;		/* No matching rule */
+}
+
+static int audit_list_rules(void *_dest)
+{
+	int pid, seq;
+	int *dest = _dest;
+	struct audit_entry *entry;
+	int i;
+
+	pid = dest[0];
+	seq = dest[1];
+	kfree(dest);
+
+	down(&audit_netlink_sem);
+
+	/* The *_rcu iterators not needed here because we are
+	   always called with audit_netlink_sem held. */
+	for (i=0; i<AUDIT_NR_FILTERS; i++) {
+		list_for_each_entry(entry, &audit_filter_list[i], list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+	}
+	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+	
+	up(&audit_netlink_sem);
+	return 0;
+}
+
+/**
+ * audit_receive_filter - apply all rules to the specified message type
+ * @type: audit message type
+ * @pid: target pid for netlink audit messages
+ * @uid: target uid for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ * @data: payload data
+ * @loginuid: loginuid of sender
+ */
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+							uid_t loginuid)
+{
+	struct task_struct *tsk;
+	int *dest;
+	int		   err = 0;
+	unsigned listnr;
+
+	switch (type) {
+	case AUDIT_LIST:
+		/* We can't just spew out the rules here because we might fill
+		 * the available socket buffer space and deadlock waiting for
+		 * auditctl to read from it... which isn't ever going to
+		 * happen if we're actually running in the context of auditctl
+		 * trying to _send_ the stuff */
+		 
+		dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+		if (!dest)
+			return -ENOMEM;
+		dest[0] = pid;
+		dest[1] = seq;
+
+		tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
+		if (IS_ERR(tsk)) {
+			kfree(dest);
+			err = PTR_ERR(tsk);
+		}
+		break;
+	case AUDIT_ADD:
+		listnr = ((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+		switch(listnr) {
+		default:
+			return -EINVAL;
+
+		case AUDIT_FILTER_USER:
+		case AUDIT_FILTER_TYPE:
+#ifdef CONFIG_AUDITSYSCALL
+		case AUDIT_FILTER_ENTRY:
+		case AUDIT_FILTER_EXIT:
+		case AUDIT_FILTER_TASK:
+#endif
+			;
+		}
+		err = audit_add_rule(data, &audit_filter_list[listnr]);
+		if (!err)
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				  "auid=%u added an audit rule\n", loginuid);
+		break;
+	case AUDIT_DEL:
+		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+		if (listnr >= AUDIT_NR_FILTERS)
+			return -EINVAL;
+
+		err = audit_del_rule(data, &audit_filter_list[listnr]);
+		if (!err)
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				  "auid=%u removed an audit rule\n", loginuid);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+int audit_comparator(const u32 left, const u32 op, const u32 right)
+{
+	switch (op) {
+	case AUDIT_EQUAL:
+		return (left == right);
+	case AUDIT_NOT_EQUAL:
+		return (left != right);
+	case AUDIT_LESS_THAN:
+		return (left < right);
+	case AUDIT_LESS_THAN_OR_EQUAL:
+		return (left <= right);
+	case AUDIT_GREATER_THAN:
+		return (left > right);
+	case AUDIT_GREATER_THAN_OR_EQUAL:
+		return (left >= right);
+	default:
+		return -EINVAL;
+	}
+}
+
+
+
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+				   struct audit_rule *rule,
+				   enum audit_state *state)
+{
+	int i;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = audit_comparator(cb->creds.pid, op, value);
+			break;
+		case AUDIT_UID:
+			result = audit_comparator(cb->creds.uid, op, value);
+			break;
+		case AUDIT_GID:
+			result = audit_comparator(cb->creds.gid, op, value);
+			break;
+		case AUDIT_LOGINUID:
+			result = audit_comparator(cb->loginuid, op, value);
+			break;
+		}
+
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+int audit_filter_user(struct netlink_skb_parms *cb, int type)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+	int ret = 1;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+		if (audit_filter_user_rules(cb, &e->rule, &state)) {
+			if (state == AUDIT_DISABLED)
+				ret = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret; /* Audit by default */
+}
+
+int audit_filter_type(int type)
+{
+	struct audit_entry *e;
+	int result = 0;
+	
+	rcu_read_lock();
+	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+		goto unlock_and_return;
+
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+				list) {
+		struct audit_rule *rule = &e->rule;
+		int i;
+		for (i = 0; i < rule->field_count; i++) {
+			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
+			u32 value  = rule->values[i];
+			if ( field == AUDIT_MSGTYPE ) {
+				result = audit_comparator(type, op, value); 
+				if (!result)
+					break;
+			}
+		}
+		if (result)
+			goto unlock_and_return;
+	}
+unlock_and_return:
+	rcu_read_unlock();
+	return result;
+}
+
+
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4ef14515da35..17719b303638 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -52,17 +52,15 @@
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
-#include <linux/kthread.h>
 #include <linux/netlink.h>
 #include <linux/compiler.h>
 #include <asm/unistd.h>
 #include <linux/security.h>
+#include <linux/list.h>
 
-/* 0 = no checking
-   1 = put_count checking
-   2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
+#include "audit.h"
+
+extern struct list_head audit_filter_list[];
 
 /* No syscall auditing will take place unless audit_enabled != 0. */
 extern int audit_enabled;
@@ -76,29 +74,6 @@ extern int audit_enabled;
  * path_lookup. */
 #define AUDIT_NAMES_RESERVED 7
 
-/* At task start time, the audit_state is set in the audit_context using
-   a per-task filter.  At syscall entry, the audit_state is augmented by
-   the syscall filter. */
-enum audit_state {
-	AUDIT_DISABLED,		/* Do not create per-task audit_context.
-				 * No syscall-specific audit records can
-				 * be generated. */
-	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
-				 * but don't necessarily fill it in at
-				 * syscall entry time (i.e., filter
-				 * instead). */
-	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
-				 * and always fill it in at syscall
-				 * entry time.  This makes a full
-				 * syscall record available if some
-				 * other part of the kernel decides it
-				 * should be recorded. */
-	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
-				 * always fill it in at syscall entry
-				 * time, and always write out the audit
-				 * record at syscall exit time.  */
-};
-
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
@@ -183,264 +158,6 @@ struct audit_context {
 #endif
 };
 
-				/* Public API */
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
-static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
-	LIST_HEAD_INIT(audit_filter_list[0]),
-	LIST_HEAD_INIT(audit_filter_list[1]),
-	LIST_HEAD_INIT(audit_filter_list[2]),
-	LIST_HEAD_INIT(audit_filter_list[3]),
-	LIST_HEAD_INIT(audit_filter_list[4]),
-	LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
-#error Fix audit_filter_list initialiser
-#endif
-};
-
-struct audit_entry {
-	struct list_head  list;
-	struct rcu_head   rcu;
-	struct audit_rule rule;
-};
-
-extern int audit_pid;
-
-/* Copy rule from user-space to kernel-space.  Called from 
- * audit_add_rule during AUDIT_ADD. */
-static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
-{
-	int i;
-
-	if (s->action != AUDIT_NEVER
-	    && s->action != AUDIT_POSSIBLE
-	    && s->action != AUDIT_ALWAYS)
-		return -1;
-	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
-		return -1;
-	if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
-		return -1;
-
-	d->flags	= s->flags;
-	d->action	= s->action;
-	d->field_count	= s->field_count;
-	for (i = 0; i < d->field_count; i++) {
-		d->fields[i] = s->fields[i];
-		d->values[i] = s->values[i];
-	}
-	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
-	return 0;
-}
-
-/* Check to see if two rules are identical.  It is called from
- * audit_add_rule during AUDIT_ADD and 
- * audit_del_rule during AUDIT_DEL. */
-static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
-{
-	int i;
-
-	if (a->flags != b->flags)
-		return 1;
-
-	if (a->action != b->action)
-		return 1;
-
-	if (a->field_count != b->field_count)
-		return 1;
-
-	for (i = 0; i < a->field_count; i++) {
-		if (a->fields[i] != b->fields[i]
-		    || a->values[i] != b->values[i])
-			return 1;
-	}
-
-	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
-		if (a->mask[i] != b->mask[i])
-			return 1;
-
-	return 0;
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_rule *rule,
-				  struct list_head *list)
-{
-	struct audit_entry  *entry;
-	int i;
-
-	/* Do not use the _rcu iterator here, since this is the only
-	 * addition routine. */
-	list_for_each_entry(entry, list, list) {
-		if (!audit_compare_rule(rule, &entry->rule)) {
-			return -EEXIST;
-		}
-	}
-
-	for (i = 0; i < rule->field_count; i++) {
-		if (rule->fields[i] & AUDIT_UNUSED_BITS)
-			return -EINVAL;
-		if ( rule->fields[i] & AUDIT_NEGATE )
-			rule->fields[i] |= AUDIT_NOT_EQUAL;
-		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
-			rule->fields[i] |= AUDIT_EQUAL;
-		rule->fields[i] &= (~AUDIT_NEGATE);
-	}
-
-	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
-		return -ENOMEM;
-	if (audit_copy_rule(&entry->rule, rule)) {
-		kfree(entry);
-		return -EINVAL;
-	}
-
-	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
-		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
-		list_add_rcu(&entry->list, list);
-	} else {
-		list_add_tail_rcu(&entry->list, list);
-	}
-
-	return 0;
-}
-
-static inline void audit_free_rule(struct rcu_head *head)
-{
-	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
-	kfree(e);
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_del_rule(struct audit_rule *rule,
-				 struct list_head *list)
-{
-	struct audit_entry  *e;
-
-	/* Do not use the _rcu iterator here, since this is the only
-	 * deletion routine. */
-	list_for_each_entry(e, list, list) {
-		if (!audit_compare_rule(rule, &e->rule)) {
-			list_del_rcu(&e->list);
-			call_rcu(&e->rcu, audit_free_rule);
-			return 0;
-		}
-	}
-	return -ENOENT;		/* No matching rule */
-}
-
-static int audit_list_rules(void *_dest)
-{
-	int pid, seq;
-	int *dest = _dest;
-	struct audit_entry *entry;
-	int i;
-
-	pid = dest[0];
-	seq = dest[1];
-	kfree(dest);
-
-	down(&audit_netlink_sem);
-
-	/* The *_rcu iterators not needed here because we are
-	   always called with audit_netlink_sem held. */
-	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(entry, &audit_filter_list[i], list)
-			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 &entry->rule, sizeof(entry->rule));
-	}
-	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
-	
-	up(&audit_netlink_sem);
-	return 0;
-}
-
-/**
- * audit_receive_filter - apply all rules to the specified message type
- * @type: audit message type
- * @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
- * @seq: netlink audit message sequence (serial) number
- * @data: payload data
- * @loginuid: loginuid of sender
- */
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-							uid_t loginuid)
-{
-	struct task_struct *tsk;
-	int *dest;
-	int		   err = 0;
-	unsigned listnr;
-
-	switch (type) {
-	case AUDIT_LIST:
-		/* We can't just spew out the rules here because we might fill
-		 * the available socket buffer space and deadlock waiting for
-		 * auditctl to read from it... which isn't ever going to
-		 * happen if we're actually running in the context of auditctl
-		 * trying to _send_ the stuff */
-		 
-		dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
-		if (!dest)
-			return -ENOMEM;
-		dest[0] = pid;
-		dest[1] = seq;
-
-		tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
-		if (IS_ERR(tsk)) {
-			kfree(dest);
-			err = PTR_ERR(tsk);
-		}
-		break;
-	case AUDIT_ADD:
-		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		if (listnr >= AUDIT_NR_FILTERS)
-			return -EINVAL;
-
-		err = audit_add_rule(data, &audit_filter_list[listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u added an audit rule\n", loginuid);
-		break;
-	case AUDIT_DEL:
-		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		if (listnr >= AUDIT_NR_FILTERS)
-			return -EINVAL;
-
-		err = audit_del_rule(data, &audit_filter_list[listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u removed an audit rule\n", loginuid);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return err;
-}
-
-static int audit_comparator(const u32 left, const u32 op, const u32 right)
-{
-	switch (op) {
-	case AUDIT_EQUAL:
-		return (left == right);
-	case AUDIT_NOT_EQUAL:
-		return (left != right);
-	case AUDIT_LESS_THAN:
-		return (left < right);
-	case AUDIT_LESS_THAN_OR_EQUAL:
-		return (left <= right);
-	case AUDIT_GREATER_THAN:
-		return (left > right);
-	case AUDIT_GREATER_THAN_OR_EQUAL:
-		return (left >= right);
-	default:
-		return -EINVAL;
-	}
-}
 
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
@@ -613,95 +330,6 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 	return AUDIT_BUILD_CONTEXT;
 }
 
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
-				   struct audit_rule *rule,
-				   enum audit_state *state)
-{
-	int i;
-
-	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-		u32 value  = rule->values[i];
-		int result = 0;
-
-		switch (field) {
-		case AUDIT_PID:
-			result = audit_comparator(cb->creds.pid, op, value);
-			break;
-		case AUDIT_UID:
-			result = audit_comparator(cb->creds.uid, op, value);
-			break;
-		case AUDIT_GID:
-			result = audit_comparator(cb->creds.gid, op, value);
-			break;
-		case AUDIT_LOGINUID:
-			result = audit_comparator(cb->loginuid, op, value);
-			break;
-		}
-
-		if (!result)
-			return 0;
-	}
-	switch (rule->action) {
-	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
-	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
-	}
-	return 1;
-}
-
-int audit_filter_user(struct netlink_skb_parms *cb, int type)
-{
-	struct audit_entry *e;
-	enum audit_state   state;
-	int ret = 1;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-		if (audit_filter_user_rules(cb, &e->rule, &state)) {
-			if (state == AUDIT_DISABLED)
-				ret = 0;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret; /* Audit by default */
-}
-
-int audit_filter_type(int type)
-{
-	struct audit_entry *e;
-	int result = 0;
-	
-	rcu_read_lock();
-	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
-		goto unlock_and_return;
-
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
-				list) {
-		struct audit_rule *rule = &e->rule;
-		int i;
-		for (i = 0; i < rule->field_count; i++) {
-			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-			u32 value  = rule->values[i];
-			if ( field == AUDIT_MSGTYPE ) {
-				result = audit_comparator(type, op, value); 
-				if (!result)
-					break;
-			}
-		}
-		if (result)
-			goto unlock_and_return;
-	}
-unlock_and_return:
-	rcu_read_unlock();
-	return result;
-}
-
-
 /* This should be called with task_lock() held. */
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
-- 
cgit v1.2.3


From af601e4623d0303bfafa54ec728b7ae8493a8e1b Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Wed, 4 Jan 2006 14:08:39 +0000
Subject: [PATCH] SE Linux audit events

Attached is a patch that hardwires important SE Linux events to the audit
system. Please Apply.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h          |  3 +++
 security/selinux/selinuxfs.c   | 11 +++++++++++
 security/selinux/ss/services.c | 15 +++++++++------
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index fbc21d6267f3..8868c96ca8a2 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -83,6 +83,9 @@
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
 #define AUDIT_AVC_PATH		1402	/* dentry, vfsmount pair from avc */
+#define AUDIT_MAC_POLICY_LOAD	1403	/* Policy file load */
+#define AUDIT_MAC_STATUS	1404	/* Changed enforcing,permissive,off */
+#define AUDIT_MAC_CONFIG_CHANGE	1405	/* Changes to booleans */
 
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index b5fa02d17b1e..5eba6664eac0 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -21,6 +21,7 @@
 #include <linux/major.h>
 #include <linux/seq_file.h>
 #include <linux/percpu.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 
@@ -126,6 +127,10 @@ static ssize_t sel_write_enforce(struct file * file, const char __user * buf,
 		length = task_has_security(current, SECURITY__SETENFORCE);
 		if (length)
 			goto out;
+		audit_log(current->audit_context, GFP_KERNEL, AUDIT_MAC_STATUS,
+			"enforcing=%d old_enforcing=%d auid=%u", new_value, 
+			selinux_enforcing,
+			audit_get_loginuid(current->audit_context));
 		selinux_enforcing = new_value;
 		if (selinux_enforcing)
 			avc_ss_reset(0);
@@ -176,6 +181,9 @@ static ssize_t sel_write_disable(struct file * file, const char __user * buf,
 		length = selinux_disable();
 		if (length < 0)
 			goto out;
+		audit_log(current->audit_context, GFP_KERNEL, AUDIT_MAC_STATUS,
+			"selinux=0 auid=%u",
+			audit_get_loginuid(current->audit_context));
 	}
 
 	length = count;
@@ -261,6 +269,9 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf,
 		length = ret;
 	else
 		length = count;
+	audit_log(current->audit_context, GFP_KERNEL, AUDIT_MAC_POLICY_LOAD,
+		"policy loaded auid=%u",
+		audit_get_loginuid(current->audit_context));
 out:
 	up(&sel_sem);
 	vfree(data);
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 8a764928ff4b..d877cd16a813 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -1758,19 +1758,22 @@ int security_set_bools(int len, int *values)
 		goto out;
 	}
 
-	printk(KERN_INFO "security: committed booleans { ");
 	for (i = 0; i < len; i++) {
+		if (!!values[i] != policydb.bool_val_to_struct[i]->state) {
+			audit_log(current->audit_context, GFP_ATOMIC,
+				AUDIT_MAC_CONFIG_CHANGE,
+				"bool=%s val=%d old_val=%d auid=%u",
+				policydb.p_bool_val_to_name[i],
+				!!values[i],
+				policydb.bool_val_to_struct[i]->state,
+				audit_get_loginuid(current->audit_context));
+		}
 		if (values[i]) {
 			policydb.bool_val_to_struct[i]->state = 1;
 		} else {
 			policydb.bool_val_to_struct[i]->state = 0;
 		}
-		if (i != 0)
-			printk(", ");
-		printk("%s:%d", policydb.p_bool_val_to_name[i],
-		       policydb.bool_val_to_struct[i]->state);
 	}
-	printk(" }\n");
 
 	for (cur = policydb.cond_list; cur != NULL; cur = cur->next) {
 		rc = evaluate_cond_node(&policydb, cur);
-- 
cgit v1.2.3


From 93315ed6dd12dacfc941f9eb8ca0293aadf99793 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Tue, 7 Feb 2006 12:05:27 -0500
Subject: [PATCH] audit string fields interface + consumer

Updated patch to dynamically allocate audit rule fields in kernel's
internal representation.  Added unlikely() calls for testing memory
allocation result.

Amy Griffis wrote:     [Wed Jan 11 2006, 02:02:31PM EST]
> Modify audit's kernel-userspace interface to allow the specification
> of string fields in audit rules.
>
> Signed-off-by: Amy Griffis <amy.griffis@hp.com>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
(cherry picked from 5ffc4a863f92351b720fe3e9c5cd647accff9e03 commit)
---
 include/linux/audit.h       |  31 ++-
 kernel/audit.c              |  19 +-
 kernel/audit.h              |  23 ++-
 kernel/auditfilter.c        | 467 +++++++++++++++++++++++++++++++++-----------
 kernel/auditsc.c            |  50 +++--
 security/selinux/nlmsgtab.c |   3 +
 6 files changed, 448 insertions(+), 145 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8868c96ca8a2..8a3b98175c25 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -50,15 +50,18 @@
  */
 #define AUDIT_GET		1000	/* Get status */
 #define AUDIT_SET		1001	/* Set status (enable/disable/auditd) */
-#define AUDIT_LIST		1002	/* List syscall filtering rules */
-#define AUDIT_ADD		1003	/* Add syscall filtering rule */
-#define AUDIT_DEL		1004	/* Delete syscall filtering rule */
+#define AUDIT_LIST		1002	/* List syscall rules -- deprecated */
+#define AUDIT_ADD		1003	/* Add syscall rule -- deprecated */
+#define AUDIT_DEL		1004	/* Delete syscall rule -- deprecated */
 #define AUDIT_USER		1005	/* Message from userspace -- deprecated */
 #define AUDIT_LOGIN		1006	/* Define the login id and information */
 #define AUDIT_WATCH_INS		1007	/* Insert file/dir watch entry */
 #define AUDIT_WATCH_REM		1008	/* Remove file/dir watch entry */
 #define AUDIT_WATCH_LIST	1009	/* List all file/dir watches */
 #define AUDIT_SIGNAL_INFO	1010	/* Get info about sender of signal to auditd */
+#define AUDIT_ADD_RULE		1011	/* Add syscall filtering rule */
+#define AUDIT_DEL_RULE		1012	/* Delete syscall filtering rule */
+#define AUDIT_LIST_RULES	1013	/* List syscall filtering rules */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
@@ -229,6 +232,26 @@ struct audit_status {
 	__u32		backlog;	/* messages waiting in queue */
 };
 
+/* audit_rule_data supports filter rules with both integer and string
+ * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
+ * AUDIT_LIST_RULES requests.
+ */
+struct audit_rule_data {
+	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
+	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
+	__u32		field_count;
+	__u32		mask[AUDIT_BITMASK_SIZE];
+	__u32		fields[AUDIT_MAX_FIELDS];
+	__u32		values[AUDIT_MAX_FIELDS];
+	__u32		fieldflags[AUDIT_MAX_FIELDS];
+	__u32		buflen;	/* total length of string fields */
+	char		buf[0];	/* string fields buffer */
+};
+
+/* audit_rule is supported to maintain backward compatibility with
+ * userspace.  It supports integer fields only and corresponds to
+ * AUDIT_ADD, AUDIT_DEL and AUDIT_LIST requests.
+ */
 struct audit_rule {		/* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */
 	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
 	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
@@ -338,7 +361,7 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
-				 void *data, uid_t loginuid);
+				 void *data, size_t datasz, uid_t loginuid);
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
 #define audit_log_start(c,g,t) ({ NULL; })
diff --git a/kernel/audit.c b/kernel/audit.c
index 07c5d2bdd38c..4eb97b62d7fa 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -52,6 +52,7 @@
 #include <linux/audit.h>
 
 #include <net/sock.h>
+#include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 
@@ -361,9 +362,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 	switch (msg_type) {
 	case AUDIT_GET:
 	case AUDIT_LIST:
+	case AUDIT_LIST_RULES:
 	case AUDIT_SET:
 	case AUDIT_ADD:
+	case AUDIT_ADD_RULE:
 	case AUDIT_DEL:
+	case AUDIT_DEL_RULE:
 	case AUDIT_SIGNAL_INFO:
 		if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
 			err = -EPERM;
@@ -470,12 +474,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	case AUDIT_ADD:
 	case AUDIT_DEL:
-		if (nlh->nlmsg_len < sizeof(struct audit_rule))
+		if (nlmsg_len(nlh) < sizeof(struct audit_rule))
 			return -EINVAL;
 		/* fallthrough */
 	case AUDIT_LIST:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
-					   uid, seq, data, loginuid);
+					   uid, seq, data, nlmsg_len(nlh),
+					   loginuid);
+		break;
+	case AUDIT_ADD_RULE:
+	case AUDIT_DEL_RULE:
+		if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+			return -EINVAL;
+		/* fallthrough */
+	case AUDIT_LIST_RULES:
+		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+					   uid, seq, data, nlmsg_len(nlh),
+					   loginuid);
 		break;
 	case AUDIT_SIGNAL_INFO:
 		sig_data.uid = audit_sig_uid;
diff --git a/kernel/audit.h b/kernel/audit.h
index 7643e46daeb2..4b602cdcabef 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -52,10 +52,27 @@ enum audit_state {
 };
 
 /* Rule lists */
+struct audit_field {
+	u32			type;
+	u32			val;
+	u32			op;
+};
+
+struct audit_krule {
+	int			vers_ops;
+	u32			flags;
+	u32			listnr;
+	u32			action;
+	u32			mask[AUDIT_BITMASK_SIZE];
+	u32			buflen; /* for data alloc on list rules */
+	u32			field_count;
+	struct audit_field	*fields;
+};
+
 struct audit_entry {
-	struct list_head  list;
-	struct rcu_head   rcu;
-	struct audit_rule rule;
+	struct list_head	list;
+	struct rcu_head		rcu;
+	struct audit_krule	rule;
 };
 
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a3a32752f973..686d514a3518 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -40,52 +40,279 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 #endif
 };
 
-/* Copy rule from user-space to kernel-space.  Called from 
- * audit_add_rule during AUDIT_ADD. */
-static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+static inline void audit_free_rule(struct audit_entry *e)
 {
+	kfree(e->rule.fields);
+	kfree(e);
+}
+
+static inline void audit_free_rule_rcu(struct rcu_head *head)
+{
+	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+	audit_free_rule(e);
+}
+
+/* Unpack a filter field's string representation from user-space
+ * buffer. */
+static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+{
+	char *str;
+
+	if (!*bufp || (len == 0) || (len > *remain))
+		return ERR_PTR(-EINVAL);
+
+	/* Of the currently implemented string fields, PATH_MAX
+	 * defines the longest valid length.
+	 */
+	if (len > PATH_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	str = kmalloc(len + 1, GFP_KERNEL);
+	if (unlikely(!str))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(str, *bufp, len);
+	str[len] = 0;
+	*bufp += len;
+	*remain -= len;
+
+	return str;
+}
+
+/* Common user-space to kernel rule translation. */
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+{
+	unsigned listnr;
+	struct audit_entry *entry;
+	struct audit_field *fields;
+	int i, err;
+
+	err = -EINVAL;
+	listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
+	switch(listnr) {
+	default:
+		goto exit_err;
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+#ifdef CONFIG_AUDITSYSCALL
+	case AUDIT_FILTER_ENTRY:
+	case AUDIT_FILTER_EXIT:
+	case AUDIT_FILTER_TASK:
+#endif
+		;
+	}
+	if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
+	    rule->action != AUDIT_ALWAYS)
+		goto exit_err;
+	if (rule->field_count > AUDIT_MAX_FIELDS)
+		goto exit_err;
+
+	err = -ENOMEM;
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (unlikely(!entry))
+		goto exit_err;
+	fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL);
+	if (unlikely(!fields)) {
+		kfree(entry);
+		goto exit_err;
+	}
+
+	memset(&entry->rule, 0, sizeof(struct audit_krule));
+	memset(fields, 0, sizeof(struct audit_field));
+
+	entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
+	entry->rule.listnr = listnr;
+	entry->rule.action = rule->action;
+	entry->rule.field_count = rule->field_count;
+	entry->rule.fields = fields;
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		entry->rule.mask[i] = rule->mask[i];
+
+	return entry;
+
+exit_err:
+	return ERR_PTR(err);
+}
+
+/* Translate struct audit_rule to kernel's rule respresentation.
+ * Exists for backward compatibility with userspace. */
+static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+{
+	struct audit_entry *entry;
+	int err = 0;
 	int i;
 
-	if (s->action != AUDIT_NEVER
-	    && s->action != AUDIT_POSSIBLE
-	    && s->action != AUDIT_ALWAYS)
-		return -1;
-	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
-		return -1;
-	if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
-		return -1;
-
-	d->flags	= s->flags;
-	d->action	= s->action;
-	d->field_count	= s->field_count;
-	for (i = 0; i < d->field_count; i++) {
-		d->fields[i] = s->fields[i];
-		d->values[i] = s->values[i];
+	entry = audit_to_entry_common(rule);
+	if (IS_ERR(entry))
+		goto exit_nofree;
+
+	for (i = 0; i < rule->field_count; i++) {
+		struct audit_field *f = &entry->rule.fields[i];
+
+		if (rule->fields[i] & AUDIT_UNUSED_BITS) {
+			err = -EINVAL;
+			goto exit_free;
+		}
+
+		f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
+		f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
+		f->val = rule->values[i];
+
+		entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
+		if (f->op & AUDIT_NEGATE)
+			f->op |= AUDIT_NOT_EQUAL;
+		else if (!(f->op & AUDIT_OPERATORS))
+			f->op |= AUDIT_EQUAL;
+		f->op &= ~AUDIT_NEGATE;
 	}
-	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
-	return 0;
+
+exit_nofree:
+	return entry;
+
+exit_free:
+	audit_free_rule(entry);
+	return ERR_PTR(err);
 }
 
-/* Check to see if two rules are identical.  It is called from
- * audit_add_rule during AUDIT_ADD and 
- * audit_del_rule during AUDIT_DEL. */
-static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+/* Translate struct audit_rule_data to kernel's rule respresentation. */
+static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
+					       size_t datasz)
 {
+	int err = 0;
+	struct audit_entry *entry;
+	void *bufp;
+	/* size_t remain = datasz - sizeof(struct audit_rule_data); */
 	int i;
 
-	if (a->flags != b->flags)
-		return 1;
+	entry = audit_to_entry_common((struct audit_rule *)data);
+	if (IS_ERR(entry))
+		goto exit_nofree;
 
-	if (a->action != b->action)
-		return 1;
+	bufp = data->buf;
+	entry->rule.vers_ops = 2;
+	for (i = 0; i < data->field_count; i++) {
+		struct audit_field *f = &entry->rule.fields[i];
+
+		err = -EINVAL;
+		if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
+		    data->fieldflags[i] & ~AUDIT_OPERATORS)
+			goto exit_free;
+
+		f->op = data->fieldflags[i] & AUDIT_OPERATORS;
+		f->type = data->fields[i];
+		switch(f->type) {
+		/* call type-specific conversion routines here */
+		default:
+			f->val = data->values[i];
+		}
+	}
+
+exit_nofree:
+	return entry;
+
+exit_free:
+	audit_free_rule(entry);
+	return ERR_PTR(err);
+}
+
+/* Pack a filter field's string representation into data block. */
+static inline size_t audit_pack_string(void **bufp, char *str)
+{
+	size_t len = strlen(str);
+
+	memcpy(*bufp, str, len);
+	*bufp += len;
+
+	return len;
+}
+
+/* Translate kernel rule respresentation to struct audit_rule.
+ * Exists for backward compatibility with userspace. */
+static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
+{
+	struct audit_rule *rule;
+	int i;
+
+	rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+	if (unlikely(!rule))
+		return ERR_PTR(-ENOMEM);
+	memset(rule, 0, sizeof(*rule));
+
+	rule->flags = krule->flags | krule->listnr;
+	rule->action = krule->action;
+	rule->field_count = krule->field_count;
+	for (i = 0; i < rule->field_count; i++) {
+		rule->values[i] = krule->fields[i].val;
+		rule->fields[i] = krule->fields[i].type;
+
+		if (krule->vers_ops == 1) {
+			if (krule->fields[i].op & AUDIT_NOT_EQUAL)
+				rule->fields[i] |= AUDIT_NEGATE;
+		} else {
+			rule->fields[i] |= krule->fields[i].op;
+		}
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
+
+	return rule;
+}
 
-	if (a->field_count != b->field_count)
+/* Translate kernel rule respresentation to struct audit_rule_data. */
+static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
+{
+	struct audit_rule_data *data;
+	void *bufp;
+	int i;
+
+	data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+	if (unlikely(!data))
+		return ERR_PTR(-ENOMEM);
+	memset(data, 0, sizeof(*data));
+
+	data->flags = krule->flags | krule->listnr;
+	data->action = krule->action;
+	data->field_count = krule->field_count;
+	bufp = data->buf;
+	for (i = 0; i < data->field_count; i++) {
+		struct audit_field *f = &krule->fields[i];
+
+		data->fields[i] = f->type;
+		data->fieldflags[i] = f->op;
+		switch(f->type) {
+		/* call type-specific conversion routines here */
+		default:
+			data->values[i] = f->val;
+		}
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+
+	return data;
+}
+
+/* Compare two rules in kernel format.  Considered success if rules
+ * don't match. */
+static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
+{
+	int i;
+
+	if (a->flags != b->flags ||
+	    a->listnr != b->listnr ||
+	    a->action != b->action ||
+	    a->field_count != b->field_count)
 		return 1;
 
 	for (i = 0; i < a->field_count; i++) {
-		if (a->fields[i] != b->fields[i]
-		    || a->values[i] != b->values[i])
+		if (a->fields[i].type != b->fields[i].type ||
+		    a->fields[i].op != b->fields[i].op)
 			return 1;
+
+		switch(a->fields[i].type) {
+		/* call type-specific comparison routines here */
+		default:
+			if (a->fields[i].val != b->fields[i].val)
+				return 1;
+		}
 	}
 
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
@@ -95,41 +322,21 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
 	return 0;
 }
 
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
+/* Add rule to given filterlist if not a duplicate.  Protected by
  * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_rule *rule,
+static inline int audit_add_rule(struct audit_entry *entry,
 				  struct list_head *list)
 {
-	struct audit_entry  *entry;
-	int i;
+	struct audit_entry *e;
 
 	/* Do not use the _rcu iterator here, since this is the only
 	 * addition routine. */
-	list_for_each_entry(entry, list, list) {
-		if (!audit_compare_rule(rule, &entry->rule))
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(&entry->rule, &e->rule))
 			return -EEXIST;
 	}
 
-	for (i = 0; i < rule->field_count; i++) {
-		if (rule->fields[i] & AUDIT_UNUSED_BITS)
-			return -EINVAL;
-		if ( rule->fields[i] & AUDIT_NEGATE)
-			rule->fields[i] |= AUDIT_NOT_EQUAL;
-		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
-			rule->fields[i] |= AUDIT_EQUAL;
-		rule->fields[i] &= ~AUDIT_NEGATE;
-	}
-
-	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
-		return -ENOMEM;
-	if (audit_copy_rule(&entry->rule, rule)) {
-		kfree(entry);
-		return -EINVAL;
-	}
-
 	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
-		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
 		list_add_rcu(&entry->list, list);
 	} else {
 		list_add_tail_rcu(&entry->list, list);
@@ -138,16 +345,9 @@ static inline int audit_add_rule(struct audit_rule *rule,
 	return 0;
 }
 
-static inline void audit_free_rule(struct rcu_head *head)
-{
-	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
-	kfree(e);
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
+/* Remove an existing rule from filterlist.  Protected by
  * audit_netlink_sem. */
-static inline int audit_del_rule(struct audit_rule *rule,
+static inline int audit_del_rule(struct audit_entry *entry,
 				 struct list_head *list)
 {
 	struct audit_entry  *e;
@@ -155,16 +355,18 @@ static inline int audit_del_rule(struct audit_rule *rule,
 	/* Do not use the _rcu iterator here, since this is the only
 	 * deletion routine. */
 	list_for_each_entry(e, list, list) {
-		if (!audit_compare_rule(rule, &e->rule)) {
+		if (!audit_compare_rule(&entry->rule, &e->rule)) {
 			list_del_rcu(&e->list);
-			call_rcu(&e->rcu, audit_free_rule);
+			call_rcu(&e->rcu, audit_free_rule_rcu);
 			return 0;
 		}
 	}
 	return -ENOENT;		/* No matching rule */
 }
 
-static int audit_list_rules(void *_dest)
+/* List rules using struct audit_rule.  Exists for backward
+ * compatibility with userspace. */
+static int audit_list(void *_dest)
 {
 	int pid, seq;
 	int *dest = _dest;
@@ -180,9 +382,16 @@ static int audit_list_rules(void *_dest)
 	/* The *_rcu iterators not needed here because we are
 	   always called with audit_netlink_sem held. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(entry, &audit_filter_list[i], list)
+		list_for_each_entry(entry, &audit_filter_list[i], list) {
+			struct audit_rule *rule;
+
+			rule = audit_krule_to_rule(&entry->rule);
+			if (unlikely(!rule))
+				break;
 			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 &entry->rule, sizeof(entry->rule));
+					 rule, sizeof(*rule));
+			kfree(rule);
+		}
 	}
 	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
 	
@@ -190,6 +399,40 @@ static int audit_list_rules(void *_dest)
 	return 0;
 }
 
+/* List rules using struct audit_rule_data. */
+static int audit_list_rules(void *_dest)
+{
+	int pid, seq;
+	int *dest = _dest;
+	struct audit_entry *e;
+	int i;
+
+	pid = dest[0];
+	seq = dest[1];
+	kfree(dest);
+
+	down(&audit_netlink_sem);
+
+	/* The *_rcu iterators not needed here because we are
+	   always called with audit_netlink_sem held. */
+	for (i=0; i<AUDIT_NR_FILTERS; i++) {
+		list_for_each_entry(e, &audit_filter_list[i], list) {
+			struct audit_rule_data *data;
+
+			data = audit_krule_to_data(&e->rule);
+			if (unlikely(!data))
+				break;
+			audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+					 data, sizeof(*data));
+			kfree(data);
+		}
+	}
+	audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+
+	up(&audit_netlink_sem);
+	return 0;
+}
+
 /**
  * audit_receive_filter - apply all rules to the specified message type
  * @type: audit message type
@@ -197,18 +440,20 @@ static int audit_list_rules(void *_dest)
  * @uid: target uid for netlink audit messages
  * @seq: netlink audit message sequence (serial) number
  * @data: payload data
+ * @datasz: size of payload data
  * @loginuid: loginuid of sender
  */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-							uid_t loginuid)
+			 size_t datasz, uid_t loginuid)
 {
 	struct task_struct *tsk;
 	int *dest;
-	int		   err = 0;
-	unsigned listnr;
+	int err = 0;
+	struct audit_entry *entry;
 
 	switch (type) {
 	case AUDIT_LIST:
+	case AUDIT_LIST_RULES:
 		/* We can't just spew out the rules here because we might fill
 		 * the available socket buffer space and deadlock waiting for
 		 * auditctl to read from it... which isn't ever going to
@@ -221,41 +466,48 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 		dest[0] = pid;
 		dest[1] = seq;
 
-		tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
+		if (type == AUDIT_LIST)
+			tsk = kthread_run(audit_list, dest, "audit_list");
+		else
+			tsk = kthread_run(audit_list_rules, dest,
+					  "audit_list_rules");
 		if (IS_ERR(tsk)) {
 			kfree(dest);
 			err = PTR_ERR(tsk);
 		}
 		break;
 	case AUDIT_ADD:
-		listnr = ((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		switch(listnr) {
-		default:
-			return -EINVAL;
-
-		case AUDIT_FILTER_USER:
-		case AUDIT_FILTER_TYPE:
-#ifdef CONFIG_AUDITSYSCALL
-		case AUDIT_FILTER_ENTRY:
-		case AUDIT_FILTER_EXIT:
-		case AUDIT_FILTER_TASK:
-#endif
-			;
-		}
-		err = audit_add_rule(data, &audit_filter_list[listnr]);
+	case AUDIT_ADD_RULE:
+		if (type == AUDIT_ADD)
+			entry = audit_rule_to_entry(data);
+		else
+			entry = audit_data_to_entry(data, datasz);
+		if (IS_ERR(entry))
+			return PTR_ERR(entry);
+
+		err = audit_add_rule(entry,
+				     &audit_filter_list[entry->rule.listnr]);
 		if (!err)
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 				  "auid=%u added an audit rule\n", loginuid);
+		else
+			audit_free_rule(entry);
 		break;
 	case AUDIT_DEL:
-		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		if (listnr >= AUDIT_NR_FILTERS)
-			return -EINVAL;
-
-		err = audit_del_rule(data, &audit_filter_list[listnr]);
+	case AUDIT_DEL_RULE:
+		if (type == AUDIT_DEL)
+			entry = audit_rule_to_entry(data);
+		else
+			entry = audit_data_to_entry(data, datasz);
+		if (IS_ERR(entry))
+			return PTR_ERR(entry);
+
+		err = audit_del_rule(entry,
+				     &audit_filter_list[entry->rule.listnr]);
 		if (!err)
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 				  "auid=%u removed an audit rule\n", loginuid);
+		audit_free_rule(entry);
 		break;
 	default:
 		return -EINVAL;
@@ -287,29 +539,27 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
 
 
 static int audit_filter_user_rules(struct netlink_skb_parms *cb,
-				   struct audit_rule *rule,
+				   struct audit_krule *rule,
 				   enum audit_state *state)
 {
 	int i;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-		u32 value  = rule->values[i];
+		struct audit_field *f = &rule->fields[i];
 		int result = 0;
 
-		switch (field) {
+		switch (f->type) {
 		case AUDIT_PID:
-			result = audit_comparator(cb->creds.pid, op, value);
+			result = audit_comparator(cb->creds.pid, f->op, f->val);
 			break;
 		case AUDIT_UID:
-			result = audit_comparator(cb->creds.uid, op, value);
+			result = audit_comparator(cb->creds.uid, f->op, f->val);
 			break;
 		case AUDIT_GID:
-			result = audit_comparator(cb->creds.gid, op, value);
+			result = audit_comparator(cb->creds.gid, f->op, f->val);
 			break;
 		case AUDIT_LOGINUID:
-			result = audit_comparator(cb->loginuid, op, value);
+			result = audit_comparator(cb->loginuid, f->op, f->val);
 			break;
 		}
 
@@ -354,14 +604,11 @@ int audit_filter_type(int type)
 
 	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
 				list) {
-		struct audit_rule *rule = &e->rule;
 		int i;
-		for (i = 0; i < rule->field_count; i++) {
-			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-			u32 value  = rule->values[i];
-			if ( field == AUDIT_MSGTYPE ) {
-				result = audit_comparator(type, op, value); 
+		for (i = 0; i < e->rule.field_count; i++) {
+			struct audit_field *f = &e->rule.fields[i];
+			if (f->type == AUDIT_MSGTYPE) {
+				result = audit_comparator(type, f->op, f->val);
 				if (!result)
 					break;
 			}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 17719b303638..ba0878854777 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -162,70 +162,68 @@ struct audit_context {
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
 static int audit_filter_rules(struct task_struct *tsk,
-			      struct audit_rule *rule,
+			      struct audit_krule *rule,
 			      struct audit_context *ctx,
 			      enum audit_state *state)
 {
 	int i, j;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-		u32 value  = rule->values[i];
+		struct audit_field *f = &rule->fields[i];
 		int result = 0;
 
-		switch (field) {
+		switch (f->type) {
 		case AUDIT_PID:
-			result = audit_comparator(tsk->pid, op, value);
+			result = audit_comparator(tsk->pid, f->op, f->val);
 			break;
 		case AUDIT_UID:
-			result = audit_comparator(tsk->uid, op, value);
+			result = audit_comparator(tsk->uid, f->op, f->val);
 			break;
 		case AUDIT_EUID:
-			result = audit_comparator(tsk->euid, op, value);
+			result = audit_comparator(tsk->euid, f->op, f->val);
 			break;
 		case AUDIT_SUID:
-			result = audit_comparator(tsk->suid, op, value);
+			result = audit_comparator(tsk->suid, f->op, f->val);
 			break;
 		case AUDIT_FSUID:
-			result = audit_comparator(tsk->fsuid, op, value);
+			result = audit_comparator(tsk->fsuid, f->op, f->val);
 			break;
 		case AUDIT_GID:
-			result = audit_comparator(tsk->gid, op, value);
+			result = audit_comparator(tsk->gid, f->op, f->val);
 			break;
 		case AUDIT_EGID:
-			result = audit_comparator(tsk->egid, op, value);
+			result = audit_comparator(tsk->egid, f->op, f->val);
 			break;
 		case AUDIT_SGID:
-			result = audit_comparator(tsk->sgid, op, value);
+			result = audit_comparator(tsk->sgid, f->op, f->val);
 			break;
 		case AUDIT_FSGID:
-			result = audit_comparator(tsk->fsgid, op, value);
+			result = audit_comparator(tsk->fsgid, f->op, f->val);
 			break;
 		case AUDIT_PERS:
-			result = audit_comparator(tsk->personality, op, value);
+			result = audit_comparator(tsk->personality, f->op, f->val);
 			break;
 		case AUDIT_ARCH:
  			if (ctx)
-				result = audit_comparator(ctx->arch, op, value);
+				result = audit_comparator(ctx->arch, f->op, f->val);
 			break;
 
 		case AUDIT_EXIT:
 			if (ctx && ctx->return_valid)
-				result = audit_comparator(ctx->return_code, op, value);
+				result = audit_comparator(ctx->return_code, f->op, f->val);
 			break;
 		case AUDIT_SUCCESS:
 			if (ctx && ctx->return_valid) {
-				if (value)
-					result = audit_comparator(ctx->return_valid, op, AUDITSC_SUCCESS);
+				if (f->val)
+					result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
 				else
-					result = audit_comparator(ctx->return_valid, op, AUDITSC_FAILURE);
+					result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
 			}
 			break;
 		case AUDIT_DEVMAJOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MAJOR(ctx->names[j].dev),	op, value)) {
+					if (audit_comparator(MAJOR(ctx->names[j].dev),	f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -235,7 +233,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_DEVMINOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MINOR(ctx->names[j].dev), op, value)) {
+					if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -245,8 +243,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(ctx->names[j].ino, op, value) ||
-					    audit_comparator(ctx->names[j].pino, op, value)) {
+					if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
+					    audit_comparator(ctx->names[j].pino, f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -256,14 +254,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_LOGINUID:
 			result = 0;
 			if (ctx)
-				result = audit_comparator(ctx->loginuid, op, value);
+				result = audit_comparator(ctx->loginuid, f->op, f->val);
 			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 			if (ctx)
-				result = audit_comparator(ctx->argv[field-AUDIT_ARG0], op, value);
+				result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
 			break;
 		}
 
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index d7c0e912c5f3..73158244cf8c 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -99,6 +99,9 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
 	{ AUDIT_LIST,		NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
 	{ AUDIT_ADD,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_DEL,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
+	{ AUDIT_LIST_RULES,	NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
+	{ AUDIT_ADD_RULE,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
+	{ AUDIT_DEL_RULE,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_USER,		NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 	{ AUDIT_SIGNAL_INFO,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
 };
-- 
cgit v1.2.3


From 5d3301088f7e412992d9e61cc3604cbdff3090ff Mon Sep 17 00:00:00 2001
From: Steve Grubb <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jan 2006 09:48:17 -0500
Subject: [PATCH] add/remove rule update

Hi,

The following patch adds a little more information to the add/remove rule message emitted
by the kernel.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  2 +-
 kernel/auditfilter.c  | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8a3b98175c25..d760430c8de3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -240,7 +240,7 @@ struct audit_rule_data {
 	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
 	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
 	__u32		field_count;
-	__u32		mask[AUDIT_BITMASK_SIZE];
+	__u32		mask[AUDIT_BITMASK_SIZE]; /* syscall(s) affected */
 	__u32		fields[AUDIT_MAX_FIELDS];
 	__u32		values[AUDIT_MAX_FIELDS];
 	__u32		fieldflags[AUDIT_MAX_FIELDS];
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 686d514a3518..35f8fa82bb8b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -487,10 +487,11 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_add_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u added an audit rule\n", loginuid);
-		else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			"auid=%u add rule to list=%d res=%d\n",
+			loginuid, entry->rule.listnr, !err);
+
+		if (err)
 			audit_free_rule(entry);
 		break;
 	case AUDIT_DEL:
@@ -504,9 +505,10 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_del_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u removed an audit rule\n", loginuid);
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			"auid=%u remove rule from list=%d res=%d\n",
+			loginuid, entry->rule.listnr, !err);
+
 		audit_free_rule(entry);
 		break;
 	default:
-- 
cgit v1.2.3


From 5bdb98868062c1b14025883049551af343233187 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Sat, 3 Dec 2005 08:39:35 -0500
Subject: [PATCH] promiscuous mode

Hi,

When a network interface goes into promiscuous mode, its an important security
issue. The attached patch is intended to capture that action and send an
event to the audit system.

The patch carves out a new block of numbers for kernel detected anomalies.
These are events that may indicate suspicious activity. Other examples of
potential kernel anomalies would be: exceeding disk quota, rlimit violations,
changes to syscall entry table.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 7 ++++++-
 net/core/dev.c        | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index d760430c8de3..1c47c59058c1 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -35,7 +35,8 @@
  * 1400 - 1499 SE Linux use
  * 1500 - 1599 kernel LSPP events
  * 1600 - 1699 kernel crypto events
- * 1700 - 1999 future kernel use (maybe integrity labels and related events)
+ * 1700 - 1799 kernel anomaly records
+ * 1800 - 1999 future kernel use (maybe integrity labels and related events)
  * 2000 is for otherwise unclassified kernel audit messages (legacy)
  * 2001 - 2099 unused (kernel)
  * 2100 - 2199 user space anomaly records
@@ -90,6 +91,10 @@
 #define AUDIT_MAC_STATUS	1404	/* Changed enforcing,permissive,off */
 #define AUDIT_MAC_CONFIG_CHANGE	1405	/* Changes to booleans */
 
+#define AUDIT_FIRST_KERN_ANOM_MSG   1700
+#define AUDIT_LAST_KERN_ANOM_MSG    1799
+#define AUDIT_ANOM_PROMISCUOUS      1700 /* Device changed promiscuous mode */
+
 #define AUDIT_KERNEL		2000	/* Asynchronous audit record. NOT A REQUEST. */
 
 /* Rule flags */
diff --git a/net/core/dev.c b/net/core/dev.c
index 2afb0de95329..e9f84a66ce81 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include <net/iw_handler.h>
 #endif	/* CONFIG_NET_RADIO */
 #include <asm/current.h>
+#include <linux/audit.h>
 
 /*
  *	The list of packet types we will receive (as opposed to discard)
@@ -2120,6 +2121,12 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
 		printk(KERN_INFO "device %s %s promiscuous mode\n",
 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
 		       					       "left");
+		audit_log(current->audit_context, GFP_ATOMIC,
+			AUDIT_ANOM_PROMISCUOUS,
+			"dev=%s prom=%d old_prom=%d auid=%u",
+			dev->name, (dev->flags & IFF_PROMISC),
+			(old_flags & IFF_PROMISC),
+			audit_get_loginuid(current->audit_context)); 
 	}
 }
 
-- 
cgit v1.2.3


From d358788f3f30113e49882187d794832905e42592 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Mon, 20 Mar 2006 20:00:09 +0000
Subject: [SERIAL] kernel console should send CRLF not LFCR

Glen Turner reported that writing LFCR rather than the more
traditional CRLF causes issues with some terminals.

Since this aflicts many serial drivers, extract the common code
to a library function (uart_console_write) and arrange for each
driver to supply a "putchar" function.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/21285.c          | 19 +++++++------------
 drivers/serial/8250.c           | 26 +++++++++-----------------
 drivers/serial/8250_early.c     |  9 ++-------
 drivers/serial/amba-pl010.c     | 24 ++++++++----------------
 drivers/serial/amba-pl011.c     | 20 +++++---------------
 drivers/serial/at91_serial.c    | 24 ++++++++----------------
 drivers/serial/au1x00_uart.c    | 26 +++++++++-----------------
 drivers/serial/clps711x.c       | 24 ++++++++----------------
 drivers/serial/dz.c             | 12 +++++-------
 drivers/serial/imx.c            | 26 +++++++++-----------------
 drivers/serial/ip22zilog.c      | 11 +++--------
 drivers/serial/m32r_sio.c       | 26 +++++++++-----------------
 drivers/serial/mpc52xx_uart.c   | 11 +++++------
 drivers/serial/pmac_zilog.c     | 23 +++++++++++------------
 drivers/serial/pxa.c            | 26 +++++++++-----------------
 drivers/serial/s3c2410.c        | 26 ++++++++++----------------
 drivers/serial/sa1100.c         | 26 ++++++++++----------------
 drivers/serial/serial_core.c    | 21 +++++++++++++++++++++
 drivers/serial/serial_lh7a40x.c | 17 +++++++----------
 drivers/serial/serial_txx9.c    | 26 +++++++++-----------------
 drivers/serial/sunsab.c         | 10 +++-------
 drivers/serial/sunsu.c          | 26 +++++++++-----------------
 drivers/serial/sunzilog.c       | 13 ++++---------
 drivers/serial/vr41xx_siu.c     | 16 +++++++---------
 include/linux/serial_core.h     |  3 +++
 25 files changed, 190 insertions(+), 301 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/21285.c b/drivers/serial/21285.c
index 8c5c276c5577..7572665a8855 100644
--- a/drivers/serial/21285.c
+++ b/drivers/serial/21285.c
@@ -375,23 +375,18 @@ static void serial21285_setup_ports(void)
 }
 
 #ifdef CONFIG_SERIAL_21285_CONSOLE
+static void serial21285_console_putchar(struct uart_port *port, int ch)
+{
+	while (*CSR_UARTFLG & 0x20)
+		barrier();
+	*CSR_UARTDR = ch;
+}
 
 static void
 serial21285_console_write(struct console *co, const char *s,
 			  unsigned int count)
 {
-	int i;
-
-	for (i = 0; i < count; i++) {
-		while (*CSR_UARTFLG & 0x20)
-			barrier();
-		*CSR_UARTDR = s[i];
-		if (s[i] == '\n') {
-			while (*CSR_UARTFLG & 0x20)
-				barrier();
-			*CSR_UARTDR = '\r';
-		}
-	}
+	uart_console_write(&serial21285_port, s, count, serial21285_console_putchar);
 }
 
 static void __init
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 7aca22c9976d..5996d3cd0ed8 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2182,6 +2182,14 @@ static inline void wait_for_xmitr(struct uart_8250_port *up, int bits)
 	}
 }
 
+static void serial8250_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_8250_port *up = (struct uart_8250_port *)port;
+
+	wait_for_xmitr(up, UART_LSR_THRE);
+	serial_out(up, UART_TX, ch);
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -2193,7 +2201,6 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct uart_8250_port *up = &serial8250_ports[co->index];
 	unsigned int ier;
-	int i;
 
 	touch_nmi_watchdog();
 
@@ -2207,22 +2214,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 	else
 		serial_out(up, UART_IER, 0);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++, s++) {
-		wait_for_xmitr(up, UART_LSR_THRE);
-
-		/*
-		 *	Send the character out.
-		 *	If a LF, also do CR...
-		 */
-		serial_out(up, UART_TX, *s);
-		if (*s == 10) {
-			wait_for_xmitr(up, UART_LSR_THRE);
-			serial_out(up, UART_TX, 13);
-		}
-	}
+	uart_console_write(&up->port, s, count, serial8250_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/8250_early.c b/drivers/serial/8250_early.c
index 59ba5d993b4b..7e511199b4c5 100644
--- a/drivers/serial/8250_early.c
+++ b/drivers/serial/8250_early.c
@@ -74,7 +74,7 @@ static void __init wait_for_xmitr(struct uart_port *port)
 	}
 }
 
-static void __init putc(struct uart_port *port, unsigned char c)
+static void __init putc(struct uart_port *port, int c)
 {
 	wait_for_xmitr(port);
 	serial_out(port, UART_TX, c);
@@ -89,12 +89,7 @@ static void __init early_uart_write(struct console *console, const char *s, unsi
 	ier = serial_in(port, UART_IER);
 	serial_out(port, UART_IER, 0);
 
-	while (*s && count-- > 0) {
-		putc(port, *s);
-		if (*s == '\n')
-			putc(port, '\r');
-		s++;
-	}
+	uart_console_write(port, s, count, putc);
 
 	/* Wait for transmitter to become empty and restore the IER */
 	wait_for_xmitr(port);
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index 321a3b3a5728..e04d5e82d9ae 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -591,12 +591,18 @@ static struct uart_amba_port amba_ports[UART_NR] = {
 
 #ifdef CONFIG_SERIAL_AMBA_PL010_CONSOLE
 
+static void pl010_console_putchar(struct uart_port *port, int ch)
+{
+	while (!UART_TX_READY(UART_GET_FR(port)))
+		barrier();
+	UART_PUT_CHAR(port, ch);
+}
+
 static void
 pl010_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct uart_port *port = &amba_ports[co->index].port;
 	unsigned int status, old_cr;
-	int i;
 
 	/*
 	 *	First save the CR then disable the interrupts
@@ -604,21 +610,7 @@ pl010_console_write(struct console *co, const char *s, unsigned int count)
 	old_cr = UART_GET_CR(port);
 	UART_PUT_CR(port, UART01x_CR_UARTEN);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++) {
-		do {
-			status = UART_GET_FR(port);
-		} while (!UART_TX_READY(status));
-		UART_PUT_CHAR(port, s[i]);
-		if (s[i] == '\n') {
-			do {
-				status = UART_GET_FR(port);
-			} while (!UART_TX_READY(status));
-			UART_PUT_CHAR(port, '\r');
-		}
-	}
+	uart_console_write(port, s, count, pl010_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/amba-pl011.c b/drivers/serial/amba-pl011.c
index 034a029e356e..3d966cfc9a38 100644
--- a/drivers/serial/amba-pl011.c
+++ b/drivers/serial/amba-pl011.c
@@ -587,14 +587,12 @@ static struct uart_amba_port *amba_ports[UART_NR];
 
 #ifdef CONFIG_SERIAL_AMBA_PL011_CONSOLE
 
-static inline void
-pl011_console_write_char(struct uart_amba_port *uap, char ch)
+static void pl011_console_putchar(struct uart_port *port, int ch)
 {
-	unsigned int status;
+	struct uart_amba_port *uap = (struct uart_amba_port *)port;
 
-	do {
-		status = readw(uap->port.membase + UART01x_FR);
-	} while (status & UART01x_FR_TXFF);
+	while (readw(uap->port.membase + UART01x_FR) & UART01x_FR_TXFF)
+		barrier();
 	writew(ch, uap->port.membase + UART01x_DR);
 }
 
@@ -603,7 +601,6 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct uart_amba_port *uap = amba_ports[co->index];
 	unsigned int status, old_cr, new_cr;
-	int i;
 
 	clk_enable(uap->clk);
 
@@ -615,14 +612,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
 	new_cr |= UART01x_CR_UARTEN | UART011_CR_TXE;
 	writew(new_cr, uap->port.membase + UART011_CR);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++) {
-		pl011_console_write_char(uap, s[i]);
-		if (s[i] == '\n')
-			pl011_console_write_char(uap, '\r');
-	}
+	uart_console_write(&uap->port, s, count, pl011_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/at91_serial.c b/drivers/serial/at91_serial.c
index 2113feb75c39..6547fe0cef96 100644
--- a/drivers/serial/at91_serial.c
+++ b/drivers/serial/at91_serial.c
@@ -711,6 +711,12 @@ void __init at91_register_uart(int idx, int port)
 }
 
 #ifdef CONFIG_SERIAL_AT91_CONSOLE
+static void at91_console_putchar(struct uart_port *port, int ch)
+{
+	while (!(UART_GET_CSR(port) & AT91_US_TXRDY))
+		barrier();
+	UART_PUT_CHAR(port, ch);
+}
 
 /*
  * Interrupts are disabled on entering
@@ -718,7 +724,7 @@ void __init at91_register_uart(int idx, int port)
 static void at91_console_write(struct console *co, const char *s, u_int count)
 {
 	struct uart_port *port = at91_ports + co->index;
-	unsigned int status, i, imr;
+	unsigned int status, imr;
 
 	/*
 	 *	First, save IMR and then disable interrupts
@@ -726,21 +732,7 @@ static void at91_console_write(struct console *co, const char *s, u_int count)
 	imr = UART_GET_IMR(port);	/* get interrupt mask */
 	UART_PUT_IDR(port, AT91_US_RXRDY | AT91_US_TXRDY);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++) {
-		do {
-			status = UART_GET_CSR(port);
-		} while (!(status & AT91_US_TXRDY));
-		UART_PUT_CHAR(port, s[i]);
-		if (s[i] == '\n') {
-			do {
-				status = UART_GET_CSR(port);
-			} while (!(status & AT91_US_TXRDY));
-			UART_PUT_CHAR(port, '\r');
-		}
-	}
+	uart_console_write(port, s, count, at91_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/au1x00_uart.c b/drivers/serial/au1x00_uart.c
index 344022fe53ef..29f94bbb79be 100644
--- a/drivers/serial/au1x00_uart.c
+++ b/drivers/serial/au1x00_uart.c
@@ -1121,6 +1121,14 @@ static inline void wait_for_xmitr(struct uart_8250_port *up)
 	}
 }
 
+static void au1x00_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_8250_port *up = (struct uart_8250_port *)port;
+
+	wait_for_xmitr(up);
+	serial_out(up, UART_TX, ch);
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -1132,7 +1140,6 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct uart_8250_port *up = &serial8250_ports[co->index];
 	unsigned int ier;
-	int i;
 
 	/*
 	 *	First save the UER then disable the interrupts
@@ -1140,22 +1147,7 @@ serial8250_console_write(struct console *co, const char *s, unsigned int count)
 	ier = serial_in(up, UART_IER);
 	serial_out(up, UART_IER, 0);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++, s++) {
-		wait_for_xmitr(up);
-
-		/*
-		 *	Send the character out.
-		 *	If a LF, also do CR...
-		 */
-		serial_out(up, UART_TX, *s);
-		if (*s == 10) {
-			wait_for_xmitr(up);
-			serial_out(up, UART_TX, 13);
-		}
-	}
+	uart_console_write(&up->port, s, count, au1x00_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/clps711x.c b/drivers/serial/clps711x.c
index ce7b2e4ecd17..2691112c84ad 100644
--- a/drivers/serial/clps711x.c
+++ b/drivers/serial/clps711x.c
@@ -424,6 +424,13 @@ static struct uart_port clps711x_ports[UART_NR] = {
 };
 
 #ifdef CONFIG_SERIAL_CLPS711X_CONSOLE
+static void clps711xuart_console_putchar(struct uart_port *port, int ch)
+{
+	while (clps_readl(SYSFLG(port)) & SYSFLG_UTXFF)
+		barrier();
+	clps_writel(ch, UARTDR(port));
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -438,7 +445,6 @@ clps711xuart_console_write(struct console *co, const char *s,
 {
 	struct uart_port *port = clps711x_ports + co->index;
 	unsigned int status, syscon;
-	int i;
 
 	/*
 	 *	Ensure that the port is enabled.
@@ -446,21 +452,7 @@ clps711xuart_console_write(struct console *co, const char *s,
 	syscon = clps_readl(SYSCON(port));
 	clps_writel(syscon | SYSCON_UARTEN, SYSCON(port));
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++) {
-		do {
-			status = clps_readl(SYSFLG(port));
-		} while (status & SYSFLG_UTXFF);
-		clps_writel(s[i], UARTDR(port));
-		if (s[i] == '\n') {
-			do {
-				status = clps_readl(SYSFLG(port));
-			} while (status & SYSFLG_UTXFF);
-			clps_writel('\r', UARTDR(port));
-		}
-	}
+	uart_console_write(port, s, count, clps711xuart_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/dz.c b/drivers/serial/dz.c
index ba5541de673b..bf71bad5c34f 100644
--- a/drivers/serial/dz.c
+++ b/drivers/serial/dz.c
@@ -674,11 +674,12 @@ static void dz_reset(struct dz_port *dport)
 }
 
 #ifdef CONFIG_SERIAL_DZ_CONSOLE
-static void dz_console_put_char(struct dz_port *dport, unsigned char ch)
+static void dz_console_putchar(struct uart_port *port, int ch)
 {
+	struct dz_port *dport = (struct dz_port *)uport;
 	unsigned long flags;
 	int loops = 2500;
-	unsigned short tmp = ch;
+	unsigned short tmp = (unsigned char)ch;
 	/* this code sends stuff out to serial device - spinning its
 	   wheels and waiting. */
 
@@ -694,6 +695,7 @@ static void dz_console_put_char(struct dz_port *dport, unsigned char ch)
 
 	spin_unlock_irqrestore(&dport->port.lock, flags);
 }
+
 /*
  * -------------------------------------------------------------------
  * dz_console_print ()
@@ -710,11 +712,7 @@ static void dz_console_print(struct console *cons,
 #ifdef DEBUG_DZ
 	prom_printf((char *) str);
 #endif
-	while (count--) {
-		if (*str == '\n')
-			dz_console_put_char(dport, '\r');
-		dz_console_put_char(dport, *str++);
-	}
+	uart_console_write(&dport->port, str, count, dz_console_putchar);
 }
 
 static int __init dz_console_setup(struct console *co, char *options)
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 4d53fb5ca87b..c3b7a6673e9c 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -743,6 +743,13 @@ static void __init imx_init_ports(void)
 }
 
 #ifdef CONFIG_SERIAL_IMX_CONSOLE
+static void imx_console_putchar(struct uart_port *port, int ch)
+{
+	struct imx_port *sport = (struct imx_port *)port;
+	while ((UTS((u32)sport->port.membase) & UTS_TXFULL))
+		barrier();
+	URTX0((u32)sport->port.membase) = ch;
+}
 
 /*
  * Interrupts are disabled on entering
@@ -751,7 +758,7 @@ static void
 imx_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct imx_port *sport = &imx_ports[co->index];
-	unsigned int old_ucr1, old_ucr2, i;
+	unsigned int old_ucr1, old_ucr2;
 
 	/*
 	 *	First, save UCR1/2 and then disable interrupts
@@ -764,22 +771,7 @@ imx_console_write(struct console *co, const char *s, unsigned int count)
 	                   & ~(UCR1_TXMPTYEN | UCR1_RRDYEN | UCR1_RTSDEN);
 	UCR2((u32)sport->port.membase) = old_ucr2 | UCR2_TXEN;
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++) {
-
-		while ((UTS((u32)sport->port.membase) & UTS_TXFULL))
-			barrier();
-
-		URTX0((u32)sport->port.membase) = s[i];
-
-		if (s[i] == '\n') {
-			while ((UTS((u32)sport->port.membase) & UTS_TXFULL))
-				barrier();
-			URTX0((u32)sport->port.membase) = '\r';
-		}
-	}
+	uart_console_write(&sport->port, s, count, imx_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/ip22zilog.c b/drivers/serial/ip22zilog.c
index 193722d680cf..651772474ac1 100644
--- a/drivers/serial/ip22zilog.c
+++ b/drivers/serial/ip22zilog.c
@@ -967,8 +967,9 @@ static struct zilog_layout * __init get_zs(int chip)
 #define ZS_PUT_CHAR_MAX_DELAY	2000	/* 10 ms */
 
 #ifdef CONFIG_SERIAL_IP22_ZILOG_CONSOLE
-static void ip22zilog_put_char(struct zilog_channel *channel, unsigned char ch)
+static void ip22zilog_put_char(struct uart_port *port, int ch)
 {
+	struct zilog_channel *channel = ZILOG_CHANNEL_FROM_PORT(port);
 	int loops = ZS_PUT_CHAR_MAX_DELAY;
 
 	/* This is a timed polling loop so do not switch the explicit
@@ -992,16 +993,10 @@ static void
 ip22zilog_console_write(struct console *con, const char *s, unsigned int count)
 {
 	struct uart_ip22zilog_port *up = &ip22zilog_port_table[con->index];
-	struct zilog_channel *channel = ZILOG_CHANNEL_FROM_PORT(&up->port);
 	unsigned long flags;
-	int i;
 
 	spin_lock_irqsave(&up->port.lock, flags);
-	for (i = 0; i < count; i++, s++) {
-		ip22zilog_put_char(channel, *s);
-		if (*s == 10)
-			ip22zilog_put_char(channel, 13);
-	}
+	uart_console_write(&up->port, s, count, ip22zilog_put_char);
 	udelay(2);
 	spin_unlock_irqrestore(&up->port.lock, flags);
 }
diff --git a/drivers/serial/m32r_sio.c b/drivers/serial/m32r_sio.c
index 242a04104393..876bc5e027bb 100644
--- a/drivers/serial/m32r_sio.c
+++ b/drivers/serial/m32r_sio.c
@@ -1039,6 +1039,14 @@ static inline void wait_for_xmitr(struct uart_sio_port *up)
 	}
 }
 
+static void m32r_sio_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_sio_port *up = (struct uart_sio_port *)port;
+
+	wait_for_xmitr(up);
+	sio_out(up, SIOTXB, ch);
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -1058,23 +1066,7 @@ static void m32r_sio_console_write(struct console *co, const char *s,
 	ier = sio_in(up, SIOTRCR);
 	sio_out(up, SIOTRCR, 0);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++, s++) {
-		wait_for_xmitr(up);
-
-		/*
-		 *	Send the character out.
-		 *	If a LF, also do CR...
-		 */
-		sio_out(up, SIOTXB, *s);
-
-		if (*s == 10) {
-			wait_for_xmitr(up);
-			sio_out(up, SIOTXB, 13);
-		}
-	}
+	uart_console_write(&up->port, s, count, m32r_sio_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index 61dd17d7bace..928e6cf12dca 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -603,15 +603,14 @@ mpc52xx_console_write(struct console *co, const char *s, unsigned int count)
 		udelay(1);
 
 	/* Write all the chars */
-	for ( i=0 ; i<count ; i++ ) {
-	
-		/* Send the char */
-		out_8(&psc->mpc52xx_psc_buffer_8, *s);
-
+	for (i = 0; i < count; i++, s++) {
 		/* Line return handling */
-		if ( *s++ == '\n' )
+		if (*s == '\n')
 			out_8(&psc->mpc52xx_psc_buffer_8, '\r');
 		
+		/* Send the char */
+		out_8(&psc->mpc52xx_psc_buffer_8, *s);
+
 		/* Wait the TX buffer to be empty */
 		j = 20000;	/* Maximum wait */	
 		while (!(in_be16(&psc->mpc52xx_psc_status) & 
diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index 9b7ed58cb53b..513ff8597707 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -1916,6 +1916,16 @@ static void __exit exit_pmz(void)
 
 #ifdef CONFIG_SERIAL_PMACZILOG_CONSOLE
 
+static void pmz_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_pmac_port *uap = (struct uart_pmac_port *)port;
+
+	/* Wait for the transmit buffer to empty. */
+	while ((read_zsreg(uap, R0) & Tx_BUF_EMP) == 0)
+		udelay(5);
+	write_zsdata(uap, ch);
+}
+
 /*
  * Print a string to the serial port trying not to disturb
  * any possible real use of the port...
@@ -1924,7 +1934,6 @@ static void pmz_console_write(struct console *con, const char *s, unsigned int c
 {
 	struct uart_pmac_port *uap = &pmz_ports[con->index];
 	unsigned long flags;
-	int i;
 
 	if (ZS_IS_ASLEEP(uap))
 		return;
@@ -1934,17 +1943,7 @@ static void pmz_console_write(struct console *con, const char *s, unsigned int c
 	write_zsreg(uap, R1, uap->curregs[1] & ~TxINT_ENAB);
 	write_zsreg(uap, R5, uap->curregs[5] | TxENABLE | RTS | DTR);
 
-	for (i = 0; i < count; i++) {
-		/* Wait for the transmit buffer to empty. */
-		while ((read_zsreg(uap, R0) & Tx_BUF_EMP) == 0)
-			udelay(5);
-		write_zsdata(uap, s[i]);
-		if (s[i] == 10) {
-			while ((read_zsreg(uap, R0) & Tx_BUF_EMP) == 0)
-				udelay(5);
-			write_zsdata(uap, R13);
-		}
-	}
+	uart_console_write(&uap->port, s, count, pmz_console_putchar);
 
 	/* Restore the values in the registers. */
 	write_zsreg(uap, R1, uap->curregs[1]);
diff --git a/drivers/serial/pxa.c b/drivers/serial/pxa.c
index 10535f00301f..77d4568ccc3a 100644
--- a/drivers/serial/pxa.c
+++ b/drivers/serial/pxa.c
@@ -619,6 +619,14 @@ static inline void wait_for_xmitr(struct uart_pxa_port *up)
 	}
 }
 
+static void serial_pxa_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_pxa_port *up = (struct uart_pxa_port *)port;
+
+	wait_for_xmitr(up);
+	serial_out(up, UART_TX, ch);
+}
+
 /*
  * Print a string to the serial port trying not to disturb
  * any possible real use of the port...
@@ -630,7 +638,6 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct uart_pxa_port *up = &serial_pxa_ports[co->index];
 	unsigned int ier;
-	int i;
 
 	/*
 	 *	First save the IER then disable the interrupts
@@ -638,22 +645,7 @@ serial_pxa_console_write(struct console *co, const char *s, unsigned int count)
 	ier = serial_in(up, UART_IER);
 	serial_out(up, UART_IER, UART_IER_UUE);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++, s++) {
-		wait_for_xmitr(up);
-
-		/*
-		 *	Send the character out.
-		 *	If a LF, also do CR...
-		 */
-		serial_out(up, UART_TX, *s);
-		if (*s == 10) {
-			wait_for_xmitr(up);
-			serial_out(up, UART_TX, 13);
-		}
-	}
+	uart_console_write(&up->port, s, count, serial_pxa_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/s3c2410.c b/drivers/serial/s3c2410.c
index 7410e093a6b9..e4d701239702 100644
--- a/drivers/serial/s3c2410.c
+++ b/drivers/serial/s3c2410.c
@@ -1584,25 +1584,19 @@ s3c24xx_serial_console_txrdy(struct uart_port *port, unsigned int ufcon)
 }
 
 static void
-s3c24xx_serial_console_write(struct console *co, const char *s,
-			     unsigned int count)
+s3c24xx_serial_console_putchar(struct uart_port *port, int ch)
 {
-	int i;
 	unsigned int ufcon = rd_regl(cons_uart, S3C2410_UFCON);
+	while (!s3c24xx_serial_console_txrdy(port, ufcon))
+		barrier();
+	wr_regb(cons_uart, S3C2410_UTXH, ch);
+}
 
-	for (i = 0; i < count; i++) {
-		while (!s3c24xx_serial_console_txrdy(cons_uart, ufcon))
-			barrier();
-
-		wr_regb(cons_uart, S3C2410_UTXH, s[i]);
-
-		if (s[i] == '\n') {
-			while (!s3c24xx_serial_console_txrdy(cons_uart, ufcon))
-				barrier();
-
-			wr_regb(cons_uart, S3C2410_UTXH, '\r');
-		}
-	}
+static void
+s3c24xx_serial_console_write(struct console *co, const char *s,
+			     unsigned int count)
+{
+	uart_console_write(cons_uart, s, count, s3c24xx_serial_console_putchar);
 }
 
 static void __init
diff --git a/drivers/serial/sa1100.c b/drivers/serial/sa1100.c
index 2c00b8625852..c2d9068b491d 100644
--- a/drivers/serial/sa1100.c
+++ b/drivers/serial/sa1100.c
@@ -689,6 +689,14 @@ void __init sa1100_register_uart(int idx, int port)
 
 
 #ifdef CONFIG_SERIAL_SA1100_CONSOLE
+static void sa1100_console_putchar(struct uart_port *port, int ch)
+{
+	struct sa1100_port *sport = (struct sa1100_port *)port;
+
+	while (!(UART_GET_UTSR1(sport) & UTSR1_TNF))
+		barrier();
+	UART_PUT_CHAR(sport, ch);
+}
 
 /*
  * Interrupts are disabled on entering
@@ -697,7 +705,7 @@ static void
 sa1100_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct sa1100_port *sport = &sa1100_ports[co->index];
-	unsigned int old_utcr3, status, i;
+	unsigned int old_utcr3, status;
 
 	/*
 	 *	First, save UTCR3 and then disable interrupts
@@ -706,21 +714,7 @@ sa1100_console_write(struct console *co, const char *s, unsigned int count)
 	UART_PUT_UTCR3(sport, (old_utcr3 & ~(UTCR3_RIE | UTCR3_TIE)) |
 				UTCR3_TXE);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++) {
-		do {
-			status = UART_GET_UTSR1(sport);
-		} while (!(status & UTSR1_TNF));
-		UART_PUT_CHAR(sport, s[i]);
-		if (s[i] == '\n') {
-			do {
-				status = UART_GET_UTSR1(sport);
-			} while (!(status & UTSR1_TNF));
-			UART_PUT_CHAR(sport, '\r');
-		}
-	}
+	uart_console_write(&sport->port, s, count, sa1100_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index cc1faa31d124..fcd7744c4253 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1754,6 +1754,27 @@ static int uart_read_proc(char *page, char **start, off_t off,
 #endif
 
 #ifdef CONFIG_SERIAL_CORE_CONSOLE
+/*
+ *	uart_console_write - write a console message to a serial port
+ *	@port: the port to write the message
+ *	@s: array of characters
+ *	@count: number of characters in string to write
+ *	@write: function to write character to port
+ */
+void uart_console_write(struct uart_port *port, const char *s,
+			unsigned int count,
+			void (*putchar)(struct uart_port *, int))
+{
+	unsigned int i;
+
+	for (i = 0; i < count; i++, s++) {
+		if (*s == '\n')
+			putchar(port, '\r');
+		putchar(port, *s);
+	}
+}
+EXPORT_SYMBOL_GPL(uart_console_write);
+
 /*
  *	Check whether an invalid uart number has been specified, and
  *	if so, search for the first available port that does have
diff --git a/drivers/serial/serial_lh7a40x.c b/drivers/serial/serial_lh7a40x.c
index 04186eaae227..aa521b8e0d4e 100644
--- a/drivers/serial/serial_lh7a40x.c
+++ b/drivers/serial/serial_lh7a40x.c
@@ -543,6 +543,12 @@ static struct uart_port_lh7a40x lh7a40x_ports[DEV_NR] = {
 #else
 # define LH7A40X_CONSOLE &lh7a40x_console
 
+static void lh7a40xuart_console_putchar(struct uart_port *port, int ch)
+{
+	while (UR(port, UART_R_STATUS) & nTxRdy)
+		;
+	UR(port, UART_R_DATA) = ch;
+}
 
 static void lh7a40xuart_console_write (struct console* co,
 				       const char* s,
@@ -556,16 +562,7 @@ static void lh7a40xuart_console_write (struct console* co,
 	UR (port, UART_R_INTEN) = 0;		/* Disable all interrupts */
 	BIT_SET (port, UART_R_CON, UARTEN | SIRDIS); /* Enable UART */
 
-	for (; count-- > 0; ++s) {
-		while (UR (port, UART_R_STATUS) & nTxRdy)
-			;
-		UR (port, UART_R_DATA) = *s;
-		if (*s == '\n') {
-			while ((UR (port, UART_R_STATUS) & TxBusy))
-				;
-			UR (port, UART_R_DATA) = '\r';
-		}
-	}
+	uart_console_write(port, s, count, lh7a40xuart_console_putchar);
 
 				/* Wait until all characters are sent */
 	while (UR (port, UART_R_STATUS) & TxBusy)
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index ee98a867bc6d..1a259cee1a98 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -854,6 +854,14 @@ static inline void wait_for_xmitr(struct uart_txx9_port *up)
 	}
 }
 
+static void serial_txx9_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_txx9_port *up = (struct uart_txx9_port *)port;
+
+	wait_for_xmitr(up);
+	sio_out(up, TXX9_SITFIFO, ch);
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -865,7 +873,6 @@ serial_txx9_console_write(struct console *co, const char *s, unsigned int count)
 {
 	struct uart_txx9_port *up = &serial_txx9_ports[co->index];
 	unsigned int ier, flcr;
-	int i;
 
 	/*
 	 *	First save the UER then disable the interrupts
@@ -879,22 +886,7 @@ serial_txx9_console_write(struct console *co, const char *s, unsigned int count)
 	if (!(up->port.flags & UPF_CONS_FLOW) && (flcr & TXX9_SIFLCR_TES))
 		sio_out(up, TXX9_SIFLCR, flcr & ~TXX9_SIFLCR_TES);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++, s++) {
-		wait_for_xmitr(up);
-
-		/*
-		 *	Send the character out.
-		 *	If a LF, also do CR...
-		 */
-		sio_out(up, TXX9_SITFIFO, *s);
-		if (*s == 10) {
-			wait_for_xmitr(up);
-			sio_out(up, TXX9_SITFIFO, 13);
-		}
-	}
+	uart_console_write(&up->port, s, count, serial_txx9_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/sunsab.c b/drivers/serial/sunsab.c
index 85664228a0b6..be95eabd0394 100644
--- a/drivers/serial/sunsab.c
+++ b/drivers/serial/sunsab.c
@@ -861,8 +861,9 @@ static int num_channels;
 
 #ifdef CONFIG_SERIAL_SUNSAB_CONSOLE
 
-static __inline__ void sunsab_console_putchar(struct uart_sunsab_port *up, char c)
+static void sunsab_console_putchar(struct uart_port *port, int c)
 {
+	struct uart_sunsab_port *up = (struct uart_sunsab_port *)port;
 	unsigned long flags;
 
 	spin_lock_irqsave(&up->port.lock, flags);
@@ -876,13 +877,8 @@ static __inline__ void sunsab_console_putchar(struct uart_sunsab_port *up, char
 static void sunsab_console_write(struct console *con, const char *s, unsigned n)
 {
 	struct uart_sunsab_port *up = &sunsab_ports[con->index];
-	int i;
 
-	for (i = 0; i < n; i++) {
-		if (*s == '\n')
-			sunsab_console_putchar(up, '\r');
-		sunsab_console_putchar(up, *s++);
-	}
+	uart_console_write(&up->port, s, n, sunsab_console_putchar);
 	sunsab_tec_wait(up);
 }
 
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 4e453fa966ae..9ca1d8b9364b 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -1376,6 +1376,14 @@ static __inline__ void wait_for_xmitr(struct uart_sunsu_port *up)
 	}
 }
 
+static void sunsu_console_putchar(struct uart_port *port, int ch)
+{
+	struct uart_sunsu_port *up = (struct uart_sunsu_port *)port;
+
+	wait_for_xmitr(up);
+	serial_out(up, UART_TX, ch);
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -1385,7 +1393,6 @@ static void sunsu_console_write(struct console *co, const char *s,
 {
 	struct uart_sunsu_port *up = &sunsu_ports[co->index];
 	unsigned int ier;
-	int i;
 
 	/*
 	 *	First save the UER then disable the interrupts
@@ -1393,22 +1400,7 @@ static void sunsu_console_write(struct console *co, const char *s,
 	ier = serial_in(up, UART_IER);
 	serial_out(up, UART_IER, 0);
 
-	/*
-	 *	Now, do each character
-	 */
-	for (i = 0; i < count; i++, s++) {
-		wait_for_xmitr(up);
-
-		/*
-		 *	Send the character out.
-		 *	If a LF, also do CR...
-		 */
-		serial_out(up, UART_TX, *s);
-		if (*s == 10) {
-			wait_for_xmitr(up);
-			serial_out(up, UART_TX, 13);
-		}
-	}
+	uart_console_write(&up->port, s, count, sunsu_console_putchar);
 
 	/*
 	 *	Finally, wait for transmitter to become empty
diff --git a/drivers/serial/sunzilog.c b/drivers/serial/sunzilog.c
index 5cc4d4c2935c..b0c46b9a5930 100644
--- a/drivers/serial/sunzilog.c
+++ b/drivers/serial/sunzilog.c
@@ -1252,8 +1252,9 @@ static struct zilog_layout __iomem * __init get_zs(int chip, int node)
 
 #define ZS_PUT_CHAR_MAX_DELAY	2000	/* 10 ms */
 
-static void sunzilog_put_char(struct zilog_channel __iomem *channel, unsigned char ch)
+static void sunzilog_putchar(struct uart_port *port, int ch)
 {
+	struct zilog_channel *channel = ZILOG_CHANNEL_FROM_PORT(port);
 	int loops = ZS_PUT_CHAR_MAX_DELAY;
 
 	/* This is a timed polling loop so do not switch the explicit
@@ -1284,7 +1285,7 @@ static int sunzilog_serio_write(struct serio *serio, unsigned char ch)
 
 	spin_lock_irqsave(&sunzilog_serio_lock, flags);
 
-	sunzilog_put_char(ZILOG_CHANNEL_FROM_PORT(&up->port), ch);
+	sunzilog_putchar(&up->port, ch);
 
 	spin_unlock_irqrestore(&sunzilog_serio_lock, flags);
 
@@ -1325,16 +1326,10 @@ static void
 sunzilog_console_write(struct console *con, const char *s, unsigned int count)
 {
 	struct uart_sunzilog_port *up = &sunzilog_port_table[con->index];
-	struct zilog_channel *channel = ZILOG_CHANNEL_FROM_PORT(&up->port);
 	unsigned long flags;
-	int i;
 
 	spin_lock_irqsave(&up->port.lock, flags);
-	for (i = 0; i < count; i++, s++) {
-		sunzilog_put_char(channel, *s);
-		if (*s == 10)
-			sunzilog_put_char(channel, 13);
-	}
+	uart_console_write(&up->port, s, count, sunzilog_putchar);
 	udelay(2);
 	spin_unlock_irqrestore(&up->port.lock, flags);
 }
diff --git a/drivers/serial/vr41xx_siu.c b/drivers/serial/vr41xx_siu.c
index d61494d185cd..3f88b8e81937 100644
--- a/drivers/serial/vr41xx_siu.c
+++ b/drivers/serial/vr41xx_siu.c
@@ -821,25 +821,23 @@ static void wait_for_xmitr(struct uart_port *port)
 	}
 }
 
+static void siu_console_putchar(struct uart_port *port, int ch)
+{
+	wait_for_xmitr(port);
+	siu_write(port, UART_TX, ch);
+}
+
 static void siu_console_write(struct console *con, const char *s, unsigned count)
 {
 	struct uart_port *port;
 	uint8_t ier;
-	unsigned i;
 
 	port = &siu_uart_ports[con->index];
 
 	ier = siu_read(port, UART_IER);
 	siu_write(port, UART_IER, 0);
 
-	for (i = 0; i < count && *s != '\0'; i++, s++) {
-		wait_for_xmitr(port);
-		siu_write(port, UART_TX, *s);
-		if (*s == '\n') {
-			wait_for_xmitr(port);
-			siu_write(port, UART_TX, '\r');
-		}
-	}
+	uart_console_write(port, s, count, siu_console_putchar);
 
 	wait_for_xmitr(port);
 	siu_write(port, UART_IER, ier);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 4041122dabfc..f7434e5086f5 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -366,6 +366,9 @@ void uart_parse_options(char *options, int *baud, int *parity, int *bits,
 int uart_set_options(struct uart_port *port, struct console *co, int baud,
 		     int parity, int bits, int flow);
 struct tty_driver *uart_console_device(struct console *co, int *index);
+void uart_console_write(struct uart_port *port, const char *s,
+			unsigned int count,
+			void (*putchar)(struct uart_port *, int));
 
 /*
  * Port/driver registration/removal
-- 
cgit v1.2.3


From 58383af629efb07e5a0694e445eda0c65b16e1de Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Mon, 6 Feb 2006 14:12:43 -0800
Subject: [PATCH] kobj_map semaphore to mutex conversion

Convert the kobj_map code to use a mutex instead of a semaphore.  It
converts the single two users as well, genhd.c and char_dev.c.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c            | 31 ++++++++++++++++---------------
 drivers/base/map.c       | 21 +++++++++++----------
 fs/char_dev.c            | 17 +++++++++--------
 include/linux/kobj_map.h |  4 ++--
 4 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index db57546a709d..64510fd88621 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -15,12 +15,13 @@
 #include <linux/kmod.h>
 #include <linux/kobj_map.h>
 #include <linux/buffer_head.h>
+#include <linux/mutex.h>
 
 #define MAX_PROBE_HASH 255	/* random */
 
 static struct subsystem block_subsys;
 
-static DECLARE_MUTEX(block_subsys_sem);
+static DEFINE_MUTEX(block_subsys_lock);
 
 /*
  * Can be deleted altogether. Later.
@@ -46,7 +47,7 @@ struct blkdev_info {
 /*
  * iterate over a list of blkdev_info structures.  allows
  * the major_names array to be iterated over from outside this file
- * must be called with the block_subsys_sem held
+ * must be called with the block_subsys_lock held
  */
 void *get_next_blkdev(void *dev)
 {
@@ -85,20 +86,20 @@ out:
 
 void *acquire_blkdev_list(void)
 {
-        down(&block_subsys_sem);
+        mutex_lock(&block_subsys_lock);
         return get_next_blkdev(NULL);
 }
 
 void release_blkdev_list(void *dev)
 {
-        up(&block_subsys_sem);
+        mutex_unlock(&block_subsys_lock);
         kfree(dev);
 }
 
 
 /*
  * Count the number of records in the blkdev_list.
- * must be called with the block_subsys_sem held
+ * must be called with the block_subsys_lock held
  */
 int count_blkdev_list(void)
 {
@@ -118,7 +119,7 @@ int count_blkdev_list(void)
 /*
  * extract the major and name values from a blkdev_info struct
  * passed in as a void to *dev.  Must be called with
- * block_subsys_sem held
+ * block_subsys_lock held
  */
 int get_blkdev_info(void *dev, int *major, char **name)
 {
@@ -138,7 +139,7 @@ int register_blkdev(unsigned int major, const char *name)
 	struct blk_major_name **n, *p;
 	int index, ret = 0;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 
 	/* temporary */
 	if (major == 0) {
@@ -183,7 +184,7 @@ int register_blkdev(unsigned int major, const char *name)
 		kfree(p);
 	}
 out:
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 	return ret;
 }
 
@@ -197,7 +198,7 @@ int unregister_blkdev(unsigned int major, const char *name)
 	int index = major_to_index(major);
 	int ret = 0;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 	for (n = &major_names[index]; *n; n = &(*n)->next)
 		if ((*n)->major == major)
 			break;
@@ -207,7 +208,7 @@ int unregister_blkdev(unsigned int major, const char *name)
 		p = *n;
 		*n = p->next;
 	}
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 	kfree(p);
 
 	return ret;
@@ -301,7 +302,7 @@ static void *part_start(struct seq_file *part, loff_t *pos)
 	struct list_head *p;
 	loff_t l = *pos;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 	list_for_each(p, &block_subsys.kset.list)
 		if (!l--)
 			return list_entry(p, struct gendisk, kobj.entry);
@@ -318,7 +319,7 @@ static void *part_next(struct seq_file *part, void *v, loff_t *pos)
 
 static void part_stop(struct seq_file *part, void *v)
 {
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 }
 
 static int show_partition(struct seq_file *part, void *v)
@@ -377,7 +378,7 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 
 static int __init genhd_device_init(void)
 {
-	bdev_map = kobj_map_init(base_probe, &block_subsys_sem);
+	bdev_map = kobj_map_init(base_probe, &block_subsys_lock);
 	blk_dev_init();
 	subsystem_register(&block_subsys);
 	return 0;
@@ -611,7 +612,7 @@ static void *diskstats_start(struct seq_file *part, loff_t *pos)
 	loff_t k = *pos;
 	struct list_head *p;
 
-	down(&block_subsys_sem);
+	mutex_lock(&block_subsys_lock);
 	list_for_each(p, &block_subsys.kset.list)
 		if (!k--)
 			return list_entry(p, struct gendisk, kobj.entry);
@@ -628,7 +629,7 @@ static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
 
 static void diskstats_stop(struct seq_file *part, void *v)
 {
-	up(&block_subsys_sem);
+	mutex_unlock(&block_subsys_lock);
 }
 
 static int diskstats_show(struct seq_file *s, void *v)
diff --git a/drivers/base/map.c b/drivers/base/map.c
index b449dae6f0d3..e87017f36853 100644
--- a/drivers/base/map.c
+++ b/drivers/base/map.c
@@ -11,6 +11,7 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <linux/kdev_t.h>
 #include <linux/kobject.h>
 #include <linux/kobj_map.h>
@@ -25,7 +26,7 @@ struct kobj_map {
 		int (*lock)(dev_t, void *);
 		void *data;
 	} *probes[255];
-	struct semaphore *sem;
+	struct mutex *lock;
 };
 
 int kobj_map(struct kobj_map *domain, dev_t dev, unsigned long range,
@@ -53,7 +54,7 @@ int kobj_map(struct kobj_map *domain, dev_t dev, unsigned long range,
 		p->range = range;
 		p->data = data;
 	}
-	down(domain->sem);
+	mutex_lock(domain->lock);
 	for (i = 0, p -= n; i < n; i++, p++, index++) {
 		struct probe **s = &domain->probes[index % 255];
 		while (*s && (*s)->range < range)
@@ -61,7 +62,7 @@ int kobj_map(struct kobj_map *domain, dev_t dev, unsigned long range,
 		p->next = *s;
 		*s = p;
 	}
-	up(domain->sem);
+	mutex_unlock(domain->lock);
 	return 0;
 }
 
@@ -75,7 +76,7 @@ void kobj_unmap(struct kobj_map *domain, dev_t dev, unsigned long range)
 	if (n > 255)
 		n = 255;
 
-	down(domain->sem);
+	mutex_lock(domain->lock);
 	for (i = 0; i < n; i++, index++) {
 		struct probe **s;
 		for (s = &domain->probes[index % 255]; *s; s = &(*s)->next) {
@@ -88,7 +89,7 @@ void kobj_unmap(struct kobj_map *domain, dev_t dev, unsigned long range)
 			}
 		}
 	}
-	up(domain->sem);
+	mutex_unlock(domain->lock);
 	kfree(found);
 }
 
@@ -99,7 +100,7 @@ struct kobject *kobj_lookup(struct kobj_map *domain, dev_t dev, int *index)
 	unsigned long best = ~0UL;
 
 retry:
-	down(domain->sem);
+	mutex_lock(domain->lock);
 	for (p = domain->probes[MAJOR(dev) % 255]; p; p = p->next) {
 		struct kobject *(*probe)(dev_t, int *, void *);
 		struct module *owner;
@@ -120,7 +121,7 @@ retry:
 			module_put(owner);
 			continue;
 		}
-		up(domain->sem);
+		mutex_unlock(domain->lock);
 		kobj = probe(dev, index, data);
 		/* Currently ->owner protects _only_ ->probe() itself. */
 		module_put(owner);
@@ -128,11 +129,11 @@ retry:
 			return kobj;
 		goto retry;
 	}
-	up(domain->sem);
+	mutex_unlock(domain->lock);
 	return NULL;
 }
 
-struct kobj_map *kobj_map_init(kobj_probe_t *base_probe, struct semaphore *sem)
+struct kobj_map *kobj_map_init(kobj_probe_t *base_probe, struct mutex *lock)
 {
 	struct kobj_map *p = kmalloc(sizeof(struct kobj_map), GFP_KERNEL);
 	struct probe *base = kzalloc(sizeof(*base), GFP_KERNEL);
@@ -149,6 +150,6 @@ struct kobj_map *kobj_map_init(kobj_probe_t *base_probe, struct semaphore *sem)
 	base->get = base_probe;
 	for (i = 0; i < 255; i++)
 		p->probes[i] = base;
-	p->sem = sem;
+	p->lock = lock;
 	return p;
 }
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 21195c481637..5c36345c9bf7 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -19,6 +19,7 @@
 #include <linux/kobject.h>
 #include <linux/kobj_map.h>
 #include <linux/cdev.h>
+#include <linux/mutex.h>
 
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
@@ -28,7 +29,7 @@ static struct kobj_map *cdev_map;
 
 #define MAX_PROBE_HASH 255	/* random */
 
-static DECLARE_MUTEX(chrdevs_lock);
+static DEFINE_MUTEX(chrdevs_lock);
 
 static struct char_device_struct {
 	struct char_device_struct *next;
@@ -88,13 +89,13 @@ out:
 
 void *acquire_chrdev_list(void)
 {
-	down(&chrdevs_lock);
+	mutex_lock(&chrdevs_lock);
 	return get_next_chrdev(NULL);
 }
 
 void release_chrdev_list(void *dev)
 {
-	up(&chrdevs_lock);
+	mutex_unlock(&chrdevs_lock);
 	kfree(dev);
 }
 
@@ -151,7 +152,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 
 	memset(cd, 0, sizeof(struct char_device_struct));
 
-	down(&chrdevs_lock);
+	mutex_lock(&chrdevs_lock);
 
 	/* temporary */
 	if (major == 0) {
@@ -186,10 +187,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 	}
 	cd->next = *cp;
 	*cp = cd;
-	up(&chrdevs_lock);
+	mutex_unlock(&chrdevs_lock);
 	return cd;
 out:
-	up(&chrdevs_lock);
+	mutex_unlock(&chrdevs_lock);
 	kfree(cd);
 	return ERR_PTR(ret);
 }
@@ -200,7 +201,7 @@ __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
 	struct char_device_struct *cd = NULL, **cp;
 	int i = major_to_index(major);
 
-	down(&chrdevs_lock);
+	mutex_lock(&chrdevs_lock);
 	for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
 		if ((*cp)->major == major &&
 		    (*cp)->baseminor == baseminor &&
@@ -210,7 +211,7 @@ __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
 		cd = *cp;
 		*cp = cd->next;
 	}
-	up(&chrdevs_lock);
+	mutex_unlock(&chrdevs_lock);
 	return cd;
 }
 
diff --git a/include/linux/kobj_map.h b/include/linux/kobj_map.h
index cbe7d8008042..bafe178a381f 100644
--- a/include/linux/kobj_map.h
+++ b/include/linux/kobj_map.h
@@ -1,6 +1,6 @@
 #ifdef __KERNEL__
 
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 typedef struct kobject *kobj_probe_t(dev_t, int *, void *);
 struct kobj_map;
@@ -9,6 +9,6 @@ int kobj_map(struct kobj_map *, dev_t, unsigned long, struct module *,
 	     kobj_probe_t *, int (*)(dev_t, void *), void *);
 void kobj_unmap(struct kobj_map *, dev_t, unsigned long);
 struct kobject *kobj_lookup(struct kobj_map *, dev_t, int *);
-struct kobj_map *kobj_map_init(kobj_probe_t *, struct semaphore *);
+struct kobj_map *kobj_map_init(kobj_probe_t *, struct mutex *);
 
 #endif
-- 
cgit v1.2.3


From 9f28bb7e1d0188a993403ab39b774785892805e1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Mar 2006 13:17:13 -0800
Subject: [PATCH] add EXPORT_SYMBOL_GPL_FUTURE()

This patch adds the ability to mark symbols that will be changed in the
future, so that kernel modules that don't include MODULE_LICENSE("GPL")
and use the symbols, will be flagged and printed out to the system log.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/m68knommu/kernel/vmlinux.lds.S | 10 ++++
 arch/v850/kernel/vmlinux.lds.S      |  8 ++++
 include/asm-generic/vmlinux.lds.h   | 14 ++++++
 include/linux/module.h              |  9 ++++
 kernel/module.c                     | 49 +++++++++++++++++++-
 scripts/genksyms/keywords.c_shipped | 91 +++++++++++++++++++------------------
 scripts/genksyms/keywords.gperf     |  1 +
 7 files changed, 135 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index ac9de2661c0b..a331cc90797c 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -269,6 +269,11 @@ SECTIONS {
 		*(__ksymtab_gpl)
 		__stop___ksymtab_gpl = .;
 
+		/* Kernel symbol table: GPL-future symbols */
+		__start___ksymtab_gpl_future = .;
+		*(__ksymtab_gpl_future)
+		__stop___ksymtab_gpl_future = .;
+
 		/* Kernel symbol table: Normal symbols */
 		__start___kcrctab = .;
 		*(__kcrctab)
@@ -279,6 +284,11 @@ SECTIONS {
 		*(__kcrctab_gpl)
 		__stop___kcrctab_gpl = .;
 
+		/* Kernel symbol table: GPL-future symbols */
+		__start___kcrctab_gpl_future = .;
+		*(__kcrctab_gpl_future)
+		__stop___kcrctab_gpl_future = .;
+
 		/* Kernel symbol table: strings */
 		*(__ksymtab_strings)
 
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 5be05f47109e..5b2ffcc6e2b2 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -64,6 +64,10 @@
 		___start___ksymtab_gpl = .;				      \
 			*(__ksymtab_gpl)				      \
 		___stop___ksymtab_gpl = .;				      \
+		/* Kernel symbol table: GPL-future symbols */		      \
+		___start___ksymtab_gpl_future = .;			      \
+			*(__ksymtab_gpl_future)				      \
+		___stop___ksymtab_gpl_future = .;			      \
 		/* Kernel symbol table: strings */			      \
 			*(__ksymtab_strings)				      \
 		/* Kernel symbol table: Normal symbols */		      \
@@ -74,6 +78,10 @@
 		___start___kcrctab_gpl = .;				      \
 			*(__kcrctab_gpl)				      \
 		___stop___kcrctab_gpl = .;				      \
+		/* Kernel symbol table: GPL-future symbols */		      \
+		___start___kcrctab_gpl_future = .;			      \
+			*(__kcrctab_gpl_future)				      \
+		___stop___kcrctab_gpl_future = .;			      \
 		/* Built-in module parameters */			      \
 		. = ALIGN (4) ;						      \
 		___start___param = .;					      \
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 35de20cf8fac..9d11550b4818 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -58,6 +58,13 @@
 		VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .;		\
 	}								\
 									\
+	/* Kernel symbol table: GPL-future-only symbols */		\
+	__ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \
+		VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .;	\
+		*(__ksymtab_gpl_future)					\
+		VMLINUX_SYMBOL(__stop___ksymtab_gpl_future) = .;	\
+	}								\
+									\
 	/* Kernel symbol table: Normal symbols */			\
 	__kcrctab         : AT(ADDR(__kcrctab) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start___kcrctab) = .;			\
@@ -72,6 +79,13 @@
 		VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .;		\
 	}								\
 									\
+	/* Kernel symbol table: GPL-future-only symbols */		\
+	__kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \
+		VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .;	\
+		*(__kcrctab_gpl_future)					\
+		VMLINUX_SYMBOL(__stop___kcrctab_gpl_future) = .;	\
+	}								\
+									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
diff --git a/include/linux/module.h b/include/linux/module.h
index 84d75f3a8aca..a25d5f61548c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -198,6 +198,9 @@ void *__symbol_get_gpl(const char *symbol);
 #define EXPORT_SYMBOL_GPL(sym)					\
 	__EXPORT_SYMBOL(sym, "_gpl")
 
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)				\
+	__EXPORT_SYMBOL(sym, "_gpl_future")
+
 #endif
 
 struct module_ref
@@ -255,6 +258,11 @@ struct module
 	unsigned int num_gpl_syms;
 	const unsigned long *gpl_crcs;
 
+	/* symbols that will be GPL-only in the near future. */
+	const struct kernel_symbol *gpl_future_syms;
+	unsigned int num_gpl_future_syms;
+	const unsigned long *gpl_future_crcs;
+
 	/* Exception table */
 	unsigned int num_exentries;
 	const struct exception_table_entry *extable;
@@ -441,6 +449,7 @@ void module_remove_driver(struct device_driver *);
 #else /* !CONFIG_MODULES... */
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)
 
 /* Given an address, look for it in the exception tables. */
 static inline const struct exception_table_entry *
diff --git a/kernel/module.c b/kernel/module.c
index 2a892b20d68f..5ca99fbe9f44 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -126,8 +126,11 @@ extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
 extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
+extern const unsigned long __start___kcrctab_gpl_future[];
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -172,6 +175,22 @@ static unsigned long __find_symbol(const char *name,
 			return ks->value;
 		}
 	}
+	ks = lookup_symbol(name, __start___ksymtab_gpl_future,
+				 __stop___ksymtab_gpl_future);
+	if (ks) {
+		if (!gplok) {
+			printk(KERN_WARNING "Symbol %s is being used "
+			       "by a non-GPL module, which will not "
+			       "be allowed in the future\n", name);
+			printk(KERN_WARNING "Please see the file "
+			       "Documentation/feature-removal-schedule.txt "
+			       "in the kernel source tree for more "
+			       "details.\n");
+		}
+		*crc = symversion(__start___kcrctab_gpl_future,
+				  (ks - __start___ksymtab_gpl_future));
+		return ks->value;
+	}
 
 	/* Now try modules. */ 
 	list_for_each_entry(mod, &modules, list) {
@@ -191,6 +210,23 @@ static unsigned long __find_symbol(const char *name,
 				return ks->value;
 			}
 		}
+		ks = lookup_symbol(name, mod->gpl_future_syms,
+				   (mod->gpl_future_syms +
+				    mod->num_gpl_future_syms));
+		if (ks) {
+			if (!gplok) {
+				printk(KERN_WARNING "Symbol %s is being used "
+				       "by a non-GPL module, which will not "
+				       "be allowed in the future\n", name);
+				printk(KERN_WARNING "Please see the file "
+				       "Documentation/feature-removal-schedule.txt "
+				       "in the kernel source tree for more "
+				       "details.\n");
+			}
+			*crc = symversion(mod->gpl_future_crcs,
+					  (ks - mod->gpl_future_syms));
+			return ks->value;
+		}
 	}
 	DEBUGP("Failed to find symbol %s\n", name);
  	return 0;
@@ -1546,7 +1582,8 @@ static struct module *load_module(void __user *umod,
 	char *secstrings, *args, *modmagic, *strtab = NULL;
 	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
-		crcindex, gplcrcindex, versindex, pcpuindex;
+		crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
+		gplfuturecrcindex;
 	long arglen;
 	struct module *mod;
 	long err = 0;
@@ -1627,8 +1664,10 @@ static struct module *load_module(void __user *umod,
 	/* Optional sections */
 	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
 	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
+	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
 	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
 	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
 	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
 	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1784,10 +1823,16 @@ static struct module *load_module(void __user *umod,
 	mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
 	if (gplcrcindex)
 		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
+	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
+					sizeof(*mod->gpl_future_syms);
+	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
+	if (gplfuturecrcindex)
+		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !crcindex) || 
-	    (mod->num_gpl_syms && !gplcrcindex)) {
+	    (mod->num_gpl_syms && !gplcrcindex) ||
+	    (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
 		printk(KERN_WARNING "%s: No versions for exported symbols."
 		       " Tainting kernel.\n", mod->name);
 		add_taint(TAINT_FORCED_MODULE);
diff --git a/scripts/genksyms/keywords.c_shipped b/scripts/genksyms/keywords.c_shipped
index ee4647805c58..d8153f572e40 100644
--- a/scripts/genksyms/keywords.c_shipped
+++ b/scripts/genksyms/keywords.c_shipped
@@ -52,9 +52,9 @@ is_reserved_hash (register const char *str, register unsigned int len)
       71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
       71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
       71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
-      71, 71, 71, 71, 71, 71, 71, 71, 71, 15,
-      71, 71, 71, 71, 71, 71, 15, 71, 71, 71,
-      10, 71, 71, 71, 71, 71, 71, 71, 71, 71,
+      71, 71, 71, 71, 71, 71, 71, 71, 71,  0,
+      71, 71, 71, 71, 71, 71, 35, 71, 71, 71,
+       5, 71, 71, 71, 71, 71, 71, 71, 71, 71,
       71, 71, 71, 71, 71,  0, 71,  0, 71,  5,
        5,  0, 10, 20, 71, 25, 71, 71, 20,  0,
       20, 30, 25, 71, 10,  5,  0, 20, 15, 71,
@@ -84,9 +84,9 @@ is_reserved_word (register const char *str, register unsigned int len)
 {
   enum
     {
-      TOTAL_KEYWORDS = 41,
+      TOTAL_KEYWORDS = 42,
       MIN_WORD_LENGTH = 3,
-      MAX_WORD_LENGTH = 17,
+      MAX_WORD_LENGTH = 24,
       MIN_HASH_VALUE = 3,
       MAX_HASH_VALUE = 70
     };
@@ -94,104 +94,105 @@ is_reserved_word (register const char *str, register unsigned int len)
   static const struct resword wordlist[] =
     {
       {""}, {""}, {""},
-#line 24 "scripts/genksyms/keywords.gperf"
+#line 25 "scripts/genksyms/keywords.gperf"
       {"asm", ASM_KEYW},
       {""},
-#line 7 "scripts/genksyms/keywords.gperf"
+#line 8 "scripts/genksyms/keywords.gperf"
       {"__asm", ASM_KEYW},
       {""},
-#line 8 "scripts/genksyms/keywords.gperf"
+#line 9 "scripts/genksyms/keywords.gperf"
       {"__asm__", ASM_KEYW},
       {""},
-#line 21 "scripts/genksyms/keywords.gperf"
+#line 22 "scripts/genksyms/keywords.gperf"
       {"_restrict", RESTRICT_KEYW},
-#line 50 "scripts/genksyms/keywords.gperf"
+#line 51 "scripts/genksyms/keywords.gperf"
       {"__typeof__", TYPEOF_KEYW},
-#line 9 "scripts/genksyms/keywords.gperf"
+#line 10 "scripts/genksyms/keywords.gperf"
       {"__attribute", ATTRIBUTE_KEYW},
-#line 11 "scripts/genksyms/keywords.gperf"
+#line 12 "scripts/genksyms/keywords.gperf"
       {"__const", CONST_KEYW},
-#line 10 "scripts/genksyms/keywords.gperf"
+#line 11 "scripts/genksyms/keywords.gperf"
       {"__attribute__", ATTRIBUTE_KEYW},
-#line 12 "scripts/genksyms/keywords.gperf"
+#line 13 "scripts/genksyms/keywords.gperf"
       {"__const__", CONST_KEYW},
-#line 16 "scripts/genksyms/keywords.gperf"
+#line 17 "scripts/genksyms/keywords.gperf"
       {"__signed__", SIGNED_KEYW},
-#line 42 "scripts/genksyms/keywords.gperf"
+#line 43 "scripts/genksyms/keywords.gperf"
       {"static", STATIC_KEYW},
       {""},
-#line 15 "scripts/genksyms/keywords.gperf"
+#line 16 "scripts/genksyms/keywords.gperf"
       {"__signed", SIGNED_KEYW},
-#line 30 "scripts/genksyms/keywords.gperf"
+#line 31 "scripts/genksyms/keywords.gperf"
       {"char", CHAR_KEYW},
       {""},
-#line 43 "scripts/genksyms/keywords.gperf"
+#line 44 "scripts/genksyms/keywords.gperf"
       {"struct", STRUCT_KEYW},
-#line 22 "scripts/genksyms/keywords.gperf"
-      {"__restrict__", RESTRICT_KEYW},
 #line 23 "scripts/genksyms/keywords.gperf"
+      {"__restrict__", RESTRICT_KEYW},
+#line 24 "scripts/genksyms/keywords.gperf"
       {"restrict", RESTRICT_KEYW},
-#line 33 "scripts/genksyms/keywords.gperf"
+#line 34 "scripts/genksyms/keywords.gperf"
       {"enum", ENUM_KEYW},
-#line 17 "scripts/genksyms/keywords.gperf"
+#line 18 "scripts/genksyms/keywords.gperf"
       {"__volatile", VOLATILE_KEYW},
-#line 34 "scripts/genksyms/keywords.gperf"
+#line 35 "scripts/genksyms/keywords.gperf"
       {"extern", EXTERN_KEYW},
-#line 18 "scripts/genksyms/keywords.gperf"
+#line 19 "scripts/genksyms/keywords.gperf"
       {"__volatile__", VOLATILE_KEYW},
-#line 37 "scripts/genksyms/keywords.gperf"
+#line 38 "scripts/genksyms/keywords.gperf"
       {"int", INT_KEYW},
-      {""},
-#line 31 "scripts/genksyms/keywords.gperf"
-      {"const", CONST_KEYW},
+#line 7 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW},
 #line 32 "scripts/genksyms/keywords.gperf"
+      {"const", CONST_KEYW},
+#line 33 "scripts/genksyms/keywords.gperf"
       {"double", DOUBLE_KEYW},
       {""},
-#line 13 "scripts/genksyms/keywords.gperf"
+#line 14 "scripts/genksyms/keywords.gperf"
       {"__inline", INLINE_KEYW},
-#line 29 "scripts/genksyms/keywords.gperf"
+#line 30 "scripts/genksyms/keywords.gperf"
       {"auto", AUTO_KEYW},
-#line 14 "scripts/genksyms/keywords.gperf"
+#line 15 "scripts/genksyms/keywords.gperf"
       {"__inline__", INLINE_KEYW},
-#line 41 "scripts/genksyms/keywords.gperf"
+#line 42 "scripts/genksyms/keywords.gperf"
       {"signed", SIGNED_KEYW},
       {""},
-#line 46 "scripts/genksyms/keywords.gperf"
+#line 47 "scripts/genksyms/keywords.gperf"
       {"unsigned", UNSIGNED_KEYW},
       {""},
-#line 40 "scripts/genksyms/keywords.gperf"
+#line 41 "scripts/genksyms/keywords.gperf"
       {"short", SHORT_KEYW},
-#line 49 "scripts/genksyms/keywords.gperf"
+#line 50 "scripts/genksyms/keywords.gperf"
       {"typeof", TYPEOF_KEYW},
-#line 44 "scripts/genksyms/keywords.gperf"
+#line 45 "scripts/genksyms/keywords.gperf"
       {"typedef", TYPEDEF_KEYW},
-#line 48 "scripts/genksyms/keywords.gperf"
+#line 49 "scripts/genksyms/keywords.gperf"
       {"volatile", VOLATILE_KEYW},
       {""},
-#line 35 "scripts/genksyms/keywords.gperf"
+#line 36 "scripts/genksyms/keywords.gperf"
       {"float", FLOAT_KEYW},
       {""}, {""},
-#line 39 "scripts/genksyms/keywords.gperf"
+#line 40 "scripts/genksyms/keywords.gperf"
       {"register", REGISTER_KEYW},
-#line 47 "scripts/genksyms/keywords.gperf"
+#line 48 "scripts/genksyms/keywords.gperf"
       {"void", VOID_KEYW},
       {""},
-#line 36 "scripts/genksyms/keywords.gperf"
+#line 37 "scripts/genksyms/keywords.gperf"
       {"inline", INLINE_KEYW},
       {""},
 #line 5 "scripts/genksyms/keywords.gperf"
       {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW},
       {""},
-#line 20 "scripts/genksyms/keywords.gperf"
+#line 21 "scripts/genksyms/keywords.gperf"
       {"_Bool", BOOL_KEYW},
       {""},
 #line 6 "scripts/genksyms/keywords.gperf"
       {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
       {""}, {""}, {""}, {""}, {""}, {""},
-#line 38 "scripts/genksyms/keywords.gperf"
+#line 39 "scripts/genksyms/keywords.gperf"
       {"long", LONG_KEYW},
       {""}, {""}, {""}, {""}, {""},
-#line 45 "scripts/genksyms/keywords.gperf"
+#line 46 "scripts/genksyms/keywords.gperf"
       {"union", UNION_KEYW}
     };
 
diff --git a/scripts/genksyms/keywords.gperf b/scripts/genksyms/keywords.gperf
index b6bec765996e..c75e0c8d8f0c 100644
--- a/scripts/genksyms/keywords.gperf
+++ b/scripts/genksyms/keywords.gperf
@@ -4,6 +4,7 @@ struct resword { const char *name; int token; }
 %%
 EXPORT_SYMBOL, EXPORT_SYMBOL_KEYW
 EXPORT_SYMBOL_GPL, EXPORT_SYMBOL_KEYW
+EXPORT_SYMBOL_GPL_FUTURE, EXPORT_SYMBOL_KEYW
 __asm, ASM_KEYW
 __asm__, ASM_KEYW
 __attribute, ATTRIBUTE_KEYW
-- 
cgit v1.2.3


From 03e88ae1b13dfdc8bbaa59b8198e1ca53aad12ac Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 16 Feb 2006 13:50:23 -0800
Subject: [PATCH] fix module sysfs files reference counting

The module files, refcnt, version, and srcversion did not properly
increment the owner's module reference count, allowing the modules to
be removed while the files were open, causing oopses.

This patch fixes this, and also fixes the problem that the version and
srcversion files were not showing up, unless CONFIG_MODULE_UNLOAD was
enabled, which is not correct.

Cc: Nathan Lynch <ntl@pobox.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/module.h |  1 +
 kernel/module.c        | 77 ++++++++++++++++++++------------------------------
 kernel/params.c        | 10 -------
 3 files changed, 32 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index a25d5f61548c..70bd843c71cb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -245,6 +245,7 @@ struct module
 	/* Sysfs stuff. */
 	struct module_kobject mkobj;
 	struct module_param_attrs *param_attrs;
+	struct module_attribute *modinfo_attrs;
 	const char *version;
 	const char *srcversion;
 
diff --git a/kernel/module.c b/kernel/module.c
index 5ca99fbe9f44..77764f22f021 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -429,7 +429,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 }
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_MODULE_UNLOAD
 #define MODINFO_ATTR(field)	\
 static void setup_modinfo_##field(struct module *mod, const char *s)  \
 {                                                                     \
@@ -461,12 +460,7 @@ static struct module_attribute modinfo_##field = {                    \
 MODINFO_ATTR(version);
 MODINFO_ATTR(srcversion);
 
-static struct module_attribute *modinfo_attrs[] = {
-	&modinfo_version,
-	&modinfo_srcversion,
-	NULL,
-};
-
+#ifdef CONFIG_MODULE_UNLOAD
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -781,6 +775,15 @@ static inline void module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
 
+static struct module_attribute *modinfo_attrs[] = {
+	&modinfo_version,
+	&modinfo_srcversion,
+#ifdef CONFIG_MODULE_UNLOAD
+	&refcnt,
+#endif
+	NULL,
+};
+
 #ifdef CONFIG_OBSOLETE_MODPARM
 /* Bounds checking done below */
 static int obsparm_copy_string(const char *val, struct kernel_param *kp)
@@ -1106,37 +1109,28 @@ static inline void remove_sect_attrs(struct module *mod)
 }
 #endif /* CONFIG_KALLSYMS */
 
-
-#ifdef CONFIG_MODULE_UNLOAD
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-	return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-	return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-#else
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-	return 0;
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_UNLOAD
 static int module_add_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
+	struct module_attribute *temp_attr;
 	int error = 0;
 	int i;
 
+	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+					(ARRAY_SIZE(modinfo_attrs) + 1)),
+					GFP_KERNEL);
+	if (!mod->modinfo_attrs)
+		return -ENOMEM;
+
+	temp_attr = mod->modinfo_attrs;
 	for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
 		if (!attr->test ||
-		    (attr->test && attr->test(mod)))
-			error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+		    (attr->test && attr->test(mod))) {
+			memcpy(temp_attr, attr, sizeof(*temp_attr));
+			temp_attr->attr.owner = mod;
+			error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+			++temp_attr;
+		}
 	}
 	return error;
 }
@@ -1146,12 +1140,16 @@ static void module_remove_modinfo_attrs(struct module *mod)
 	struct module_attribute *attr;
 	int i;
 
-	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+		/* pick a field to test for end of list */
+		if (!attr->attr.name)
+			break;
 		sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
-		attr->free(mod);
+		if (attr->free)
+			attr->free(mod);
 	}
+	kfree(mod->modinfo_attrs);
 }
-#endif
 
 static int mod_sysfs_setup(struct module *mod,
 			   struct kernel_param *kparam,
@@ -1169,19 +1167,13 @@ static int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out;
 
-	err = module_add_refcnt_attr(mod);
-	if (err)
-		goto out_unreg;
-
 	err = module_param_sysfs_setup(mod, kparam, num_params);
 	if (err)
 		goto out_unreg;
 
-#ifdef CONFIG_MODULE_UNLOAD
 	err = module_add_modinfo_attrs(mod);
 	if (err)
 		goto out_unreg;
-#endif
 
 	return 0;
 
@@ -1193,10 +1185,7 @@ out:
 
 static void mod_kobject_remove(struct module *mod)
 {
-#ifdef CONFIG_MODULE_UNLOAD
 	module_remove_modinfo_attrs(mod);
-#endif
-	module_remove_refcnt_attr(mod);
 	module_param_sysfs_remove(mod);
 
 	kobject_unregister(&mod->mkobj.kobj);
@@ -1474,7 +1463,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
 	return NULL;
 }
 
-#ifdef CONFIG_MODULE_UNLOAD
 static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 			  unsigned int infoindex)
 {
@@ -1489,7 +1477,6 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 						attr->attr.name));
 	}
 }
-#endif
 
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
@@ -1803,10 +1790,8 @@ static struct module *load_module(void __user *umod,
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint(TAINT_PROPRIETARY_MODULE);
 
-#ifdef CONFIG_MODULE_UNLOAD
 	/* Set up MODINFO_ATTR fields */
 	setup_modinfo(mod, sechdrs, infoindex);
-#endif
 
 	/* Fix up syms, so that st_value is a pointer to location. */
 	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
diff --git a/kernel/params.c b/kernel/params.c
index c76ad25e6a21..a29150582310 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj,
 	if (!attribute->show)
 		return -EIO;
 
-	if (!try_module_get(mk->mod))
-		return -ENODEV;
-
 	ret = attribute->show(attribute, mk->mod, buf);
 
-	module_put(mk->mod);
-
 	return ret;
 }
 
@@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj,
 	if (!attribute->store)
 		return -EIO;
 
-	if (!try_module_get(mk->mod))
-		return -ENODEV;
-
 	ret = attribute->store(attribute, mk->mod, buf, len);
 
-	module_put(mk->mod);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4f2928d0a439553f0288d9483faf417430629635 Mon Sep 17 00:00:00 2001
From: Tilman Schmidt <tilman@imap.cc>
Date: Fri, 24 Feb 2006 11:05:45 +0100
Subject: [PATCH] Driver core: add macros notice(), dev_notice()

Both usb.h and device.h have collections of convenience macros for
printk() with the KERN_ERR, KERN_WARNING, and KERN_NOTICE severity
levels. This patch adds macros for the KERN_NOTICE level which was
so far uncatered for.

These macros already exist privately in drivers/isdn/gigaset/gigaset.h
(currently in the process of being submitted for the kernel tree)
but they really belong with their brothers and sisters in
include/linux/{device,usb}.h.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 58df18d9cd3e..5b595fdfb672 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -424,6 +424,8 @@ extern void firmware_unregister(struct subsystem *);
 	dev_printk(KERN_INFO , dev , format , ## arg)
 #define dev_warn(dev, format, arg...)		\
 	dev_printk(KERN_WARNING , dev , format , ## arg)
+#define dev_notice(dev, format, arg...)		\
+	dev_printk(KERN_NOTICE , dev , format , ## arg)
 
 /* Create alias, so I can be autoloaded. */
 #define MODULE_ALIAS_CHARDEV(major,minor) \
-- 
cgit v1.2.3


From 22f98c0cd7e003b896ee52ded945081307118745 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 4 Mar 2006 13:15:31 +0100
Subject: [PATCH] Kobject: kobject.h: fix a typo

It shouldn't cause real harm, but it hurts my eyes.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kobject.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c374b5fa8d3b..7ece63f8abbd 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -255,7 +255,7 @@ struct subsys_attribute {
 extern int subsys_create_file(struct subsystem * , struct subsys_attribute *);
 extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
-#if defined(CONFIG_HOTPLUG) & defined(CONFIG_NET)
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 void kobject_uevent(struct kobject *kobj, enum kobject_action action);
 
 int add_uevent_var(char **envp, int num_envp, int *cur_index,
-- 
cgit v1.2.3


From dd308bc355a1aa4f202fe9a3133b6c676cb9606c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Tue, 7 Mar 2006 21:41:59 +1100
Subject: [PATCH] debugfs: Add debugfs_create_blob() helper for exporting
 binary data

I wanted to export a binary blob via debugfs, and although it was pretty easy
it seems like it'd be easier if there was a helper for it. It's a pity we need
the wrapper struct but I can't see a cleaner way to do it.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/file.c       | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h | 15 +++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index d575452cd9f7..40c4fc973fad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -251,3 +251,49 @@ struct dentry *debugfs_create_bool(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_bool);
 
+static ssize_t read_file_blob(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	struct debugfs_blob_wrapper *blob = file->private_data;
+	return simple_read_from_buffer(user_buf, count, ppos, blob->data,
+			blob->size);
+}
+
+static struct file_operations fops_blob = {
+	.read =		read_file_blob,
+	.open =		default_open,
+};
+
+/**
+ * debugfs_create_blob - create a file in the debugfs filesystem that is
+ * used to read and write a binary blob.
+ *
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this paramater is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer
+ *        to the blob data and the size of the data.
+ *
+ * This function creates a file in debugfs with the given name that exports
+ * @blob->data as a binary blob. If the @mode variable is so set it can be
+ * read from. Writing is not supported.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * returned.  It is not wise to check for this value, but rather, check for
+ * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+				   struct dentry *parent,
+				   struct debugfs_blob_wrapper *blob)
+{
+	return debugfs_create_file(name, mode, parent, blob, &fops_blob);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_blob);
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index a5fa6a6eede8..4b0428e335be 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -21,6 +21,11 @@
 
 struct file_operations;
 
+struct debugfs_blob_wrapper {
+	void *data;
+	unsigned long size;
+};
+
 #if defined(CONFIG_DEBUG_FS)
 struct dentry *debugfs_create_file(const char *name, mode_t mode,
 				   struct dentry *parent, void *data,
@@ -39,6 +44,9 @@ struct dentry *debugfs_create_u32(const char *name, mode_t mode,
 struct dentry *debugfs_create_bool(const char *name, mode_t mode,
 				  struct dentry *parent, u32 *value);
 
+struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+				  struct dentry *parent,
+				  struct debugfs_blob_wrapper *blob);
 #else
 
 #include <linux/err.h>
@@ -94,6 +102,13 @@ static inline struct dentry *debugfs_create_bool(const char *name, mode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode,
+				  struct dentry *parent,
+				  struct debugfs_blob_wrapper *blob)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 #endif
 
 #endif
-- 
cgit v1.2.3


From 7423172a50968de1905a61413c52bb070a62f5ce Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 13 Mar 2006 17:14:25 -0500
Subject: [PATCH] kobject_add_dir

Adding kobject_add_dir() function which creates a subdirectory
for a given kobject.

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kobject.h |  2 ++
 lib/kobject.c           | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 7ece63f8abbd..4cb1214ec290 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -80,6 +80,8 @@ extern void kobject_unregister(struct kobject *);
 extern struct kobject * kobject_get(struct kobject *);
 extern void kobject_put(struct kobject *);
 
+extern struct kobject *kobject_add_dir(struct kobject *, const char *);
+
 extern char * kobject_get_path(struct kobject *, gfp_t);
 
 struct kobj_type {
diff --git a/lib/kobject.c b/lib/kobject.c
index 36668c8c3ea1..25204a41a9b0 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -385,6 +385,44 @@ void kobject_put(struct kobject * kobj)
 }
 
 
+static void dir_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static struct kobj_type dir_ktype = {
+	.release	= dir_release,
+	.sysfs_ops	= NULL,
+	.default_attrs	= NULL,
+};
+
+/**
+ *	kobject_add_dir - add sub directory of object.
+ *	@parent:	object in which a directory is created.
+ *	@name:	directory name.
+ *
+ *	Add a plain directory object as child of given object.
+ */
+struct kobject *kobject_add_dir(struct kobject *parent, const char *name)
+{
+	struct kobject *k;
+
+	if (!parent)
+		return NULL;
+
+	k = kzalloc(sizeof(*k), GFP_KERNEL);
+	if (!k)
+		return NULL;
+
+	k->parent = parent;
+	k->ktype = &dir_ktype;
+	kobject_set_name(k, name);
+	kobject_register(k);
+
+	return k;
+}
+EXPORT_SYMBOL_GPL(kobject_add_dir);
+
 /**
  *	kset_init - initialize a kset for use
  *	@k:	kset 
-- 
cgit v1.2.3


From a29d642a4aa99c5234314ab2523281139226c231 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 7 Mar 2006 23:53:25 -0800
Subject: [PATCH] get_cpu_sysdev() signedness fix

Doing (int < NR_CPUS) doesn't dtrt if it's negative..

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/cpu.c  | 2 +-
 include/linux/cpu.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 07a7f97e1de9..29f3d7504da1 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -141,7 +141,7 @@ int __devinit register_cpu(struct cpu *cpu, int num, struct node *root)
 	return error;
 }
 
-struct sys_device *get_cpu_sysdev(int cpu)
+struct sys_device *get_cpu_sysdev(unsigned cpu)
 {
 	if (cpu < NR_CPUS)
 		return cpu_sys_devices[cpu];
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 0ed1d4853c69..d612b89dce33 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -32,7 +32,7 @@ struct cpu {
 };
 
 extern int register_cpu(struct cpu *, int, struct node *);
-extern struct sys_device *get_cpu_sysdev(int cpu);
+extern struct sys_device *get_cpu_sysdev(unsigned cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *, struct node *);
 #endif
-- 
cgit v1.2.3


From e266a12492f7ca9142882710bff92e902b7c95c8 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 8 Nov 2005 21:05:43 +0100
Subject: [PATCH] USB: drivers/usb/core/message.c: make usb_get_string() static

After the removal of usb-midi.c, there's no longer any external user of
usb_get_string().

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/message.c | 5 ++---
 include/linux/usb.h        | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 7135e542679d..2f6009b0cffc 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -631,8 +631,8 @@ int usb_get_descriptor(struct usb_device *dev, unsigned char type, unsigned char
  * Returns the number of bytes received on success, or else the status code
  * returned by the underlying usb_control_msg() call.
  */
-int usb_get_string(struct usb_device *dev, unsigned short langid,
-		unsigned char index, void *buf, int size)
+static int usb_get_string(struct usb_device *dev, unsigned short langid,
+			  unsigned char index, void *buf, int size)
 {
 	int i;
 	int result;
@@ -1488,7 +1488,6 @@ EXPORT_SYMBOL(usb_sg_wait);
 // synchronous control message convenience routines
 EXPORT_SYMBOL(usb_get_descriptor);
 EXPORT_SYMBOL(usb_get_status);
-EXPORT_SYMBOL(usb_get_string);
 EXPORT_SYMBOL(usb_string);
 
 // synchronous calls that also maintain usbcore state
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 827cc6de5f5c..130d125fda12 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1018,8 +1018,6 @@ extern int usb_get_descriptor(struct usb_device *dev, unsigned char desctype,
 	unsigned char descindex, void *buf, int size);
 extern int usb_get_status(struct usb_device *dev,
 	int type, int target, void *data);
-extern int usb_get_string(struct usb_device *dev,
-	unsigned short langid, unsigned char index, void *buf, int size);
 extern int usb_string(struct usb_device *dev, int index,
 	char *buf, size_t size);
 
-- 
cgit v1.2.3


From 80cb9aee01245b38325dd84f1359b14a3f01f10d Mon Sep 17 00:00:00 2001
From: Randy Vinson <rvinson@mvista.com>
Date: Fri, 20 Jan 2006 13:53:38 -0800
Subject: [PATCH] USB: EHCI for Freescale 83xx

Adding a Host Mode USB driver for the Freescale 83xx.

This driver supports both the Dual-Role (DR) controller and the
Multi-Port-Host (MPH) controller present in the Freescale MPC8349. It has
been tested with the MPC8349CDS reference system. This driver depends on
platform support code for setting up the pins on the device package in a
manner appropriate for the board in use. Note that this patch requires
selecting the EHCI controller option under the USB Host menu.

Signed-off-by: Randy Vinson <rvinson@mvista.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/Kconfig         |   1 +
 drivers/usb/host/Kconfig    |   2 +-
 drivers/usb/host/ehci-fsl.c | 357 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/ehci-fsl.h |  37 +++++
 drivers/usb/host/ehci-hcd.c |   8 +-
 include/linux/fsl_devices.h |  27 ++++
 6 files changed, 430 insertions(+), 2 deletions(-)
 create mode 100644 drivers/usb/host/ehci-fsl.c
 create mode 100644 drivers/usb/host/ehci-fsl.h

(limited to 'include/linux')

diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index 85dacc92545a..7772e28a303f 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -11,6 +11,7 @@ config USB_ARCH_HAS_HCD
 	boolean
 	default y if USB_ARCH_HAS_OHCI
 	default y if ARM				# SL-811
+	default y if PPC_83xx
 	default PCI
 
 # many non-PCI SOC chips embed OHCI
diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index be3fd9bce573..a45293673906 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -6,7 +6,7 @@ comment "USB Host Controller Drivers"
 
 config USB_EHCI_HCD
 	tristate "EHCI HCD (USB 2.0) support"
-	depends on USB && PCI
+	depends on USB && (PCI || PPC_83xx)
 	---help---
 	  The Enhanced Host Controller Interface (EHCI) is standard for USB 2.0
 	  "high speed" (480 Mbit/sec, 60 Mbyte/sec) host controller hardware.
diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c
new file mode 100644
index 000000000000..c6012d6cd527
--- /dev/null
+++ b/drivers/usb/host/ehci-fsl.c
@@ -0,0 +1,357 @@
+/*
+ * (C) Copyright David Brownell 2000-2002
+ * Copyright (c) 2005 MontaVista Software
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Ported to 834x by Randy Vinson <rvinson@mvista.com> using code provided
+ * by Hunter Wu.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/fsl_devices.h>
+
+#include "ehci-fsl.h"
+
+/* FIXME: Power Managment is un-ported so temporarily disable it */
+#undef CONFIG_PM
+
+/* PCI-based HCs are common, but plenty of non-PCI HCs are used too */
+
+/* configure so an HC device and id are always provided */
+/* always called with process context; sleeping is OK */
+
+/**
+ * usb_hcd_fsl_probe - initialize FSL-based HCDs
+ * @drvier: Driver to be used for this HCD
+ * @pdev: USB Host Controller being probed
+ * Context: !in_interrupt()
+ *
+ * Allocates basic resources for this USB host controller.
+ *
+ */
+int usb_hcd_fsl_probe(const struct hc_driver *driver,
+		      struct platform_device *pdev)
+{
+	struct fsl_usb2_platform_data *pdata;
+	struct usb_hcd *hcd;
+	struct resource *res;
+	int irq;
+	int retval;
+	unsigned int temp;
+
+	pr_debug("initializing FSL-SOC USB Controller\n");
+
+	/* Need platform data for setup */
+	pdata = (struct fsl_usb2_platform_data *)pdev->dev.platform_data;
+	if (!pdata) {
+		dev_err(&pdev->dev,
+			"No platform data for %s.\n", pdev->dev.bus_id);
+		return -ENODEV;
+	}
+
+	/*
+	 * This is a host mode driver, verify that we're supposed to be
+	 * in host mode.
+	 */
+	if (!((pdata->operating_mode == FSL_USB2_DR_HOST) ||
+	      (pdata->operating_mode == FSL_USB2_MPH_HOST))) {
+		dev_err(&pdev->dev,
+			"Non Host Mode configured for %s. Wrong driver linked.\n",
+			pdev->dev.bus_id);
+		return -ENODEV;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res) {
+		dev_err(&pdev->dev,
+			"Found HC with no IRQ. Check %s setup!\n",
+			pdev->dev.bus_id);
+		return -ENODEV;
+	}
+	irq = res->start;
+
+	hcd = usb_create_hcd(driver, &pdev->dev, pdev->dev.bus_id);
+	if (!hcd) {
+		retval = -ENOMEM;
+		goto err1;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev,
+			"Found HC with no register addr. Check %s setup!\n",
+			pdev->dev.bus_id);
+		retval = -ENODEV;
+		goto err2;
+	}
+	hcd->rsrc_start = res->start;
+	hcd->rsrc_len = res->end - res->start + 1;
+	if (!request_mem_region(hcd->rsrc_start, hcd->rsrc_len,
+				driver->description)) {
+		dev_dbg(&pdev->dev, "controller already in use\n");
+		retval = -EBUSY;
+		goto err2;
+	}
+	hcd->regs = ioremap(hcd->rsrc_start, hcd->rsrc_len);
+
+	if (hcd->regs == NULL) {
+		dev_dbg(&pdev->dev, "error mapping memory\n");
+		retval = -EFAULT;
+		goto err3;
+	}
+
+	/* Enable USB controller */
+	temp = in_be32(hcd->regs + 0x500);
+	out_be32(hcd->regs + 0x500, temp | 0x4);
+
+	/* Set to Host mode */
+	temp = in_le32(hcd->regs + 0x1a8);
+	out_le32(hcd->regs + 0x1a8, temp | 0x3);
+
+	retval = usb_add_hcd(hcd, irq, SA_SHIRQ);
+	if (retval != 0)
+		goto err4;
+	return retval;
+
+      err4:
+	iounmap(hcd->regs);
+      err3:
+	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
+      err2:
+	usb_put_hcd(hcd);
+      err1:
+	dev_err(&pdev->dev, "init %s fail, %d\n", pdev->dev.bus_id, retval);
+	return retval;
+}
+
+/* may be called without controller electrically present */
+/* may be called with controller, bus, and devices active */
+
+/**
+ * usb_hcd_fsl_remove - shutdown processing for FSL-based HCDs
+ * @dev: USB Host Controller being removed
+ * Context: !in_interrupt()
+ *
+ * Reverses the effect of usb_hcd_fsl_probe().
+ *
+ */
+void usb_hcd_fsl_remove(struct usb_hcd *hcd, struct platform_device *pdev)
+{
+	usb_remove_hcd(hcd);
+	iounmap(hcd->regs);
+	release_mem_region(hcd->rsrc_start, hcd->rsrc_len);
+	usb_put_hcd(hcd);
+}
+
+static void mpc83xx_setup_phy(struct ehci_hcd *ehci,
+			      enum fsl_usb2_phy_modes phy_mode,
+			      unsigned int port_offset)
+{
+	u32 portsc = readl(&ehci->regs->port_status[port_offset]);
+	portsc &= ~PORT_PTS_MSK;
+	switch (phy_mode) {
+	case FSL_USB2_PHY_ULPI:
+		portsc |= PORT_PTS_ULPI;
+		break;
+	case FSL_USB2_PHY_SERIAL:
+		portsc |= PORT_PTS_SERIAL;
+		break;
+	case FSL_USB2_PHY_UTMI_WIDE:
+		portsc |= PORT_PTS_PTW;
+		/* fall through */
+	case FSL_USB2_PHY_UTMI:
+		portsc |= PORT_PTS_UTMI;
+		break;
+	case FSL_USB2_PHY_NONE:
+		break;
+	}
+	writel(portsc, &ehci->regs->port_status[port_offset]);
+}
+
+static void mpc83xx_usb_setup(struct usb_hcd *hcd)
+{
+	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
+	struct fsl_usb2_platform_data *pdata;
+	void __iomem *non_ehci = hcd->regs;
+
+	pdata =
+	    (struct fsl_usb2_platform_data *)hcd->self.controller->
+	    platform_data;
+	/* Enable PHY interface in the control reg. */
+	out_be32(non_ehci + FSL_SOC_USB_CTRL, 0x00000004);
+	out_be32(non_ehci + FSL_SOC_USB_SNOOP1, 0x0000001b);
+
+	if (pdata->operating_mode == FSL_USB2_DR_HOST)
+		mpc83xx_setup_phy(ehci, pdata->phy_mode, 0);
+
+	if (pdata->operating_mode == FSL_USB2_MPH_HOST) {
+		if (pdata->port_enables & FSL_USB2_PORT0_ENABLED)
+			mpc83xx_setup_phy(ehci, pdata->phy_mode, 0);
+		if (pdata->port_enables & FSL_USB2_PORT1_ENABLED)
+			mpc83xx_setup_phy(ehci, pdata->phy_mode, 1);
+	}
+
+	/* put controller in host mode. */
+	writel(0x00000003, non_ehci + FSL_SOC_USB_USBMODE);
+	out_be32(non_ehci + FSL_SOC_USB_PRICTRL, 0x0000000c);
+	out_be32(non_ehci + FSL_SOC_USB_AGECNTTHRSH, 0x00000040);
+	out_be32(non_ehci + FSL_SOC_USB_SICTRL, 0x00000001);
+}
+
+/* called after powerup, by probe or system-pm "wakeup" */
+static int ehci_fsl_reinit(struct ehci_hcd *ehci)
+{
+	mpc83xx_usb_setup(ehci_to_hcd(ehci));
+	ehci_port_power(ehci, 0);
+
+	return 0;
+}
+
+/* called during probe() after chip reset completes */
+static int ehci_fsl_setup(struct usb_hcd *hcd)
+{
+	struct ehci_hcd *ehci = hcd_to_ehci(hcd);
+	int retval;
+
+	/* EHCI registers start at offset 0x100 */
+	ehci->caps = hcd->regs + 0x100;
+	ehci->regs = hcd->regs + 0x100 +
+	    HC_LENGTH(readl(&ehci->caps->hc_capbase));
+	dbg_hcs_params(ehci, "reset");
+	dbg_hcc_params(ehci, "reset");
+
+	/* cache this readonly data; minimize chip reads */
+	ehci->hcs_params = readl(&ehci->caps->hcs_params);
+
+	retval = ehci_halt(ehci);
+	if (retval)
+		return retval;
+
+	/* data structure init */
+	retval = ehci_init(hcd);
+	if (retval)
+		return retval;
+
+	ehci->is_tdi_rh_tt = 1;
+
+	ehci->sbrn = 0x20;
+
+	ehci_reset(ehci);
+
+	retval = ehci_fsl_reinit(ehci);
+	return retval;
+}
+
+static const struct hc_driver ehci_fsl_hc_driver = {
+	.description = hcd_name,
+	.product_desc = "Freescale On-Chip EHCI Host Controller",
+	.hcd_priv_size = sizeof(struct ehci_hcd),
+
+	/*
+	 * generic hardware linkage
+	 */
+	.irq = ehci_irq,
+	.flags = HCD_USB2,
+
+	/*
+	 * basic lifecycle operations
+	 */
+	.reset = ehci_fsl_setup,
+	.start = ehci_run,
+#ifdef	CONFIG_PM
+	.suspend = ehci_bus_suspend,
+	.resume = ehci_bus_resume,
+#endif
+	.stop = ehci_stop,
+
+	/*
+	 * managing i/o requests and associated device resources
+	 */
+	.urb_enqueue = ehci_urb_enqueue,
+	.urb_dequeue = ehci_urb_dequeue,
+	.endpoint_disable = ehci_endpoint_disable,
+
+	/*
+	 * scheduling support
+	 */
+	.get_frame_number = ehci_get_frame,
+
+	/*
+	 * root hub support
+	 */
+	.hub_status_data = ehci_hub_status_data,
+	.hub_control = ehci_hub_control,
+	.bus_suspend = ehci_bus_suspend,
+	.bus_resume = ehci_bus_resume,
+};
+
+static int ehci_fsl_drv_probe(struct platform_device *pdev)
+{
+	if (usb_disabled())
+		return -ENODEV;
+
+	return usb_hcd_fsl_probe(&ehci_fsl_hc_driver, pdev);
+}
+
+static int ehci_fsl_drv_remove(struct platform_device *pdev)
+{
+	struct usb_hcd *hcd = platform_get_drvdata(pdev);
+
+	usb_hcd_fsl_remove(hcd, pdev);
+
+	return 0;
+}
+
+static struct platform_driver ehci_fsl_dr_driver = {
+	.probe = ehci_fsl_drv_probe,
+	.remove = ehci_fsl_drv_remove,
+	.driver = {
+		   .name = "fsl-usb2-dr",
+		   },
+};
+
+static struct platform_driver ehci_fsl_mph_driver = {
+	.probe = ehci_fsl_drv_probe,
+	.remove = ehci_fsl_drv_remove,
+	.driver = {
+		   .name = "fsl-usb2-mph",
+		   },
+};
+
+static int __init ehci_fsl_init(void)
+{
+	int retval;
+
+	pr_debug("%s: block sizes: qh %Zd qtd %Zd itd %Zd sitd %Zd\n",
+		 hcd_name,
+		 sizeof(struct ehci_qh), sizeof(struct ehci_qtd),
+		 sizeof(struct ehci_itd), sizeof(struct ehci_sitd));
+
+	retval = platform_driver_register(&ehci_fsl_dr_driver);
+	if (retval)
+		return retval;
+
+	return platform_driver_register(&ehci_fsl_mph_driver);
+}
+
+static void __exit ehci_fsl_cleanup(void)
+{
+	platform_driver_unregister(&ehci_fsl_mph_driver);
+	platform_driver_unregister(&ehci_fsl_dr_driver);
+}
+
+module_init(ehci_fsl_init);
+module_exit(ehci_fsl_cleanup);
diff --git a/drivers/usb/host/ehci-fsl.h b/drivers/usb/host/ehci-fsl.h
new file mode 100644
index 000000000000..caac0d1967d0
--- /dev/null
+++ b/drivers/usb/host/ehci-fsl.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2005 freescale semiconductor
+ * Copyright (c) 2005 MontaVista Software
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the  GNU General Public License along
+ * with this program; if not, write  to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _EHCI_FSL_H
+#define _EHCI_FSL_H
+
+/* offsets for the non-ehci registers in the FSL SOC USB controller */
+#define FSL_SOC_USB_ULPIVP	0x170
+#define FSL_SOC_USB_PORTSC1	0x184
+#define PORT_PTS_MSK		(3<<30)
+#define PORT_PTS_UTMI		(0<<30)
+#define PORT_PTS_ULPI		(2<<30)
+#define	PORT_PTS_SERIAL		(3<<30)
+#define PORT_PTS_PTW		(1<<28)
+#define FSL_SOC_USB_PORTSC2	0x188
+#define FSL_SOC_USB_USBMODE	0x1a8
+#define FSL_SOC_USB_SNOOP1	0x400	/* NOTE: big-endian */
+#define FSL_SOC_USB_SNOOP2	0x404	/* NOTE: big-endian */
+#define FSL_SOC_USB_AGECNTTHRSH	0x408	/* NOTE: big-endian */
+#define FSL_SOC_USB_SICTRL	0x40c	/* NOTE: big-endian */
+#define FSL_SOC_USB_PRICTRL	0x410	/* NOTE: big-endian */
+#define FSL_SOC_USB_CTRL	0x500	/* NOTE: big-endian */
+#endif				/* _EHCI_FSL_H */
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 9dd3d14c64f3..8730babb5771 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -891,6 +891,12 @@ MODULE_LICENSE ("GPL");
 #include "ehci-pci.c"
 #endif
 
-#if !defined(CONFIG_PCI)
+#ifdef CONFIG_PPC_83xx
+#include "ehci-fsl.c"
+#endif
+
+#if !(defined(CONFIG_PCI) || \
+      defined(CONFIG_PPC_83xx) \
+     )
 #error "missing bus glue for ehci-hcd"
 #endif
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index a9f1cfd096ff..a3a0e078f79d 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -83,5 +83,32 @@ struct fsl_i2c_platform_data {
 #define FSL_I2C_DEV_SEPARATE_DFSRR	0x00000001
 #define FSL_I2C_DEV_CLOCK_5200		0x00000002
 
+
+enum fsl_usb2_operating_modes {
+	FSL_USB2_MPH_HOST,
+	FSL_USB2_DR_HOST,
+	FSL_USB2_DR_DEVICE,
+	FSL_USB2_DR_OTG,
+};
+
+enum fsl_usb2_phy_modes {
+	FSL_USB2_PHY_NONE,
+	FSL_USB2_PHY_ULPI,
+	FSL_USB2_PHY_UTMI,
+	FSL_USB2_PHY_UTMI_WIDE,
+	FSL_USB2_PHY_SERIAL,
+};
+
+struct fsl_usb2_platform_data {
+	/* board specific information */
+	enum fsl_usb2_operating_modes operating_mode;
+	enum fsl_usb2_phy_modes phy_mode;
+	unsigned int port_enables;
+};
+
+/* Flags in fsl_usb2_mph_platform_data */
+#define FSL_USB2_PORT0_ENABLED	0x00000001
+#define FSL_USB2_PORT1_ENABLED	0x00000002
+
 #endif				/* _FSL_DEVICE_H_ */
 #endif				/* __KERNEL__ */
-- 
cgit v1.2.3


From 329af28b141ab4ae847aff1362864c4cc332641f Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sat, 18 Feb 2006 12:31:05 -0800
Subject: [PATCH] USB: gadget driver section fixups

This adds __init section annotations to gadget driver bind() routines to
remove calls from .text into .init sections (for endpoint autoconfig).
Likewise it adds __exit section annotations to their unbind() routines.

The specification of the gadget driver register/unregister functions is
updated to explicitly allow use of those sections.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/ether.c        | 4 ++--
 drivers/usb/gadget/file_storage.c | 4 ++--
 drivers/usb/gadget/serial.c       | 6 +++---
 drivers/usb/gadget/zero.c         | 6 +++---
 include/linux/usb_gadget.h        | 7 +++++--
 5 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index 3d2603e31808..0d9d9bbb73f0 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c
@@ -2125,7 +2125,7 @@ eth_req_free (struct usb_ep *ep, struct usb_request *req)
 }
 
 
-static void
+static void __exit
 eth_unbind (struct usb_gadget *gadget)
 {
 	struct eth_dev		*dev = get_gadget_data (gadget);
@@ -2532,7 +2532,7 @@ static struct usb_gadget_driver eth_driver = {
 
 	.function	= (char *) driver_desc,
 	.bind		= eth_bind,
-	.unbind		= eth_unbind,
+	.unbind		= __exit_p(eth_unbind),
 
 	.setup		= eth_setup,
 	.disconnect	= eth_disconnect,
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index de59c58896d6..cf3be299e353 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -3678,7 +3678,7 @@ static void lun_release(struct device *dev)
 	kref_put(&fsg->ref, fsg_release);
 }
 
-static void fsg_unbind(struct usb_gadget *gadget)
+static void __exit fsg_unbind(struct usb_gadget *gadget)
 {
 	struct fsg_dev		*fsg = get_gadget_data(gadget);
 	int			i;
@@ -4064,7 +4064,7 @@ static struct usb_gadget_driver		fsg_driver = {
 #endif
 	.function	= (char *) longname,
 	.bind		= fsg_bind,
-	.unbind		= fsg_unbind,
+	.unbind		= __exit_p(fsg_unbind),
 	.disconnect	= fsg_disconnect,
 	.setup		= fsg_setup,
 	.suspend	= fsg_suspend,
diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c
index ba9acd531024..548feaac4553 100644
--- a/drivers/usb/gadget/serial.c
+++ b/drivers/usb/gadget/serial.c
@@ -369,7 +369,7 @@ static struct usb_gadget_driver gs_gadget_driver = {
 #endif /* CONFIG_USB_GADGET_DUALSPEED */
 	.function =		GS_LONG_NAME,
 	.bind =			gs_bind,
-	.unbind =		gs_unbind,
+	.unbind =		__exit_p(gs_unbind),
 	.setup =		gs_setup,
 	.disconnect =		gs_disconnect,
 	.driver = {
@@ -1413,7 +1413,7 @@ requeue:
  * Called on module load.  Allocates and initializes the device
  * structure and a control request.
  */
-static int gs_bind(struct usb_gadget *gadget)
+static int __init gs_bind(struct usb_gadget *gadget)
 {
 	int ret;
 	struct usb_ep *ep;
@@ -1538,7 +1538,7 @@ autoconf_fail:
  * Called on module unload.  Frees the control request and device
  * structure.
  */
-static void gs_unbind(struct usb_gadget *gadget)
+static void __exit gs_unbind(struct usb_gadget *gadget)
 {
 	struct gs_dev *dev = get_gadget_data(gadget);
 
diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c
index 5e9fe8a70543..44d8e5e77da7 100644
--- a/drivers/usb/gadget/zero.c
+++ b/drivers/usb/gadget/zero.c
@@ -1119,7 +1119,7 @@ zero_autoresume (unsigned long _dev)
 
 /*-------------------------------------------------------------------------*/
 
-static void
+static void __exit
 zero_unbind (struct usb_gadget *gadget)
 {
 	struct zero_dev		*dev = get_gadget_data (gadget);
@@ -1136,7 +1136,7 @@ zero_unbind (struct usb_gadget *gadget)
 	set_gadget_data (gadget, NULL);
 }
 
-static int
+static int __init
 zero_bind (struct usb_gadget *gadget)
 {
 	struct zero_dev		*dev;
@@ -1288,7 +1288,7 @@ static struct usb_gadget_driver zero_driver = {
 #endif
 	.function	= (char *) longname,
 	.bind		= zero_bind,
-	.unbind		= zero_unbind,
+	.unbind		= __exit_p(zero_unbind),
 
 	.setup		= zero_setup,
 	.disconnect	= zero_disconnect,
diff --git a/include/linux/usb_gadget.h b/include/linux/usb_gadget.h
index ff81117eb733..1d78870ed8af 100644
--- a/include/linux/usb_gadget.h
+++ b/include/linux/usb_gadget.h
@@ -801,7 +801,9 @@ struct usb_gadget_driver {
  * Call this in your gadget driver's module initialization function,
  * to tell the underlying usb controller driver about your driver.
  * The driver's bind() function will be called to bind it to a
- * gadget.  This function must be called in a context that can sleep.
+ * gadget before this registration call returns.  It's expected that
+ * the bind() functions will be in init sections.
+ * This function must be called in a context that can sleep.
  */
 int usb_gadget_register_driver (struct usb_gadget_driver *driver);
 
@@ -814,7 +816,8 @@ int usb_gadget_register_driver (struct usb_gadget_driver *driver);
  * going away.  If the controller is connected to a USB host,
  * it will first disconnect().  The driver is also requested
  * to unbind() and clean up any device state, before this procedure
- * finally returns.
+ * finally returns.  It's expected that the unbind() functions
+ * will in in exit sections, so may not be linked in some kernels.
  * This function must be called in a context that can sleep.
  */
 int usb_gadget_unregister_driver (struct usb_gadget_driver *driver);
-- 
cgit v1.2.3


From 65f5c7c1143fb8eed5bc7e7d8c926346e00fe3c0 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 20 Mar 2006 16:55:08 -0800
Subject: [IPV6]: ROUTE: Add accept_ra_defrtr sysctl.

This controls whether we accept default router information
in RAs.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  2 ++
 include/linux/sysctl.h                 |  1 +
 net/ipv6/addrconf.c                    | 11 +++++++++++
 net/ipv6/ndisc.c                       |  7 ++++++-
 5 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 26364d06ae92..8001faa76ea2 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -717,6 +717,12 @@ accept_ra - BOOLEAN
 	Functional default: enabled if local forwarding is disabled.
 			    disabled if local forwarding is enabled.
 
+accept_ra_defrtr - BOOLEAN
+	Learn default router in Router Advertisement.
+
+	Functional default: enabled if accept_ra is enabled.
+			    disabled if accept_ra is disabled.
+
 accept_redirects - BOOLEAN
 	Accept Redirects.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9c8f4c9ed429..c5131a02869a 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -145,6 +145,7 @@ struct ipv6_devconf {
 	__s32		max_desync_factor;
 #endif
 	__s32		max_addresses;
+	__s32		accept_ra_defrtr;
 	void		*sysctl;
 };
 
@@ -167,6 +168,7 @@ enum {
 	DEVCONF_MAX_DESYNC_FACTOR,
 	DEVCONF_MAX_ADDRESSES,
 	DEVCONF_FORCE_MLD_VERSION,
+	DEVCONF_ACCEPT_RA_DEFRTR,
 	DEVCONF_MAX
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index bac61db26456..0f494137d037 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -531,6 +531,7 @@ enum {
 	NET_IPV6_MAX_DESYNC_FACTOR=15,
 	NET_IPV6_MAX_ADDRESSES=16,
 	NET_IPV6_FORCE_MLD_VERSION=17,
+	NET_IPV6_ACCEPT_RA_DEFRTR=18,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8a8895ef09a7..fbcdcc6ba93b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -165,6 +165,7 @@ struct ipv6_devconf ipv6_devconf = {
 	.max_desync_factor	= MAX_DESYNC_FACTOR,
 #endif
 	.max_addresses		= IPV6_MAX_ADDRESSES,
+	.accept_ra_defrtr	= 1,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt = {
@@ -186,6 +187,7 @@ static struct ipv6_devconf ipv6_devconf_dflt = {
 	.max_desync_factor	= MAX_DESYNC_FACTOR,
 #endif
 	.max_addresses		= IPV6_MAX_ADDRESSES,
+	.accept_ra_defrtr	= 1,
 };
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
@@ -3116,6 +3118,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
 #endif
 	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
+	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
 }
 
 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 
@@ -3568,6 +3571,14 @@ static struct addrconf_sysctl_table
 			.mode		=	0644,
 			.proc_handler	=	&proc_dointvec,
 		},
+		{
+			.ctl_name	=	NET_IPV6_ACCEPT_RA_DEFRTR,
+			.procname	=	"accept_ra_defrtr",
+         		.data		=	&ipv6_devconf.accept_ra_defrtr,
+			.maxlen		=	sizeof(int),
+			.mode		=	0644,
+         		.proc_handler	=	&proc_dointvec,
+		},
 		{
 			.ctl_name	=	0,	/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index cb8856b1d951..e17116796059 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1019,7 +1019,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
         struct ra_msg *ra_msg = (struct ra_msg *) skb->h.raw;
 	struct neighbour *neigh = NULL;
 	struct inet6_dev *in6_dev;
-	struct rt6_info *rt;
+	struct rt6_info *rt = NULL;
 	int lifetime;
 	struct ndisc_options ndopts;
 	int optlen;
@@ -1081,6 +1081,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 				(ra_msg->icmph.icmp6_addrconf_other ?
 					IF_RA_OTHERCONF : 0);
 
+	if (!in6_dev->cnf.accept_ra_defrtr)
+		goto skip_defrtr;
+
 	lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
 
 	rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
@@ -1128,6 +1131,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit;
 	}
 
+skip_defrtr:
+
 	/*
 	 *	Update Reachable Time and Retrans Timer
 	 */
-- 
cgit v1.2.3


From c4fd30eb18666972230689eb30e8f90844bce635 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 20 Mar 2006 16:55:26 -0800
Subject: [IPV6]: ADDRCONF: Add accept_ra_pinfo sysctl.

This controls whether we accept Prefix Information in RAs.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 10 ++++++++--
 include/linux/ipv6.h                   |  2 ++
 include/linux/sysctl.h                 |  1 +
 net/ipv6/addrconf.c                    | 11 +++++++++++
 net/ipv6/ndisc.c                       |  2 +-
 5 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8001faa76ea2..404afacb468d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -723,6 +723,12 @@ accept_ra_defrtr - BOOLEAN
 	Functional default: enabled if accept_ra is enabled.
 			    disabled if accept_ra is disabled.
 
+accept_ra_pinfo - BOOLEAN
+	Learn Prefix Inforamtion in Router Advertisement.
+
+	Functional default: enabled if accept_ra is enabled.
+			    disabled if accept_ra is disabled.
+
 accept_redirects - BOOLEAN
 	Accept Redirects.
 
@@ -733,8 +739,8 @@ autoconf - BOOLEAN
 	Autoconfigure addresses using Prefix Information in Router 
 	Advertisements.
 
-	Functional default: enabled if accept_ra is enabled.
-			    disabled if accept_ra is disabled.
+	Functional default: enabled if accept_ra_pinfo is enabled.
+			    disabled if accept_ra_pinfo is disabled.
 
 dad_transmits - INTEGER
 	The amount of Duplicate Address Detection probes to send.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c5131a02869a..2c3b799480c5 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -146,6 +146,7 @@ struct ipv6_devconf {
 #endif
 	__s32		max_addresses;
 	__s32		accept_ra_defrtr;
+	__s32		accept_ra_pinfo;
 	void		*sysctl;
 };
 
@@ -169,6 +170,7 @@ enum {
 	DEVCONF_MAX_ADDRESSES,
 	DEVCONF_FORCE_MLD_VERSION,
 	DEVCONF_ACCEPT_RA_DEFRTR,
+	DEVCONF_ACCEPT_RA_PINFO,
 	DEVCONF_MAX
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 0f494137d037..09378ea505bd 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -532,6 +532,7 @@ enum {
 	NET_IPV6_MAX_ADDRESSES=16,
 	NET_IPV6_FORCE_MLD_VERSION=17,
 	NET_IPV6_ACCEPT_RA_DEFRTR=18,
+	NET_IPV6_ACCEPT_RA_PINFO=19,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index fbcdcc6ba93b..631b51d0ccbc 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -166,6 +166,7 @@ struct ipv6_devconf ipv6_devconf = {
 #endif
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
+	.accept_ra_pinfo	= 1,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt = {
@@ -188,6 +189,7 @@ static struct ipv6_devconf ipv6_devconf_dflt = {
 #endif
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
+	.accept_ra_pinfo	= 1,
 };
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
@@ -3119,6 +3121,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 #endif
 	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
 	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
 }
 
 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 
@@ -3579,6 +3582,14 @@ static struct addrconf_sysctl_table
 			.mode		=	0644,
          		.proc_handler	=	&proc_dointvec,
 		},
+		{
+			.ctl_name	=	NET_IPV6_ACCEPT_RA_PINFO,
+			.procname	=	"accept_ra_pinfo",
+         		.data		=	&ipv6_devconf.accept_ra_pinfo,
+			.maxlen		=	sizeof(int),
+			.mode		=	0644,
+         		.proc_handler	=	&proc_dointvec,
+		},
 		{
 			.ctl_name	=	0,	/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e17116796059..3b56be85234e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1191,7 +1191,7 @@ skip_defrtr:
 			     NEIGH_UPDATE_F_ISROUTER);
 	}
 
-	if (ndopts.nd_opts_pi) {
+	if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
 		struct nd_opt_hdr *p;
 		for (p = ndopts.nd_opts_pi;
 		     p;
-- 
cgit v1.2.3


From ebacaaa0fdf4402cdf4c8e569f54af36b6f0aa2d Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 20 Mar 2006 17:04:53 -0800
Subject: [IPV6]: ROUTE: Add support for Router Preference (RFC4191).

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmpv6.h     | 11 +++++++++--
 include/linux/ipv6_route.h |  8 ++++++++
 include/net/ip6_route.h    |  3 ++-
 net/ipv6/Kconfig           | 11 +++++++++++
 net/ipv6/ndisc.c           | 12 +++++++++++-
 net/ipv6/route.c           | 11 ++++++++---
 6 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 0cf6c8b12caf..c771a7db9871 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -40,14 +40,16 @@ struct icmp6hdr {
                 struct icmpv6_nd_ra {
 			__u8		hop_limit;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-			__u8		reserved:6,
+			__u8		reserved:4,
+					router_pref:2,
 					other:1,
 					managed:1;
 
 #elif defined(__BIG_ENDIAN_BITFIELD)
 			__u8		managed:1,
 					other:1,
-					reserved:6;
+					router_pref:2,
+					reserved:4;
 #else
 #error	"Please fix <asm/byteorder.h>"
 #endif
@@ -70,8 +72,13 @@ struct icmp6hdr {
 #define icmp6_addrconf_managed	icmp6_dataun.u_nd_ra.managed
 #define icmp6_addrconf_other	icmp6_dataun.u_nd_ra.other
 #define icmp6_rt_lifetime	icmp6_dataun.u_nd_ra.rt_lifetime
+#define icmp6_router_pref	icmp6_dataun.u_nd_ra.router_pref
 };
 
+#define ICMPV6_ROUTER_PREF_LOW		0x3
+#define ICMPV6_ROUTER_PREF_MEDIUM	0x0
+#define ICMPV6_ROUTER_PREF_HIGH		0x1
+#define ICMPV6_ROUTER_PREF_INVALID	0x2
 
 #define ICMPV6_DEST_UNREACH		1
 #define ICMPV6_PKT_TOOBIG		2
diff --git a/include/linux/ipv6_route.h b/include/linux/ipv6_route.h
index d7c41d1d706a..f4b085c91608 100644
--- a/include/linux/ipv6_route.h
+++ b/include/linux/ipv6_route.h
@@ -27,8 +27,16 @@
 #define RTF_FLOW	0x02000000	/* flow significant route	*/
 #define RTF_POLICY	0x04000000	/* policy route			*/
 
+#define RTF_PREF(pref)	((pref) << 27)
+#define RTF_PREF_MASK	0x18000000
+
 #define RTF_LOCAL	0x80000000
 
+#ifdef __KERNEL__
+#define IPV6_EXTRACT_PREF(flag)	(((flag) & RTF_PREF_MASK) >> 27)
+#define IPV6_DECODE_PREF(pref)	((pref) ^ 2)	/* 1:low,2:med,3:high */
+#endif
+
 struct in6_rtmsg {
 	struct in6_addr		rtmsg_dst;
 	struct in6_addr		rtmsg_src;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 01acca06d6dd..50161322b828 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -87,7 +87,8 @@ extern struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 extern struct rt6_info *	rt6_get_dflt_router(struct in6_addr *addr,
 						    struct net_device *dev);
 extern struct rt6_info *	rt6_add_dflt_router(struct in6_addr *gwaddr,
-						    struct net_device *dev);
+						    struct net_device *dev,
+						    unsigned int pref);
 
 extern void			rt6_purge_dflt_routers(void);
 
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index f925f206d8ff..c456ead8a4a3 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -38,6 +38,17 @@ config IPV6_PRIVACY
 
 	  See <file:Documentation/networking/ip-sysctl.txt> for details.
 
+config IPV6_ROUTER_PREF
+	bool "IPv6: Router Preference (RFC 4191) support"
+	depends on IPV6
+	---help---
+	  Router Preference is an optional extension to the Router
+	  Advertisement message to improve the ability of hosts
+	  to pick more appropriate router, especially when the hosts
+	  is placed in a multi-homed network.
+
+	  If unsure, say N.
+
 config INET6_AH
 	tristate "IPv6: AH transformation"
 	depends on IPV6
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3b56be85234e..966ab6b3022e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1023,6 +1023,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	int lifetime;
 	struct ndisc_options ndopts;
 	int optlen;
+	unsigned int pref = 0;
 
 	__u8 * opt = (__u8 *)(ra_msg + 1);
 
@@ -1086,6 +1087,13 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
 
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	pref = ra_msg->icmph.icmp6_router_pref;
+	/* 10b is handled as if it were 00b (medium) */
+	if (pref == ICMPV6_ROUTER_PREF_INVALID)
+		pref = ICMPV6_ROUTER_PREF_MEDIUM;
+#endif
+
 	rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
 
 	if (rt)
@@ -1101,7 +1109,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		ND_PRINTK3(KERN_DEBUG
 			   "ICMPv6 RA: adding default router.\n");
 
-		rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
+		rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev, pref);
 		if (rt == NULL) {
 			ND_PRINTK0(KERN_ERR
 				   "ICMPv6 RA: %s() failed to add default route.\n",
@@ -1120,6 +1128,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 		neigh->flags |= NTF_ROUTER;
+	} else if (rt) {
+		rt->rt6i_flags |= (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 	}
 
 	if (rt)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6a068e7f81f1..a7030fed1a18 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -251,8 +251,11 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 	int m = rt6_check_dev(rt, oif);
 	if (!m && (strict & RT6_SELECT_F_IFACE))
 		return -1;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+#endif
 	if (rt6_check_neigh(rt))
-		m |= 4;
+		m |= 16;
 	else if (strict & RT6_SELECT_F_REACHABLE)
 		return -1;
 	return m;
@@ -1256,7 +1259,8 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *d
 }
 
 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
-				     struct net_device *dev)
+				     struct net_device *dev,
+				     unsigned int pref)
 {
 	struct in6_rtmsg rtmsg;
 
@@ -1264,7 +1268,8 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
 	rtmsg.rtmsg_metric = 1024;
-	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES;
+	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
+			    RTF_PREF(pref);
 
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 
-- 
cgit v1.2.3


From 930d6ff2e2a5f1538448d3b0b2652a8f0c0f6cba Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 20 Mar 2006 17:05:30 -0800
Subject: [IPV6]: ROUTE: Add accept_ra_rtr_pref sysctl.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  4 ++++
 include/linux/sysctl.h                 |  1 +
 net/ipv6/addrconf.c                    | 19 +++++++++++++++++++
 net/ipv6/ndisc.c                       |  3 ++-
 5 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 404afacb468d..87bbd774c2b2 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -729,6 +729,12 @@ accept_ra_pinfo - BOOLEAN
 	Functional default: enabled if accept_ra is enabled.
 			    disabled if accept_ra is disabled.
 
+accept_ra_rtr_pref - BOOLEAN
+	Accept Router Preference in RA.
+
+	Functional default: enabled if accept_ra is enabled.
+			    disabled if accept_ra is disabled.
+
 accept_redirects - BOOLEAN
 	Accept Redirects.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 2c3b799480c5..108b75dccd9f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -147,6 +147,9 @@ struct ipv6_devconf {
 	__s32		max_addresses;
 	__s32		accept_ra_defrtr;
 	__s32		accept_ra_pinfo;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	__s32		accept_ra_rtr_pref;
+#endif
 	void		*sysctl;
 };
 
@@ -171,6 +174,7 @@ enum {
 	DEVCONF_FORCE_MLD_VERSION,
 	DEVCONF_ACCEPT_RA_DEFRTR,
 	DEVCONF_ACCEPT_RA_PINFO,
+	DEVCONF_ACCEPT_RA_RTR_PREF,
 	DEVCONF_MAX
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 09378ea505bd..236f537b38d2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -533,6 +533,7 @@ enum {
 	NET_IPV6_FORCE_MLD_VERSION=17,
 	NET_IPV6_ACCEPT_RA_DEFRTR=18,
 	NET_IPV6_ACCEPT_RA_PINFO=19,
+	NET_IPV6_ACCEPT_RA_RTR_PREF=20,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 631b51d0ccbc..51edba5fea26 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -167,6 +167,9 @@ struct ipv6_devconf ipv6_devconf = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_pinfo	= 1,
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	.accept_ra_rtr_pref	= 1,
+#endif
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt = {
@@ -190,6 +193,9 @@ static struct ipv6_devconf ipv6_devconf_dflt = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_pinfo	= 1,
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	.accept_ra_rtr_pref	= 1,
+#endif
 };
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
@@ -3122,6 +3128,9 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
 	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
 	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
+#endif
 }
 
 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 
@@ -3590,6 +3599,16 @@ static struct addrconf_sysctl_table
 			.mode		=	0644,
          		.proc_handler	=	&proc_dointvec,
 		},
+#ifdef CONFIG_IPV6_ROUTER_PREF
+		{
+			.ctl_name	=	NET_IPV6_ACCEPT_RA_RTR_PREF,
+			.procname	=	"accept_ra_rtr_pref",
+			.data		=	&ipv6_devconf.accept_ra_rtr_pref,
+			.maxlen		=	sizeof(int),
+			.mode		=	0644,
+			.proc_handler	=	&proc_dointvec,
+		},
+#endif
 		{
 			.ctl_name	=	0,	/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 966ab6b3022e..f4462ee33024 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1090,7 +1090,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	pref = ra_msg->icmph.icmp6_router_pref;
 	/* 10b is handled as if it were 00b (medium) */
-	if (pref == ICMPV6_ROUTER_PREF_INVALID)
+	if (pref == ICMPV6_ROUTER_PREF_INVALID ||
+	    in6_dev->cnf.accept_ra_rtr_pref)
 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
 #endif
 
-- 
cgit v1.2.3


From 52e1635631b342803aecaf81a362c1464e3da2e5 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 20 Mar 2006 17:05:47 -0800
Subject: [IPV6]: ROUTE: Add router_probe_interval sysctl.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  2 ++
 include/linux/sysctl.h                 |  1 +
 net/ipv6/addrconf.c                    | 12 ++++++++++++
 net/ipv6/route.c                       |  2 +-
 5 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 87bbd774c2b2..88efed0a533f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -789,6 +789,12 @@ mtu - INTEGER
 	Default Maximum Transfer Unit
 	Default: 1280 (IPv6 required minimum)
 
+router_probe_interval - INTEGER
+	Minimum interval (in seconds) between Router Probing described
+	in RFC4191.
+
+	Default: 60
+
 router_solicitation_delay - INTEGER
 	Number of seconds to wait after interface is brought up
 	before sending Router Solicitations.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 108b75dccd9f..c609cc702375 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -149,6 +149,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
+	__s32		rtr_probe_interval;
 #endif
 	void		*sysctl;
 };
@@ -175,6 +176,7 @@ enum {
 	DEVCONF_ACCEPT_RA_DEFRTR,
 	DEVCONF_ACCEPT_RA_PINFO,
 	DEVCONF_ACCEPT_RA_RTR_PREF,
+	DEVCONF_RTR_PROBE_INTERVAL,
 	DEVCONF_MAX
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 236f537b38d2..f49488ffefef 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -534,6 +534,7 @@ enum {
 	NET_IPV6_ACCEPT_RA_DEFRTR=18,
 	NET_IPV6_ACCEPT_RA_PINFO=19,
 	NET_IPV6_ACCEPT_RA_RTR_PREF=20,
+	NET_IPV6_RTR_PROBE_INTERVAL=21,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 51edba5fea26..e7add61e6e39 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -169,6 +169,7 @@ struct ipv6_devconf ipv6_devconf = {
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
+	.rtr_probe_interval	= 60 * HZ,
 #endif
 };
 
@@ -195,6 +196,7 @@ static struct ipv6_devconf ipv6_devconf_dflt = {
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
+	.rtr_probe_interval	= 60 * HZ,
 #endif
 };
 
@@ -3130,6 +3132,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
+	array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval;
 #endif
 }
 
@@ -3608,6 +3611,15 @@ static struct addrconf_sysctl_table
 			.mode		=	0644,
 			.proc_handler	=	&proc_dointvec,
 		},
+		{
+			.ctl_name	=	NET_IPV6_RTR_PROBE_INTERVAL,
+			.procname	=	"router_probe_interval",
+			.data		=	&ipv6_devconf.rtr_probe_interval,
+			.maxlen		=	sizeof(int),
+			.mode		=	0644,
+			.proc_handler	=	&proc_dointvec_jiffies,
+			.strategy	=	&sysctl_jiffies,
+		},
 #endif
 		{
 			.ctl_name	=	0,	/* sentinel */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8ba8900c0a5f..c797b9bbb7d1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -234,7 +234,7 @@ static void rt6_probe(struct rt6_info *rt)
 		return;
 	read_lock_bh(&neigh->lock);
 	if (!(neigh->nud_state & NUD_VALID) &&
-	    time_after(jiffies, neigh->updated + 60 * HZ)) {
+	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 		struct in6_addr mcaddr;
 		struct in6_addr *target;
 
-- 
cgit v1.2.3


From 70ceb4f53929f73746be72f73707cd9f8753e2fc Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 20 Mar 2006 17:06:24 -0800
Subject: [IPV6]: ROUTE: Add experimental support for Route Information Option
 in RA (RFC4191).

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6_route.h |   2 +
 include/net/ip6_route.h    |  21 +++++++
 include/net/ndisc.h        |   2 +
 net/ipv6/Kconfig           |   8 +++
 net/ipv6/ndisc.c           |  25 ++++++++-
 net/ipv6/route.c           | 134 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 191 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6_route.h b/include/linux/ipv6_route.h
index f4b085c91608..b323ff577967 100644
--- a/include/linux/ipv6_route.h
+++ b/include/linux/ipv6_route.h
@@ -23,6 +23,8 @@
 #define RTF_NONEXTHOP	0x00200000	/* route with no nexthop	*/
 #define RTF_EXPIRES	0x00400000
 
+#define RTF_ROUTEINFO	0x00800000	/* route information - RA	*/
+
 #define RTF_CACHE	0x01000000	/* cache entry			*/
 #define RTF_FLOW	0x02000000	/* flow significant route	*/
 #define RTF_POLICY	0x04000000	/* policy route			*/
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 50161322b828..a398ae5e30f9 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -7,6 +7,23 @@
 #define IP6_RT_PRIO_KERN	512
 #define IP6_RT_FLOW_MASK	0x00ff
 
+struct route_info {
+	__u8			type;
+	__u8			length;
+	__u8			prefix_len;
+#if defined(__BIG_ENDIAN_BITFIELD)
+	__u8			reserved_h:3,
+				route_pref:2,
+				reserved_l:3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8			reserved_l:3,
+				route_pref:2,
+				reserved_h:3;
+#endif
+	__u32			lifetime;
+	__u8			prefix[0];	/* 0,8 or 16 */
+};
+
 #ifdef __KERNEL__
 
 #include <net/flow.h>
@@ -92,6 +109,10 @@ extern struct rt6_info *	rt6_add_dflt_router(struct in6_addr *gwaddr,
 
 extern void			rt6_purge_dflt_routers(void);
 
+extern int			rt6_route_rcv(struct net_device *dev,
+					      u8 *opt, int len,
+					      struct in6_addr *gwaddr);
+
 extern void			rt6_redirect(struct in6_addr *dest,
 					     struct in6_addr *saddr,
 					     struct neighbour *neigh,
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index bbac87eeb422..91fa271a0064 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -22,6 +22,8 @@ enum {
 	ND_OPT_PREFIX_INFO = 3,		/* RFC2461 */
 	ND_OPT_REDIRECT_HDR = 4,	/* RFC2461 */
 	ND_OPT_MTU = 5,			/* RFC2461 */
+	__ND_OPT_ARRAY_MAX,
+	ND_OPT_ROUTE_INFO = 24,		/* RFC4191 */
 	__ND_OPT_MAX
 };
 
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index c456ead8a4a3..e6f83b6a2b76 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -49,6 +49,14 @@ config IPV6_ROUTER_PREF
 
 	  If unsure, say N.
 
+config IPV6_ROUTE_INFO
+	bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)"
+	depends on IPV6_ROUTER_PREF && EXPERIMENTAL
+	---help---
+	  This is experimental support of Route Information.
+
+	  If unsure, say N.
+
 config INET6_AH
 	tristate "IPv6: AH transformation"
 	depends on IPV6
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f4462ee33024..1f6256909674 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -156,7 +156,11 @@ struct neigh_table nd_tbl = {
 
 /* ND options */
 struct ndisc_options {
-	struct nd_opt_hdr *nd_opt_array[__ND_OPT_MAX];
+	struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX];
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	struct nd_opt_hdr *nd_opts_ri;
+	struct nd_opt_hdr *nd_opts_ri_end;
+#endif
 };
 
 #define nd_opts_src_lladdr	nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
@@ -255,6 +259,13 @@ static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0)
 				ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
 			break;
+#ifdef CONFIG_IPV6_ROUTE_INFO
+		case ND_OPT_ROUTE_INFO:
+			ndopts->nd_opts_ri_end = nd_opt;
+			if (!ndopts->nd_opts_ri)
+				ndopts->nd_opts_ri = nd_opt;
+			break;
+#endif
 		default:
 			/*
 			 * Unknown options must be silently ignored,
@@ -1202,6 +1213,18 @@ skip_defrtr:
 			     NEIGH_UPDATE_F_ISROUTER);
 	}
 
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	if (ndopts.nd_opts_ri) {
+		struct nd_opt_hdr *p;
+		for (p = ndopts.nd_opts_ri;
+		     p;
+		     p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
+			rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3,
+				      &skb->nh.ipv6h->saddr);
+		}
+	}
+#endif
+
 	if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
 		struct nd_opt_hdr *p;
 		for (p = ndopts.nd_opts_pi;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c797b9bbb7d1..0f30ee3d94ea 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -98,6 +98,14 @@ static int		ip6_pkt_discard_out(struct sk_buff *skb);
 static void		ip6_link_failure(struct sk_buff *skb);
 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 
+#ifdef CONFIG_IPV6_ROUTE_INFO
+static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
+					   struct in6_addr *gwaddr, int ifindex,
+					   unsigned pref);
+static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
+					   struct in6_addr *gwaddr, int ifindex);
+#endif
+
 static struct dst_ops ip6_dst_ops = {
 	.family			=	AF_INET6,
 	.protocol		=	__constant_htons(ETH_P_IPV6),
@@ -346,6 +354,84 @@ static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
 	return (match ? match : &ip6_null_entry);
 }
 
+#ifdef CONFIG_IPV6_ROUTE_INFO
+int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
+		  struct in6_addr *gwaddr)
+{
+	struct route_info *rinfo = (struct route_info *) opt;
+	struct in6_addr prefix_buf, *prefix;
+	unsigned int pref;
+	u32 lifetime;
+	struct rt6_info *rt;
+
+	if (len < sizeof(struct route_info)) {
+		return -EINVAL;
+	}
+
+	/* Sanity check for prefix_len and length */
+	if (rinfo->length > 3) {
+		return -EINVAL;
+	} else if (rinfo->prefix_len > 128) {
+		return -EINVAL;
+	} else if (rinfo->prefix_len > 64) {
+		if (rinfo->length < 2) {
+			return -EINVAL;
+		}
+	} else if (rinfo->prefix_len > 0) {
+		if (rinfo->length < 1) {
+			return -EINVAL;
+		}
+	}
+
+	pref = rinfo->route_pref;
+	if (pref == ICMPV6_ROUTER_PREF_INVALID)
+		pref = ICMPV6_ROUTER_PREF_MEDIUM;
+
+	lifetime = htonl(rinfo->lifetime);
+	if (lifetime == 0xffffffff) {
+		/* infinity */
+	} else if (lifetime > 0x7fffffff/HZ) {
+		/* Avoid arithmetic overflow */
+		lifetime = 0x7fffffff/HZ - 1;
+	}
+
+	if (rinfo->length == 3)
+		prefix = (struct in6_addr *)rinfo->prefix;
+	else {
+		/* this function is safe */
+		ipv6_addr_prefix(&prefix_buf,
+				 (struct in6_addr *)rinfo->prefix,
+				 rinfo->prefix_len);
+		prefix = &prefix_buf;
+	}
+
+	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
+
+	if (rt && !lifetime) {
+		ip6_del_rt(rt, NULL, NULL, NULL);
+		rt = NULL;
+	}
+
+	if (!rt && lifetime)
+		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
+					pref);
+	else if (rt)
+		rt->rt6i_flags = RTF_ROUTEINFO |
+				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+
+	if (rt) {
+		if (lifetime == 0xffffffff) {
+			rt->rt6i_flags &= ~RTF_EXPIRES;
+		} else {
+			rt->rt6i_expires = jiffies + HZ * lifetime;
+			rt->rt6i_flags |= RTF_EXPIRES;
+		}
+		dst_release(&rt->u.dst);
+	}
+	return 0;
+}
+#endif
+
 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
 			    int oif, int strict)
 {
@@ -1277,6 +1363,54 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
 	return rt;
 }
 
+#ifdef CONFIG_IPV6_ROUTE_INFO
+static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
+					   struct in6_addr *gwaddr, int ifindex)
+{
+	struct fib6_node *fn;
+	struct rt6_info *rt = NULL;
+
+	write_lock_bh(&rt6_lock);
+	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
+	if (!fn)
+		goto out;
+
+	for (rt = fn->leaf; rt; rt = rt->u.next) {
+		if (rt->rt6i_dev->ifindex != ifindex)
+			continue;
+		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+			continue;
+		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
+			continue;
+		dst_hold(&rt->u.dst);
+		break;
+	}
+out:
+	write_unlock_bh(&rt6_lock);
+	return rt;
+}
+
+static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
+					   struct in6_addr *gwaddr, int ifindex,
+					   unsigned pref)
+{
+	struct in6_rtmsg rtmsg;
+
+	memset(&rtmsg, 0, sizeof(rtmsg));
+	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
+	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
+	rtmsg.rtmsg_dst_len = prefixlen;
+	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
+	rtmsg.rtmsg_metric = 1024;
+	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
+	rtmsg.rtmsg_ifindex = ifindex;
+
+	ip6_route_add(&rtmsg, NULL, NULL, NULL);
+
+	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
+}
+#endif
+
 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
 {	
 	struct rt6_info *rt;
-- 
cgit v1.2.3


From 09c884d4c3b45cda904c2291d4723074ff523611 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 20 Mar 2006 17:07:03 -0800
Subject: [IPV6]: ROUTE: Add accept_ra_rt_info_max_plen sysctl.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/linux/ipv6.h                   |  4 ++++
 include/linux/sysctl.h                 |  1 +
 net/ipv6/addrconf.c                    | 19 +++++++++++++++++++
 net/ipv6/ndisc.c                       |  4 +++-
 5 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 88efed0a533f..35aed1c6dd98 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -729,6 +729,15 @@ accept_ra_pinfo - BOOLEAN
 	Functional default: enabled if accept_ra is enabled.
 			    disabled if accept_ra is disabled.
 
+accept_ra_rt_info_max_plen - INTEGER
+	Maximum prefix length of Route Information in RA.
+
+	Route Information w/ prefix larger than or equal to this
+	variable shall be ignored.
+
+	Functional default: 0 if accept_ra_rtr_pref is enabled.
+			    -1 if accept_ra_rtr_pref is disabled.
+
 accept_ra_rtr_pref - BOOLEAN
 	Accept Router Preference in RA.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c609cc702375..1263d8cb3c18 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -150,6 +150,9 @@ struct ipv6_devconf {
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
 	__s32		rtr_probe_interval;
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	__s32		accept_ra_rt_info_max_plen;
+#endif
 #endif
 	void		*sysctl;
 };
@@ -177,6 +180,7 @@ enum {
 	DEVCONF_ACCEPT_RA_PINFO,
 	DEVCONF_ACCEPT_RA_RTR_PREF,
 	DEVCONF_RTR_PROBE_INTERVAL,
+	DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN,
 	DEVCONF_MAX
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index f49488ffefef..8ad4beab2888 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -535,6 +535,7 @@ enum {
 	NET_IPV6_ACCEPT_RA_PINFO=19,
 	NET_IPV6_ACCEPT_RA_RTR_PREF=20,
 	NET_IPV6_RTR_PROBE_INTERVAL=21,
+	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e7add61e6e39..eb82cd5df8c6 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -170,6 +170,9 @@ struct ipv6_devconf ipv6_devconf = {
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
 	.rtr_probe_interval	= 60 * HZ,
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_max_plen = 0,
+#endif
 #endif
 };
 
@@ -197,6 +200,9 @@ static struct ipv6_devconf ipv6_devconf_dflt = {
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
 	.rtr_probe_interval	= 60 * HZ,
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_max_plen = 0,
+#endif
 #endif
 };
 
@@ -3133,6 +3139,9 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
 	array[DEVCONF_RTR_PROBE_INTERVAL] = cnf->rtr_probe_interval;
+#ifdef CONFIV_IPV6_ROUTE_INFO
+	array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
+#endif
 #endif
 }
 
@@ -3620,6 +3629,16 @@ static struct addrconf_sysctl_table
 			.proc_handler	=	&proc_dointvec_jiffies,
 			.strategy	=	&sysctl_jiffies,
 		},
+#ifdef CONFIV_IPV6_ROUTE_INFO
+		{
+			.ctl_name	=	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,
+			.procname	=	"accept_ra_rt_info_max_plen",
+			.data		=	&ipv6_devconf.accept_ra_rt_info_max_plen,
+			.maxlen		=	sizeof(int),
+			.mode		=	0644,
+			.proc_handler	=	&proc_dointvec,
+		},
+#endif
 #endif
 		{
 			.ctl_name	=	0,	/* sentinel */
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 1f6256909674..dfa20d3be9b6 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1214,11 +1214,13 @@ skip_defrtr:
 	}
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
-	if (ndopts.nd_opts_ri) {
+	if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) {
 		struct nd_opt_hdr *p;
 		for (p = ndopts.nd_opts_ri;
 		     p;
 		     p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
+			if (((struct route_info *)p)->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
+				continue;
 			rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3,
 				      &skb->nh.ipv6h->saddr);
 		}
-- 
cgit v1.2.3


From b00055aacdb172c05067612278ba27265fcd05ce Mon Sep 17 00:00:00 2001
From: Stefan Rompf <stefan@loplof.de>
Date: Mon, 20 Mar 2006 17:09:11 -0800
Subject: [NET] core: add RFC2863 operstate

this patch adds a dormant flag to network devices, RFC2863 operstate derived
from these flags and possibility for userspace interaction. It allows drivers
to signal that a device is unusable for user traffic without disabling
queueing (and therefore the possibility for protocol establishment traffic to
flow) and a userspace supplicant (WPA, 802.1X) to mark a device unusable
without changes to the driver.

It is the result of our long discussion. However I must admit that it
represents what Jamal and I agreed on with compromises towards Krzysztof, but
Thomas and Krzysztof still disagree with some parts. Anyway I think it should
be applied.

Signed-off-by: Stefan Rompf <stefan@loplof.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if.h        | 26 +++++++++++++++++++++---
 include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++++--
 include/linux/rtnetlink.h |  2 ++
 net/core/dev.c            | 14 ++++++++++---
 net/core/link_watch.c     | 40 +++++++++++++++++++++++++++++++++++++
 net/core/net-sysfs.c      | 41 ++++++++++++++++++++++++++++++++++++++
 net/core/rtnetlink.c      | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 200 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if.h b/include/linux/if.h
index 12c6f6d157c3..374e20ad8b0d 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -33,7 +33,7 @@
 #define	IFF_LOOPBACK	0x8		/* is a loopback net		*/
 #define	IFF_POINTOPOINT	0x10		/* interface is has p-p link	*/
 #define	IFF_NOTRAILERS	0x20		/* avoid use of trailers	*/
-#define	IFF_RUNNING	0x40		/* interface running and carrier ok */
+#define	IFF_RUNNING	0x40		/* interface RFC2863 OPER_UP	*/
 #define	IFF_NOARP	0x80		/* no ARP protocol		*/
 #define	IFF_PROMISC	0x100		/* receive all packets		*/
 #define	IFF_ALLMULTI	0x200		/* receive all multicast packets*/
@@ -43,12 +43,16 @@
 
 #define IFF_MULTICAST	0x1000		/* Supports multicast		*/
 
-#define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_MASTER|IFF_SLAVE|IFF_RUNNING)
-
 #define IFF_PORTSEL	0x2000          /* can set media type		*/
 #define IFF_AUTOMEDIA	0x4000		/* auto media select active	*/
 #define IFF_DYNAMIC	0x8000		/* dialup device with changing addresses*/
 
+#define IFF_LOWER_UP	0x10000		/* driver signals L1 up		*/
+#define IFF_DORMANT	0x20000		/* driver signals dormant	*/
+
+#define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|\
+		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
+
 /* Private (from user) interface flags (netdevice->priv_flags). */
 #define IFF_802_1Q_VLAN 0x1             /* 802.1Q VLAN device.          */
 #define IFF_EBRIDGE	0x2		/* Ethernet bridging device.	*/
@@ -83,6 +87,22 @@
 #define IF_PROTO_FR_ETH_PVC 0x200B
 #define IF_PROTO_RAW    0x200C          /* RAW Socket                   */
 
+/* RFC 2863 operational status */
+enum {
+	IF_OPER_UNKNOWN,
+	IF_OPER_NOTPRESENT,
+	IF_OPER_DOWN,
+	IF_OPER_LOWERLAYERDOWN,
+	IF_OPER_TESTING,
+	IF_OPER_DORMANT,
+	IF_OPER_UP,
+};
+
+/* link modes */
+enum {
+	IF_LINK_MODE_DEFAULT,
+	IF_LINK_MODE_DORMANT,	/* limit upward transition to dormant */
+};
 
 /*
  *	Device mapping structure. I'd just gone off and designed a 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7fda03d338d1..b825be201bce 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -230,7 +230,8 @@ enum netdev_state_t
 	__LINK_STATE_SCHED,
 	__LINK_STATE_NOCARRIER,
 	__LINK_STATE_RX_SCHED,
-	__LINK_STATE_LINKWATCH_PENDING
+	__LINK_STATE_LINKWATCH_PENDING,
+	__LINK_STATE_DORMANT,
 };
 
 
@@ -335,11 +336,14 @@ struct net_device
 	 */
 
 
-	unsigned short		flags;	/* interface flags (a la BSD)	*/
+	unsigned int		flags;	/* interface flags (a la BSD)	*/
 	unsigned short		gflags;
         unsigned short          priv_flags; /* Like 'flags' but invisible to userspace. */
 	unsigned short		padded;	/* How much padding added by alloc_netdev() */
 
+	unsigned char		operstate; /* RFC2863 operstate */
+	unsigned char		link_mode; /* mapping policy to operstate */
+
 	unsigned		mtu;	/* interface MTU value		*/
 	unsigned short		type;	/* interface hardware type	*/
 	unsigned short		hard_header_len;	/* hardware hdr length	*/
@@ -714,6 +718,10 @@ static inline void dev_put(struct net_device *dev)
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
  * and _off may be called from IRQ context, but it is caller
  * who is responsible for serialization of these calls.
+ *
+ * The name carrier is inappropriate, these functions should really be
+ * called netif_lowerlayer_*() because they represent the state of any
+ * kind of lower layer not just hardware media.
  */
 
 extern void linkwatch_fire_event(struct net_device *dev);
@@ -729,6 +737,29 @@ extern void netif_carrier_on(struct net_device *dev);
 
 extern void netif_carrier_off(struct net_device *dev);
 
+static inline void netif_dormant_on(struct net_device *dev)
+{
+	if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
+		linkwatch_fire_event(dev);
+}
+
+static inline void netif_dormant_off(struct net_device *dev)
+{
+	if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
+		linkwatch_fire_event(dev);
+}
+
+static inline int netif_dormant(const struct net_device *dev)
+{
+	return test_bit(__LINK_STATE_DORMANT, &dev->state);
+}
+
+
+static inline int netif_oper_up(const struct net_device *dev) {
+	return (dev->operstate == IF_OPER_UP ||
+		dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
+}
+
 /* Hot-plugging. */
 static inline int netif_device_present(struct net_device *dev)
 {
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index d50482ba27fe..edccefb45188 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -733,6 +733,8 @@ enum
 #define IFLA_MAP IFLA_MAP
 	IFLA_WEIGHT,
 #define IFLA_WEIGHT IFLA_WEIGHT
+	IFLA_OPERSTATE,
+	IFLA_LINKMODE,
 	__IFLA_MAX
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ef56c035d44e..8763c99fcb84 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2174,12 +2174,20 @@ unsigned dev_get_flags(const struct net_device *dev)
 
 	flags = (dev->flags & ~(IFF_PROMISC |
 				IFF_ALLMULTI |
-				IFF_RUNNING)) | 
+				IFF_RUNNING |
+				IFF_LOWER_UP |
+				IFF_DORMANT)) |
 		(dev->gflags & (IFF_PROMISC |
 				IFF_ALLMULTI));
 
-	if (netif_running(dev) && netif_carrier_ok(dev))
-		flags |= IFF_RUNNING;
+	if (netif_running(dev)) {
+		if (netif_oper_up(dev))
+			flags |= IFF_RUNNING;
+		if (netif_carrier_ok(dev))
+			flags |= IFF_LOWER_UP;
+		if (netif_dormant(dev))
+			flags |= IFF_DORMANT;
+	}
 
 	return flags;
 }
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index d43d1201275c..e82a451d330b 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -49,6 +49,45 @@ struct lw_event {
 /* Avoid kmalloc() for most systems */
 static struct lw_event singleevent;
 
+static unsigned char default_operstate(const struct net_device *dev)
+{
+	if (!netif_carrier_ok(dev))
+		return (dev->ifindex != dev->iflink ?
+			IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+
+	if (netif_dormant(dev))
+		return IF_OPER_DORMANT;
+
+	return IF_OPER_UP;
+}
+
+
+static void rfc2863_policy(struct net_device *dev)
+{
+	unsigned char operstate = default_operstate(dev);
+
+	if (operstate == dev->operstate)
+		return;
+
+	write_lock_bh(&dev_base_lock);
+
+	switch(dev->link_mode) {
+	case IF_LINK_MODE_DORMANT:
+		if (operstate == IF_OPER_UP)
+			operstate = IF_OPER_DORMANT;
+		break;
+
+	case IF_LINK_MODE_DEFAULT:
+	default:
+		break;
+	};
+
+	dev->operstate = operstate;
+
+	write_unlock_bh(&dev_base_lock);
+}
+
+
 /* Must be called with the rtnl semaphore held */
 void linkwatch_run_queue(void)
 {
@@ -74,6 +113,7 @@ void linkwatch_run_queue(void)
 		 */
 		clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
 
+		rfc2863_policy(dev);
 		if (dev->flags & IFF_UP) {
 			if (netif_carrier_ok(dev)) {
 				WARN_ON(dev->qdisc_sleeping == &noop_qdisc);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e8b2acbc8ea2..21b68464cabb 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -91,6 +91,7 @@ NETDEVICE_SHOW(iflink, fmt_dec);
 NETDEVICE_SHOW(ifindex, fmt_dec);
 NETDEVICE_SHOW(features, fmt_long_hex);
 NETDEVICE_SHOW(type, fmt_dec);
+NETDEVICE_SHOW(link_mode, fmt_dec);
 
 /* use same locking rules as GIFHWADDR ioctl's */
 static ssize_t format_addr(char *buf, const unsigned char *addr, int len)
@@ -133,6 +134,43 @@ static ssize_t show_carrier(struct class_device *dev, char *buf)
 	return -EINVAL;
 }
 
+static ssize_t show_dormant(struct class_device *dev, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	if (netif_running(netdev))
+		return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+
+	return -EINVAL;
+}
+
+static const char *operstates[] = {
+	"unknown",
+	"notpresent", /* currently unused */
+	"down",
+	"lowerlayerdown",
+	"testing", /* currently unused */
+	"dormant",
+	"up"
+};
+
+static ssize_t show_operstate(struct class_device *dev, char *buf)
+{
+	const struct net_device *netdev = to_net_dev(dev);
+	unsigned char operstate;
+
+	read_lock(&dev_base_lock);
+	operstate = netdev->operstate;
+	if (!netif_running(netdev))
+		operstate = IF_OPER_DOWN;
+	read_unlock(&dev_base_lock);
+
+	if (operstate >= sizeof(operstates))
+		return -EINVAL; /* should not happen */
+
+	return sprintf(buf, "%s\n", operstates[operstate]);
+}
+
 /* read-write attributes */
 NETDEVICE_SHOW(mtu, fmt_dec);
 
@@ -190,9 +228,12 @@ static struct class_device_attribute net_class_attributes[] = {
 	__ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
 	__ATTR(features, S_IRUGO, show_features, NULL),
 	__ATTR(type, S_IRUGO, show_type, NULL),
+	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
 	__ATTR(address, S_IRUGO, show_address, NULL),
 	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
 	__ATTR(carrier, S_IRUGO, show_carrier, NULL),
+	__ATTR(dormant, S_IRUGO, show_dormant, NULL),
+	__ATTR(operstate, S_IRUGO, show_operstate, NULL),
 	__ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu),
 	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
 	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index eca2976abb25..1c15a907066f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -179,6 +179,33 @@ rtattr_failure:
 }
 
 
+static void set_operstate(struct net_device *dev, unsigned char transition)
+{
+	unsigned char operstate = dev->operstate;
+
+	switch(transition) {
+	case IF_OPER_UP:
+		if ((operstate == IF_OPER_DORMANT ||
+		     operstate == IF_OPER_UNKNOWN) &&
+		    !netif_dormant(dev))
+			operstate = IF_OPER_UP;
+		break;
+
+	case IF_OPER_DORMANT:
+		if (operstate == IF_OPER_UP ||
+		    operstate == IF_OPER_UNKNOWN)
+			operstate = IF_OPER_DORMANT;
+		break;
+	};
+
+	if (dev->operstate != operstate) {
+		write_lock_bh(&dev_base_lock);
+		dev->operstate = operstate;
+		write_unlock_bh(&dev_base_lock);
+		netdev_state_change(dev);
+	}
+}
+
 static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 				 int type, u32 pid, u32 seq, u32 change, 
 				 unsigned int flags)
@@ -208,6 +235,13 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		RTA_PUT(skb, IFLA_WEIGHT, sizeof(weight), &weight);
 	}
 
+	if (1) {
+		u8 operstate = netif_running(dev)?dev->operstate:IF_OPER_DOWN;
+		u8 link_mode = dev->link_mode;
+		RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate);
+		RTA_PUT(skb, IFLA_LINKMODE, sizeof(link_mode), &link_mode);
+	}
+
 	if (1) {
 		struct rtnl_link_ifmap map = {
 			.mem_start   = dev->mem_start,
@@ -399,6 +433,22 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		dev->weight = *((u32 *) RTA_DATA(ida[IFLA_WEIGHT - 1]));
 	}
 
+	if (ida[IFLA_OPERSTATE - 1]) {
+		if (ida[IFLA_OPERSTATE - 1]->rta_len != RTA_LENGTH(sizeof(u8)))
+			goto out;
+
+		set_operstate(dev, *((u8 *) RTA_DATA(ida[IFLA_OPERSTATE - 1])));
+	}
+
+	if (ida[IFLA_LINKMODE - 1]) {
+		if (ida[IFLA_LINKMODE - 1]->rta_len != RTA_LENGTH(sizeof(u8)))
+			goto out;
+
+		write_lock_bh(&dev_base_lock);
+		dev->link_mode = *((u8 *) RTA_DATA(ida[IFLA_LINKMODE - 1]));
+		write_unlock_bh(&dev_base_lock);
+	}
+
 	if (ifm->ifi_index >= 0 && ida[IFLA_IFNAME - 1]) {
 		char ifname[IFNAMSIZ];
 
-- 
cgit v1.2.3


From 77d2ca350018c507815f5d38a40ffb597eb9ae25 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Mar 2006 17:12:12 -0800
Subject: [NET]: Reduce size of struct sk_buff on 64 bit architectures

Move skb->nf_mark next to skb->tc_index to remove a 4 byte hole between
skb->nfmark and skb->nfct and another one between skb->users and skb->head
when CONFIG_NETFILTER, CONFIG_NET_SCHED and CONFIG_NET_CLS_ACT are enabled.
For all other combinations the size stays the same.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ad7cc22bd424..838ce0fdcef7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -270,7 +270,6 @@ struct sk_buff {
 
 	void			(*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
-	__u32			nfmark;
 	struct nf_conntrack	*nfct;
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct sk_buff		*nfct_reasm;
@@ -278,6 +277,7 @@ struct sk_buff {
 #ifdef CONFIG_BRIDGE_NETFILTER
 	struct nf_bridge_info	*nf_bridge;
 #endif
+	__u32			nfmark;
 #endif /* CONFIG_NETFILTER */
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
-- 
cgit v1.2.3


From 5ee956125a780baf15f2c1d09f2cbf8adcf598fe Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 20 Mar 2006 17:14:12 -0800
Subject: [NETFILTER] NAT sequence adjustment: Save eight bytes per conntrack

This patch reduces the size of 'struct ip_conntrack' on systems with NAT
by eight bytes.  The sequence number delta values can be int16_t, since
we only support one sequence number modification per window anyway, and
one such modification is not going to exceed 32kB ;)

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_nat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h
index 41a107de17cf..e9f5ed1d9f68 100644
--- a/include/linux/netfilter_ipv4/ip_nat.h
+++ b/include/linux/netfilter_ipv4/ip_nat.h
@@ -23,7 +23,7 @@ struct ip_nat_seq {
 	 * modification (if any) */
 	u_int32_t correction_pos;
 	/* sequence number offset before and after last modification */
-	int32_t offset_before, offset_after;
+	int16_t offset_before, offset_after;
 };
 
 /* Single range specification. */
-- 
cgit v1.2.3


From 0af5f6c1eba4a18e6b2ed518b589927d778c6c16 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 20 Mar 2006 17:15:11 -0800
Subject: [NETFILTER] nfnetlink_log: add sequence numbers for log events

By using a sequence number for every logged netfilter event, we can
determine from userspace whether logging information was lots somewhere
downstream.

The user has a choice of either having per-instance local sequence
counters, or using a global sequence counter, or both.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_log.h |  6 +++++
 net/netfilter/nfnetlink_log.c           | 46 +++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h
index b04b03880595..a7497c7436df 100644
--- a/include/linux/netfilter/nfnetlink_log.h
+++ b/include/linux/netfilter/nfnetlink_log.h
@@ -47,6 +47,8 @@ enum nfulnl_attr_type {
 	NFULA_PAYLOAD,			/* opaque data payload */
 	NFULA_PREFIX,			/* string prefix */
 	NFULA_UID,			/* user id of socket */
+	NFULA_SEQ,			/* instance-local sequence number */
+	NFULA_SEQ_GLOBAL,		/* global sequence number */
 
 	__NFULA_MAX
 };
@@ -77,6 +79,7 @@ enum nfulnl_attr_config {
 	NFULA_CFG_NLBUFSIZ,		/* u_int32_t buffer size */
 	NFULA_CFG_TIMEOUT,		/* u_int32_t in 1/100 s */
 	NFULA_CFG_QTHRESH,		/* u_int32_t */
+	NFULA_CFG_FLAGS,		/* u_int16_t */
 	__NFULA_CFG_MAX
 };
 #define NFULA_CFG_MAX (__NFULA_CFG_MAX -1)
@@ -85,4 +88,7 @@ enum nfulnl_attr_config {
 #define NFULNL_COPY_META	0x01
 #define NFULNL_COPY_PACKET	0x02
 
+#define NFULNL_CFG_F_SEQ	0x0001
+#define NFULNL_CFG_F_SEQ_GLOBAL	0x0002
+
 #endif /* _NFNETLINK_LOG_H */
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3b3c781b40c0..54cbbaa712dc 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -11,6 +11,10 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
+ * 2006-01-26 Harald Welte <laforge@netfilter.org>
+ * 	- Add optional local and global sequence number to detect lost
+ * 	  events from userspace
+ *
  */
 #include <linux/module.h>
 #include <linux/skbuff.h>
@@ -68,11 +72,14 @@ struct nfulnl_instance {
 	unsigned int nlbufsiz;		/* netlink buffer allocation size */
 	unsigned int qthreshold;	/* threshold of the queue */
 	u_int32_t copy_range;
+	u_int32_t seq;			/* instance-local sequential counter */
 	u_int16_t group_num;		/* number of this queue */
+	u_int16_t flags;
 	u_int8_t copy_mode;	
 };
 
 static DEFINE_RWLOCK(instances_lock);
+static atomic_t global_seq;
 
 #define INSTANCE_BUCKETS	16
 static struct hlist_head instance_table[INSTANCE_BUCKETS];
@@ -310,6 +317,16 @@ nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
 	return 0;
 }
 
+static int
+nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
+{
+	spin_lock_bh(&inst->lock);
+	inst->flags = ntohs(flags);
+	spin_unlock_bh(&inst->lock);
+
+	return 0;
+}
+
 static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, 
 					unsigned int pkt_size)
 {
@@ -377,6 +394,8 @@ static void nfulnl_timer(unsigned long data)
 	spin_unlock_bh(&inst->lock);
 }
 
+/* This is an inline function, we don't really care about a long
+ * list of arguments */
 static inline int 
 __build_packet_message(struct nfulnl_instance *inst,
 			const struct sk_buff *skb, 
@@ -515,6 +534,17 @@ __build_packet_message(struct nfulnl_instance *inst,
 			read_unlock_bh(&skb->sk->sk_callback_lock);
 	}
 
+	/* local sequence number */
+	if (inst->flags & NFULNL_CFG_F_SEQ) {
+		tmp_uint = htonl(inst->seq++);
+		NFA_PUT(inst->skb, NFULA_SEQ, sizeof(tmp_uint), &tmp_uint);
+	}
+	/* global sequence number */
+	if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) {
+		tmp_uint = atomic_inc_return(&global_seq);
+		NFA_PUT(inst->skb, NFULA_SEQ_GLOBAL, sizeof(tmp_uint), &tmp_uint);
+	}
+
 	if (data_len) {
 		struct nfattr *nfa;
 		int size = NFA_LENGTH(data_len);
@@ -607,6 +637,11 @@ nfulnl_log_packet(unsigned int pf,
 
 	spin_lock_bh(&inst->lock);
 
+	if (inst->flags & NFULNL_CFG_F_SEQ)
+		size += NFA_SPACE(sizeof(u_int32_t));
+	if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
+		size += NFA_SPACE(sizeof(u_int32_t));
+
 	qthreshold = inst->qthreshold;
 	/* per-rule qthreshold overrides per-instance */
 	if (qthreshold > li->u.ulog.qthreshold)
@@ -736,10 +771,14 @@ static const int nfula_min[NFULA_MAX] = {
 	[NFULA_TIMESTAMP-1]	= sizeof(struct nfulnl_msg_packet_timestamp),
 	[NFULA_IFINDEX_INDEV-1]	= sizeof(u_int32_t),
 	[NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t),
+	[NFULA_IFINDEX_PHYSINDEV-1]	= sizeof(u_int32_t),
+	[NFULA_IFINDEX_PHYSOUTDEV-1]	= sizeof(u_int32_t),
 	[NFULA_HWADDR-1]	= sizeof(struct nfulnl_msg_packet_hw),
 	[NFULA_PAYLOAD-1]	= 0,
 	[NFULA_PREFIX-1]	= 0,
 	[NFULA_UID-1]		= sizeof(u_int32_t),
+	[NFULA_SEQ-1]		= sizeof(u_int32_t),
+	[NFULA_SEQ_GLOBAL-1]	= sizeof(u_int32_t),
 };
 
 static const int nfula_cfg_min[NFULA_CFG_MAX] = {
@@ -748,6 +787,7 @@ static const int nfula_cfg_min[NFULA_CFG_MAX] = {
 	[NFULA_CFG_TIMEOUT-1]	= sizeof(u_int32_t),
 	[NFULA_CFG_QTHRESH-1]	= sizeof(u_int32_t),
 	[NFULA_CFG_NLBUFSIZ-1]	= sizeof(u_int32_t),
+	[NFULA_CFG_FLAGS-1]	= sizeof(u_int16_t),
 };
 
 static int
@@ -859,6 +899,12 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		nfulnl_set_qthresh(inst, ntohl(qthresh));
 	}
 
+	if (nfula[NFULA_CFG_FLAGS-1]) {
+		u_int16_t flags =
+			*(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_FLAGS-1]);
+		nfulnl_set_flags(inst, ntohl(flags));
+	}
+
 out_put:
 	instance_put(inst);
 	return ret;
-- 
cgit v1.2.3


From d8dcffee860d6b63996923b10f07c91d3d6c2fab Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Mon, 20 Mar 2006 17:18:05 -0800
Subject: [LIST]: Introduce list_for_each_entry_safe_from

For iterate over list of given type from existing point safe against removal of
list entry.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 47208bd99f9e..beb6e48e116f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -437,6 +437,19 @@ static inline void list_splice_init(struct list_head *list,
 	     &pos->member != (head);						\
 	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
 
+/**
+ * list_for_each_entry_safe_from - iterate over list of given type
+ *			from existing point safe against removal of list entry
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member) 			\
+	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
 /**
  * list_for_each_entry_safe_reverse - iterate backwards over list of given type safe against
  *				      removal of list entry
-- 
cgit v1.2.3


From e229c2fb3370a0c4ebac06cad67ce1cb35abcfe6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Mon, 20 Mar 2006 17:19:17 -0800
Subject: [LIST]: Introduce list_for_each_entry_from

For iterating over list of given type continuing from existing point.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index beb6e48e116f..67258b47e9ca 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -410,6 +410,17 @@ static inline void list_splice_init(struct list_head *list,
 	     prefetch(pos->member.next), &pos->member != (head);	\
 	     pos = list_entry(pos->member.next, typeof(*pos), member))
 
+/**
+ * list_for_each_entry_from -	iterate over list of given type
+ *			continuing from existing point
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_from(pos, head, member) 			\
+	for (; prefetch(pos->member.next), &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
 /**
  * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
  * @pos:	the type * to use as a loop counter.
-- 
cgit v1.2.3


From 2a91aa3967398fb94eccc8da67c82bce9f67afdf Mon Sep 17 00:00:00 2001
From: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Date: Mon, 20 Mar 2006 17:41:47 -0800
Subject: [DCCP] CCID2: Initial CCID2 (TCP-Like) implementation

Original work by Andrea Bittau, Arnaldo Melo cleaned up and fixed several
issues on the merge process.

For now CCID2 was turned the default for all SOCK_DCCP connections, but this
will be remedied soon with the merge of the feature negotiation code.

Signed-off-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h    |   8 +-
 net/dccp/Kconfig        |   4 +
 net/dccp/ccids/Kconfig  |  39 ++-
 net/dccp/ccids/Makefile |   4 +
 net/dccp/ccids/ccid2.c  | 838 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/ccids/ccid2.h  |  69 ++++
 net/dccp/ipv4.c         |   1 +
 7 files changed, 957 insertions(+), 6 deletions(-)
 create mode 100644 net/dccp/ccids/ccid2.c
 create mode 100644 net/dccp/ccids/ccid2.h

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 088529f54965..268b4579d7e5 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -314,9 +314,9 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 
 /* initial values for each feature */
 #define DCCPF_INITIAL_SEQUENCE_WINDOW		100
-/* FIXME: for now we're using CCID 3 (TFRC) */
-#define DCCPF_INITIAL_CCID			3
-#define DCCPF_INITIAL_SEND_ACK_VECTOR		0
+/* FIXME: for now we're using CCID 2 (TCP-Like) */
+#define DCCPF_INITIAL_CCID			2
+#define DCCPF_INITIAL_SEND_ACK_VECTOR		1
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
 
@@ -430,6 +430,8 @@ struct dccp_sock {
 	struct timeval			dccps_timestamp_time;
 	__u32				dccps_timestamp_echo;
 	__u32				dccps_packet_size;
+	__u16				dccps_l_ack_ratio;
+	__u16				dccps_r_ack_ratio;
 	unsigned long			dccps_ndp_count;
 	__u32				dccps_mss_cache;
 	struct dccp_options		dccps_options;
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 187ac182e24b..24a6981e209a 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -24,6 +24,10 @@ config INET_DCCP_DIAG
 	def_tristate y if (IP_DCCP = y && INET_DIAG = y)
 	def_tristate m
 
+config IP_DCCP_ACKVEC
+	depends on IP_DCCP
+	def_bool N
+
 source "net/dccp/ccids/Kconfig"
 
 menu "DCCP Kernel Hacking"
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 7684d83946a4..422af197171d 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,6 +1,34 @@
 menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
 	depends on IP_DCCP && EXPERIMENTAL
 
+config IP_DCCP_CCID2
+	tristate "CCID2 (TCP) (EXPERIMENTAL)"
+	depends on IP_DCCP
+	select IP_DCCP_ACKVEC
+	---help---
+	  CCID 2, TCP-like Congestion Control, denotes Additive Increase,
+	  Multiplicative Decrease (AIMD) congestion control with behavior
+	  modelled directly on TCP, including congestion window, slow start,
+	  timeouts, and so forth [RFC 2581].  CCID 2 achieves maximum
+	  bandwidth over the long term, consistent with the use of end-to-end
+	  congestion control, but halves its congestion window in response to
+	  each congestion event.  This leads to the abrupt rate changes
+	  typical of TCP.  Applications should use CCID 2 if they prefer
+	  maximum bandwidth utilization to steadiness of rate.  This is often
+	  the case for applications that are not playing their data directly
+	  to the user.  For example, a hypothetical application that
+	  transferred files over DCCP, using application-level retransmissions
+	  for lost packets, would prefer CCID 2 to CCID 3.  On-line games may
+	  also prefer CCID 2.
+
+	  CCID 2 is further described in:
+	  http://www.icir.org/kohler/dccp/draft-ietf-dccp-ccid2-10.txt
+
+	  This text was extracted from:
+	  http://www.icir.org/kohler/dccp/draft-ietf-dccp-spec-13.txt
+
+	  If in doubt, say M.
+
 config IP_DCCP_CCID3
 	tristate "CCID3 (TFRC) (EXPERIMENTAL)"
 	depends on IP_DCCP
@@ -15,10 +43,15 @@ config IP_DCCP_CCID3
 	  suitable than CCID 2 for applications such streaming media where a
 	  relatively smooth sending rate is of importance.
 
-	  CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
-	  congestion control algorithms were initially described in RFC 3448.
+	  CCID 3 is further described in:
+
+	  http://www.icir.org/kohler/dccp/draft-ietf-dccp-ccid3-11.txt.
+
+	  The TFRC congestion control algorithms were initially described in
+	  RFC 3448.
 
-	  This text was extracted from draft-ietf-dccp-spec-11.txt.
+	  This text was extracted from:
+	  http://www.icir.org/kohler/dccp/draft-ietf-dccp-spec-13.txt
 	  
 	  If in doubt, say M.
 
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
index 956f79f50743..438f20bccff7 100644
--- a/net/dccp/ccids/Makefile
+++ b/net/dccp/ccids/Makefile
@@ -2,4 +2,8 @@ obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
 
 dccp_ccid3-y := ccid3.o
 
+obj-$(CONFIG_IP_DCCP_CCID2) += dccp_ccid2.o
+
+dccp_ccid2-y := ccid2.o
+
 obj-y += lib/
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
new file mode 100644
index 000000000000..4a7b87512560
--- /dev/null
+++ b/net/dccp/ccids/ccid2.c
@@ -0,0 +1,838 @@
+/*
+ *  net/dccp/ccids/ccid2.c
+ *
+ *  Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *  Changes to meet Linux coding standards, and DCCP infrastructure fixes.
+ *
+ *  Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * This implementation should follow: draft-ietf-dccp-ccid2-10.txt
+ *
+ * BUGS:
+ * - sequence number wrapping
+ * - jiffies wrapping
+ */
+
+#include <linux/config.h>
+#include "../ccid.h"
+#include "../dccp.h"
+#include "ccid2.h"
+
+static int ccid2_debug;
+
+#if 0
+#define CCID2_DEBUG
+#endif
+
+#ifdef CCID2_DEBUG
+#define ccid2_pr_debug(format, a...) \
+        do { if (ccid2_debug) \
+                printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
+        } while (0)
+#else
+#define ccid2_pr_debug(format, a...)
+#endif
+
+static const int ccid2_seq_len = 128;
+
+static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_hc_tx_ccid_private;
+}
+
+static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_hc_rx_ccid_private;
+}
+
+#ifdef CCID2_DEBUG
+static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
+{
+	int len = 0;
+	struct ccid2_seq *seqp;
+	int pipe = 0;
+
+	seqp = hctx->ccid2hctx_seqh;
+
+	/* there is data in the chain */
+	if (seqp != hctx->ccid2hctx_seqt) {
+		seqp = seqp->ccid2s_prev;
+		len++;
+		if (!seqp->ccid2s_acked)
+			pipe++;
+
+		while (seqp != hctx->ccid2hctx_seqt) {
+			struct ccid2_seq *prev;
+
+			prev = seqp->ccid2s_prev;
+			len++;
+			if (!prev->ccid2s_acked)
+				pipe++;
+
+			/* packets are sent sequentially */
+			BUG_ON(seqp->ccid2s_seq <= prev->ccid2s_seq);
+			BUG_ON(seqp->ccid2s_sent < prev->ccid2s_sent);
+			BUG_ON(len > ccid2_seq_len);
+
+			seqp = prev;
+		}
+	}
+
+	BUG_ON(pipe != hctx->ccid2hctx_pipe);
+	ccid2_pr_debug("len of chain=%d\n", len);
+
+	do {
+		seqp = seqp->ccid2s_prev;
+		len++;
+		BUG_ON(len > ccid2_seq_len);
+	} while(seqp != hctx->ccid2hctx_seqh);
+
+	BUG_ON(len != ccid2_seq_len);
+	ccid2_pr_debug("total len=%d\n", len);
+}
+#else
+#define ccid2_hc_tx_check_sanity(hctx) do {} while (0)
+#endif
+
+static int ccid2_hc_tx_send_packet(struct sock *sk,
+				   struct sk_buff *skb, int len)
+{
+	struct ccid2_hc_tx_sock *hctx;
+
+	switch (DCCP_SKB_CB(skb)->dccpd_type) {
+	case 0: /* XXX data packets from userland come through like this */
+	case DCCP_PKT_DATA:
+	case DCCP_PKT_DATAACK:
+		break;
+	/* No congestion control on other packets */
+	default:
+		return 0;
+	}
+
+        hctx = ccid2_hc_tx_sk(sk);
+
+	ccid2_pr_debug("pipe=%d cwnd=%d\n", hctx->ccid2hctx_pipe,
+		       hctx->ccid2hctx_cwnd);
+
+	if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) {
+		/* OK we can send... make sure previous packet was sent off */
+		if (!hctx->ccid2hctx_sendwait) {
+			hctx->ccid2hctx_sendwait = 1;
+			return 0;
+		}
+	}
+
+	return 100; /* XXX */
+}
+
+static void ccid2_change_l_ack_ratio(struct sock *sk, int val)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	/*
+	 * XXX I don't really agree with val != 2.  If cwnd is 1, ack ratio
+	 * should be 1... it shouldn't be allowed to become 2.
+	 * -sorbo.
+	 */
+	if (val != 2) {
+		struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+		int max = hctx->ccid2hctx_cwnd / 2;
+
+		/* round up */
+		if (hctx->ccid2hctx_cwnd & 1)
+			max++;
+
+		if (val > max)
+			val = max;
+	}
+
+	ccid2_pr_debug("changing local ack ratio to %d\n", val);
+	WARN_ON(val <= 0);
+	dp->dccps_l_ack_ratio = val;
+}
+
+static void ccid2_change_cwnd(struct sock *sk, int val)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	if (val == 0)
+		val = 1;
+
+	/* XXX do we need to change ack ratio? */
+	ccid2_pr_debug("change cwnd to %d\n", val);
+
+	BUG_ON(val < 1);
+	hctx->ccid2hctx_cwnd = val;
+}
+
+static void ccid2_start_rto_timer(struct sock *sk);
+
+static void ccid2_hc_tx_rto_expire(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+	long s;
+
+	/* XXX I don't think i'm locking correctly
+	 * -sorbo.
+	 */
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
+			       jiffies + HZ / 5);
+		goto out;
+	}
+
+	ccid2_pr_debug("RTO_EXPIRE\n");
+
+	ccid2_hc_tx_check_sanity(hctx);
+
+	/* back-off timer */
+	hctx->ccid2hctx_rto <<= 1;
+
+	s = hctx->ccid2hctx_rto / HZ;
+	if (s > 60)
+		hctx->ccid2hctx_rto = 60 * HZ;
+
+	ccid2_start_rto_timer(sk);
+
+	/* adjust pipe, cwnd etc */
+	hctx->ccid2hctx_pipe = 0;
+	hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd >> 1;
+	if (hctx->ccid2hctx_ssthresh < 2)
+		hctx->ccid2hctx_ssthresh = 2;
+	ccid2_change_cwnd(sk, 1);
+
+	/* clear state about stuff we sent */
+	hctx->ccid2hctx_seqt	= hctx->ccid2hctx_seqh;
+	hctx->ccid2hctx_ssacks	= 0;
+	hctx->ccid2hctx_acks	= 0;
+	hctx->ccid2hctx_sent	= 0;
+
+	/* clear ack ratio state. */
+	hctx->ccid2hctx_arsent	 = 0;
+	hctx->ccid2hctx_ackloss  = 0;
+	hctx->ccid2hctx_rpseq	 = 0;
+	hctx->ccid2hctx_rpdupack = -1;
+	ccid2_change_l_ack_ratio(sk, 1);
+	ccid2_hc_tx_check_sanity(hctx);
+out:
+	bh_unlock_sock(sk);
+/*	sock_put(sk); */
+}
+
+static void ccid2_start_rto_timer(struct sock *sk)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
+
+	BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
+	sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
+		       jiffies + hctx->ccid2hctx_rto);
+}
+
+static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, int len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+	u64 seq;
+
+	ccid2_hc_tx_check_sanity(hctx);
+
+	BUG_ON(!hctx->ccid2hctx_sendwait);
+	hctx->ccid2hctx_sendwait = 0;
+	hctx->ccid2hctx_pipe++;
+	BUG_ON(hctx->ccid2hctx_pipe < 0);
+
+	/* There is an issue.  What if another packet is sent between
+	 * packet_send() and packet_sent().  Then the sequence number would be
+	 * wrong.
+	 * -sorbo.
+	 */
+	seq = dp->dccps_gss;
+
+	hctx->ccid2hctx_seqh->ccid2s_seq   = seq;
+	hctx->ccid2hctx_seqh->ccid2s_acked = 0;
+	hctx->ccid2hctx_seqh->ccid2s_sent  = jiffies;
+	hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqh->ccid2s_next;
+
+	ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
+		       hctx->ccid2hctx_pipe);
+
+	if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) {
+		/* XXX allocate more space */
+		WARN_ON(1);
+	}
+
+	hctx->ccid2hctx_sent++;
+
+	/* Ack Ratio.  Need to maintain a concept of how many windows we sent */
+	hctx->ccid2hctx_arsent++;
+	/* We had an ack loss in this window... */
+	if (hctx->ccid2hctx_ackloss) {
+		if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
+			hctx->ccid2hctx_arsent = 0;
+			hctx->ccid2hctx_ackloss = 0;
+		}
+	}
+	/* No acks lost up to now... */
+	else {
+		/* decrease ack ratio if enough packets were sent */
+		if (dp->dccps_l_ack_ratio > 1) {
+			/* XXX don't calculate denominator each time */
+			int denom;
+
+			denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
+				dp->dccps_l_ack_ratio;
+			denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
+
+			if (hctx->ccid2hctx_arsent >= denom) {
+				ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
+				hctx->ccid2hctx_arsent = 0;
+			}
+		}
+		/* we can't increase ack ratio further [1] */
+		else {
+			hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
+		}
+	}
+
+	/* setup RTO timer */
+	if (!timer_pending(&hctx->ccid2hctx_rtotimer)) {
+		ccid2_start_rto_timer(sk);
+	}
+#ifdef CCID2_DEBUG
+	ccid2_pr_debug("pipe=%d\n", hctx->ccid2hctx_pipe);
+	ccid2_pr_debug("Sent: seq=%llu\n", seq);
+	do {
+		struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
+
+		while (seqp != hctx->ccid2hctx_seqh) {
+			ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
+			       	       seqp->ccid2s_seq, seqp->ccid2s_acked,
+				       seqp->ccid2s_sent);
+			seqp = seqp->ccid2s_next;
+		}
+	} while(0);
+	ccid2_pr_debug("=========\n");
+	ccid2_hc_tx_check_sanity(hctx);
+#endif
+}
+
+/* XXX Lame code duplication!
+ * returns -1 if none was found.
+ * else returns the next offset to use in the function call.
+ */
+static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
+			   unsigned char **vec, unsigned char *veclen)
+{
+        const struct dccp_hdr *dh = dccp_hdr(skb);
+        unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+        unsigned char *opt_ptr;
+        const unsigned char *opt_end = (unsigned char *)dh +
+                                        (dh->dccph_doff * 4);
+        unsigned char opt, len;
+        unsigned char *value;
+
+	BUG_ON(offset < 0);
+	options += offset;
+	opt_ptr = options;
+	if (opt_ptr >= opt_end)
+		return -1;
+
+	while (opt_ptr != opt_end) {
+                opt   = *opt_ptr++;
+                len   = 0;
+                value = NULL;
+
+                /* Check if this isn't a single byte option */
+                if (opt > DCCPO_MAX_RESERVED) {
+                        if (opt_ptr == opt_end)
+                                goto out_invalid_option;
+
+                        len = *opt_ptr++;
+                        if (len < 3)
+                                goto out_invalid_option;
+                        /*
+                         * Remove the type and len fields, leaving
+                         * just the value size
+                         */
+                        len     -= 2;
+                        value   = opt_ptr;
+                        opt_ptr += len;
+
+                        if (opt_ptr > opt_end)
+                                goto out_invalid_option;
+                }
+
+		switch (opt) {
+		case DCCPO_ACK_VECTOR_0:
+		case DCCPO_ACK_VECTOR_1:
+			*vec	= value;
+			*veclen = len;
+			return offset + (opt_ptr - options);
+			break;
+		}
+	}
+
+	return -1;
+
+out_invalid_option:
+	BUG_ON(1); /* should never happen... options were previously parsed ! */
+	return -1;
+}
+
+static void ccid2_hc_tx_kill_rto_timer(struct ccid2_hc_tx_sock *hctx)
+{
+	if (del_timer(&hctx->ccid2hctx_rtotimer))
+		ccid2_pr_debug("deleted RTO timer\n");
+}
+
+static inline void ccid2_new_ack(struct sock *sk,
+			         struct ccid2_seq *seqp,
+				 unsigned int *maxincr)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	/* slow start */
+	if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
+		hctx->ccid2hctx_acks = 0;
+
+		/* We can increase cwnd at most maxincr [ack_ratio/2] */
+		if (*maxincr) {
+			/* increase every 2 acks */
+			hctx->ccid2hctx_ssacks++;
+			if (hctx->ccid2hctx_ssacks == 2) {
+				ccid2_change_cwnd(sk, hctx->ccid2hctx_cwnd + 1);
+				hctx->ccid2hctx_ssacks = 0;
+				*maxincr = *maxincr - 1;
+			}
+		}
+		/* increased cwnd enough for this single ack */
+		else {
+			hctx->ccid2hctx_ssacks = 0;
+		}
+	}
+	else {
+		hctx->ccid2hctx_ssacks = 0;
+		hctx->ccid2hctx_acks++;
+
+		if (hctx->ccid2hctx_acks >= hctx->ccid2hctx_cwnd) {
+			ccid2_change_cwnd(sk, hctx->ccid2hctx_cwnd + 1);
+			hctx->ccid2hctx_acks = 0;
+		}
+	}
+
+	/* update RTO */
+	if (hctx->ccid2hctx_srtt == -1 ||
+	    (jiffies - hctx->ccid2hctx_lastrtt) >= hctx->ccid2hctx_srtt) {
+		unsigned long r = jiffies - seqp->ccid2s_sent;
+		int s;
+
+		/* first measurement */
+		if (hctx->ccid2hctx_srtt == -1) {
+			ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
+			       	       r, jiffies, seqp->ccid2s_seq);
+			hctx->ccid2hctx_srtt = r;
+			hctx->ccid2hctx_rttvar = r >> 1;
+		}
+		else {
+			/* RTTVAR */
+			long tmp = hctx->ccid2hctx_srtt - r;
+			if (tmp < 0)
+				tmp *= -1;
+
+			tmp >>= 2;
+			hctx->ccid2hctx_rttvar *= 3;
+			hctx->ccid2hctx_rttvar >>= 2;
+			hctx->ccid2hctx_rttvar += tmp;
+
+			/* SRTT */
+			hctx->ccid2hctx_srtt *= 7;
+			hctx->ccid2hctx_srtt >>= 3;
+			tmp = r >> 3;
+			hctx->ccid2hctx_srtt += tmp;
+		}
+		s = hctx->ccid2hctx_rttvar << 2;
+		/* clock granularity is 1 when based on jiffies */
+		if (!s)
+			s = 1;
+		hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
+
+		/* must be at least a second */
+		s = hctx->ccid2hctx_rto / HZ;
+		/* DCCP doesn't require this [but I like it cuz my code sux] */
+#if 1
+		if (s < 1)
+			hctx->ccid2hctx_rto = HZ;
+#endif
+		/* max 60 seconds */
+		if (s > 60)
+			hctx->ccid2hctx_rto = HZ * 60;
+
+		hctx->ccid2hctx_lastrtt = jiffies;
+
+		ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
+		       	       hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
+		       	       hctx->ccid2hctx_rto, HZ, r);
+		hctx->ccid2hctx_sent = 0;
+	}
+
+	/* we got a new ack, so re-start RTO timer */
+	ccid2_hc_tx_kill_rto_timer(hctx);
+	ccid2_start_rto_timer(sk);
+}
+
+static void ccid2_hc_tx_dec_pipe(struct ccid2_hc_tx_sock *hctx)
+{
+	hctx->ccid2hctx_pipe--;
+	BUG_ON(hctx->ccid2hctx_pipe < 0);
+
+	if (hctx->ccid2hctx_pipe == 0)
+		ccid2_hc_tx_kill_rto_timer(hctx);
+}
+
+static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+	u64 ackno, seqno;
+	struct ccid2_seq *seqp;
+	unsigned char *vector;
+	unsigned char veclen;
+	int offset = 0;
+	int done = 0;
+	int loss = 0;
+	unsigned int maxincr = 0;
+
+	ccid2_hc_tx_check_sanity(hctx);
+	/* check reverse path congestion */
+	seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+
+	/* XXX this whole "algorithm" is broken.  Need to fix it to keep track
+	 * of the seqnos of the dupacks so that rpseq and rpdupack are correct
+	 * -sorbo.
+	 */
+	/* need to bootstrap */
+	if (hctx->ccid2hctx_rpdupack == -1) {
+		hctx->ccid2hctx_rpdupack = 0;
+		hctx->ccid2hctx_rpseq = seqno;
+	}
+	else {
+		/* check if packet is consecutive */
+		if ((hctx->ccid2hctx_rpseq + 1) == seqno) {
+			hctx->ccid2hctx_rpseq++;
+		}
+		/* it's a later packet */
+		else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
+			hctx->ccid2hctx_rpdupack++;
+
+			/* check if we got enough dupacks */
+			if (hctx->ccid2hctx_rpdupack >=
+			    hctx->ccid2hctx_numdupack) {
+
+				hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
+				hctx->ccid2hctx_rpseq = 0;
+
+				ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio << 1);
+			}
+		}
+	}
+
+	/* check forward path congestion */
+	/* still didn't send out new data packets */
+	if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
+		return;
+
+	switch (DCCP_SKB_CB(skb)->dccpd_type) {
+	case DCCP_PKT_ACK:
+	case DCCP_PKT_DATAACK:
+		break;
+
+	default:
+		return;
+	}
+
+	ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+	seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
+
+	/* If in slow-start, cwnd can increase at most Ack Ratio / 2 packets for
+	 * this single ack.  I round up.
+	 * -sorbo.
+	 */
+	maxincr = dp->dccps_l_ack_ratio >> 1;
+	maxincr++;
+
+	/* go through all ack vectors */
+	while ((offset = ccid2_ackvector(sk, skb, offset,
+					 &vector, &veclen)) != -1) {
+		/* go through this ack vector */
+		while (veclen--) {
+			const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+			u64 ackno_end_rl;
+
+			dccp_set_seqno(&ackno_end_rl, ackno - rl);
+			ccid2_pr_debug("ackvec start:%llu end:%llu\n", ackno,
+				       ackno_end_rl);
+			/* if the seqno we are analyzing is larger than the
+			 * current ackno, then move towards the tail of our
+			 * seqnos.
+			 */
+			while (after48(seqp->ccid2s_seq, ackno)) {
+				if (seqp == hctx->ccid2hctx_seqt) {
+					done = 1;
+					break;
+				}
+				seqp = seqp->ccid2s_prev;
+			}
+			if (done)
+				break;
+
+			/* check all seqnos in the range of the vector
+			 * run length
+			 */
+			while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
+				const u8 state = (*vector &
+						  DCCP_ACKVEC_STATE_MASK) >> 6;
+
+				/* new packet received or marked */
+				if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+				    !seqp->ccid2s_acked) {
+				    	if (state ==
+					    DCCP_ACKVEC_STATE_ECN_MARKED) {
+						loss = 1;
+					}
+					else {
+						ccid2_new_ack(sk, seqp,
+							      &maxincr);
+					}
+
+					seqp->ccid2s_acked = 1;
+					ccid2_pr_debug("Got ack for %llu\n",
+					       	       seqp->ccid2s_seq);
+					ccid2_hc_tx_dec_pipe(hctx);
+				}
+				if (seqp == hctx->ccid2hctx_seqt) {
+					done = 1;
+					break;
+				}
+				seqp = seqp->ccid2s_next;
+			}
+			if (done)
+				break;
+
+
+			dccp_set_seqno(&ackno, ackno_end_rl - 1);
+			vector++;
+		}
+		if (done)
+			break;
+	}
+
+	/* The state about what is acked should be correct now
+	 * Check for NUMDUPACK
+	 */
+	seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
+	done = 0;
+	while (1) {
+		if (seqp->ccid2s_acked) {
+			done++;
+			if (done == hctx->ccid2hctx_numdupack) {
+				break;
+			}
+		}
+		if (seqp == hctx->ccid2hctx_seqt) {
+			break;
+		}
+		seqp = seqp->ccid2s_prev;
+	}
+
+	/* If there are at least 3 acknowledgements, anything unacknowledged
+	 * below the last sequence number is considered lost
+	 */
+	if (done == hctx->ccid2hctx_numdupack) {
+		struct ccid2_seq *last_acked = seqp;
+
+		/* check for lost packets */
+		while (1) {
+			if (!seqp->ccid2s_acked) {
+				loss = 1;
+				ccid2_hc_tx_dec_pipe(hctx);
+			}
+			if (seqp == hctx->ccid2hctx_seqt)
+				break;
+			seqp = seqp->ccid2s_prev;
+		}
+
+		hctx->ccid2hctx_seqt = last_acked;
+	}
+
+	/* trim acked packets in tail */
+	while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
+		if (!hctx->ccid2hctx_seqt->ccid2s_acked)
+			break;
+
+		hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
+	}
+
+	if (loss) {
+		/* XXX do bit shifts guarantee a 0 as the new bit? */
+		ccid2_change_cwnd(sk, hctx->ccid2hctx_cwnd >> 1);
+		hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd;
+		if (hctx->ccid2hctx_ssthresh < 2)
+			hctx->ccid2hctx_ssthresh = 2;
+	}
+
+	ccid2_hc_tx_check_sanity(hctx);
+}
+
+static int ccid2_hc_tx_init(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid2_hc_tx_sock *hctx;
+	int seqcount = ccid2_seq_len;
+	int i;
+
+        dp->dccps_hc_tx_ccid_private = kzalloc(sizeof(*hctx), gfp_any());
+        if (dp->dccps_hc_tx_ccid_private == NULL)
+                return -ENOMEM;
+
+        hctx = ccid2_hc_tx_sk(sk);
+
+	/* XXX init variables with proper values */
+	hctx->ccid2hctx_cwnd	  = 1;
+	hctx->ccid2hctx_ssthresh  = 10;
+	hctx->ccid2hctx_numdupack = 3;
+
+	/* XXX init ~ to window size... */
+	hctx->ccid2hctx_seqbuf = kmalloc(sizeof(*hctx->ccid2hctx_seqbuf) *
+					 seqcount, gfp_any());
+	if (hctx->ccid2hctx_seqbuf == NULL) {
+		kfree(dp->dccps_hc_tx_ccid_private);
+		dp->dccps_hc_tx_ccid_private = NULL;
+		return -ENOMEM;
+	}
+	for (i = 0; i < (seqcount - 1); i++) {
+		hctx->ccid2hctx_seqbuf[i].ccid2s_next =
+					&hctx->ccid2hctx_seqbuf[i + 1];
+		hctx->ccid2hctx_seqbuf[i + 1].ccid2s_prev =
+					&hctx->ccid2hctx_seqbuf[i];
+	}
+	hctx->ccid2hctx_seqbuf[seqcount - 1].ccid2s_next =
+					hctx->ccid2hctx_seqbuf;
+	hctx->ccid2hctx_seqbuf->ccid2s_prev =
+					&hctx->ccid2hctx_seqbuf[seqcount - 1];
+
+	hctx->ccid2hctx_seqh	 = hctx->ccid2hctx_seqbuf;
+	hctx->ccid2hctx_seqt	 = hctx->ccid2hctx_seqh;
+	hctx->ccid2hctx_sent	 = 0;
+	hctx->ccid2hctx_rto	 = 3 * HZ;
+	hctx->ccid2hctx_srtt	 = -1;
+	hctx->ccid2hctx_rttvar	 = -1;
+	hctx->ccid2hctx_lastrtt  = 0;
+	hctx->ccid2hctx_rpdupack = -1;
+
+	hctx->ccid2hctx_rtotimer.function = &ccid2_hc_tx_rto_expire;
+	hctx->ccid2hctx_rtotimer.data	  = (unsigned long)sk;
+	init_timer(&hctx->ccid2hctx_rtotimer);
+
+	ccid2_hc_tx_check_sanity(hctx);
+	return 0;
+}
+
+static void ccid2_hc_tx_exit(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+        struct ccid2_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+	ccid2_hc_tx_kill_rto_timer(hctx);
+
+	kfree(hctx->ccid2hctx_seqbuf);
+
+	kfree(dp->dccps_hc_tx_ccid_private);
+	dp->dccps_hc_tx_ccid_private = NULL;
+}
+
+static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid2_hc_rx_sock *hcrx = ccid2_hc_rx_sk(sk);
+
+	switch (DCCP_SKB_CB(skb)->dccpd_type) {
+	case DCCP_PKT_DATA:
+	case DCCP_PKT_DATAACK:
+		hcrx->ccid2hcrx_data++;
+		if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
+			dccp_send_ack(sk);
+			hcrx->ccid2hcrx_data = 0;
+		}
+		break;
+	}
+}
+
+static int ccid2_hc_rx_init(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+        dp->dccps_hc_rx_ccid_private = kzalloc(sizeof(struct ccid2_hc_rx_sock),
+					       gfp_any());
+        return dp->dccps_hc_rx_ccid_private == NULL ? -ENOMEM : 0;
+}
+
+static void ccid2_hc_rx_exit(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	kfree(dp->dccps_hc_rx_ccid_private);
+	dp->dccps_hc_rx_ccid_private = NULL;
+}
+
+static struct ccid ccid2 = {
+	.ccid_id		= 2,
+	.ccid_name		= "ccid2",
+	.ccid_owner		= THIS_MODULE,
+	.ccid_hc_tx_init	= ccid2_hc_tx_init,
+	.ccid_hc_tx_exit	= ccid2_hc_tx_exit,
+	.ccid_hc_tx_send_packet	= ccid2_hc_tx_send_packet,
+	.ccid_hc_tx_packet_sent	= ccid2_hc_tx_packet_sent,
+	.ccid_hc_tx_packet_recv	= ccid2_hc_tx_packet_recv,
+	.ccid_hc_rx_init	= ccid2_hc_rx_init,
+	.ccid_hc_rx_exit	= ccid2_hc_rx_exit,
+	.ccid_hc_rx_packet_recv	= ccid2_hc_rx_packet_recv,
+};
+
+module_param(ccid2_debug, int, 0444);
+MODULE_PARM_DESC(ccid2_debug, "Enable debug messages");
+
+static __init int ccid2_module_init(void)
+{
+	return ccid_register(&ccid2);
+}
+module_init(ccid2_module_init);
+
+static __exit void ccid2_module_exit(void)
+{
+	ccid_unregister(&ccid2);
+}
+module_exit(ccid2_module_exit);
+
+MODULE_AUTHOR("Andrea Bittau <a.bittau@cs.ucl.ac.uk>");
+MODULE_DESCRIPTION("DCCP TCP CCID2 CCID");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("net-dccp-ccid-2");
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
new file mode 100644
index 000000000000..0b08c90955a9
--- /dev/null
+++ b/net/dccp/ccids/ccid2.h
@@ -0,0 +1,69 @@
+/*
+ *  net/dccp/ccids/ccid2.h
+ *
+ *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID2_H_
+#define _DCCP_CCID2_H_
+
+struct ccid2_seq {
+	u64			ccid2s_seq;
+	unsigned long		ccid2s_sent;
+	int			ccid2s_acked;
+	struct ccid2_seq	*ccid2s_prev;
+	struct ccid2_seq	*ccid2s_next;
+};
+
+/** struct ccid2_hc_tx_sock - CCID2 TX half connection
+ *
+ * @ccid2hctx_ssacks - ACKs recv in slow start
+ * @ccid2hctx_acks - ACKS recv in AI phase
+ * @ccid2hctx_sent - packets sent in this window
+ * @ccid2hctx_lastrtt -time RTT was last measured
+ * @ccid2hctx_arsent - packets sent [ack ratio]
+ * @ccid2hctx_ackloss - ack was lost in this win
+ * @ccid2hctx_rpseq - last consecutive seqno
+ * @ccid2hctx_rpdupack - dupacks since rpseq
+*/
+struct ccid2_hc_tx_sock {
+	int			ccid2hctx_cwnd;
+	int			ccid2hctx_ssacks;
+	int			ccid2hctx_acks;
+	int			ccid2hctx_ssthresh;
+	int			ccid2hctx_pipe;
+	int			ccid2hctx_numdupack;
+	struct ccid2_seq	*ccid2hctx_seqbuf;
+	struct ccid2_seq	*ccid2hctx_seqh;
+	struct ccid2_seq	*ccid2hctx_seqt;
+	long			ccid2hctx_rto;
+	long			ccid2hctx_srtt;
+	long			ccid2hctx_rttvar;
+	int			ccid2hctx_sent;
+	unsigned long		ccid2hctx_lastrtt;
+	struct timer_list	ccid2hctx_rtotimer;
+	unsigned long		ccid2hctx_arsent;
+	int			ccid2hctx_ackloss;
+	u64			ccid2hctx_rpseq;
+	int			ccid2hctx_rpdupack;
+	int			ccid2hctx_sendwait;
+};
+
+struct ccid2_hc_rx_sock {
+	int	ccid2hcrx_data;
+};
+
+#endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 2ab6f0e6cd62..38321ad81875 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1081,6 +1081,7 @@ int dccp_v4_init_sock(struct sock *sk)
 	dp->dccps_mss_cache = 536;
 	dp->dccps_role = DCCP_ROLE_UNDEFINED;
 	dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
+	dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
 
 	return 0;
 }
-- 
cgit v1.2.3


From afe00251dd9b53d51de91ff0099961f42bbf3754 Mon Sep 17 00:00:00 2001
From: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Date: Mon, 20 Mar 2006 17:43:56 -0800
Subject: [DCCP]: Initial feature negotiation implementation

Still needs more work, but boots and doesn't crashes, even
does some negotiation!

18:38:52.174934  127.0.0.1.43458 > 127.0.0.1.5001: request <change_l ack_ratio 2, change_r ccid 2, change_l ccid 2>
18:38:52.218526  127.0.0.1.5001 > 127.0.0.1.43458: response <nop, nop, change_l ack_ratio 2, confirm_r ccid 2 2, confirm_l ccid 2 2, confirm_r ack_ratio 2>
18:38:52.185398  127.0.0.1.43458 > 127.0.0.1.5001: <nop, confirm_r ack_ratio 2, ack_vector0 0x00, elapsed_time 212>

:-)

Signed-off-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  35 +++-
 net/dccp/Makefile    |   2 +-
 net/dccp/feat.c      | 554 +++++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/feat.h      |  28 +++
 net/dccp/input.c     |   3 +
 net/dccp/ipv4.c      |  19 +-
 net/dccp/minisocks.c |   4 +
 net/dccp/options.c   | 139 ++++++++++++-
 net/dccp/output.c    |   4 +
 net/dccp/proto.c     |  53 +++++
 net/dccp/timer.c     |  12 ++
 11 files changed, 847 insertions(+), 6 deletions(-)
 create mode 100644 net/dccp/feat.c
 create mode 100644 net/dccp/feat.h

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 268b4579d7e5..f91c8a62406d 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -154,6 +154,10 @@ enum {
 	DCCPO_MANDATORY = 1,
 	DCCPO_MIN_RESERVED = 3,
 	DCCPO_MAX_RESERVED = 31,
+	DCCPO_CHANGE_L = 32,
+	DCCPO_CONFIRM_L = 33,
+	DCCPO_CHANGE_R = 34,
+	DCCPO_CONFIRM_R = 35,
 	DCCPO_NDP_COUNT = 37,
 	DCCPO_ACK_VECTOR_0 = 38,
 	DCCPO_ACK_VECTOR_1 = 39,
@@ -168,7 +172,9 @@ enum {
 /* DCCP features */
 enum {
 	DCCPF_RESERVED = 0,
+	DCCPF_CCID = 1,
 	DCCPF_SEQUENCE_WINDOW = 3,
+	DCCPF_ACK_RATIO = 5,
 	DCCPF_SEND_ACK_VECTOR = 6,
 	DCCPF_SEND_NDP_COUNT = 7,
 	/* 10-127 reserved */
@@ -176,9 +182,18 @@ enum {
 	DCCPF_MAX_CCID_SPECIFIC = 255,
 };
 
+/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */
+struct dccp_so_feat {
+	__u8 dccpsf_feat;
+	__u8 *dccpsf_val;
+	__u8 dccpsf_len;
+};
+
 /* DCCP socket options */
 #define DCCP_SOCKOPT_PACKET_SIZE	1
 #define DCCP_SOCKOPT_SERVICE		2
+#define DCCP_SOCKOPT_CHANGE_L		3
+#define DCCP_SOCKOPT_CHANGE_R		4
 #define DCCP_SOCKOPT_CCID_RX_INFO	128
 #define DCCP_SOCKOPT_CCID_TX_INFO	192
 
@@ -314,8 +329,8 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 
 /* initial values for each feature */
 #define DCCPF_INITIAL_SEQUENCE_WINDOW		100
-/* FIXME: for now we're using CCID 2 (TCP-Like) */
 #define DCCPF_INITIAL_CCID			2
+#define DCCPF_INITIAL_ACK_RATIO			2
 #define DCCPF_INITIAL_SEND_ACK_VECTOR		1
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
@@ -335,6 +350,24 @@ struct dccp_options {
 	__u8	dccpo_tx_ccid;
 	__u8	dccpo_send_ack_vector;
 	__u8	dccpo_send_ndp_count;
+	__u8			dccpo_ack_ratio;
+	struct list_head	dccpo_pending;
+	struct list_head	dccpo_conf;
+};
+
+struct dccp_opt_conf {
+	__u8			*dccpoc_val;
+	__u8			dccpoc_len;
+};
+
+struct dccp_opt_pend {
+	struct list_head	dccpop_node;
+	__u8			dccpop_type;
+	__u8			dccpop_feat;
+	__u8		        *dccpop_val;
+	__u8			dccpop_len;
+	int			dccpop_conf;
+	struct dccp_opt_conf    *dccpop_sc;
 };
 
 extern void __dccp_options_init(struct dccp_options *dccpo);
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 87b27fff6e3b..5736acea1c86 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -4,7 +4,7 @@ dccp_ipv6-y := ipv6.o
 
 obj-$(CONFIG_IP_DCCP) += dccp.o
 
-dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
+dccp-y := ccid.o feat.o input.o ipv4.o minisocks.o options.o output.o proto.o \
 	  timer.o
 
 dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
new file mode 100644
index 000000000000..99d7b7f9efa9
--- /dev/null
+++ b/net/dccp/feat.c
@@ -0,0 +1,554 @@
+/*
+ *  net/dccp/feat.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include "dccp.h"
+#include "feat.h"
+
+#define DCCP_FEAT_SP_NOAGREE (-123)
+
+int dccp_feat_change(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len,
+		     gfp_t gfp)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_opt_pend *opt;
+
+	dccp_pr_debug("feat change type=%d feat=%d\n", type, feature);
+
+	/* check if that feature is already being negotiated */
+	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
+			    dccpop_node) {
+		/* ok we found a negotiation for this option already */
+		if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
+			dccp_pr_debug("Replacing old\n");
+			/* replace */
+			BUG_ON(opt->dccpop_val == NULL);
+			kfree(opt->dccpop_val);
+			opt->dccpop_val	 = val;
+			opt->dccpop_len	 = len;
+			opt->dccpop_conf = 0;
+			return 0;
+		}
+	}
+
+	/* negotiation for a new feature */
+	opt = kmalloc(sizeof(*opt), gfp);
+	if (opt == NULL)
+		return -ENOMEM;
+
+	opt->dccpop_type = type;
+	opt->dccpop_feat = feature;
+	opt->dccpop_len	 = len;
+	opt->dccpop_val	 = val;
+	opt->dccpop_conf = 0;
+	opt->dccpop_sc	 = NULL;
+
+	BUG_ON(opt->dccpop_val == NULL);
+
+	list_add_tail(&opt->dccpop_node, &dp->dccps_options.dccpo_pending);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_feat_change);
+
+/* XXX taking only u8 vals */
+static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
+{
+	/* FIXME implement */
+	dccp_pr_debug("changing [%d] feat %d to %d\n", type, feat, val);
+	return 0;
+}
+
+static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
+			       u8 *rpref, u8 rlen)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	u8 *spref, slen, *res = NULL;
+	int i, j, rc, agree = 1;
+
+	BUG_ON(rpref == NULL);
+
+	/* check if we are the black sheep */
+	if (dp->dccps_role == DCCP_ROLE_CLIENT) {
+		spref = rpref;
+		slen  = rlen;
+		rpref = opt->dccpop_val;
+		rlen  = opt->dccpop_len;
+	} else {
+		spref = opt->dccpop_val;
+		slen  = opt->dccpop_len;
+	}
+	/*
+	 * Now we have server preference list in spref and client preference in
+	 * rpref
+	 */
+	BUG_ON(spref == NULL);
+	BUG_ON(rpref == NULL);
+
+	/* FIXME sanity check vals */
+
+	/* Are values in any order?  XXX Lame "algorithm" here */
+	/* XXX assume values are 1 byte */
+	for (i = 0; i < slen; i++) {
+		for (j = 0; j < rlen; j++) {
+			if (spref[i] == rpref[j]) {
+				res = &spref[i];
+				break;
+			}
+		}
+		if (res)
+			break;
+	}
+
+	/* we didn't agree on anything */
+	if (res == NULL) {
+		/* confirm previous value */
+		switch (opt->dccpop_feat) {
+		case DCCPF_CCID:
+			/* XXX did i get this right? =P */
+			if (opt->dccpop_type == DCCPO_CHANGE_L)
+				res = &dp->dccps_options.dccpo_tx_ccid;
+			else
+				res = &dp->dccps_options.dccpo_rx_ccid;
+			break;
+
+		default:
+			WARN_ON(1); /* XXX implement res */
+			return -EFAULT;
+		}
+
+		dccp_pr_debug("Don't agree... reconfirming %d\n", *res);
+		agree = 0; /* this is used for mandatory options... */
+	}
+
+	/* need to put result and our preference list */
+	/* XXX assume 1 byte vals */
+	rlen = 1 + opt->dccpop_len;
+	rpref = kmalloc(rlen, GFP_ATOMIC);
+	if (rpref == NULL)
+		return -ENOMEM;
+
+	*rpref = *res;
+	memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len);
+
+	/* put it in the "confirm queue" */
+	if (opt->dccpop_sc == NULL) {
+		opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC);
+		if (opt->dccpop_sc == NULL) {
+			kfree(rpref);
+			return -ENOMEM;
+		}
+	} else {
+		/* recycle the confirm slot */
+		BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
+		kfree(opt->dccpop_sc->dccpoc_val);
+		dccp_pr_debug("recycling confirm slot\n");
+	}
+	memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
+
+	opt->dccpop_sc->dccpoc_val = rpref;
+	opt->dccpop_sc->dccpoc_len = rlen;
+
+	/* update the option on our side [we are about to send the confirm] */
+	rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
+	if (rc) {
+		kfree(opt->dccpop_sc->dccpoc_val);
+		kfree(opt->dccpop_sc);
+		opt->dccpop_sc = 0;
+		return rc;
+	}
+
+	dccp_pr_debug("Will confirm %d\n", *rpref);
+
+	/* say we want to change to X but we just got a confirm X, suppress our
+	 * change
+	 */
+	if (!opt->dccpop_conf) {
+		if (*opt->dccpop_val == *res)
+			opt->dccpop_conf = 1;
+		dccp_pr_debug("won't ask for change of same feature\n");
+	}
+
+	return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */
+}
+
+static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_opt_pend *opt;
+	int rc = 1;
+	u8 t;
+
+	/*
+	 * We received a CHANGE.  We gotta match it against our own preference
+	 * list.  If we got a CHANGE_R it means it's a change for us, so we need
+	 * to compare our CHANGE_L list.
+	 */
+	if (type == DCCPO_CHANGE_L)
+		t = DCCPO_CHANGE_R;
+	else
+		t = DCCPO_CHANGE_L;
+
+	/* find our preference list for this feature */
+	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
+			    dccpop_node) {
+		if (opt->dccpop_type != t || opt->dccpop_feat != feature)
+			continue;
+
+		/* find the winner from the two preference lists */
+		rc = dccp_feat_reconcile(sk, opt, val, len);
+		break;
+	}
+
+	/* We didn't deal with the change.  This can happen if we have no
+	 * preference list for the feature.  In fact, it just shouldn't
+	 * happen---if we understand a feature, we should have a preference list
+	 * with at least the default value.
+	 */
+	BUG_ON(rc == 1);
+
+	return rc;
+}
+
+static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
+{
+	struct dccp_opt_pend *opt;
+	struct dccp_sock *dp = dccp_sk(sk);
+	u8 *copy;
+	int rc;
+
+	/* NN features must be change L */
+	if (type == DCCPO_CHANGE_R) {
+		dccp_pr_debug("received CHANGE_R %d for NN feat %d\n",
+			      type, feature);
+		return -EFAULT;
+	}
+
+	/* XXX sanity check opt val */
+
+	/* copy option so we can confirm it */
+	opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
+	if (opt == NULL)
+		return -ENOMEM;
+
+	copy = kmalloc(len, GFP_ATOMIC);
+	if (copy == NULL) {
+		kfree(opt);
+		return -ENOMEM;
+	}
+	memcpy(copy, val, len);
+
+	opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */
+	opt->dccpop_feat = feature;
+	opt->dccpop_val	 = copy;
+	opt->dccpop_len	 = len;
+
+	/* change feature */
+	rc = dccp_feat_update(sk, type, feature, *val);
+	if (rc) {
+		kfree(opt->dccpop_val);
+		kfree(opt);
+		return rc;
+	}
+
+	dccp_pr_debug("Confirming NN feature %d (val=%d)\n", feature, *copy);
+	list_add_tail(&opt->dccpop_node, &dp->dccps_options.dccpo_conf);
+
+	return 0;
+}
+
+static void dccp_feat_empty_confirm(struct sock *sk, u8 type, u8 feature)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	/* XXX check if other confirms for that are queued and recycle slot */
+	struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
+
+	if (opt == NULL) {
+		/* XXX what do we do?  Ignoring should be fine.  It's a change
+		 * after all =P
+		 */
+		return;
+	}
+
+	opt->dccpop_type = type == DCCPO_CHANGE_L ? DCCPO_CONFIRM_R :
+						    DCCPO_CONFIRM_L;
+	opt->dccpop_feat = feature;
+	opt->dccpop_val	 = 0;
+	opt->dccpop_len	 = 0;
+
+	/* change feature */
+	dccp_pr_debug("Empty confirm feature %d type %d\n", feature, type);
+	list_add_tail(&opt->dccpop_node, &dp->dccps_options.dccpo_conf);
+}
+
+static void dccp_feat_flush_confirm(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	/* Check if there is anything to confirm in the first place */
+	int yes = !list_empty(&dp->dccps_options.dccpo_conf);
+
+	if (!yes) {
+		struct dccp_opt_pend *opt;
+
+		list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
+				    dccpop_node) {
+			if (opt->dccpop_conf) {
+				yes = 1;
+				break;
+			}
+		}
+	}
+
+	if (!yes)
+		return;
+
+	/* OK there is something to confirm... */
+	/* XXX check if packet is in flight?  Send delayed ack?? */
+	if (sk->sk_state == DCCP_OPEN)
+		dccp_send_ack(sk);
+}
+
+int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
+{
+	int rc;
+
+	dccp_pr_debug("got feat change type=%d feat=%d\n", type, feature);
+
+	/* figure out if it's SP or NN feature */
+	switch (feature) {
+	/* deal with SP features */
+	case DCCPF_CCID:
+		rc = dccp_feat_sp(sk, type, feature, val, len);
+		break;
+
+	/* deal with NN features */
+	case DCCPF_ACK_RATIO:
+		rc = dccp_feat_nn(sk, type, feature, val, len);
+		break;
+
+	/* XXX implement other features */
+	default:
+		rc = -EFAULT;
+		break;
+	}
+
+	/* check if there were problems changing features */
+	if (rc) {
+		/* If we don't agree on SP, we sent a confirm for old value.
+		 * However we propagate rc to caller in case option was
+		 * mandatory
+		 */
+		if (rc != DCCP_FEAT_SP_NOAGREE)
+			dccp_feat_empty_confirm(sk, type, feature);
+	}
+
+	/* generate the confirm [if required] */
+	dccp_feat_flush_confirm(sk);
+
+	return rc;
+}
+
+EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
+
+int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
+			   u8 *val, u8 len)
+{
+	u8 t;
+	struct dccp_opt_pend *opt;
+	struct dccp_sock *dp = dccp_sk(sk);
+	int rc = 1;
+	int all_confirmed = 1;
+
+	dccp_pr_debug("got feat confirm type=%d feat=%d\n", type, feature);
+
+	/* XXX sanity check type & feat */
+
+	/* locate our change request */
+	t = type == DCCPO_CONFIRM_L ? DCCPO_CHANGE_R : DCCPO_CHANGE_L;
+
+	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
+			    dccpop_node) {
+		if (!opt->dccpop_conf && opt->dccpop_type == t &&
+		    opt->dccpop_feat == feature) {
+			/* we found it */
+			/* XXX do sanity check */
+
+			opt->dccpop_conf = 1;
+
+			/* We got a confirmation---change the option */
+			dccp_feat_update(sk, opt->dccpop_type,
+					 opt->dccpop_feat, *val);
+
+			dccp_pr_debug("feat %d type %d confirmed %d\n",
+				      feature, type, *val);
+			rc = 0;
+			break;
+		}
+
+		if (!opt->dccpop_conf)
+			all_confirmed = 0;
+	}
+
+	/* fix re-transmit timer */
+	/* XXX gotta make sure that no option negotiation occurs during
+	 * connection shutdown.  Consider that the CLOSEREQ is sent and timer is
+	 * on.  if all options are confirmed it might kill timer which should
+	 * remain alive until close is received.
+	 */
+	if (all_confirmed) {
+		dccp_pr_debug("clear feat negotiation timer %p\n", sk);
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+	}
+
+	if (rc)
+		dccp_pr_debug("feat %d type %d never requested\n",
+			      feature, type);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
+
+void dccp_feat_clean(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_opt_pend *opt, *next;
+
+	list_for_each_entry_safe(opt, next, &dp->dccps_options.dccpo_pending,
+				 dccpop_node) {
+                BUG_ON(opt->dccpop_val == NULL);
+                kfree(opt->dccpop_val);
+
+		if (opt->dccpop_sc != NULL) {
+			BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
+			kfree(opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc);
+		}
+
+                kfree(opt);
+        }
+	INIT_LIST_HEAD(&dp->dccps_options.dccpo_pending);
+
+	list_for_each_entry_safe(opt, next, &dp->dccps_options.dccpo_conf,
+				 dccpop_node) {
+		BUG_ON(opt == NULL);
+		if (opt->dccpop_val != NULL)
+			kfree(opt->dccpop_val);
+		kfree(opt);
+	}
+	INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
+}
+
+EXPORT_SYMBOL_GPL(dccp_feat_clean);
+
+/* this is to be called only when a listening sock creates its child.  It is
+ * assumed by the function---the confirm is not duplicated, but rather it is
+ * "passed on".
+ */
+int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
+{
+	struct dccp_sock *olddp = dccp_sk(oldsk);
+	struct dccp_sock *newdp = dccp_sk(newsk);
+	struct dccp_opt_pend *opt;
+	int rc = 0;
+
+	INIT_LIST_HEAD(&newdp->dccps_options.dccpo_pending);
+	INIT_LIST_HEAD(&newdp->dccps_options.dccpo_conf);
+
+	list_for_each_entry(opt, &olddp->dccps_options.dccpo_pending,
+			    dccpop_node) {
+		struct dccp_opt_pend *newopt;
+		/* copy the value of the option */
+		u8 *val = kmalloc(opt->dccpop_len, GFP_ATOMIC);
+
+		if (val == NULL)
+			goto out_clean;
+		memcpy(val, opt->dccpop_val, opt->dccpop_len);
+
+		newopt = kmalloc(sizeof(*newopt), GFP_ATOMIC);
+		if (newopt == NULL) {
+			kfree(val);
+			goto out_clean;
+		}
+
+		/* insert the option */
+		memcpy(newopt, opt, sizeof(*newopt));
+		newopt->dccpop_val = val;
+		list_add_tail(&newopt->dccpop_node,
+			      &newdp->dccps_options.dccpo_pending);
+
+		/* XXX what happens with backlogs and multiple connections at
+		 * once...
+		 */
+		/* the master socket no longer needs to worry about confirms */
+		opt->dccpop_sc = 0; /* it's not a memleak---new socket has it */
+
+		/* reset state for a new socket */
+		opt->dccpop_conf = 0;
+	}
+
+	/* XXX not doing anything about the conf queue */
+
+out:
+	return rc;
+
+out_clean:
+	dccp_feat_clean(newsk);
+	rc = -ENOMEM;
+	goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_feat_clone);
+
+static int __dccp_feat_init(struct sock *sk, u8 type, u8 feat, u8 *val, u8 len)
+{
+	int rc = -ENOMEM;
+	u8 *copy = kmalloc(len, GFP_KERNEL);
+
+	if (copy != NULL) {
+		memcpy(copy, val, len);
+		rc = dccp_feat_change(sk, type, feat, copy, len, GFP_KERNEL);
+		if (rc)
+			kfree(copy);
+	}
+	return rc;
+}
+
+int dccp_feat_init(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	int rc;
+
+	INIT_LIST_HEAD(&dp->dccps_options.dccpo_pending);
+	INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
+
+	/* CCID L */
+	rc = __dccp_feat_init(sk, DCCPO_CHANGE_L, DCCPF_CCID,
+			      &dp->dccps_options.dccpo_tx_ccid, 1);
+	if (rc)
+		goto out;
+
+	/* CCID R */
+	rc = __dccp_feat_init(sk, DCCPO_CHANGE_R, DCCPF_CCID,
+			      &dp->dccps_options.dccpo_rx_ccid, 1);
+	if (rc)
+		goto out;
+
+	/* Ack ratio */
+	rc = __dccp_feat_init(sk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
+			      &dp->dccps_options.dccpo_ack_ratio, 1);
+out:
+	return rc;
+}
+
+EXPORT_SYMBOL_GPL(dccp_feat_init);
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
new file mode 100644
index 000000000000..67da06265f14
--- /dev/null
+++ b/net/dccp/feat.h
@@ -0,0 +1,28 @@
+#ifndef _DCCP_FEAT_H
+#define _DCCP_FEAT_H
+/*
+ *  net/dccp/feat.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+
+struct sock;
+
+extern int  dccp_feat_change(struct sock *sk, u8 type, u8 feature,
+			     u8 *val, u8 len, gfp_t gfp);
+extern int  dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
+				  u8 *val, u8 len);
+extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
+				   u8 *val, u8 len);
+extern void dccp_feat_clean(struct sock *sk);
+extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
+extern int  dccp_feat_init(struct sock *sk);
+
+#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index b6cba72b44e8..4b6d43d8b920 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -300,6 +300,9 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			goto out_invalid_packet;
 		}
 
+		if (dccp_parse_options(sk, skb))
+			goto out_invalid_packet;
+
                 if (dp->dccps_options.dccpo_send_ack_vector &&
                     dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
                                     DCCP_SKB_CB(skb)->dccpd_seq,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 38321ad81875..fcfb486f90c2 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -28,6 +28,7 @@
 #include "ackvec.h"
 #include "ccid.h"
 #include "dccp.h"
+#include "feat.h"
 
 struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
 	.lhash_lock	= RW_LOCK_UNLOCKED,
@@ -535,7 +536,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	/* FIXME: process options */
+	if (dccp_parse_options(sk, skb))
+		goto drop;
 
 	dccp_openreq_init(req, &dp, skb);
 
@@ -1049,6 +1051,8 @@ int dccp_v4_init_sock(struct sock *sk)
 	 * setsockopt(CCIDs-I-want/accept). -acme
 	 */
 	if (likely(!dccp_ctl_socket_init)) {
+		int rc;
+
 		if (dp->dccps_options.dccpo_send_ack_vector) {
 			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
 			if (dp->dccps_hc_rx_ackvec == NULL)
@@ -1069,8 +1073,16 @@ int dccp_v4_init_sock(struct sock *sk)
 			dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
 			return -ENOMEM;
 		}
-	} else
+
+		rc = dccp_feat_init(sk);
+		if (rc)
+			return rc;
+	} else {
+		/* control socket doesn't need feat nego */
+		INIT_LIST_HEAD(&dp->dccps_options.dccpo_pending);
+		INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
 		dccp_ctl_socket_init = 0;
+	}
 
 	dccp_init_xmit_timers(sk);
 	icsk->icsk_rto = DCCP_TIMEOUT_INIT;
@@ -1118,6 +1130,9 @@ int dccp_v4_destroy_sock(struct sock *sk)
 	ccid_exit(dp->dccps_hc_tx_ccid, sk);
 	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
 
+	/* clean up feature negotiation state */
+	dccp_feat_clean(sk);
+
 	return 0;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index a60a3e948c36..9e1de5919ee5 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -22,6 +22,7 @@
 #include "ackvec.h"
 #include "ccid.h"
 #include "dccp.h"
+#include "feat.h"
 
 struct inet_timewait_death_row dccp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
@@ -114,6 +115,9 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		newicsk->icsk_rto	   = DCCP_TIMEOUT_INIT;
 		do_gettimeofday(&newdp->dccps_epoch);
 
+		if (dccp_feat_clone(sk, newsk))
+			goto out_free;
+
 		if (newdp->dccps_options.dccpo_send_ack_vector) {
 			newdp->dccps_hc_rx_ackvec =
 						dccp_ackvec_alloc(GFP_ATOMIC);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 0a76426c9aea..7f99306c8e99 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -21,12 +21,14 @@
 #include "ackvec.h"
 #include "ccid.h"
 #include "dccp.h"
+#include "feat.h"
 
 /* stores the default values for new connection. may be changed with sysctl */
 static const struct dccp_options dccpo_default_values = {
 	.dccpo_sequence_window	  = DCCPF_INITIAL_SEQUENCE_WINDOW,
 	.dccpo_rx_ccid		  = DCCPF_INITIAL_CCID,
 	.dccpo_tx_ccid		  = DCCPF_INITIAL_CCID,
+	.dccpo_ack_ratio	  = DCCPF_INITIAL_ACK_RATIO,
 	.dccpo_send_ack_vector	  = DCCPF_INITIAL_SEND_ACK_VECTOR,
 	.dccpo_send_ndp_count	  = DCCPF_INITIAL_SEND_NDP_COUNT,
 };
@@ -69,6 +71,8 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 	unsigned char opt, len;
 	unsigned char *value;
 	u32 elapsed_time;
+	int rc;
+	int mandatory = 0;
 
 	memset(opt_recv, 0, sizeof(*opt_recv));
 
@@ -100,6 +104,11 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 		switch (opt) {
 		case DCCPO_PADDING:
 			break;
+		case DCCPO_MANDATORY:
+			if (mandatory)
+				goto out_invalid_option;
+			mandatory = 1;
+			break;
 		case DCCPO_NDP_COUNT:
 			if (len > 3)
 				goto out_invalid_option;
@@ -108,6 +117,31 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 			dccp_pr_debug("%sNDP count=%d\n", debug_prefix,
 				      opt_recv->dccpor_ndp);
 			break;
+		case DCCPO_CHANGE_L:
+			/* fall through */
+		case DCCPO_CHANGE_R:
+			if (len < 2)
+				goto out_invalid_option;
+			rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
+						   len - 1);
+			/*
+			 * When there is a change error, change_recv is
+			 * responsible for dealing with it.  i.e. reply with an
+			 * empty confirm.
+			 * If the change was mandatory, then we need to die.
+			 */
+			if (rc && mandatory)
+				goto out_invalid_option;
+			break;
+		case DCCPO_CONFIRM_L:
+			/* fall through */
+		case DCCPO_CONFIRM_R:
+			if (len < 2)
+				goto out_invalid_option;
+			if (dccp_feat_confirm_recv(sk, opt, *value,
+						   value + 1, len - 1))
+				goto out_invalid_option;
+			break;
 		case DCCPO_ACK_VECTOR_0:
 		case DCCPO_ACK_VECTOR_1:
 			if (pkt_type == DCCP_PKT_DATA)
@@ -208,6 +242,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 				sk, opt, len);
 			break;
 	        }
+
+		if (opt != DCCPO_MANDATORY)
+			mandatory = 0;
 	}
 
 	return 0;
@@ -356,7 +393,7 @@ void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
 {
 	struct timeval tv;
 	u32 now;
-	
+
 	dccp_timestamp(sk, &tv);
 	now = timeval_usecs(&tv) / 10;
 	/* yes this will overflow but that is the point as we want a
@@ -402,7 +439,7 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk,
 	tstamp_echo = htonl(dp->dccps_timestamp_echo);
 	memcpy(to, &tstamp_echo, 4);
 	to += 4;
-	
+
 	if (elapsed_time_len == 2) {
 		const u16 var16 = htons((u16)elapsed_time);
 		memcpy(to, &var16, 2);
@@ -421,6 +458,93 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk,
 	dp->dccps_timestamp_time.tv_usec = 0;
 }
 
+static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
+			        u8 *val, u8 len)
+{
+	u8 *to;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) {
+		LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small"
+			       " to insert feature %d option!\n", feat);
+		return -1;
+	}
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3;
+
+	to    = skb_push(skb, len + 3);
+	*to++ = type;
+	*to++ = len + 3;
+	*to++ = feat;
+
+	if (len)
+		memcpy(to, val, len);
+	dccp_pr_debug("option %d feat %d len %d\n", type, feat, len);
+
+	return 0;
+}
+
+static void dccp_insert_feat(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_opt_pend *opt, *next;
+	int change = 0;
+
+	/* confirm any options [NN opts] */
+	list_for_each_entry_safe(opt, next, &dp->dccps_options.dccpo_conf,
+				 dccpop_node) {
+		dccp_insert_feat_opt(skb, opt->dccpop_type,
+				     opt->dccpop_feat, opt->dccpop_val,
+				     opt->dccpop_len);
+		/* fear empty confirms */
+		if (opt->dccpop_val)
+			kfree(opt->dccpop_val);
+		kfree(opt);
+	}
+	INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
+
+	/* see which features we need to send */
+	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
+			    dccpop_node) {
+		/* see if we need to send any confirm */
+		if (opt->dccpop_sc) {
+			dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
+					     opt->dccpop_feat,
+					     opt->dccpop_sc->dccpoc_val,
+					     opt->dccpop_sc->dccpoc_len);
+
+			BUG_ON(!opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc);
+			opt->dccpop_sc = NULL;
+		}
+
+		/* any option not confirmed, re-send it */
+		if (!opt->dccpop_conf) {
+			dccp_insert_feat_opt(skb, opt->dccpop_type,
+					     opt->dccpop_feat, opt->dccpop_val,
+					     opt->dccpop_len);
+			change++;
+		}
+	}
+
+	/* Retransmit timer.
+	 * If this is the master listening sock, we don't set a timer on it.  It
+	 * should be fine because if the dude doesn't receive our RESPONSE
+	 * [which will contain the CHANGE] he will send another REQUEST which
+	 * will "retrnasmit" the change.
+	 */
+	if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
+		dccp_pr_debug("reset feat negotiation timer %p\n", sk);
+
+		/* XXX don't reset the timer on re-transmissions.  I.e. reset it
+		 * only when sending new stuff i guess.  Currently the timer
+		 * never backs off because on re-transmission it just resets it!
+		 */
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
+	}
+}
+
 void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -447,6 +571,17 @@ void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 		dp->dccps_hc_tx_insert_options = 0;
 	}
 
+	/* Feature negotiation */
+	switch(DCCP_SKB_CB(skb)->dccpd_type) {
+		/* Data packets can't do feat negotiation */
+	case DCCP_PKT_DATA:
+	case DCCP_PKT_DATAACK:
+		break;
+	default:
+		dccp_insert_feat(sk, skb);
+		break;
+	}
+
 	/* XXX: insert other options when appropriate */
 
 	if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
diff --git a/net/dccp/output.c b/net/dccp/output.c
index efd7ffb903a1..0cc2bcf56522 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -64,6 +64,10 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		case DCCP_PKT_DATAACK:
 			break;
 
+		case DCCP_PKT_REQUEST:
+			set_ack = 0;
+			/* fall through */
+
 		case DCCP_PKT_SYNC:
 		case DCCP_PKT_SYNCACK:
 			ackno = dcb->dccpd_seq;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 81ad24953710..1a8cf8ecfe63 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -37,6 +37,7 @@
 
 #include "ccid.h"
 #include "dccp.h"
+#include "feat.h"
 
 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
 
@@ -255,6 +256,39 @@ static int dccp_setsockopt_service(struct sock *sk, const u32 service,
 	return 0;
 }
 
+/* byte 1 is feature.  the rest is the preference list */
+static int dccp_setsockopt_change(struct sock *sk, int type,
+				  struct dccp_so_feat __user *optval)
+{
+	struct dccp_so_feat opt;
+	u8 *val;
+	int rc;
+
+	if (copy_from_user(&opt, optval, sizeof(opt)))
+		return -EFAULT;
+
+	val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
+	if (!val)
+		return -ENOMEM;
+
+	if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
+		rc = -EFAULT;
+		goto out_free_val;
+	}
+
+	rc = dccp_feat_change(sk, type, opt.dccpsf_feat, val, opt.dccpsf_len,
+			      GFP_KERNEL);
+	if (rc)
+		goto out_free_val;
+
+out:
+	return rc;
+
+out_free_val:
+	kfree(val);
+	goto out;
+}
+
 int dccp_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, int optlen)
 {
@@ -284,6 +318,25 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_PACKET_SIZE:
 		dp->dccps_packet_size = val;
 		break;
+
+	case DCCP_SOCKOPT_CHANGE_L:
+		if (optlen != sizeof(struct dccp_so_feat))
+			err = -EINVAL;
+		else
+			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
+					             (struct dccp_so_feat *)
+						     optval);
+		break;
+
+	case DCCP_SOCKOPT_CHANGE_R:
+		if (optlen != sizeof(struct dccp_so_feat))
+			err = -EINVAL;
+		else
+			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
+						     (struct dccp_so_feat *)
+						     optval);
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index aa34b576e228..d7c786608ec6 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -141,6 +141,17 @@ static void dccp_retransmit_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	/* retransmit timer is used for feature negotiation throughout
+	 * connection.  In this case, no packet is re-transmitted, but rather an
+	 * ack is generated and pending changes are splaced into its options.
+	 */
+	if (sk->sk_send_head == NULL) {
+		dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
+		if (sk->sk_state == DCCP_OPEN)
+			dccp_send_ack(sk);
+		goto backoff;
+	}
+
 	/*
 	 * sk->sk_send_head has to have one skb with
 	 * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
@@ -177,6 +188,7 @@ static void dccp_retransmit_timer(struct sock *sk)
 		goto out;
 	}
 
+backoff:
 	icsk->icsk_backoff++;
 	icsk->icsk_retransmits++;
 
-- 
cgit v1.2.3


From d4d2c558fd3e1f5e386b153f194aa8f0be496c77 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Mon, 20 Mar 2006 17:47:20 -0800
Subject: [TG3]: Add support for 5714S and 5715S

Add support for 5714S and 5715S.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/pci_ids.h |  2 ++
 2 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 6c6c5498899f..b0de6b2754cc 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -223,8 +223,12 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5714,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5714S,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5715,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5715S,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5780,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5780S,
@@ -2680,6 +2684,12 @@ static int tg3_setup_fiber_mii_phy(struct tg3 *tp, int force_reset)
 
 	err |= tg3_readphy(tp, MII_BMSR, &bmsr);
 	err |= tg3_readphy(tp, MII_BMSR, &bmsr);
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5714) {
+		if (tr32(MAC_TX_STATUS) & TX_STATUS_LINK_UP)
+			bmsr |= BMSR_LSTATUS;
+		else
+			bmsr &= ~BMSR_LSTATUS;
+	}
 
 	err |= tg3_readphy(tp, MII_BMCR, &bmcr);
 
@@ -2748,6 +2758,13 @@ static int tg3_setup_fiber_mii_phy(struct tg3 *tp, int force_reset)
 			bmcr = new_bmcr;
 			err |= tg3_readphy(tp, MII_BMSR, &bmsr);
 			err |= tg3_readphy(tp, MII_BMSR, &bmsr);
+			if (GET_ASIC_REV(tp->pci_chip_rev_id) ==
+			    ASIC_REV_5714) {
+				if (tr32(MAC_TX_STATUS) & TX_STATUS_LINK_UP)
+					bmsr |= BMSR_LSTATUS;
+				else
+					bmsr &= ~BMSR_LSTATUS;
+			}
 			tp->tg3_flags2 &= ~TG3_FLG2_PARALLEL_DETECT;
 		}
 	}
@@ -5585,6 +5602,9 @@ static int tg3_reset_hw(struct tg3 *tp)
 		tg3_abort_hw(tp, 1);
 	}
 
+	if (tp->tg3_flags2 & TG3_FLG2_MII_SERDES)
+		tg3_phy_reset(tp);
+
 	err = tg3_chip_reset(tp);
 	if (err)
 		return err;
@@ -6097,6 +6117,17 @@ static int tg3_reset_hw(struct tg3 *tp)
 		tp->tg3_flags2 |= TG3_FLG2_HW_AUTONEG;
 	}
 
+	if ((tp->tg3_flags2 & TG3_FLG2_MII_SERDES) &&
+	    (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5714)) {
+		u32 tmp;
+
+		tmp = tr32(SERDES_RX_CTRL);
+		tw32(SERDES_RX_CTRL, tmp | SERDES_RX_SIG_DETECT);
+		tp->grc_local_ctrl &= ~GRC_LCLCTRL_USE_EXT_SIG_DETECT;
+		tp->grc_local_ctrl |= GRC_LCLCTRL_USE_SIG_DETECT;
+		tw32(GRC_LOCAL_CTRL, tp->grc_local_ctrl);
+	}
+
 	err = tg3_setup_phy(tp, 1);
 	if (err)
 		return err;
@@ -6476,7 +6507,9 @@ static int tg3_open(struct net_device *dev)
 
 	if ((tp->tg3_flags2 & TG3_FLG2_5750_PLUS) &&
 	    (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5750_AX) &&
-	    (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5750_BX)) {
+	    (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5750_BX) &&
+	    !((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5714) &&
+	      (tp->pdev_peer == tp->pdev))) {
 		/* All MSI supporting chips should support tagged
 		 * status.  Assert that this is the case.
 		 */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 751eea58bde8..a3a09cceb026 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1857,12 +1857,14 @@
 #define PCI_DEVICE_ID_TIGON3_5705M	0x165d
 #define PCI_DEVICE_ID_TIGON3_5705M_2	0x165e
 #define PCI_DEVICE_ID_TIGON3_5714	0x1668
+#define PCI_DEVICE_ID_TIGON3_5714S	0x1669
 #define PCI_DEVICE_ID_TIGON3_5780	0x166a
 #define PCI_DEVICE_ID_TIGON3_5780S	0x166b
 #define PCI_DEVICE_ID_TIGON3_5705F	0x166e
 #define PCI_DEVICE_ID_TIGON3_5750	0x1676
 #define PCI_DEVICE_ID_TIGON3_5751	0x1677
 #define PCI_DEVICE_ID_TIGON3_5715	0x1678
+#define PCI_DEVICE_ID_TIGON3_5715S	0x1679
 #define PCI_DEVICE_ID_TIGON3_5750M	0x167c
 #define PCI_DEVICE_ID_TIGON3_5751M	0x167d
 #define PCI_DEVICE_ID_TIGON3_5751F	0x167e
-- 
cgit v1.2.3


From 5d424d5a674f782d0659a3b66d951f412901faee Mon Sep 17 00:00:00 2001
From: John Heffner <jheffner@psc.edu>
Date: Mon, 20 Mar 2006 17:53:41 -0800
Subject: [TCP]: MTU probing

Implementation of packetization layer path mtu discovery for TCP, based on
the internet-draft currently found at
<http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h             |   2 +
 include/net/inet_connection_sock.h |  13 ++
 include/net/tcp.h                  |   9 ++
 net/ipv4/sysctl_net_ipv4.c         |  16 +++
 net/ipv4/tcp_input.c               |  49 ++++++++
 net/ipv4/tcp_ipv4.c                |   1 +
 net/ipv4/tcp_output.c              | 236 ++++++++++++++++++++++++++++++++++---
 net/ipv4/tcp_timer.c               |  36 +++---
 net/ipv6/tcp_ipv6.c                |   1 +
 9 files changed, 326 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 8ad4beab2888..6e8880ea49e7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -397,6 +397,8 @@ enum
 	NET_TCP_CONG_CONTROL=110,
 	NET_TCP_ABC=111,
 	NET_IPV4_IPFRAG_MAX_DIST=112,
+ 	NET_TCP_MTU_PROBING=113,
+	NET_TCP_BASE_MSS=114,
 };
 
 enum {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index fa587c94e9d0..b3abe33f4e5f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -72,6 +72,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_probes_out:	   unanswered 0 window probes
  * @icsk_ext_hdr_len:	   Network protocol overhead (IP/IPv6 options)
  * @icsk_ack:		   Delayed ACK control data
+ * @icsk_mtup;		   MTU probing control data
  */
 struct inet_connection_sock {
 	/* inet_sock has to be the first member! */
@@ -104,6 +105,18 @@ struct inet_connection_sock {
 		__u16		  last_seg_size; /* Size of last incoming segment	   */
 		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
 	} icsk_ack;
+	struct {
+		int		  enabled;
+
+		/* Range of MTUs to search */
+		int		  search_high;
+		int		  search_low;
+
+		/* Information on the current probe. */
+		int		  probe_size;
+		__u32		  probe_seq_start;
+		__u32		  probe_seq_end;
+	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
 #define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 77f21c65bbca..16879fa560de 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* Minimal RCV_MSS. */
 #define TCP_MIN_RCVMSS		536U
 
+/* The least MTU to use for probing */
+#define TCP_BASE_MSS		512
+
 /* After receiving this amount of duplicate ACKs fast retransmit starts. */
 #define TCP_FASTRETRANS_THRESH 3
 
@@ -219,6 +222,8 @@ extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_abc;
+extern int sysctl_tcp_mtu_probing;
+extern int sysctl_tcp_base_mss;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
@@ -447,6 +452,10 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 
 extern void tcp_initialize_rcv_mss(struct sock *sk);
 
+extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
+extern int tcp_mss_to_mtu(struct sock *sk, int mss);
+extern void tcp_mtup_init(struct sock *sk);
+
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 16984d4a8a06..ebf2e0b363c4 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_TCP_MTU_PROBING,
+		.procname	= "tcp_mtu_probing",
+		.data		= &sysctl_tcp_mtu_probing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_BASE_MSS,
+		.procname	= "tcp_base_mss",
+		.data		= &sysctl_tcp_base_mss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e9a54ae7d690..0ac388e3d01d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
 	}
 }
 
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+	icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	/* FIXME: breaks with very large cwnd */
+	tp->prior_ssthresh = tcp_current_ssthresh(sk);
+	tp->snd_cwnd = tp->snd_cwnd *
+		       tcp_mss_to_mtu(sk, tp->mss_cache) /
+		       icsk->icsk_mtup.probe_size;
+	tp->snd_cwnd_cnt = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+
+	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+	icsk->icsk_mtup.probe_size = 0;
+	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
  * taking into account both packets sitting in receiver's buffer and
@@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			return;
 		}
 
+		/* MTU probe failure: don't reduce cwnd */
+		if (icsk->icsk_ca_state < TCP_CA_CWR &&
+		    icsk->icsk_mtup.probe_size &&
+		    tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
+			tcp_mtup_probe_failed(sk);
+			/* Restores the reduction we did in tcp_mtup_probe() */
+			tp->snd_cwnd++;
+			tcp_simple_retransmit(sk);
+			return;
+		}
+
 		/* Otherwise enter Recovery state */
 
 		if (IsReno(tp))
@@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 			tp->retrans_stamp = 0;
 		}
 
+		/* MTU probing checks */
+		if (icsk->icsk_mtup.probe_size) {
+			if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
+				tcp_mtup_probe_success(sk, skb);
+			}
+		}
+
 		if (sacked) {
 			if (sacked & TCPCB_RETRANS) {
 				if(sacked & TCPCB_SACKED_RETRANS)
@@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
 			tp->rx_opt.sack_ok |= 2;
 
+		tcp_mtup_init(sk);
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
@@ -4211,6 +4258,7 @@ discard:
 		if (tp->ecn_flags&TCP_ECN_OK)
 			sock_set_flag(sk, SOCK_NO_LARGESEND);
 
+		tcp_mtup_init(sk);
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
@@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				tp->lsndtime = tcp_time_stamp;
 
+				tcp_mtup_init(sk);
 				tcp_initialize_rcv_mss(sk);
 				tcp_init_buffer_space(sk);
 				tcp_fast_path_on(tp);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 233bdf259965..57e7a26e8213 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
 	newinet->id = newtp->write_seq ^ jiffies;
 
+	tcp_mtup_init(newsk);
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f498a6c8895..8197b5e12f1f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1;
  */
 int sysctl_tcp_tso_win_divisor = 3;
 
+int sysctl_tcp_mtu_probing = 0;
+int sysctl_tcp_base_mss = 512;
+
+EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
+EXPORT_SYMBOL(sysctl_tcp_base_mss);
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 	return 0;
 }
 
+/* Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mss_now;
+
+	/* Calculate base mss without TCP options:
+	   It is MMS_S - sizeof(tcphdr) of rfc1122
+	 */
+	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+	/* Clamp it (mss_clamp does not include tcp options) */
+	if (mss_now > tp->rx_opt.mss_clamp)
+		mss_now = tp->rx_opt.mss_clamp;
+
+	/* Now subtract optional transport overhead */
+	mss_now -= icsk->icsk_ext_hdr_len;
+
+	/* Then reserve room for full set of TCP options and 8 bytes of data */
+	if (mss_now < 48)
+		mss_now = 48;
+
+	/* Now subtract TCP options size, not including SACKs */
+	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+	return mss_now;
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mtu;
+
+	mtu = mss +
+	      tp->tcp_header_len +
+	      icsk->icsk_ext_hdr_len +
+	      icsk->icsk_af_ops->net_header_len;
+
+	return mtu;
+}
+
+void tcp_mtup_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+	                       icsk->icsk_af_ops->net_header_len;
+	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+	icsk->icsk_mtup.probe_size = 0;
+}
+
 /* This function synchronize snd mss to current pmtu/exthdr set.
 
    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	/* Calculate base mss without TCP options:
-	   It is MMS_S - sizeof(tcphdr) of rfc1122
-	 */
-	int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
-		       sizeof(struct tcphdr));
+	int mss_now;
 
-	/* Clamp it (mss_clamp does not include tcp options) */
-	if (mss_now > tp->rx_opt.mss_clamp)
-		mss_now = tp->rx_opt.mss_clamp;
+	if (icsk->icsk_mtup.search_high > pmtu)
+		icsk->icsk_mtup.search_high = pmtu;
 
-	/* Now subtract optional transport overhead */
-	mss_now -= icsk->icsk_ext_hdr_len;
-
-	/* Then reserve room for full set of TCP options and 8 bytes of data */
-	if (mss_now < 48)
-		mss_now = 48;
-
-	/* Now subtract TCP options size, not including SACKs */
-	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+	mss_now = tcp_mtu_to_mss(sk, pmtu);
 
 	/* Bound mss with half of window */
 	if (tp->max_window && mss_now > (tp->max_window>>1))
@@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 
 	/* And store cached results */
 	icsk->icsk_pmtu_cookie = pmtu;
+	if (icsk->icsk_mtup.enabled)
+		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
 	tp->mss_cache = mss_now;
 
 	return mss_now;
@@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
 	return 1;
 }
 
+/* Create a new MTU probe if we are ready.
+ * Returns 0 if we should wait to probe (no cwnd available),
+ *         1 if a probe was sent,
+ *         -1 otherwise */
+static int tcp_mtu_probe(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb, *nskb, *next;
+	int len;
+	int probe_size;
+	unsigned int pif;
+	int copy;
+	int mss_now;
+
+	/* Not currently probing/verifying,
+	 * not in recovery,
+	 * have enough cwnd, and
+	 * not SACKing (the variable headers throw things off) */
+	if (!icsk->icsk_mtup.enabled ||
+	    icsk->icsk_mtup.probe_size ||
+	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+	    tp->snd_cwnd < 11 ||
+	    tp->rx_opt.eff_sacks)
+		return -1;
+
+	/* Very simple search strategy: just double the MSS. */
+	mss_now = tcp_current_mss(sk, 0);
+	probe_size = 2*tp->mss_cache;
+	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
+		/* TODO: set timer for probe_converge_event */
+		return -1;
+	}
+
+	/* Have enough data in the send queue to probe? */
+	len = 0;
+	if ((skb = sk->sk_send_head) == NULL)
+		return -1;
+	while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
+		skb = skb->next;
+	if (len < probe_size)
+		return -1;
+
+	/* Receive window check. */
+	if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
+		if (tp->snd_wnd < probe_size)
+			return -1;
+		else
+			return 0;
+	}
+
+	/* Do we need to wait to drain cwnd? */
+	pif = tcp_packets_in_flight(tp);
+	if (pif + 2 > tp->snd_cwnd) {
+		/* With no packets in flight, don't stall. */
+		if (pif == 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	/* We're allowed to probe.  Build it now. */
+	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+		return -1;
+	sk_charge_skb(sk, nskb);
+
+	skb = sk->sk_send_head;
+	__skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
+	sk->sk_send_head = nskb;
+
+	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+	TCP_SKB_CB(nskb)->sacked = 0;
+	nskb->csum = 0;
+	if (skb->ip_summed == CHECKSUM_HW)
+		nskb->ip_summed = CHECKSUM_HW;
+
+	len = 0;
+	while (len < probe_size) {
+		next = skb->next;
+
+		copy = min_t(int, skb->len, probe_size - len);
+		if (nskb->ip_summed)
+			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+		else
+			nskb->csum = skb_copy_and_csum_bits(skb, 0,
+			                 skb_put(nskb, copy), copy, nskb->csum);
+
+		if (skb->len <= copy) {
+			/* We've eaten all the data from this skb.
+			 * Throw it away. */
+			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+			__skb_unlink(skb, &sk->sk_write_queue);
+			sk_stream_free_skb(sk, skb);
+		} else {
+			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+			                           ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+			if (!skb_shinfo(skb)->nr_frags) {
+				skb_pull(skb, copy);
+				if (skb->ip_summed != CHECKSUM_HW)
+					skb->csum = csum_partial(skb->data, skb->len, 0);
+			} else {
+				__pskb_trim_head(skb, copy);
+				tcp_set_skb_tso_segs(sk, skb, mss_now);
+			}
+			TCP_SKB_CB(skb)->seq += copy;
+		}
+
+		len += copy;
+		skb = next;
+	}
+	tcp_init_tso_segs(sk, nskb, nskb->len);
+
+	/* We're ready to send.  If this fails, the probe will
+	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
+	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+		/* Decrement cwnd here because we are sending
+		* effectively two packets. */
+		tp->snd_cwnd--;
+		update_send_head(sk, tp, nskb);
+
+		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+		icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+		icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+		return 1;
+	}
+
+	return -1;
+}
+
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 	struct sk_buff *skb;
 	unsigned int tso_segs, sent_pkts;
 	int cwnd_quota;
+	int result;
 
 	/* If we are closed, the bytes will have to remain here.
 	 * In time closedown will finish, we empty the write queue and all
@@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		return 0;
 
 	sent_pkts = 0;
+
+	/* Do MTU probing. */
+	if ((result = tcp_mtu_probe(sk)) == 0) {
+		return 0;
+	} else if (result > 0) {
+		sent_pkts = 1;
+	}
+
 	while ((skb = sk->sk_send_head)) {
 		unsigned int limit;
 
@@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk)
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
  	unsigned int cur_mss = tcp_current_mss(sk, 0);
 	int err;
 
+	/* Inconslusive MTU probe */
+	if (icsk->icsk_mtup.probe_size) {
+		icsk->icsk_mtup.probe_size = 0;
+	}
+
 	/* Do not sent more than we queued. 1/4 is reserved for possible
 	 * copying overhead: fragmentation, tunneling, mangling etc.
 	 */
@@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk)
 	if (tp->rx_opt.user_mss)
 		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
 	tp->max_window = 0;
+	tcp_mtup_init(sk);
 	tcp_sync_mss(sk, dst_mtu(dst));
 
 	if (!tp->window_clamp)
@@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack);
 EXPORT_SYMBOL(tcp_simple_retransmit);
 EXPORT_SYMBOL(tcp_sync_mss);
 EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
+EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e1880959614a..7c1bde3cd6cb 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
 /* A write timeout has occurred. Process the after effects. */
 static int tcp_write_timeout(struct sock *sk)
 {
-	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 	int retry_until;
+	int mss;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 		if (icsk->icsk_retransmits)
@@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk)
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
 		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
-			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
-			   hole detection. :-(
-
-			   It is place to make it. It is not made. I do not want
-			   to make it. It is disgusting. It does not work in any
-			   case. Let me to cite the same draft, which requires for
-			   us to implement this:
-
-   "The one security concern raised by this memo is that ICMP black holes
-   are often caused by over-zealous security administrators who block
-   all ICMP messages.  It is vitally important that those who design and
-   deploy security systems understand the impact of strict filtering on
-   upper-layer protocols.  The safest web site in the world is worthless
-   if most TCP implementations cannot transfer data from it.  It would
-   be far nicer to have all of the black holes fixed rather than fixing
-   all of the TCP implementations."
-
-                           Golden words :-).
-		   */
+			/* Black hole detection */
+			if (sysctl_tcp_mtu_probing) {
+				if (!icsk->icsk_mtup.enabled) {
+					icsk->icsk_mtup.enabled = 1;
+					tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+				} else {
+					mss = min(sysctl_tcp_base_mss,
+					          tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
+					mss = max(mss, 68 - tp->tcp_header_len);
+					icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+					tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+				}
+			}
 
 			dst_negative_advice(&sk->sk_dst_cache);
 		}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ca9cf6853755..14de50380f4e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
 						     newnp->opt->opt_flen);
 
+	tcp_mtup_init(newsk);
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(newsk);
-- 
cgit v1.2.3


From ba66c6e8b292997467128506f39fa6607e959050 Mon Sep 17 00:00:00 2001
From: Ian McDonald <imcdnzl@gmail.com>
Date: Mon, 20 Mar 2006 17:56:56 -0800
Subject: [DCCP]: Set the default CCID according to kernel config selection

Now CCID2 is the default, as stated in the RFC drafts, but we allow
a config where just CCID3 is built, where CCID3 becomes the default.

Signed-off-by: Ian McDonald <imcdnzl@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 include/linux/dccp.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index f91c8a62406d..a70d1a27e7fc 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -329,9 +329,18 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 
 /* initial values for each feature */
 #define DCCPF_INITIAL_SEQUENCE_WINDOW		100
-#define DCCPF_INITIAL_CCID			2
 #define DCCPF_INITIAL_ACK_RATIO			2
+
+#if defined(CONFIG_IP_DCCP_CCID2) || defined(CONFIG_IP_DCCP_CCID2_MODULE)
+#define DCCPF_INITIAL_CCID			2
 #define DCCPF_INITIAL_SEND_ACK_VECTOR		1
+#elif defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
+#define DCCPF_INITIAL_CCID			3
+#define DCCPF_INITIAL_SEND_ACK_VECTOR		0
+#else
+#error  "At least one CCID must be built as the default"
+#endif
+
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
 
-- 
cgit v1.2.3


From 37f9f7334b86ffc3b8a1921842ae33cb9aa22ee3 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Mar 2006 17:59:06 -0800
Subject: [NETFILTER]: xt_tables: add centralized error checking

Introduce new functions for common match/target checks (private data
size, valid hooks, valid tables and valid protocols) to get more consistent
error reporting and to avoid each module duplicating them.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h | 23 +++++++++---
 net/netfilter/x_tables.c           | 72 ++++++++++++++++++++++++++++++++++----
 2 files changed, 84 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 6500d4e59d46..b9c37e1e6730 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -92,8 +92,6 @@ struct xt_match
 
 	const char name[XT_FUNCTION_MAXNAMELEN-1];
 
-	u_int8_t revision;
-
 	/* Return true or false: return FALSE and set *hotdrop = 1 to
            force immediate packet drop. */
 	/* Arguments changed since 2.6.9, as this must now handle
@@ -120,6 +118,12 @@ struct xt_match
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
+
+	char *table;
+	unsigned int matchsize;
+	unsigned int hooks;
+	unsigned short proto;
+	u_int8_t revision;
 };
 
 /* Registration hooks for targets. */
@@ -129,8 +133,6 @@ struct xt_target
 
 	const char name[XT_FUNCTION_MAXNAMELEN-1];
 
-	u_int8_t revision;
-
 	/* Returns verdict. Argument order changed since 2.6.9, as this
 	   must now handle non-linear skbs, using skb_copy_bits and
 	   skb_ip_make_writable. */
@@ -156,6 +158,12 @@ struct xt_target
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
+
+	char *table;
+	unsigned int targetsize;
+	unsigned int hooks;
+	unsigned short proto;
+	u_int8_t revision;
 };
 
 /* Furniture shopping... */
@@ -207,6 +215,13 @@ extern void xt_unregister_target(int af, struct xt_target *target);
 extern int xt_register_match(int af, struct xt_match *target);
 extern void xt_unregister_match(int af, struct xt_match *target);
 
+extern int xt_check_match(const struct xt_match *match, unsigned short family,
+			  unsigned int size, const char *table, unsigned int hook,
+			  unsigned short proto, int inv_proto);
+extern int xt_check_target(const struct xt_target *target, unsigned short family,
+			   unsigned int size, const char *table, unsigned int hook,
+			   unsigned short proto, int inv_proto);
+
 extern int xt_register_table(struct xt_table *table,
 			     struct xt_table_info *bootstrap,
 			     struct xt_table_info *newinfo);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d7817afc6b96..750b92829766 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -52,6 +52,12 @@ enum {
 	MATCH,
 };
 
+static const char *xt_prefix[NPROTO] = {
+	[AF_INET] 	= "ip",
+	[AF_INET6] 	= "ip6",
+	[NF_ARP]	= "arp",
+};
+
 /* Registration hooks for targets. */
 int
 xt_register_target(int af, struct xt_target *target)
@@ -158,18 +164,12 @@ struct xt_target *xt_find_target(int af, const char *name, u8 revision)
 }
 EXPORT_SYMBOL(xt_find_target);
 
-static const char *xt_prefix[NPROTO] = {
-	[AF_INET] 	= "ipt_%s",
-	[AF_INET6] 	= "ip6t_%s",
-	[NF_ARP]	= "arpt_%s",
-};
-
 struct xt_target *xt_request_find_target(int af, const char *name, u8 revision)
 {
 	struct xt_target *target;
 
 	target = try_then_request_module(xt_find_target(af, name, revision),
-					 xt_prefix[af], name);
+					 "%st_%s", xt_prefix[af], name);
 	if (IS_ERR(target) || !target)
 		return NULL;
 	return target;
@@ -237,6 +237,64 @@ int xt_find_revision(int af, const char *name, u8 revision, int target,
 }
 EXPORT_SYMBOL_GPL(xt_find_revision);
 
+int xt_check_match(const struct xt_match *match, unsigned short family,
+                   unsigned int size, const char *table, unsigned int hook_mask,
+		   unsigned short proto, int inv_proto)
+{
+	if (XT_ALIGN(match->matchsize) != size) {
+		printk("%s_tables: %s match: invalid size %Zu != %u\n",
+		       xt_prefix[family], match->name,
+		       XT_ALIGN(match->matchsize), size);
+		return -EINVAL;
+	}
+	if (match->table && strcmp(match->table, table)) {
+		printk("%s_tables: %s match: only valid in %s table, not %s\n",
+		       xt_prefix[family], match->name, match->table, table);
+		return -EINVAL;
+	}
+	if (match->hooks && (hook_mask & ~match->hooks) != 0) {
+		printk("%s_tables: %s match: bad hook_mask %u\n",
+		       xt_prefix[family], match->name, hook_mask);
+		return -EINVAL;
+	}
+	if (match->proto && (match->proto != proto || inv_proto)) {
+		printk("%s_tables: %s match: only valid for protocol %u\n",
+		       xt_prefix[family], match->name, match->proto);
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_match);
+
+int xt_check_target(const struct xt_target *target, unsigned short family,
+		    unsigned int size, const char *table, unsigned int hook_mask,
+		    unsigned short proto, int inv_proto)
+{
+	if (XT_ALIGN(target->targetsize) != size) {
+		printk("%s_tables: %s target: invalid size %Zu != %u\n",
+		       xt_prefix[family], target->name,
+		       XT_ALIGN(target->targetsize), size);
+		return -EINVAL;
+	}
+	if (target->table && strcmp(target->table, table)) {
+		printk("%s_tables: %s target: only valid in %s table, not %s\n",
+		       xt_prefix[family], target->name, target->table, table);
+		return -EINVAL;
+	}
+	if (target->hooks && (hook_mask & ~target->hooks) != 0) {
+		printk("%s_tables: %s target: bad hook_mask %u\n",
+		       xt_prefix[family], target->name, hook_mask);
+		return -EINVAL;
+	}
+	if (target->proto && (target->proto != proto || inv_proto)) {
+		printk("%s_tables: %s target: only valid for protocol %u\n",
+		       xt_prefix[family], target->name, target->proto);
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_target);
+
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
 {
 	struct xt_table_info *newinfo;
-- 
cgit v1.2.3


From 1c524830d0b39472f0278989bf1119750a5e234d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Mar 2006 18:02:15 -0800
Subject: [NETFILTER]: x_tables: pass registered match/target data to
 match/target functions

This allows to make decisions based on the revision (and address family
with a follow-up patch) at runtime.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h | 10 ++++++++--
 net/ipv4/netfilter/arp_tables.c    |  5 +++--
 net/ipv4/netfilter/ip_tables.c     | 13 +++++++------
 net/ipv6/netfilter/ip6_tables.c    | 11 ++++++-----
 net/sched/act_ipt.c                | 10 ++++++----
 5 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index b9c37e1e6730..2fdbc4a446bf 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -100,6 +100,7 @@ struct xt_match
 	int (*match)(const struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
+		     const struct xt_match *match,
 		     const void *matchinfo,
 		     int offset,
 		     unsigned int protoff,
@@ -109,12 +110,14 @@ struct xt_match
 	/* Should return true or false. */
 	int (*checkentry)(const char *tablename,
 			  const void *ip,
+			  const struct xt_match *match,
 			  void *matchinfo,
 			  unsigned int matchinfosize,
 			  unsigned int hook_mask);
 
 	/* Called when entry of this type deleted. */
-	void (*destroy)(void *matchinfo, unsigned int matchinfosize);
+	void (*destroy)(const struct xt_match *match, void *matchinfo,
+			unsigned int matchinfosize);
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
@@ -140,6 +143,7 @@ struct xt_target
 			       const struct net_device *in,
 			       const struct net_device *out,
 			       unsigned int hooknum,
+			       const struct xt_target *target,
 			       const void *targinfo,
 			       void *userdata);
 
@@ -149,12 +153,14 @@ struct xt_target
 	/* Should return true or false. */
 	int (*checkentry)(const char *tablename,
 			  const void *entry,
+			  const struct xt_target *target,
 			  void *targinfo,
 			  unsigned int targinfosize,
 			  unsigned int hook_mask);
 
 	/* Called when entry of this type deleted. */
-	void (*destroy)(void *targinfo, unsigned int targinfosize);
+	void (*destroy)(const struct xt_target *target, void *targinfo,
+			unsigned int targinfosize);
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 6162d0e328ec..87b3b7920101 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -300,6 +300,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 				verdict = t->u.kernel.target->target(pskb,
 								     in, out,
 								     hook,
+								     t->u.kernel.target,
 								     t->data,
 								     userdata);
 
@@ -491,7 +492,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
 			goto out;
 		}
 	} else if (t->u.kernel.target->checkentry
-		   && !t->u.kernel.target->checkentry(name, e, t->data,
+		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
 						      t->u.target_size
 						      - sizeof(*t),
 						      e->comefrom)) {
@@ -560,7 +561,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
 
 	t = arpt_get_target(e);
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->data,
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
 					    t->u.target_size - sizeof(*t));
 	module_put(t->u.kernel.target->me);
 	return 0;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 62f8d639ab9c..2381a4aa71d0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -197,8 +197,8 @@ int do_match(struct ipt_entry_match *m,
 	     int *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
-	if (!m->u.kernel.match->match(skb, in, out, m->data, offset, 
-	    skb->nh.iph->ihl*4, hotdrop))
+	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
+				      offset, skb->nh.iph->ihl*4, hotdrop))
 		return 1;
 	else
 		return 0;
@@ -305,6 +305,7 @@ ipt_do_table(struct sk_buff **pskb,
 				verdict = t->u.kernel.target->target(pskb,
 								     in, out,
 								     hook,
+								     t->u.kernel.target,
 								     t->data,
 								     userdata);
 
@@ -464,7 +465,7 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i)
 		return 1;
 
 	if (m->u.kernel.match->destroy)
-		m->u.kernel.match->destroy(m->data,
+		m->u.kernel.match->destroy(m->u.kernel.match, m->data,
 					   m->u.match_size - sizeof(*m));
 	module_put(m->u.kernel.match->me);
 	return 0;
@@ -517,7 +518,7 @@ check_match(struct ipt_entry_match *m,
 		goto err;
 
 	if (m->u.kernel.match->checkentry
-	    && !m->u.kernel.match->checkentry(name, ip, m->data,
+	    && !m->u.kernel.match->checkentry(name, ip, match, m->data,
 					      m->u.match_size - sizeof(*m),
 					      hookmask)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -578,7 +579,7 @@ check_entry(struct ipt_entry *e, const char *name, unsigned int size,
 			goto cleanup_matches;
 		}
 	} else if (t->u.kernel.target->checkentry
-		   && !t->u.kernel.target->checkentry(name, e, t->data,
+		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
 						      t->u.target_size
 						      - sizeof(*t),
 						      e->comefrom)) {
@@ -652,7 +653,7 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i)
 	IPT_MATCH_ITERATE(e, cleanup_match, NULL);
 	t = ipt_get_target(e);
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->data,
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
 					    t->u.target_size - sizeof(*t));
 	module_put(t->u.kernel.target->me);
 	return 0;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e2e8d0140d7b..1b32a2d1e9e0 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -251,7 +251,7 @@ int do_match(struct ip6t_entry_match *m,
 	     int *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
-	if (!m->u.kernel.match->match(skb, in, out, m->data,
+	if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data,
 				      offset, protoff, hotdrop))
 		return 1;
 	else
@@ -373,6 +373,7 @@ ip6t_do_table(struct sk_buff **pskb,
 				verdict = t->u.kernel.target->target(pskb,
 								     in, out,
 								     hook,
+								     t->u.kernel.target,
 								     t->data,
 								     userdata);
 
@@ -531,7 +532,7 @@ cleanup_match(struct ip6t_entry_match *m, unsigned int *i)
 		return 1;
 
 	if (m->u.kernel.match->destroy)
-		m->u.kernel.match->destroy(m->data,
+		m->u.kernel.match->destroy(m->u.kernel.match, m->data,
 					   m->u.match_size - sizeof(*m));
 	module_put(m->u.kernel.match->me);
 	return 0;
@@ -584,7 +585,7 @@ check_match(struct ip6t_entry_match *m,
 		goto err;
 
 	if (m->u.kernel.match->checkentry
-	    && !m->u.kernel.match->checkentry(name, ipv6, m->data,
+	    && !m->u.kernel.match->checkentry(name, ipv6, match,  m->data,
 					      m->u.match_size - sizeof(*m),
 					      hookmask)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
@@ -645,7 +646,7 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
 			goto cleanup_matches;
 		}
 	} else if (t->u.kernel.target->checkentry
-		   && !t->u.kernel.target->checkentry(name, e, t->data,
+		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
 						      t->u.target_size
 						      - sizeof(*t),
 						      e->comefrom)) {
@@ -719,7 +720,7 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i)
 	IP6T_MATCH_ITERATE(e, cleanup_match, NULL);
 	t = ip6t_get_target(e);
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->data,
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
 					    t->u.target_size - sizeof(*t));
 	module_put(t->u.kernel.target->me);
 	return 0;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 39a22a3ffe78..6056d20ef429 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -70,7 +70,8 @@ ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
 	t->u.kernel.target = target;
 
 	if (t->u.kernel.target->checkentry
-	    && !t->u.kernel.target->checkentry(table, NULL, t->data,
+	    && !t->u.kernel.target->checkentry(table, NULL,
+		    			       t->u.kernel.target, t->data,
 					       t->u.target_size - sizeof(*t),
 					       hook)) {
 		DPRINTK("ipt_init_target: check failed for `%s'.\n",
@@ -86,7 +87,7 @@ static void
 ipt_destroy_target(struct ipt_entry_target *t)
 {
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->data,
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
 		                            t->u.target_size - sizeof(*t));
         module_put(t->u.kernel.target->me);
 }
@@ -224,8 +225,9 @@ tcf_ipt(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 	/* iptables targets take a double skb pointer in case the skb
 	 * needs to be replaced. We don't own the skb, so this must not
 	 * happen. The pskb_expand_head above should make sure of this */
-	ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL,
-					    p->hook, p->t->data, NULL);
+	ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL, p->hook,
+					    p->t->u.kernel.target, p->t->data,
+					    NULL);
 	switch (ret) {
 	case NF_ACCEPT:
 		result = TC_ACT_OK;
-- 
cgit v1.2.3


From c4b885139203d37f76662c37ae645fe8e0f4e4e5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Mar 2006 18:03:40 -0800
Subject: [NETFILTER]: x_tables: replace IPv4/IPv6 policy match by address
 family independant version

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h         |   4 +
 include/linux/netfilter/xt_policy.h        |  58 ++++++++
 include/linux/netfilter_ipv4/ipt_policy.h  |  69 +++-------
 include/linux/netfilter_ipv6/ip6t_policy.h |  69 +++-------
 net/ipv4/netfilter/Kconfig                 |  10 --
 net/ipv4/netfilter/Makefile                |   1 -
 net/ipv4/netfilter/ipt_policy.c            | 174 ------------------------
 net/ipv6/netfilter/Kconfig                 |  10 --
 net/ipv6/netfilter/Makefile                |   1 -
 net/ipv6/netfilter/ip6t_policy.c           | 174 ------------------------
 net/netfilter/Kconfig                      |  10 ++
 net/netfilter/Makefile                     |   1 +
 net/netfilter/xt_policy.c                  | 209 +++++++++++++++++++++++++++++
 13 files changed, 314 insertions(+), 476 deletions(-)
 create mode 100644 include/linux/netfilter/xt_policy.h
 delete mode 100644 net/ipv4/netfilter/ipt_policy.c
 delete mode 100644 net/ipv6/netfilter/ip6t_policy.c
 create mode 100644 net/netfilter/xt_policy.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 2fdbc4a446bf..46a0f974f87c 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -126,6 +126,8 @@ struct xt_match
 	unsigned int matchsize;
 	unsigned int hooks;
 	unsigned short proto;
+
+	unsigned short family;
 	u_int8_t revision;
 };
 
@@ -169,6 +171,8 @@ struct xt_target
 	unsigned int targetsize;
 	unsigned int hooks;
 	unsigned short proto;
+
+	unsigned short family;
 	u_int8_t revision;
 };
 
diff --git a/include/linux/netfilter/xt_policy.h b/include/linux/netfilter/xt_policy.h
new file mode 100644
index 000000000000..a8132ec076fb
--- /dev/null
+++ b/include/linux/netfilter/xt_policy.h
@@ -0,0 +1,58 @@
+#ifndef _XT_POLICY_H
+#define _XT_POLICY_H
+
+#define XT_POLICY_MAX_ELEM	4
+
+enum xt_policy_flags
+{
+	XT_POLICY_MATCH_IN	= 0x1,
+	XT_POLICY_MATCH_OUT	= 0x2,
+	XT_POLICY_MATCH_NONE	= 0x4,
+	XT_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum xt_policy_modes
+{
+	XT_POLICY_MODE_TRANSPORT,
+	XT_POLICY_MODE_TUNNEL
+};
+
+struct xt_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+union xt_policy_addr
+{
+	struct in_addr	a4;
+	struct in6_addr	a6;
+};
+
+struct xt_policy_elem
+{
+	union xt_policy_addr	saddr;
+	union xt_policy_addr	smask;
+	union xt_policy_addr	daddr;
+	union xt_policy_addr	dmask;
+	u_int32_t		spi;
+	u_int32_t		reqid;
+	u_int8_t		proto;
+	u_int8_t		mode;
+
+	struct xt_policy_spec	match;
+	struct xt_policy_spec	invert;
+};
+
+struct xt_policy_info
+{
+	struct xt_policy_elem pol[XT_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _XT_POLICY_H */
diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h
index a3f6eff39d33..b9478a255301 100644
--- a/include/linux/netfilter_ipv4/ipt_policy.h
+++ b/include/linux/netfilter_ipv4/ipt_policy.h
@@ -1,58 +1,21 @@
 #ifndef _IPT_POLICY_H
 #define _IPT_POLICY_H
 
-#define IPT_POLICY_MAX_ELEM	4
-
-enum ipt_policy_flags
-{
-	IPT_POLICY_MATCH_IN	= 0x1,
-	IPT_POLICY_MATCH_OUT	= 0x2,
-	IPT_POLICY_MATCH_NONE	= 0x4,
-	IPT_POLICY_MATCH_STRICT	= 0x8,
-};
-
-enum ipt_policy_modes
-{
-	IPT_POLICY_MODE_TRANSPORT,
-	IPT_POLICY_MODE_TUNNEL
-};
-
-struct ipt_policy_spec
-{
-	u_int8_t	saddr:1,
-			daddr:1,
-			proto:1,
-			mode:1,
-			spi:1,
-			reqid:1;
-};
-
-union ipt_policy_addr
-{
-	struct in_addr	a4;
-	struct in6_addr	a6;
-};
-
-struct ipt_policy_elem
-{
-	union ipt_policy_addr	saddr;
-	union ipt_policy_addr	smask;
-	union ipt_policy_addr	daddr;
-	union ipt_policy_addr	dmask;
-	u_int32_t		spi;
-	u_int32_t		reqid;
-	u_int8_t		proto;
-	u_int8_t		mode;
-
-	struct ipt_policy_spec	match;
-	struct ipt_policy_spec	invert;
-};
-
-struct ipt_policy_info
-{
-	struct ipt_policy_elem pol[IPT_POLICY_MAX_ELEM];
-	u_int16_t flags;
-	u_int16_t len;
-};
+#define IPT_POLICY_MAX_ELEM		XT_POLICY_MAX_ELEM
+
+/* ipt_policy_flags */
+#define IPT_POLICY_MATCH_IN		XT_POLICY_MATCH_IN
+#define IPT_POLICY_MATCH_OUT		XT_POLICY_MATCH_OUT
+#define IPT_POLICY_MATCH_NONE		XT_POLICY_MATCH_NONE
+#define IPT_POLICY_MATCH_STRICT		XT_POLICY_MATCH_STRICT
+
+/* ipt_policy_modes */
+#define IPT_POLICY_MODE_TRANSPORT	XT_POLICY_MODE_TRANSPORT
+#define IPT_POLICY_MODE_TUNNEL		XT_POLICY_MODE_TUNNEL
+
+#define ipt_policy_spec			xt_policy_spec
+#define ipt_policy_addr			xt_policy_addr
+#define ipt_policy_elem			xt_policy_elem
+#define ipt_policy_info			xt_policy_info
 
 #endif /* _IPT_POLICY_H */
diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h
index 671bd818300f..6bab3163d2fb 100644
--- a/include/linux/netfilter_ipv6/ip6t_policy.h
+++ b/include/linux/netfilter_ipv6/ip6t_policy.h
@@ -1,58 +1,21 @@
 #ifndef _IP6T_POLICY_H
 #define _IP6T_POLICY_H
 
-#define IP6T_POLICY_MAX_ELEM	4
-
-enum ip6t_policy_flags
-{
-	IP6T_POLICY_MATCH_IN		= 0x1,
-	IP6T_POLICY_MATCH_OUT		= 0x2,
-	IP6T_POLICY_MATCH_NONE		= 0x4,
-	IP6T_POLICY_MATCH_STRICT	= 0x8,
-};
-
-enum ip6t_policy_modes
-{
-	IP6T_POLICY_MODE_TRANSPORT,
-	IP6T_POLICY_MODE_TUNNEL
-};
-
-struct ip6t_policy_spec
-{
-	u_int8_t	saddr:1,
-			daddr:1,
-			proto:1,
-			mode:1,
-			spi:1,
-			reqid:1;
-};
-
-union ip6t_policy_addr
-{
-	struct in_addr	a4;
-	struct in6_addr	a6;
-};
-
-struct ip6t_policy_elem
-{
-	union ip6t_policy_addr	saddr;
-	union ip6t_policy_addr	smask;
-	union ip6t_policy_addr	daddr;
-	union ip6t_policy_addr	dmask;
-	u_int32_t		spi;
-	u_int32_t		reqid;
-	u_int8_t		proto;
-	u_int8_t		mode;
-
-	struct ip6t_policy_spec	match;
-	struct ip6t_policy_spec	invert;
-};
-
-struct ip6t_policy_info
-{
-	struct ip6t_policy_elem pol[IP6T_POLICY_MAX_ELEM];
-	u_int16_t flags;
-	u_int16_t len;
-};
+#define IP6T_POLICY_MAX_ELEM		XT_POLICY_MAX_ELEM
+
+/* ip6t_policy_flags */
+#define IP6T_POLICY_MATCH_IN		XT_POLICY_MATCH_IN
+#define IP6T_POLICY_MATCH_OUT		XT_POLICY_MATCH_OUT
+#define IP6T_POLICY_MATCH_NONE		XT_POLICY_MATCH_NONE
+#define IP6T_POLICY_MATCH_STRICT	XT_POLICY_MATCH_STRICT
+
+/* ip6t_policy_modes */
+#define IP6T_POLICY_MODE_TRANSPORT	XT_POLICY_MODE_TRANSPORT
+#define IP6T_POLICY_MODE_TUNNEL		XT_POLICY_MODE_TUNNEL
+
+#define ip6t_policy_spec		xt_policy_spec
+#define ip6t_policy_addr		xt_policy_addr
+#define ip6t_policy_elem		xt_policy_elem
+#define ip6t_policy_info		xt_policy_info
 
 #endif /* _IP6T_POLICY_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index db783036e4d8..933ee7a9efdf 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -303,16 +303,6 @@ config IP_NF_MATCH_HASHLIMIT
 	  destination IP' or `500pps from any given source IP'  with a single
 	  IPtables rule.
 
-config IP_NF_MATCH_POLICY
-       tristate "IPsec policy match support"
-       depends on IP_NF_IPTABLES && XFRM
-       help
-         Policy matching allows you to match packets based on the
-         IPsec policy that was used during decapsulation/will
-         be used during encapsulation.
-
-         To compile it as a module, choose M here.  If unsure, say N.
-
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index e5c5b3202f02..3fe80924ac54 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
 obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
 obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
-obj-$(CONFIG_IP_NF_MATCH_POLICY) += ipt_policy.o
 
 # targets
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
diff --git a/net/ipv4/netfilter/ipt_policy.c b/net/ipv4/netfilter/ipt_policy.c
deleted file mode 100644
index b73f590b226b..000000000000
--- a/net/ipv4/netfilter/ipt_policy.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/* IP tables module for matching IPsec policy
- *
- * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <net/xfrm.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_policy.h>
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("IPtables IPsec policy matching module");
-MODULE_LICENSE("GPL");
-
-
-static inline int
-match_xfrm_state(struct xfrm_state *x, const struct ipt_policy_elem *e)
-{
-#define MATCH_ADDR(x,y,z)	(!e->match.x ||				     \
-		                 ((e->x.a4.s_addr == (e->y.a4.s_addr & (z))) \
-				  ^ e->invert.x))
-#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
-
-	return MATCH_ADDR(saddr, smask, x->props.saddr.a4) &&
-	       MATCH_ADDR(daddr, dmask, x->id.daddr.a4) &&
-	       MATCH(proto, x->id.proto) &&
-	       MATCH(mode, x->props.mode) &&
-	       MATCH(spi, x->id.spi) &&
-	       MATCH(reqid, x->props.reqid);
-}
-
-static int
-match_policy_in(const struct sk_buff *skb, const struct ipt_policy_info *info)
-{
-	const struct ipt_policy_elem *e;
-	struct sec_path *sp = skb->sp;
-	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
-	int i, pos;
-
-	if (sp == NULL)
-		return -1;
-	if (strict && info->len != sp->len)
-		return 0;
-
-	for (i = sp->len - 1; i >= 0; i--) {
-		pos = strict ? i - sp->len + 1 : 0;
-		if (pos >= info->len)
-			return 0;
-		e = &info->pol[pos];
-
-		if (match_xfrm_state(sp->x[i].xvec, e)) {
-			if (!strict)
-				return 1;
-		} else if (strict)
-			return 0;
-	}
-
-	return strict ? 1 : 0;
-}
-
-static int
-match_policy_out(const struct sk_buff *skb, const struct ipt_policy_info *info)
-{
-	const struct ipt_policy_elem *e;
-	struct dst_entry *dst = skb->dst;
-	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
-	int i, pos;
-
-	if (dst->xfrm == NULL)
-		return -1;
-
-	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
-		pos = strict ? i : 0;
-		if (pos >= info->len)
-			return 0;
-		e = &info->pol[pos];
-
-		if (match_xfrm_state(dst->xfrm, e)) {
-			if (!strict)
-				return 1;
-		} else if (strict)
-			return 0;
-	}
-
-	return strict ? i == info->len : 0;
-}
-
-static int match(const struct sk_buff *skb,
-                 const struct net_device *in,
-                 const struct net_device *out,
-		 const struct xt_match *match,
-                 const void *matchinfo,
-                 int offset,
-                 unsigned int protoff,
-                 int *hotdrop)
-{
-	const struct ipt_policy_info *info = matchinfo;
-	int ret;
-
-	if (info->flags & IPT_POLICY_MATCH_IN)
-		ret = match_policy_in(skb, info);
-	else
-		ret = match_policy_out(skb, info);
-
-	if (ret < 0)
-		ret = info->flags & IPT_POLICY_MATCH_NONE ? 1 : 0;
-	else if (info->flags & IPT_POLICY_MATCH_NONE)
-		ret = 0;
-
-	return ret;
-}
-
-static int checkentry(const char *tablename, const void *ip_void,
-		      const struct xt_match *match,
-                      void *matchinfo, unsigned int matchsize,
-                      unsigned int hook_mask)
-{
-	struct ipt_policy_info *info = matchinfo;
-
-	if (!(info->flags & (IPT_POLICY_MATCH_IN|IPT_POLICY_MATCH_OUT))) {
-		printk(KERN_ERR "ipt_policy: neither incoming nor "
-		                "outgoing policy selected\n");
-		return 0;
-	}
-	if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN)
-	    && info->flags & IPT_POLICY_MATCH_OUT) {
-		printk(KERN_ERR "ipt_policy: output policy not valid in "
-		                "PRE_ROUTING and INPUT\n");
-		return 0;
-	}
-	if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT)
-	    && info->flags & IPT_POLICY_MATCH_IN) {
-		printk(KERN_ERR "ipt_policy: input policy not valid in "
-		                "POST_ROUTING and OUTPUT\n");
-		return 0;
-	}
-	if (info->len > IPT_POLICY_MAX_ELEM) {
-		printk(KERN_ERR "ipt_policy: too many policy elements\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ipt_match policy_match = {
-	.name		= "policy",
-	.match		= match,
-	.matchsize	= sizeof(struct ipt_policy_info),
-	.checkentry 	= checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ipt_register_match(&policy_match);
-}
-
-static void __exit fini(void)
-{
-	ipt_unregister_match(&policy_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 2d6f8ecbc27b..98f78759f1ab 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -133,16 +133,6 @@ config IP6_NF_MATCH_EUI64
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP6_NF_MATCH_POLICY
-	tristate "IPsec policy match support"
-	depends on IP6_NF_IPTABLES && XFRM
-	help
-	  Policy matching allows you to match packets based on the
-	  IPsec policy that was used during decapsulation/will
-	  be used during encapsulation.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 # The targets
 config IP6_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index db6073c94163..8436a1a1731f 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o
 obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
 obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
 obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o
-obj-$(CONFIG_IP6_NF_MATCH_POLICY) += ip6t_policy.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
 obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o
 obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o
diff --git a/net/ipv6/netfilter/ip6t_policy.c b/net/ipv6/netfilter/ip6t_policy.c
deleted file mode 100644
index f2a59970e007..000000000000
--- a/net/ipv6/netfilter/ip6t_policy.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/* IP tables module for matching IPsec policy
- *
- * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-#include <net/xfrm.h>
-
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter_ipv6/ip6t_policy.h>
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("IPtables IPsec policy matching module");
-MODULE_LICENSE("GPL");
-
-
-static inline int
-match_xfrm_state(struct xfrm_state *x, const struct ip6t_policy_elem *e)
-{
-#define MATCH_ADDR(x,y,z)	(!e->match.x ||				       \
-				 ((!ipv6_masked_addr_cmp(&e->x.a6, &e->y.a6,   \
-							 z))                   \
-				  ^ e->invert.x))
-#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
-	
-	return MATCH_ADDR(saddr, smask, (struct in6_addr *)&x->props.saddr.a6) &&
-	       MATCH_ADDR(daddr, dmask, (struct in6_addr *)&x->id.daddr.a6) &&
-	       MATCH(proto, x->id.proto) &&
-	       MATCH(mode, x->props.mode) &&
-	       MATCH(spi, x->id.spi) &&
-	       MATCH(reqid, x->props.reqid);
-}
-
-static int
-match_policy_in(const struct sk_buff *skb, const struct ip6t_policy_info *info)
-{
-	const struct ip6t_policy_elem *e;
-	struct sec_path *sp = skb->sp;
-	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
-	int i, pos;
-
-	if (sp == NULL)
-		return -1;
-	if (strict && info->len != sp->len)
-		return 0;
-
-	for (i = sp->len - 1; i >= 0; i--) {
-		pos = strict ? i - sp->len + 1 : 0;
-		if (pos >= info->len)
-			return 0;
-		e = &info->pol[pos];
-
-		if (match_xfrm_state(sp->x[i].xvec, e)) {
-			if (!strict)
-				return 1;
-		} else if (strict)
-			return 0;
-	}
-
-	return strict ? 1 : 0;
-}
-
-static int
-match_policy_out(const struct sk_buff *skb, const struct ip6t_policy_info *info)
-{
-	const struct ip6t_policy_elem *e;
-	struct dst_entry *dst = skb->dst;
-	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
-	int i, pos;
-
-	if (dst->xfrm == NULL)
-		return -1;
-
-	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
-		pos = strict ? i : 0;
-		if (pos >= info->len)
-			return 0;
-		e = &info->pol[pos];
-
-		if (match_xfrm_state(dst->xfrm, e)) {
-			if (!strict)
-				return 1;
-		} else if (strict)
-			return 0;
-	}
-
-	return strict ? i == info->len : 0;
-}
-
-static int match(const struct sk_buff *skb,
-                 const struct net_device *in,
-                 const struct net_device *out,
-		 const struct xt_match *match,
-                 const void *matchinfo,
-		 int offset,
-		 unsigned int protoff,
-		 int *hotdrop)
-{
-	const struct ip6t_policy_info *info = matchinfo;
-	int ret;
-
-	if (info->flags & IP6T_POLICY_MATCH_IN)
-		ret = match_policy_in(skb, info);
-	else
-		ret = match_policy_out(skb, info);
-
-	if (ret < 0)
-		ret = info->flags & IP6T_POLICY_MATCH_NONE ? 1 : 0;
-	else if (info->flags & IP6T_POLICY_MATCH_NONE)
-		ret = 0;
-
-	return ret;
-}
-
-static int checkentry(const char *tablename, const void *ip_void,
-                      const struct xt_match *match, void *matchinfo,
-		      unsigned int matchsize, unsigned int hook_mask)
-{
-	struct ip6t_policy_info *info = matchinfo;
-
-	if (!(info->flags & (IP6T_POLICY_MATCH_IN|IP6T_POLICY_MATCH_OUT))) {
-		printk(KERN_ERR "ip6t_policy: neither incoming nor "
-		                "outgoing policy selected\n");
-		return 0;
-	}
-	if (hook_mask & (1 << NF_IP6_PRE_ROUTING | 1 << NF_IP6_LOCAL_IN)
-	    && info->flags & IP6T_POLICY_MATCH_OUT) {
-		printk(KERN_ERR "ip6t_policy: output policy not valid in "
-		                "PRE_ROUTING and INPUT\n");
-		return 0;
-	}
-	if (hook_mask & (1 << NF_IP6_POST_ROUTING | 1 << NF_IP6_LOCAL_OUT)
-	    && info->flags & IP6T_POLICY_MATCH_IN) {
-		printk(KERN_ERR "ip6t_policy: input policy not valid in "
-		                "POST_ROUTING and OUTPUT\n");
-		return 0;
-	}
-	if (info->len > IP6T_POLICY_MAX_ELEM) {
-		printk(KERN_ERR "ip6t_policy: too many policy elements\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct ip6t_match policy_match = {
-	.name		= "policy",
-	.match		= match,
-	.matchsize	= sizeof(struct ip6t_policy_info),
-	.checkentry 	= checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init init(void)
-{
-	return ip6t_register_match(&policy_match);
-}
-
-static void __exit fini(void)
-{
-	ip6t_unregister_match(&policy_match);
-}
-
-module_init(init);
-module_exit(fini);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index a8e5544da93e..174027809148 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -279,6 +279,16 @@ config NETFILTER_XT_MATCH_MARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_POLICY
+	tristate 'IPsec "policy" match support'
+	depends on NETFILTER_XTABLES && XFRM
+	help
+	  Policy matching allows you to match packets based on the
+	  IPsec policy that was used during decapsulation/will
+	  be used during encapsulation.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_PHYSDEV
 	tristate '"physdev" match support'
 	depends on NETFILTER_XTABLES && BRIDGE_NETFILTER
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 746172ebc91b..9558727f5e79 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
new file mode 100644
index 000000000000..1ec22082f04d
--- /dev/null
+++ b/net/netfilter/xt_policy.c
@@ -0,0 +1,209 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter/xt_policy.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+static inline int
+xt_addr_cmp(const union xt_policy_addr *a1, const union xt_policy_addr *m,
+	    const union xt_policy_addr *a2, unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return (a1->a4.s_addr ^ a2->a4.s_addr) & m->a4.s_addr;
+	case AF_INET6:
+		return ipv6_masked_addr_cmp(&a1->a6, &m->a6, &a2->a6);
+	}
+	return 0;
+}
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct xt_policy_elem *e,
+		 unsigned short family)
+{
+#define MATCH_ADDR(x,y,z)	(!e->match.x ||			       \
+				 (xt_addr_cmp(&e->x, &e->y, z, family) \
+				  ^ e->invert.x))
+#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+
+	return MATCH_ADDR(saddr, smask, (union xt_policy_addr *)&x->props.saddr) &&
+	       MATCH_ADDR(daddr, dmask, (union xt_policy_addr *)&x->id.daddr.a4) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info,
+		unsigned short family)
+{
+	const struct xt_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & XT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e, family)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
+		 unsigned short family)
+{
+	const struct xt_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & XT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e, family)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? i == info->len : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const struct xt_match *match,
+                 const void *matchinfo,
+                 int offset,
+                 unsigned int protoff,
+                 int *hotdrop)
+{
+	const struct xt_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & XT_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info, match->family);
+	else
+		ret = match_policy_out(skb, info, match->family);
+
+	if (ret < 0)
+		ret = info->flags & XT_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & XT_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const void *ip_void,
+                      const struct xt_match *match,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct xt_policy_info *info = matchinfo;
+
+	if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "xt_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	/* hook values are equal for IPv4 and IPv6 */
+	if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN)
+	    && info->flags & XT_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "xt_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT)
+	    && info->flags & XT_POLICY_MATCH_IN) {
+		printk(KERN_ERR "xt_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > XT_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "xt_policy: too many policy elements\n");
+		return 0;
+	}
+	return 1;
+}
+
+static struct xt_match policy_match = {
+	.name		= "policy",
+	.family		= AF_INET,
+	.match		= match,
+	.matchsize	= sizeof(struct xt_policy_info),
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match policy6_match = {
+	.name		= "policy",
+	.family		= AF_INET6,
+	.match		= match,
+	.matchsize	= sizeof(struct xt_policy_info),
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = xt_register_match(AF_INET, &policy_match);
+	if (ret)
+		return ret;
+	ret = xt_register_match(AF_INET6, &policy6_match);
+	if (ret)
+		xt_unregister_match(AF_INET, &policy_match);
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	xt_unregister_match(AF_INET6, &policy6_match);
+	xt_unregister_match(AF_INET, &policy_match);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_ALIAS("ipt_policy");
+MODULE_ALIAS("ip6t_policy");
-- 
cgit v1.2.3


From a24276924875802853b5bdc12c56d29f1c1bbc79 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Mar 2006 18:03:59 -0800
Subject: [NETFILTER]: ctnetlink: avoid unneccessary event message generation

Avoid unneccessary event message generation by checking for netlink
listeners before building a message.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h       | 1 +
 net/ipv4/netfilter/ip_conntrack_netlink.c | 7 ++++---
 net/netfilter/nf_conntrack_netlink.c      | 7 ++++---
 net/netfilter/nfnetlink.c                 | 6 ++++++
 4 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 934a2479f160..9f5b12cf489b 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -164,6 +164,7 @@ extern void nfattr_parse(struct nfattr *tb[], int maxattr,
  	__res;								\
 })
 
+extern int nfnetlink_has_listeners(unsigned int group);
 extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, 
 			  int echo);
 extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index e0b5926c76f9..5ce2e3fc2c7f 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -327,9 +327,10 @@ static int ctnetlink_conntrack_event(struct notifier_block *this,
 		group = NFNLGRP_CONNTRACK_UPDATE;
 	} else 
 		return NOTIFY_DONE;
-	
-  /* FIXME: Check if there are any listeners before, don't hurt performance */
-	
+
+	if (!nfnetlink_has_listeners(group))
+		return NOTIFY_DONE;
+
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb)
 		return NOTIFY_DONE;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index aef3cb41131f..5eadf009bb15 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -338,9 +338,10 @@ static int ctnetlink_conntrack_event(struct notifier_block *this,
 		group = NFNLGRP_CONNTRACK_UPDATE;
 	} else
 		return NOTIFY_DONE;
-	
-  /* FIXME: Check if there are any listeners before, don't hurt performance */
-	
+
+	if (!nfnetlink_has_listeners(group))
+		return NOTIFY_DONE;
+
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb)
 		return NOTIFY_DONE;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index f6063e8f0050..b88e82a1a987 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -191,6 +191,12 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
         return 0;
 }
 
+int nfnetlink_has_listeners(unsigned int group)
+{
+	return netlink_has_listeners(nfnl, group);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
+
 int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 {
 	gfp_t allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
-- 
cgit v1.2.3


From 4277a083ecd2c8771058641132bcecea04ca6608 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Mar 2006 18:52:01 -0800
Subject: [NETLINK]: Add netlink_has_listeners for avoiding unneccessary event
 message generation

Keep a bitmask of multicast groups with subscribed listeners to let
netlink users check for listeners before generating multicast
messages.

Queries don't perform any locking, which may result in false
positives, it is guaranteed however that any new subscriptions are
visible before bind() or setsockopt() return.

Signed-off-by: Patrick McHardy <kaber@trash.net>
ACKed-by: Jamal Hadi Salim<hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |  1 +
 net/netlink/af_netlink.c | 52 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index c256ebe2a7b4..f8f3d1c927f8 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -151,6 +151,7 @@ struct netlink_skb_parms
 
 extern struct sock *netlink_kernel_create(int unit, unsigned int groups, void (*input)(struct sock *sk, int len), struct module *module);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
+extern int netlink_has_listeners(struct sock *sk, unsigned int group);
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
 			     __u32 group, gfp_t allocation);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 59dc7d140600..d00a9034cb5f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -106,6 +106,7 @@ struct nl_pid_hash {
 struct netlink_table {
 	struct nl_pid_hash hash;
 	struct hlist_head mc_list;
+	unsigned long *listeners;
 	unsigned int nl_nonroot;
 	unsigned int groups;
 	struct module *module;
@@ -296,6 +297,24 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
 
 static const struct proto_ops netlink_ops;
 
+static void
+netlink_update_listeners(struct sock *sk)
+{
+	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
+	struct hlist_node *node;
+	unsigned long mask;
+	unsigned int i;
+
+	for (i = 0; i < NLGRPSZ(tbl->groups)/sizeof(unsigned long); i++) {
+		mask = 0;
+		sk_for_each_bound(sk, node, &tbl->mc_list)
+			mask |= nlk_sk(sk)->groups[i];
+		tbl->listeners[i] = mask;
+	}
+	/* this function is only called with the netlink table "grabbed", which
+	 * makes sure updates are visible before bind or setsockopt return. */
+}
+
 static int netlink_insert(struct sock *sk, u32 pid)
 {
 	struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
@@ -456,12 +475,14 @@ static int netlink_release(struct socket *sock)
 	if (nlk->module)
 		module_put(nlk->module);
 
+	netlink_table_grab();
 	if (nlk->flags & NETLINK_KERNEL_SOCKET) {
-		netlink_table_grab();
+		kfree(nl_table[sk->sk_protocol].listeners);
 		nl_table[sk->sk_protocol].module = NULL;
 		nl_table[sk->sk_protocol].registered = 0;
-		netlink_table_ungrab();
-	}
+	} else if (nlk->subscriptions)
+		netlink_update_listeners(sk);
+	netlink_table_ungrab();
 
 	kfree(nlk->groups);
 	nlk->groups = NULL;
@@ -589,6 +610,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
 	                                 hweight32(nladdr->nl_groups) -
 	                                 hweight32(nlk->groups[0]));
 	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; 
+	netlink_update_listeners(sk);
 	netlink_table_ungrab();
 
 	return 0;
@@ -807,6 +829,17 @@ retry:
 	return netlink_sendskb(sk, skb, ssk->sk_protocol);
 }
 
+int netlink_has_listeners(struct sock *sk, unsigned int group)
+{
+	int res = 0;
+
+	BUG_ON(!(nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET));
+	if (group - 1 < nl_table[sk->sk_protocol].groups)
+		res = test_bit(group - 1, nl_table[sk->sk_protocol].listeners);
+	return res;
+}
+EXPORT_SYMBOL_GPL(netlink_has_listeners);
+
 static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -1011,6 +1044,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		else
 			__clear_bit(val - 1, nlk->groups);
 		netlink_update_subscriptions(sk, subscriptions);
+		netlink_update_listeners(sk);
 		netlink_table_ungrab();
 		err = 0;
 		break;
@@ -1237,6 +1271,7 @@ netlink_kernel_create(int unit, unsigned int groups,
 	struct socket *sock;
 	struct sock *sk;
 	struct netlink_sock *nlk;
+	unsigned long *listeners = NULL;
 
 	if (!nl_table)
 		return NULL;
@@ -1250,6 +1285,13 @@ netlink_kernel_create(int unit, unsigned int groups,
 	if (__netlink_create(sock, unit) < 0)
 		goto out_sock_release;
 
+	if (groups < 32)
+		groups = 32;
+
+	listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL);
+	if (!listeners)
+		goto out_sock_release;
+
 	sk = sock->sk;
 	sk->sk_data_ready = netlink_data_ready;
 	if (input)
@@ -1262,7 +1304,8 @@ netlink_kernel_create(int unit, unsigned int groups,
 	nlk->flags |= NETLINK_KERNEL_SOCKET;
 
 	netlink_table_grab();
-	nl_table[unit].groups = groups < 32 ? 32 : groups;
+	nl_table[unit].groups = groups;
+	nl_table[unit].listeners = listeners;
 	nl_table[unit].module = module;
 	nl_table[unit].registered = 1;
 	netlink_table_ungrab();
@@ -1270,6 +1313,7 @@ netlink_kernel_create(int unit, unsigned int groups,
 	return sk;
 
 out_sock_release:
+	kfree(listeners);
 	sock_release(sock);
 	return NULL;
 }
-- 
cgit v1.2.3


From f8cd54884e675dfaf0c86cc7c088adb6ca9d7638 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Mon, 20 Mar 2006 19:15:11 -0800
Subject: [IPSEC]: Sync series - core changes

This patch provides the core functionality needed for sync events
for ipsec. Derived work of Krisztian KOVACS <hidden@balabit.hu>

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |  2 ++
 include/linux/xfrm.h       | 30 ++++++++++++++++++
 include/net/xfrm.h         | 44 ++++++++++++++++++++++++++-
 net/core/sysctl_net_core.c | 23 ++++++++++++++
 net/xfrm/xfrm_state.c      | 76 +++++++++++++++++++++++++++++++++++++++++++++-
 net/xfrm/xfrm_user.c       |  4 ++-
 6 files changed, 176 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 6e8880ea49e7..b686548f32e0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -261,6 +261,8 @@ enum
 	NET_CORE_DEV_WEIGHT=17,
 	NET_CORE_SOMAXCONN=18,
 	NET_CORE_BUDGET=19,
+	NET_CORE_AEVENT_ETIME=20,
+	NET_CORE_AEVENT_RSEQTH=21,
 };
 
 /* /proc/sys/net/ethernet */
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 82fbb758e28f..b54a12940ef6 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -156,6 +156,10 @@ enum {
 	XFRM_MSG_FLUSHPOLICY,
 #define XFRM_MSG_FLUSHPOLICY XFRM_MSG_FLUSHPOLICY
 
+	XFRM_MSG_NEWAE,
+#define XFRM_MSG_NEWAE XFRM_MSG_NEWAE
+	XFRM_MSG_GETAE,
+#define XFRM_MSG_GETAE XFRM_MSG_GETAE
 	__XFRM_MSG_MAX
 };
 #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1)
@@ -194,6 +198,21 @@ struct xfrm_encap_tmpl {
 	xfrm_address_t	encap_oa;
 };
 
+/* AEVENT flags  */
+enum xfrm_ae_ftype_t {
+	XFRM_AE_UNSPEC,
+	XFRM_AE_RTHR=1,	/* replay threshold*/
+	XFRM_AE_RVAL=2, /* replay value */
+	XFRM_AE_LVAL=4, /* lifetime value */
+	XFRM_AE_ETHR=8, /* expiry timer threshold */
+	XFRM_AE_CR=16, /* Event cause is replay update */
+	XFRM_AE_CE=32, /* Event cause is timer expiry */
+	XFRM_AE_CU=64, /* Event cause is policy update */
+	__XFRM_AE_MAX
+
+#define XFRM_AE_MAX (__XFRM_AE_MAX - 1)
+};
+
 /* Netlink message attributes.  */
 enum xfrm_attr_type_t {
 	XFRMA_UNSPEC,
@@ -205,6 +224,10 @@ enum xfrm_attr_type_t {
 	XFRMA_SA,
 	XFRMA_POLICY,
 	XFRMA_SEC_CTX,		/* struct xfrm_sec_ctx */
+	XFRMA_LTIME_VAL,
+	XFRMA_REPLAY_VAL,
+	XFRMA_REPLAY_THRESH,
+	XFRMA_ETIMER_THRESH,
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -235,6 +258,11 @@ struct xfrm_usersa_id {
 	__u8				proto;
 };
 
+struct xfrm_aevent_id {
+	__u32				flags;
+	struct xfrm_usersa_id		sa_id;
+};
+
 struct xfrm_userspi_info {
 	struct xfrm_usersa_info		info;
 	__u32				min;
@@ -306,6 +334,8 @@ enum xfrm_nlgroups {
 #define XFRMNLGRP_SA		XFRMNLGRP_SA
 	XFRMNLGRP_POLICY,
 #define XFRMNLGRP_POLICY	XFRMNLGRP_POLICY
+	XFRMNLGRP_AEVENTS,
+#define XFRMNLGRP_AEVENTS	XFRMNLGRP_AEVENTS
 	__XFRMNLGRP_MAX
 };
 #define XFRMNLGRP_MAX	(__XFRMNLGRP_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8d362c49b8a9..bc005e62e434 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -20,6 +20,10 @@
 
 #define XFRM_ALIGN8(len)	(((len) + 7) & ~7)
 
+extern struct sock *xfrm_nl;
+extern u32 sysctl_xfrm_aevent_etime;
+extern u32 sysctl_xfrm_aevent_rseqth;
+
 extern struct semaphore xfrm_cfg_sem;
 
 /* Organization of SPD aka "XFRM rules"
@@ -135,6 +139,16 @@ struct xfrm_state
 	/* State for replay detection */
 	struct xfrm_replay_state replay;
 
+	/* Replay detection state at the time we sent the last notification */
+	struct xfrm_replay_state preplay;
+
+	/* Replay detection notification settings */
+	u32			replay_maxage;
+	u32			replay_maxdiff;
+
+	/* Replay detection notification timer */
+	struct timer_list	rtimer;
+
 	/* Statistics */
 	struct xfrm_stats	stats;
 
@@ -169,6 +183,7 @@ struct km_event
 		u32 hard;
 		u32 proto;
 		u32 byid;
+		u32 aevent;
 	} data;
 
 	u32	seq;
@@ -305,7 +320,21 @@ struct xfrm_policy
 	struct xfrm_tmpl       	xfrm_vec[XFRM_MAX_DEPTH];
 };
 
-#define XFRM_KM_TIMEOUT		30
+#define XFRM_KM_TIMEOUT                30
+/* which seqno */
+#define XFRM_REPLAY_SEQ		1
+#define XFRM_REPLAY_OSEQ	2
+#define XFRM_REPLAY_SEQ_MASK	3
+/* what happened */
+#define XFRM_REPLAY_UPDATE	XFRM_AE_CR
+#define XFRM_REPLAY_TIMEOUT	XFRM_AE_CE
+
+/* default aevent timeout in units of 100ms */
+#define XFRM_AE_ETIME			10
+/* Async Event timer multiplier */
+#define XFRM_AE_ETH_M			10
+/* default seq threshold size */
+#define XFRM_AE_SEQT_SIZE		2
 
 struct xfrm_mgr
 {
@@ -865,6 +894,7 @@ extern int xfrm_state_delete(struct xfrm_state *x);
 extern void xfrm_state_flush(u8 proto);
 extern int xfrm_replay_check(struct xfrm_state *x, u32 seq);
 extern void xfrm_replay_advance(struct xfrm_state *x, u32 seq);
+extern void xfrm_replay_notify(struct xfrm_state *x, int event);
 extern int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb);
 extern int xfrm_state_mtu(struct xfrm_state *x, int mtu);
 extern int xfrm_init_state(struct xfrm_state *x);
@@ -965,4 +995,16 @@ static inline int xfrm_policy_id2dir(u32 index)
 	return index & 7;
 }
 
+static inline int xfrm_aevent_is_on(void)
+{
+	return netlink_has_listeners(xfrm_nl,XFRMNLGRP_AEVENTS);
+}
+
+static inline void xfrm_aevent_doreplay(struct xfrm_state *x)
+{
+	if (xfrm_aevent_is_on())
+		xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
+}
+
+
 #endif	/* _NET_XFRM_H */
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 2f278c8e4743..710453656721 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,11 @@ extern int sysctl_core_destroy_delay;
 extern char sysctl_divert_version[];
 #endif /* CONFIG_NET_DIVERT */
 
+#ifdef CONFIG_XFRM
+extern u32 sysctl_xfrm_aevent_etime;
+extern u32 sysctl_xfrm_aevent_rseqth;
+#endif
+
 ctl_table core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -111,6 +116,24 @@ ctl_table core_table[] = {
 		.proc_handler	= &proc_dostring
 	},
 #endif /* CONFIG_NET_DIVERT */
+#ifdef CONFIG_XFRM
+	{
+		.ctl_name	= NET_CORE_AEVENT_ETIME,
+		.procname	= "xfrm_aevent_etime",
+		.data		= &sysctl_xfrm_aevent_etime,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_CORE_AEVENT_RSEQTH,
+		.procname	= "xfrm_aevent_rseqth",
+		.data		= &sysctl_xfrm_aevent_rseqth,
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif /* CONFIG_XFRM */
 #endif /* CONFIG_NET */
 	{
 		.ctl_name	= NET_CORE_SOMAXCONN,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index c656cbaf35e8..8eaee499cad5 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -20,6 +20,8 @@
 #include <linux/module.h>
 #include <asm/uaccess.h>
 
+u32 sysctl_xfrm_aevent_etime = XFRM_AE_ETIME;
+u32 sysctl_xfrm_aevent_rseqth = XFRM_AE_SEQT_SIZE;
 /* Each xfrm_state may be linked to two tables:
 
    1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
@@ -62,6 +64,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 {
 	if (del_timer(&x->timer))
 		BUG();
+	if (del_timer(&x->rtimer))
+		BUG();
 	kfree(x->aalg);
 	kfree(x->ealg);
 	kfree(x->calg);
@@ -190,11 +194,16 @@ struct xfrm_state *xfrm_state_alloc(void)
 		init_timer(&x->timer);
 		x->timer.function = xfrm_timer_handler;
 		x->timer.data	  = (unsigned long)x;
+		init_timer(&x->rtimer);
+		x->rtimer.function = xfrm_replay_timer_handler;
+		x->rtimer.data     = (unsigned long)x;
 		x->curlft.add_time = (unsigned long)xtime.tv_sec;
 		x->lft.soft_byte_limit = XFRM_INF;
 		x->lft.soft_packet_limit = XFRM_INF;
 		x->lft.hard_byte_limit = XFRM_INF;
 		x->lft.hard_packet_limit = XFRM_INF;
+		x->replay_maxage = 0;
+		x->replay_maxdiff = 0;
 		spin_lock_init(&x->lock);
 	}
 	return x;
@@ -228,6 +237,8 @@ static int __xfrm_state_delete(struct xfrm_state *x)
 		spin_unlock(&xfrm_state_lock);
 		if (del_timer(&x->timer))
 			__xfrm_state_put(x);
+		if (del_timer(&x->rtimer))
+			__xfrm_state_put(x);
 
 		/* The number two in this test is the reference
 		 * mentioned in the comment below plus the reference
@@ -426,6 +437,10 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 	if (!mod_timer(&x->timer, jiffies + HZ))
 		xfrm_state_hold(x);
 
+	if (x->replay_maxage &&
+	    !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+		xfrm_state_hold(x);
+
 	wake_up(&km_waitq);
 }
 
@@ -762,6 +777,62 @@ out:
 }
 EXPORT_SYMBOL(xfrm_state_walk);
 
+
+void xfrm_replay_notify(struct xfrm_state *x, int event)
+{
+	struct km_event c;
+	/* we send notify messages in case
+	 *  1. we updated on of the sequence numbers, and the seqno difference
+	 *     is at least x->replay_maxdiff, in this case we also update the
+	 *     timeout of our timer function
+	 *  2. if x->replay_maxage has elapsed since last update,
+	 *     and there were changes
+	 *
+	 *  The state structure must be locked!
+	 */
+
+	switch (event) {
+	case XFRM_REPLAY_UPDATE:
+		if (x->replay_maxdiff &&
+		    (x->replay.seq - x->preplay.seq < x->replay_maxdiff) &&
+		    (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff))
+			return;
+
+		break;
+
+	case XFRM_REPLAY_TIMEOUT:
+		if ((x->replay.seq == x->preplay.seq) &&
+		    (x->replay.bitmap == x->preplay.bitmap) &&
+		    (x->replay.oseq == x->preplay.oseq))
+			return;
+
+		break;
+	}
+
+	memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state));
+	c.event = XFRM_MSG_NEWAE;
+	c.data.aevent = event;
+	km_state_notify(x, &c);
+
+resched:
+	if (x->replay_maxage &&
+	    !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+		xfrm_state_hold(x);
+
+}
+
+static void xfrm_replay_timer_handler(unsigned long data)
+{
+	struct xfrm_state *x = (struct xfrm_state*)data;
+
+	spin_lock(&x->lock);
+
+	if (xfrm_aevent_is_on() && x->km.state == XFRM_STATE_VALID)
+		xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT);
+
+	spin_unlock(&x->lock);
+}
+
 int xfrm_replay_check(struct xfrm_state *x, u32 seq)
 {
 	u32 diff;
@@ -805,6 +876,9 @@ void xfrm_replay_advance(struct xfrm_state *x, u32 seq)
 		diff = x->replay.seq - seq;
 		x->replay.bitmap |= (1U << diff);
 	}
+
+	if (xfrm_aevent_is_on())
+		xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
 }
 EXPORT_SYMBOL(xfrm_replay_advance);
 
@@ -835,7 +909,7 @@ void km_state_notify(struct xfrm_state *x, struct km_event *c)
 EXPORT_SYMBOL(km_policy_notify);
 EXPORT_SYMBOL(km_state_notify);
 
-static void km_state_expired(struct xfrm_state *x, int hard)
+void km_state_expired(struct xfrm_state *x, int hard)
 {
 	struct km_event c;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 7de17559249a..6f643e58e044 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -28,7 +28,7 @@
 #include <net/netlink.h>
 #include <asm/uaccess.h>
 
-static struct sock *xfrm_nl;
+struct sock *xfrm_nl;
 
 static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type)
 {
@@ -1618,3 +1618,5 @@ module_init(xfrm_user_init);
 module_exit(xfrm_user_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
+EXPORT_SYMBOL(xfrm_nl);
+
-- 
cgit v1.2.3


From 91f0ebf7b6d5cb2b6e818d48587566144821babe Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Mon, 20 Mar 2006 19:21:44 -0800
Subject: [DCCP] CCID: Improve CCID infrastructure

1. No need for ->ccid_init nor ->ccid_exit, this is what module_{init,exit}
   does and anynways neither ccid2 nor ccid3 were using it.

2. Rename struct ccid to struct ccid_operations and introduce struct ccid
   with a pointer to ccid_operations and rigth after it the rx or tx
   private state.

3. Remove the pointer to the state of the half connections from struct
   dccp_sock, now its derived thru ccid_priv() from the ccid pointer.

Now we also can implement the setsockopt for changing the CCID easily as
no ccid init routines can affect struct dccp_sock in any way that prevents
other CCIDs from working if a CCID switch operation is asked by apps.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h   |   2 -
 net/dccp/ccid.c        | 188 ++++++++++++++++++++++++++++++++++++++++---------
 net/dccp/ccid.h        | 115 +++++++++++++-----------------
 net/dccp/ccids/ccid2.c |  55 +++------------
 net/dccp/ccids/ccid2.h |  16 +++++
 net/dccp/ccids/ccid3.c |  34 ++-------
 net/dccp/ccids/ccid3.h |   5 +-
 net/dccp/input.c       |   8 ---
 net/dccp/ipv4.c        |  24 +++----
 net/dccp/minisocks.c   |  25 +++----
 10 files changed, 260 insertions(+), 212 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index a70d1a27e7fc..bdd756cc60b1 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -478,8 +478,6 @@ struct dccp_sock {
 	__u32				dccps_mss_cache;
 	struct dccp_options		dccps_options;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
-	void				*dccps_hc_rx_ccid_private;
-	void				*dccps_hc_tx_ccid_private;
 	struct ccid			*dccps_hc_rx_ccid;
 	struct ccid			*dccps_hc_tx_ccid;
 	struct dccp_options_received	dccps_options_received;
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 06b191a5740b..ff05e59043cd 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -13,7 +13,7 @@
 
 #include "ccid.h"
 
-static struct ccid *ccids[CCID_MAX];
+static struct ccid_operations *ccids[CCID_MAX];
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 static atomic_t ccids_lockct = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(ccids_lock);
@@ -55,82 +55,202 @@ static inline void ccids_read_unlock(void)
 #define ccids_read_unlock() do { } while(0)
 #endif
 
-int ccid_register(struct ccid *ccid)
+static kmem_cache_t *ccid_kmem_cache_create(int obj_size, const char *fmt,...)
 {
-	int err;
+	kmem_cache_t *slab;
+	char slab_name_fmt[32], *slab_name;
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(slab_name_fmt, sizeof(slab_name_fmt), fmt, args);
+	va_end(args);
+
+	slab_name = kstrdup(slab_name_fmt, GFP_KERNEL);
+	if (slab_name == NULL)
+		return NULL;
+	slab = kmem_cache_create(slab_name, sizeof(struct ccid) + obj_size, 0,
+				 SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (slab == NULL)
+		kfree(slab_name);
+	return slab;
+}
+
+static void ccid_kmem_cache_destroy(kmem_cache_t *slab)
+{
+	if (slab != NULL) {
+		const char *name = kmem_cache_name(slab);
+
+		kmem_cache_destroy(slab);
+		kfree(name);
+	}
+}
+
+int ccid_register(struct ccid_operations *ccid_ops)
+{
+	int err = -ENOBUFS;
+
+	ccid_ops->ccid_hc_rx_slab =
+			ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
+					       "%s_hc_rx_sock",
+					       ccid_ops->ccid_name);
+	if (ccid_ops->ccid_hc_rx_slab == NULL)
+		goto out;
+
+	ccid_ops->ccid_hc_tx_slab =
+			ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
+					       "%s_hc_tx_sock",
+					       ccid_ops->ccid_name);
+	if (ccid_ops->ccid_hc_tx_slab == NULL)
+		goto out_free_rx_slab;
 
 	ccids_write_lock();
 	err = -EEXIST;
-	if (ccids[ccid->ccid_id] == NULL) {
-		ccids[ccid->ccid_id] = ccid;
+	if (ccids[ccid_ops->ccid_id] == NULL) {
+		ccids[ccid_ops->ccid_id] = ccid_ops;
 		err = 0;
 	}
 	ccids_write_unlock();
-	if (err == 0)
-		pr_info("CCID: Registered CCID %d (%s)\n",
-			ccid->ccid_id, ccid->ccid_name);
+	if (err != 0)
+		goto out_free_tx_slab;
+
+	pr_info("CCID: Registered CCID %d (%s)\n",
+		ccid_ops->ccid_id, ccid_ops->ccid_name);
+out:
 	return err;
+out_free_tx_slab:
+	ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
+	ccid_ops->ccid_hc_tx_slab = NULL;
+	goto out;
+out_free_rx_slab:
+	ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+	ccid_ops->ccid_hc_rx_slab = NULL;
+	goto out;
 }
 
 EXPORT_SYMBOL_GPL(ccid_register);
 
-int ccid_unregister(struct ccid *ccid)
+int ccid_unregister(struct ccid_operations *ccid_ops)
 {
 	ccids_write_lock();
-	ccids[ccid->ccid_id] = NULL;
+	ccids[ccid_ops->ccid_id] = NULL;
 	ccids_write_unlock();
+
+	ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
+	ccid_ops->ccid_hc_tx_slab = NULL;
+	ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+	ccid_ops->ccid_hc_rx_slab = NULL;
+
 	pr_info("CCID: Unregistered CCID %d (%s)\n",
-		ccid->ccid_id, ccid->ccid_name);
+		ccid_ops->ccid_id, ccid_ops->ccid_name);
 	return 0;
 }
 
 EXPORT_SYMBOL_GPL(ccid_unregister);
 
-struct ccid *ccid_init(unsigned char id, struct sock *sk)
+struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
 {
-	struct ccid *ccid;
+	struct ccid_operations *ccid_ops;
+	struct ccid *ccid = NULL;
 
+	ccids_read_lock();
 #ifdef CONFIG_KMOD
-	if (ccids[id] == NULL)
+	if (ccids[id] == NULL) {
+		/* We only try to load if in process context */
+		ccids_read_unlock();
+		if (gfp & GFP_ATOMIC)
+			goto out;
 		request_module("net-dccp-ccid-%d", id);
+		ccids_read_lock();
+	}
 #endif
-	ccids_read_lock();
+	ccid_ops = ccids[id];
+	if (ccid_ops == NULL)
+		goto out_unlock;
 
-	ccid = ccids[id];
-	if (ccid == NULL)
-		goto out;
+	if (!try_module_get(ccid_ops->ccid_owner))
+		goto out_unlock;
 
-	if (!try_module_get(ccid->ccid_owner))
-		goto out_err;
+	ccids_read_unlock();
 
-	if (ccid->ccid_init != NULL && ccid->ccid_init(sk) != 0)
+	ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
+				     ccid_ops->ccid_hc_tx_slab, gfp);
+	if (ccid == NULL)
 		goto out_module_put;
+	ccid->ccid_ops = ccid_ops;
+	if (rx) {
+		memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
+		if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
+		    ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
+			goto out_free_ccid;
+	} else {
+		memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
+		if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
+		    ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
+			goto out_free_ccid;
+	}
 out:
-	ccids_read_unlock();
 	return ccid;
-out_module_put:
-	module_put(ccid->ccid_owner);
-out_err:
+out_unlock:
+	ccids_read_unlock();
+	goto out;
+out_free_ccid:
+	kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
+			ccid_ops->ccid_hc_tx_slab, ccid);
 	ccid = NULL;
+out_module_put:
+	module_put(ccid_ops->ccid_owner);
 	goto out;
 }
 
-EXPORT_SYMBOL_GPL(ccid_init);
+EXPORT_SYMBOL_GPL(ccid_new);
 
-void ccid_exit(struct ccid *ccid, struct sock *sk)
+struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
 {
+	return ccid_new(id, sk, 1, gfp);
+}
+
+EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
+
+struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk,  gfp_t gfp)
+{
+	return ccid_new(id, sk, 0, gfp);
+}
+
+EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
+
+static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
+{
+	struct ccid_operations *ccid_ops;
+
 	if (ccid == NULL)
 		return;
 
+	ccid_ops = ccid->ccid_ops;
+	if (rx) {
+		if (ccid_ops->ccid_hc_rx_exit != NULL)
+			ccid_ops->ccid_hc_rx_exit(sk);
+		kmem_cache_free(ccid_ops->ccid_hc_rx_slab,  ccid);
+	} else {
+		if (ccid_ops->ccid_hc_tx_exit != NULL)
+			ccid_ops->ccid_hc_tx_exit(sk);
+		kmem_cache_free(ccid_ops->ccid_hc_tx_slab,  ccid);
+	}
 	ccids_read_lock();
+	if (ccids[ccid_ops->ccid_id] != NULL)
+		module_put(ccid_ops->ccid_owner);
+	ccids_read_unlock();
+}
 
-	if (ccids[ccid->ccid_id] != NULL) {
-		if (ccid->ccid_exit != NULL)
-			ccid->ccid_exit(sk);
-		module_put(ccid->ccid_owner);
-	}
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
+{
+	ccid_delete(ccid, sk, 1);
+}
 
-	ccids_read_unlock();
+EXPORT_SYMBOL_GPL(ccid_hc_rx_delete);
+
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
+{
+	ccid_delete(ccid, sk, 0);
 }
 
-EXPORT_SYMBOL_GPL(ccid_exit);
+EXPORT_SYMBOL_GPL(ccid_hc_tx_delete);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index de681c6ad081..3dec50d49731 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -23,14 +23,16 @@
 
 struct tcp_info;
 
-struct ccid {
+struct ccid_operations {
 	unsigned char	ccid_id;
 	const char	*ccid_name;
 	struct module	*ccid_owner;
-	int		(*ccid_init)(struct sock *sk);
-	void		(*ccid_exit)(struct sock *sk);
-	int		(*ccid_hc_rx_init)(struct sock *sk);
-	int		(*ccid_hc_tx_init)(struct sock *sk);
+	kmem_cache_t	*ccid_hc_rx_slab;
+	__u32		ccid_hc_rx_obj_size;
+	kmem_cache_t	*ccid_hc_tx_slab;
+	__u32		ccid_hc_tx_obj_size;
+	int		(*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
+	int		(*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
 	void		(*ccid_hc_rx_exit)(struct sock *sk);
 	void		(*ccid_hc_tx_exit)(struct sock *sk);
 	void		(*ccid_hc_rx_packet_recv)(struct sock *sk,
@@ -67,75 +69,58 @@ struct ccid {
 						 int __user *optlen);
 };
 
-extern int	   ccid_register(struct ccid *ccid);
-extern int	   ccid_unregister(struct ccid *ccid);
+extern int ccid_register(struct ccid_operations *ccid_ops);
+extern int ccid_unregister(struct ccid_operations *ccid_ops);
 
-extern struct ccid *ccid_init(unsigned char id, struct sock *sk);
-extern void	   ccid_exit(struct ccid *ccid, struct sock *sk);
+struct ccid {
+	struct ccid_operations *ccid_ops;
+	char		       ccid_priv[0];
+};
 
-static inline void __ccid_get(struct ccid *ccid)
+static inline void *ccid_priv(const struct ccid *ccid)
 {
-	__module_get(ccid->ccid_owner);
+	return (void *)ccid->ccid_priv;
 }
 
+extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
+			     gfp_t gfp);
+
+extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
+				   gfp_t gfp);
+extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
+				   gfp_t gfp);
+
+extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
+extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+
 static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
 					 struct sk_buff *skb, int len)
 {
 	int rc = 0;
-	if (ccid->ccid_hc_tx_send_packet != NULL)
-		rc = ccid->ccid_hc_tx_send_packet(sk, skb, len);
+	if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
+		rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb, len);
 	return rc;
 }
 
 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
 					  int more, int len)
 {
-	if (ccid->ccid_hc_tx_packet_sent != NULL)
-		ccid->ccid_hc_tx_packet_sent(sk, more, len);
-}
-
-static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk)
-{
-	int rc = 0;
-	if (ccid->ccid_hc_rx_init != NULL)
-		rc = ccid->ccid_hc_rx_init(sk);
-	return rc;
-}
-
-static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk)
-{
-	int rc = 0;
-	if (ccid->ccid_hc_tx_init != NULL)
-		rc = ccid->ccid_hc_tx_init(sk);
-	return rc;
-}
-
-static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk)
-{
-	if (ccid != NULL && ccid->ccid_hc_rx_exit != NULL &&
-	    dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL)
-		ccid->ccid_hc_rx_exit(sk);
-}
-
-static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk)
-{
-	if (ccid != NULL && ccid->ccid_hc_tx_exit != NULL &&
-	    dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL)
-		ccid->ccid_hc_tx_exit(sk);
+	if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
+		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
 }
 
 static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
 					  struct sk_buff *skb)
 {
-	if (ccid->ccid_hc_rx_packet_recv != NULL)
-		ccid->ccid_hc_rx_packet_recv(sk, skb);
+	if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
+		ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
 }
 
 static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
 					  struct sk_buff *skb)
 {
-	if (ccid->ccid_hc_tx_packet_recv != NULL)
-		ccid->ccid_hc_tx_packet_recv(sk, skb);
+	if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
+		ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
 }
 
 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
@@ -144,8 +129,8 @@ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
 					   unsigned char* value)
 {
 	int rc = 0;
-	if (ccid->ccid_hc_tx_parse_options != NULL)
-		rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx,
+	if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
+		rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
 						    value);
 	return rc;
 }
@@ -156,37 +141,37 @@ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
 					   unsigned char* value)
 {
 	int rc = 0;
-	if (ccid->ccid_hc_rx_parse_options != NULL)
-		rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value);
+	if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
+		rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
 	return rc;
 }
 
 static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
 					     struct sk_buff *skb)
 {
-	if (ccid->ccid_hc_tx_insert_options != NULL)
-		ccid->ccid_hc_tx_insert_options(sk, skb);
+	if (ccid->ccid_ops->ccid_hc_tx_insert_options != NULL)
+		ccid->ccid_ops->ccid_hc_tx_insert_options(sk, skb);
 }
 
 static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
 					     struct sk_buff *skb)
 {
-	if (ccid->ccid_hc_rx_insert_options != NULL)
-		ccid->ccid_hc_rx_insert_options(sk, skb);
+	if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
+		ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
 }
 
 static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
 				       struct tcp_info *info)
 {
-	if (ccid->ccid_hc_rx_get_info != NULL)
-		ccid->ccid_hc_rx_get_info(sk, info);
+	if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
+		ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
 }
 
 static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
 				       struct tcp_info *info)
 {
-	if (ccid->ccid_hc_tx_get_info != NULL)
-		ccid->ccid_hc_tx_get_info(sk, info);
+	if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
+		ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
 }
 
 static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
@@ -194,8 +179,8 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
 					u32 __user *optval, int __user *optlen)
 {
 	int rc = -ENOPROTOOPT;
-	if (ccid->ccid_hc_rx_getsockopt != NULL)
-		rc = ccid->ccid_hc_rx_getsockopt(sk, optname, len,
+	if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
+		rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
 						 optval, optlen);
 	return rc;
 }
@@ -205,8 +190,8 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
 					u32 __user *optval, int __user *optlen)
 {
 	int rc = -ENOPROTOOPT;
-	if (ccid->ccid_hc_tx_getsockopt != NULL)
-		rc = ccid->ccid_hc_tx_getsockopt(sk, optname, len,
+	if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
+		rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
 						 optval, optlen);
 	return rc;
 }
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 3328d23c4be7..b40c4569a820 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -52,16 +52,6 @@ static int ccid2_debug;
 
 static const int ccid2_seq_len = 128;
 
-static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
-{
-	return dccp_sk(sk)->dccps_hc_tx_ccid_private;
-}
-
-static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
-{
-	return dccp_sk(sk)->dccps_hc_rx_ccid_private;
-}
-
 #ifdef CCID2_DEBUG
 static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
 {
@@ -707,19 +697,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	ccid2_hc_tx_check_sanity(hctx);
 }
 
-static int ccid2_hc_tx_init(struct sock *sk)
+static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-        struct ccid2_hc_tx_sock *hctx;
+        struct ccid2_hc_tx_sock *hctx = ccid_priv(ccid);
 	int seqcount = ccid2_seq_len;
 	int i;
 
-        dp->dccps_hc_tx_ccid_private = kzalloc(sizeof(*hctx), gfp_any());
-        if (dp->dccps_hc_tx_ccid_private == NULL)
-                return -ENOMEM;
-
-        hctx = ccid2_hc_tx_sk(sk);
-
 	/* XXX init variables with proper values */
 	hctx->ccid2hctx_cwnd	  = 1;
 	hctx->ccid2hctx_ssthresh  = 10;
@@ -728,11 +711,9 @@ static int ccid2_hc_tx_init(struct sock *sk)
 	/* XXX init ~ to window size... */
 	hctx->ccid2hctx_seqbuf = kmalloc(sizeof(*hctx->ccid2hctx_seqbuf) *
 					 seqcount, gfp_any());
-	if (hctx->ccid2hctx_seqbuf == NULL) {
-		kfree(dp->dccps_hc_tx_ccid_private);
-		dp->dccps_hc_tx_ccid_private = NULL;
+	if (hctx->ccid2hctx_seqbuf == NULL)
 		return -ENOMEM;
-	}
+
 	for (i = 0; i < (seqcount - 1); i++) {
 		hctx->ccid2hctx_seqbuf[i].ccid2s_next =
 					&hctx->ccid2hctx_seqbuf[i + 1];
@@ -763,15 +744,11 @@ static int ccid2_hc_tx_init(struct sock *sk)
 
 static void ccid2_hc_tx_exit(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
         struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
 	ccid2_hc_tx_kill_rto_timer(sk);
-
 	kfree(hctx->ccid2hctx_seqbuf);
-
-	kfree(dp->dccps_hc_tx_ccid_private);
-	dp->dccps_hc_tx_ccid_private = NULL;
+	hctx->ccid2hctx_seqbuf = NULL;
 }
 
 static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -791,33 +768,17 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
-static int ccid2_hc_rx_init(struct sock *sk)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-        dp->dccps_hc_rx_ccid_private = kzalloc(sizeof(struct ccid2_hc_rx_sock),
-					       gfp_any());
-        return dp->dccps_hc_rx_ccid_private == NULL ? -ENOMEM : 0;
-}
-
-static void ccid2_hc_rx_exit(struct sock *sk)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-
-	kfree(dp->dccps_hc_rx_ccid_private);
-	dp->dccps_hc_rx_ccid_private = NULL;
-}
-
-static struct ccid ccid2 = {
+static struct ccid_operations ccid2 = {
 	.ccid_id		= 2,
 	.ccid_name		= "ccid2",
 	.ccid_owner		= THIS_MODULE,
+	.ccid_hc_tx_obj_size	= sizeof(struct ccid2_hc_tx_sock),
 	.ccid_hc_tx_init	= ccid2_hc_tx_init,
 	.ccid_hc_tx_exit	= ccid2_hc_tx_exit,
 	.ccid_hc_tx_send_packet	= ccid2_hc_tx_send_packet,
 	.ccid_hc_tx_packet_sent	= ccid2_hc_tx_packet_sent,
 	.ccid_hc_tx_packet_recv	= ccid2_hc_tx_packet_recv,
-	.ccid_hc_rx_init	= ccid2_hc_rx_init,
-	.ccid_hc_rx_exit	= ccid2_hc_rx_exit,
+	.ccid_hc_rx_obj_size	= sizeof(struct ccid2_hc_rx_sock),
 	.ccid_hc_rx_packet_recv	= ccid2_hc_rx_packet_recv,
 };
 
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 0b08c90955a9..451a87464fa5 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -20,6 +20,13 @@
 #ifndef _DCCP_CCID2_H_
 #define _DCCP_CCID2_H_
 
+#include <linux/dccp.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include "../ccid.h"
+
+struct sock;
+
 struct ccid2_seq {
 	u64			ccid2s_seq;
 	unsigned long		ccid2s_sent;
@@ -66,4 +73,13 @@ struct ccid2_hc_rx_sock {
 	int	ccid2hcrx_data;
 };
 
+static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
+{
+	return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+}
+
+static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
+{
+	return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+}
 #endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index f9e16db09bef..0587f52e4af1 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -647,17 +647,10 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
 	return rc;
 }
 
-static int ccid3_hc_tx_init(struct sock *sk)
+static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct ccid3_hc_tx_sock *hctx;
-
-	dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any());
-	if (dp->dccps_hc_tx_ccid_private == NULL)
-		return -ENOMEM;
-
-	hctx = ccid3_hc_tx_sk(sk);
-	memset(hctx, 0, sizeof(*hctx));
+	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
 
 	if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
 	    dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
@@ -680,7 +673,6 @@ static int ccid3_hc_tx_init(struct sock *sk)
 
 static void ccid3_hc_tx_exit(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
 	BUG_ON(hctx == NULL);
@@ -690,9 +682,6 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 
 	/* Empty packet history */
 	dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
-
-	kfree(dp->dccps_hc_tx_ccid_private);
-	dp->dccps_hc_tx_ccid_private = NULL;
 }
 
 /*
@@ -1039,20 +1028,13 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
-static int ccid3_hc_rx_init(struct sock *sk)
+static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct ccid3_hc_rx_sock *hcrx;
+	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
 
 	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
 
-	dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any());
-	if (dp->dccps_hc_rx_ccid_private == NULL)
-		return -ENOMEM;
-
-	hcrx = ccid3_hc_rx_sk(sk);
-	memset(hcrx, 0, sizeof(*hcrx));
-
 	if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
 	    dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
 		hcrx->ccid3hcrx_s = dp->dccps_packet_size;
@@ -1071,7 +1053,6 @@ static int ccid3_hc_rx_init(struct sock *sk)
 static void ccid3_hc_rx_exit(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	struct dccp_sock *dp = dccp_sk(sk);
 
 	BUG_ON(hcrx == NULL);
 
@@ -1082,9 +1063,6 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 
 	/* Empty loss interval history */
 	dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
-
-	kfree(dp->dccps_hc_rx_ccid_private);
-	dp->dccps_hc_rx_ccid_private = NULL;
 }
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
@@ -1170,10 +1148,11 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 	return 0;
 }
 
-static struct ccid ccid3 = {
+static struct ccid_operations ccid3 = {
 	.ccid_id		   = 3,
 	.ccid_name		   = "ccid3",
 	.ccid_owner		   = THIS_MODULE,
+	.ccid_hc_tx_obj_size	   = sizeof(struct ccid3_hc_tx_sock),
 	.ccid_hc_tx_init	   = ccid3_hc_tx_init,
 	.ccid_hc_tx_exit	   = ccid3_hc_tx_exit,
 	.ccid_hc_tx_send_packet	   = ccid3_hc_tx_send_packet,
@@ -1181,6 +1160,7 @@ static struct ccid ccid3 = {
 	.ccid_hc_tx_packet_recv	   = ccid3_hc_tx_packet_recv,
 	.ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
 	.ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
+	.ccid_hc_rx_obj_size	   = sizeof(struct ccid3_hc_rx_sock),
 	.ccid_hc_rx_init	   = ccid3_hc_rx_init,
 	.ccid_hc_rx_exit	   = ccid3_hc_rx_exit,
 	.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 0bde4583d091..f18b96d4e5a2 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -41,6 +41,7 @@
 #include <linux/time.h>
 #include <linux/types.h>
 #include <linux/tfrc.h>
+#include "../ccid.h"
 
 #define TFRC_MIN_PACKET_SIZE	   16
 #define TFRC_STD_PACKET_SIZE	  256
@@ -135,12 +136,12 @@ struct ccid3_hc_rx_sock {
 
 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
 {
-    return dccp_sk(sk)->dccps_hc_tx_ccid_private;
+    return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
 }
 
 static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
 {
-    return dccp_sk(sk)->dccps_hc_rx_ccid_private;
+    return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
 }
 
 #endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 4b6d43d8b920..67691a0592af 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -324,14 +324,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		dccp_set_seqno(&dp->dccps_swl,
 			       max48(dp->dccps_swl, dp->dccps_isr));
 
-		if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 ||
-		    ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) {
-			ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
-			ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
-			/* FIXME: send appropriate RESET code */
-			goto out_invalid_packet;
-		}
-
 		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 
 		/*
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index fcfb486f90c2..aa7708fed32e 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1058,14 +1058,16 @@ int dccp_v4_init_sock(struct sock *sk)
 			if (dp->dccps_hc_rx_ackvec == NULL)
 				return -ENOMEM;
 		}
-		dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_rx_ccid,
-						 sk);
-		dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_tx_ccid,
-						 sk);
-	    	if (dp->dccps_hc_rx_ccid == NULL ||
-		    dp->dccps_hc_tx_ccid == NULL) {
-			ccid_exit(dp->dccps_hc_rx_ccid, sk);
-			ccid_exit(dp->dccps_hc_tx_ccid, sk);
+		dp->dccps_hc_rx_ccid =
+				ccid_hc_rx_new(dp->dccps_options.dccpo_rx_ccid,
+					       sk, GFP_KERNEL);
+		dp->dccps_hc_tx_ccid =
+				ccid_hc_tx_new(dp->dccps_options.dccpo_tx_ccid,
+					       sk, GFP_KERNEL);
+	    	if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
+			     dp->dccps_hc_tx_ccid == NULL)) {
+			ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+			ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
 			if (dp->dccps_options.dccpo_send_ack_vector) {
 				dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 				dp->dccps_hc_rx_ackvec = NULL;
@@ -1120,14 +1122,12 @@ int dccp_v4_destroy_sock(struct sock *sk)
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
 
-	ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
-	ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
 	if (dp->dccps_options.dccpo_send_ack_vector) {
 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
-	ccid_exit(dp->dccps_hc_rx_ccid, sk);
-	ccid_exit(dp->dccps_hc_tx_ccid, sk);
+	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
 	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
 
 	/* clean up feature negotiation state */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 9e1de5919ee5..5324fcacb34d 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -121,23 +121,21 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		if (newdp->dccps_options.dccpo_send_ack_vector) {
 			newdp->dccps_hc_rx_ackvec =
 						dccp_ackvec_alloc(GFP_ATOMIC);
-			/*
-			 * XXX: We're using the same CCIDs set on the parent,
-			 * i.e. sk_clone copied the master sock and left the
-			 * CCID pointers for this child, that is why we do the
-			 * __ccid_get calls.
-			 */
 			if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
 				goto out_free;
 		}
 
-		if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid,
-					     newsk) != 0 ||
-			     ccid_hc_tx_init(newdp->dccps_hc_tx_ccid,
-				     	     newsk) != 0)) {
+		newdp->dccps_hc_rx_ccid =
+			    ccid_hc_rx_new(newdp->dccps_options.dccpo_rx_ccid,
+					   newsk, GFP_ATOMIC);
+		newdp->dccps_hc_tx_ccid =
+			    ccid_hc_tx_new(newdp->dccps_options.dccpo_tx_ccid,
+					   newsk, GFP_ATOMIC);
+		if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
+			     newdp->dccps_hc_tx_ccid == NULL)) {
 			dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
-			ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk);
-			ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk);
+			ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
+			ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
 out_free:
 			/* It is still raw copy of parent, so invalidate
 			 * destructor and make plain sk_free() */
@@ -146,9 +144,6 @@ out_free:
 			return NULL;
 		}
 
-		__ccid_get(newdp->dccps_hc_rx_ccid);
-		__ccid_get(newdp->dccps_hc_tx_ccid);
-
 		/*
 		 * Step 3: Process LISTEN state
 		 *
-- 
cgit v1.2.3


From a193a4abdd1f742a57f3f70b6a83c3e536876e97 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 20 Mar 2006 19:23:05 -0800
Subject: [NETFILTER]: Fix skb->nf_bridge lifetime issues

The bridge netfilter code simulates the NF_IP_PRE_ROUTING hook and skips
the real hook by registering with high priority and returning NF_STOP if
skb->nf_bridge is present and the BRNF_NF_BRIDGE_PREROUTING flag is not
set. The flag is only set during the simulated hook.

Because skb->nf_bridge is only freed when the packet is destroyed, the
packet will not only skip the first invocation of NF_IP_PRE_ROUTING, but
in the case of tunnel devices on top of the bridge also all further ones.
Forwarded packets from a bridge encapsulated by a tunnel device and sent
as locally outgoing packet will also still have the incorrect bridge
information from the input path attached.

We already have nf_reset calls on all RX/TX paths of tunnel devices,
so simply reset the nf_bridge field there too. As an added bonus,
the bridge information for locally delivered packets is now also freed
when the packet is queued to a socket.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 24 ++++++++++++++----------
 net/ipv4/netfilter/ipt_REJECT.c |  4 ----
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 838ce0fdcef7..1a2611030d36 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1351,16 +1351,6 @@ static inline void nf_conntrack_put_reasm(struct sk_buff *skb)
 		kfree_skb(skb);
 }
 #endif
-static inline void nf_reset(struct sk_buff *skb)
-{
-	nf_conntrack_put(skb->nfct);
-	skb->nfct = NULL;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-	nf_conntrack_put_reasm(skb->nfct_reasm);
-	skb->nfct_reasm = NULL;
-#endif
-}
-
 #ifdef CONFIG_BRIDGE_NETFILTER
 static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
 {
@@ -1373,6 +1363,20 @@ static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge)
 		atomic_inc(&nf_bridge->use);
 }
 #endif /* CONFIG_BRIDGE_NETFILTER */
+static inline void nf_reset(struct sk_buff *skb)
+{
+	nf_conntrack_put(skb->nfct);
+	skb->nfct = NULL;
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	nf_conntrack_put_reasm(skb->nfct_reasm);
+	skb->nfct_reasm = NULL;
+#endif
+#ifdef CONFIG_BRIDGE_NETFILTER
+	nf_bridge_put(skb->nf_bridge);
+	skb->nf_bridge = NULL;
+#endif
+}
+
 #else /* CONFIG_NETFILTER */
 static inline void nf_reset(struct sk_buff *skb) {}
 #endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 26ea6c19f5bd..9d3b3579f27c 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -154,10 +154,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	/* This packet will not be the same as the other: clear nf fields */
 	nf_reset(nskb);
 	nskb->nfmark = 0;
-#ifdef CONFIG_BRIDGE_NETFILTER
-	nf_bridge_put(nskb->nf_bridge);
-	nskb->nf_bridge = NULL;
-#endif
 
 	tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
 
-- 
cgit v1.2.3


From 60fe62e789076ae7c13f7ffb35fec4b24802530d Mon Sep 17 00:00:00 2001
From: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Date: Mon, 20 Mar 2006 19:23:32 -0800
Subject: [DCCP]: sparse endianness annotations

This also fixes the layout of dccp_hdr short sequence numbers, problem
was not fatal now as we only support long (48 bits) sequence numbers.

Signed-off-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h   | 52 ++++++++++++++++++++------------------------------
 net/dccp/ccids/ccid3.c |  6 +++---
 net/dccp/dccp.h        | 23 ++++++----------------
 net/dccp/ipv4.c        | 12 ++++++------
 net/dccp/ipv6.c        |  4 ++--
 net/dccp/options.c     | 27 +++++++++++++-------------
 net/dccp/proto.c       |  6 +++---
 7 files changed, 54 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index bdd756cc60b1..496dbad8e896 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -18,7 +18,7 @@
  * @dccph_seq - sequence number high or low order 24 bits, depends on dccph_x
  */
 struct dccp_hdr {
-	__u16	dccph_sport,
+	__be16	dccph_sport,
 		dccph_dport;
 	__u8	dccph_doff;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
@@ -32,18 +32,18 @@ struct dccp_hdr {
 #endif
 	__u16	dccph_checksum;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u32	dccph_x:1,
+	__u8	dccph_x:1,
 		dccph_type:4,
-		dccph_reserved:3,
-		dccph_seq:24;
+		dccph_reserved:3;
 #elif defined(__BIG_ENDIAN_BITFIELD)
-	__u32	dccph_reserved:3,
+	__u8	dccph_reserved:3,
 		dccph_type:4,
-		dccph_x:1,
-		dccph_seq:24;
+		dccph_x:1;
 #else
 #error  "Adjust your <asm/byteorder.h> defines"
 #endif
+	__u8	dccph_seq2;
+	__be16	dccph_seq;
 };
 
 /**
@@ -52,7 +52,7 @@ struct dccp_hdr {
  * @dccph_seq_low - low 24 bits of a 48 bit seq packet
  */
 struct dccp_hdr_ext {
-	__u32	dccph_seq_low;
+	__be32	dccph_seq_low;
 };
 
 /**
@@ -62,7 +62,7 @@ struct dccp_hdr_ext {
  * @dccph_req_options - list of options (must be a multiple of 32 bits
  */
 struct dccp_hdr_request {
-	__u32	dccph_req_service;
+	__be32	dccph_req_service;
 };
 /**
  * struct dccp_hdr_ack_bits - acknowledgment bits common to most packets
@@ -71,9 +71,9 @@ struct dccp_hdr_request {
  * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR
  */
 struct dccp_hdr_ack_bits {
-	__u32	dccph_reserved1:8,
-		dccph_ack_nr_high:24;
-	__u32	dccph_ack_nr_low;
+	__be16	dccph_reserved1;
+	__be16	dccph_ack_nr_high;
+	__be32	dccph_ack_nr_low;
 };
 /**
  * struct dccp_hdr_response - Conection initiation response header
@@ -85,7 +85,7 @@ struct dccp_hdr_ack_bits {
  */
 struct dccp_hdr_response {
 	struct dccp_hdr_ack_bits	dccph_resp_ack;
-	__u32				dccph_resp_service;
+	__be32				dccph_resp_service;
 };
 
 /**
@@ -269,16 +269,12 @@ static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb)
 static inline __u64 dccp_hdr_seq(const struct sk_buff *skb)
 {
 	const struct dccp_hdr *dh = dccp_hdr(skb);
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64 seq_nr = ntohl(dh->dccph_seq << 8);
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	__u64 seq_nr = ntohl(dh->dccph_seq);
-#else
-#error  "Adjust your <asm/byteorder.h> defines"
-#endif
+	__u64 seq_nr =  ntohs(dh->dccph_seq);
 
 	if (dh->dccph_x != 0)
 		seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(skb)->dccph_seq_low);
+	else
+		seq_nr += (u32)dh->dccph_seq2 << 16;
 
 	return seq_nr;
 }
@@ -296,13 +292,7 @@ static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *
 static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb)
 {
 	const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb);
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	return (((u64)ntohl(dhack->dccph_ack_nr_high << 8)) << 32) + ntohl(dhack->dccph_ack_nr_low);
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	return (((u64)ntohl(dhack->dccph_ack_nr_high)) << 32) + ntohl(dhack->dccph_ack_nr_low);
-#else
-#error  "Adjust your <asm/byteorder.h> defines"
-#endif
+	return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low);
 }
 
 static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb)
@@ -387,7 +377,7 @@ struct dccp_request_sock {
 	struct inet_request_sock dreq_inet_rsk;
 	__u64			 dreq_iss;
 	__u64			 dreq_isr;
-	__u32			 dreq_service;
+	__be32			 dreq_service;
 };
 
 static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req)
@@ -415,13 +405,13 @@ enum dccp_role {
 
 struct dccp_service_list {
 	__u32	dccpsl_nr;
-	__u32	dccpsl_list[0];
+	__be32	dccpsl_list[0];
 };
 
 #define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1)
 
 static inline int dccp_list_has_service(const struct dccp_service_list *sl,
-					const u32 service)
+					const __be32 service)
 {
 	if (likely(sl != NULL)) {
 		u32 i = sl->dccpsl_nr;
@@ -467,7 +457,7 @@ struct dccp_sock {
 	__u64				dccps_gss;
 	__u64				dccps_gsr;
 	__u64				dccps_gar;
-	__u32				dccps_service;
+	__be32				dccps_service;
 	struct dccp_service_list	*dccps_service_list;
 	struct timeval			dccps_timestamp_time;
 	__u32				dccps_timestamp_echo;
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 0587f52e4af1..86201631fb6e 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -615,7 +615,7 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
 				       __FUNCTION__, dccp_role(sk), sk);
 			rc = -EINVAL;
 		} else {
-			opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value);
+			opt_recv->ccid3or_loss_event_rate = ntohl(*(__be32 *)value);
 			ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
 				       dccp_role(sk), sk,
 				       opt_recv->ccid3or_loss_event_rate);
@@ -636,7 +636,7 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
 				       __FUNCTION__, dccp_role(sk), sk);
 			rc = -EINVAL;
 		} else {
-			opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value);
+			opt_recv->ccid3or_receive_rate = ntohl(*(__be32 *)value);
 			ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
 				       dccp_role(sk), sk,
 				       opt_recv->ccid3or_receive_rate);
@@ -777,7 +777,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk)
 static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	u32 x_recv, pinv;
+	__be32 x_recv, pinv;
 
 	BUG_ON(hcrx == NULL);
 
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 93f26dd6e6cb..1764adb4f15e 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -262,7 +262,7 @@ extern int	   dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
 				   int addr_len);
 
 extern int	   dccp_v4_checksum(const struct sk_buff *skb,
-				    const u32 saddr, const u32 daddr);
+				    const __be32 saddr, const __be32 daddr);
 
 extern int	   dccp_v4_send_reset(struct sock *sk,
 				      enum dccp_reset_codes code);
@@ -270,7 +270,7 @@ extern void	   dccp_send_close(struct sock *sk, const int active);
 extern int	   dccp_invalid_packet(struct sk_buff *skb);
 
 static inline int dccp_bad_service_code(const struct sock *sk,
-					const __u32 service)
+					const __be32 service)
 {
 	const struct dccp_sock *dp = dccp_sk(sk);
 
@@ -334,27 +334,16 @@ static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
 {
 	struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
 							   sizeof(*dh));
-
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	dh->dccph_seq	   = htonl((gss >> 32)) >> 8;
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	dh->dccph_seq	   = htonl((gss >> 32));
-#else
-#error  "Adjust your <asm/byteorder.h> defines"
-#endif
+	dh->dccph_seq2 = 0;
+	dh->dccph_seq = htons((gss >> 32) & 0xfffff);
 	dhx->dccph_seq_low = htonl(gss & 0xffffffff);
 }
 
 static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
 				    const u64 gsr)
 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8;
-#elif defined(__BIG_ENDIAN_BITFIELD)
-	dhack->dccph_ack_nr_high = htonl((gsr >> 32));
-#else
-#error  "Adjust your <asm/byteorder.h> defines"
-#endif
+	dhack->dccph_reserved1 = 0;
+	dhack->dccph_ack_nr_high = htons(gsr >> 32);
 	dhack->dccph_ack_nr_low  = htonl(gsr & 0xffffffff);
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index aa7708fed32e..be5ce57b8046 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -498,9 +498,9 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct dccp_sock dp;
 	struct request_sock *req;
 	struct dccp_request_sock *dreq;
-	const __u32 saddr = skb->nh.iph->saddr;
-	const __u32 daddr = skb->nh.iph->daddr;
- 	const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
+	const __be32 saddr = skb->nh.iph->saddr;
+	const __be32 daddr = skb->nh.iph->daddr;
+ 	const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
 	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
 	__u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
 
@@ -662,8 +662,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	return sk;
 }
 
-int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
-		     const u32 daddr)
+int dccp_v4_checksum(const struct sk_buff *skb, const __be32 saddr,
+		     const __be32 daddr)
 {
 	const struct dccp_hdr* dh = dccp_hdr(skb);
 	int checksum_len;
@@ -683,7 +683,7 @@ int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
 }
 
 static int dccp_v4_verify_checksum(struct sk_buff *skb,
-				   const u32 saddr, const u32 daddr)
+				   const __be32 saddr, const __be32 daddr)
 {
 	struct dccp_hdr *dh = dccp_hdr(skb);
 	int checksum_len;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 80c4d048869e..ad5a1c66362d 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -264,7 +264,7 @@ failure:
 }
 
 static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-			int type, int code, int offset, __u32 info)
+			int type, int code, int offset, __be32 info)
 {
 	struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
 	const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
@@ -678,7 +678,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct dccp_request_sock *dreq;
 	struct inet6_request_sock *ireq6;
 	struct ipv6_pinfo *np = inet6_sk(sk);
- 	const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
+ 	const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
 	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
 	__u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
 
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 7f99306c8e99..7d73b33a6043 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -155,7 +155,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 			if (len != 4)
 				goto out_invalid_option;
 
-			opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
+			opt_recv->dccpor_timestamp = ntohl(*(__be32 *)value);
 
 			dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
 			dccp_timestamp(sk, &dp->dccps_timestamp_time);
@@ -169,7 +169,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 			if (len != 4 && len != 6 && len != 8)
 				goto out_invalid_option;
 
-			opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value);
+			opt_recv->dccpor_timestamp_echo = ntohl(*(__be32 *)value);
 
 			dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ",
 				      debug_prefix,
@@ -183,9 +183,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 				break;
 
 			if (len == 6)
-				elapsed_time = ntohs(*(u16 *)(value + 4));
+				elapsed_time = ntohs(*(__be16 *)(value + 4));
 			else
-				elapsed_time = ntohl(*(u32 *)(value + 4));
+				elapsed_time = ntohl(*(__be32 *)(value + 4));
 
 			/* Give precedence to the biggest ELAPSED_TIME */
 			if (elapsed_time > opt_recv->dccpor_elapsed_time)
@@ -199,9 +199,9 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 				continue;
 
 			if (len == 2)
-				elapsed_time = ntohs(*(u16 *)value);
+				elapsed_time = ntohs(*(__be16 *)value);
 			else
-				elapsed_time = ntohl(*(u32 *)value);
+				elapsed_time = ntohl(*(__be32 *)value);
 
 			if (elapsed_time > opt_recv->dccpor_elapsed_time)
 				opt_recv->dccpor_elapsed_time = elapsed_time;
@@ -358,10 +358,10 @@ void dccp_insert_option_elapsed_time(struct sock *sk,
 	*to++ = len;
 
 	if (elapsed_time_len == 2) {
-		const u16 var16 = htons((u16)elapsed_time);
+		const __be16 var16 = htons((u16)elapsed_time);
 		memcpy(to, &var16, 2);
 	} else {
-		const u32 var32 = htonl(elapsed_time);
+		const __be32 var32 = htonl(elapsed_time);
 		memcpy(to, &var32, 4);
 	}
 
@@ -392,14 +392,13 @@ EXPORT_SYMBOL_GPL(dccp_timestamp);
 void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
 {
 	struct timeval tv;
-	u32 now;
+	__be32 now;
 
 	dccp_timestamp(sk, &tv);
-	now = timeval_usecs(&tv) / 10;
+	now = htonl(timeval_usecs(&tv) / 10);
 	/* yes this will overflow but that is the point as we want a
 	 * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
 
-	now = htonl(now);
 	dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
 }
 
@@ -414,7 +413,7 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk,
 					"CLIENT TX opt: " : "server TX opt: ";
 #endif
 	struct timeval now;
-	u32 tstamp_echo;
+	__be32 tstamp_echo;
 	u32 elapsed_time;
 	int len, elapsed_time_len;
 	unsigned char *to;
@@ -441,10 +440,10 @@ static void dccp_insert_option_timestamp_echo(struct sock *sk,
 	to += 4;
 
 	if (elapsed_time_len == 2) {
-		const u16 var16 = htons((u16)elapsed_time);
+		const __be16 var16 = htons((u16)elapsed_time);
 		memcpy(to, &var16, 2);
 	} else if (elapsed_time_len == 4) {
-		const u32 var32 = htonl(elapsed_time);
+		const __be32 var32 = htonl(elapsed_time);
 		memcpy(to, &var32, 4);
 	}
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 1a8cf8ecfe63..53735ee2bbd1 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -221,7 +221,7 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 
 EXPORT_SYMBOL_GPL(dccp_ioctl);
 
-static int dccp_setsockopt_service(struct sock *sk, const u32 service,
+static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
 				   char __user *optval, int optlen)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -349,7 +349,7 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL_GPL(dccp_setsockopt);
 
 static int dccp_getsockopt_service(struct sock *sk, int len,
-				   u32 __user *optval,
+				   __be32 __user *optval,
 				   int __user *optlen)
 {
 	const struct dccp_sock *dp = dccp_sk(sk);
@@ -404,7 +404,7 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
 		break;
 	case DCCP_SOCKOPT_SERVICE:
 		return dccp_getsockopt_service(sk, len,
-					       (u32 __user *)optval, optlen);
+					       (__be32 __user *)optval, optlen);
 	case 128 ... 191:
 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
 					     len, (u32 __user *)optval, optlen);
-- 
cgit v1.2.3


From 93ce20928f6e197707add8f670ae0cd029107e8f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Mon, 20 Mar 2006 19:23:58 -0800
Subject: [DCCP]: Make CCID2 be the default

As per the draft. This fixes the build when netfilter dccp components
are built and dccp isn't. Thanks to Reuben Farrelly for reporting
this.

The following changesets will introduce /proc/sys/net/dccp/defaults/
to give more flexibility to DCCP developers and testers while apps
doesn't use setsockopt to specify the desired CCID, etc.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 496dbad8e896..e35f680f909b 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -320,17 +320,8 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 /* initial values for each feature */
 #define DCCPF_INITIAL_SEQUENCE_WINDOW		100
 #define DCCPF_INITIAL_ACK_RATIO			2
-
-#if defined(CONFIG_IP_DCCP_CCID2) || defined(CONFIG_IP_DCCP_CCID2_MODULE)
 #define DCCPF_INITIAL_CCID			2
 #define DCCPF_INITIAL_SEND_ACK_VECTOR		1
-#elif defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
-#define DCCPF_INITIAL_CCID			3
-#define DCCPF_INITIAL_SEND_ACK_VECTOR		0
-#else
-#error  "At least one CCID must be built as the default"
-#endif
-
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
 
-- 
cgit v1.2.3


From e55d912f5b75723159348a7fc7692f869a86636a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Mon, 20 Mar 2006 19:25:02 -0800
Subject: [DCCP] feat: Introduce sysctls for the default features

[root@qemu ~]# for a in /proc/sys/net/dccp/default/* ; do echo $a ; cat $a ; done
/proc/sys/net/dccp/default/ack_ratio
2
/proc/sys/net/dccp/default/rx_ccid
3
/proc/sys/net/dccp/default/send_ackvec
1
/proc/sys/net/dccp/default/send_ndp
1
/proc/sys/net/dccp/default/seq_window
100
/proc/sys/net/dccp/default/tx_ccid
3
[root@qemu ~]#

So if wanting to test ccid3 as the tx CCID one can just do:

[root@qemu ~]# echo 3 > /proc/sys/net/dccp/default/tx_ccid
[root@qemu ~]# echo 2 > /proc/sys/net/dccp/default/rx_ccid
[root@qemu ~]# cat /proc/sys/net/dccp/default/[tr]x_ccid
2
3
[root@qemu ~]#

Of course we also need the setsockopt for each app to tell its preferences, but
for testing or defining something other than CCID2 as the default for apps that
don't explicitely set their preference the sysctl interface is handy.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h |  16 +++++++
 net/dccp/Makefile      |   2 +
 net/dccp/dccp.h        |  14 ++++++
 net/dccp/options.c     |  22 +++++----
 net/dccp/proto.c       |   9 +++-
 net/dccp/sysctl.c      | 124 +++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 176 insertions(+), 11 deletions(-)
 create mode 100644 net/dccp/sysctl.c

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b686548f32e0..dfcf449afc7c 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -211,6 +211,7 @@ enum
 	NET_SCTP=17,
 	NET_LLC=18,
 	NET_NETFILTER=19,
+	NET_DCCP=20,
 };
 
 /* /proc/sys/kernel/random */
@@ -571,6 +572,21 @@ enum {
 	__NET_NEIGH_MAX
 };
 
+/* /proc/sys/net/dccp */
+enum {
+	NET_DCCP_DEFAULT=1,
+};
+
+/* /proc/sys/net/dccp/default */
+enum {
+	NET_DCCP_DEFAULT_SEQ_WINDOW  = 1,
+	NET_DCCP_DEFAULT_RX_CCID     = 2,
+	NET_DCCP_DEFAULT_TX_CCID     = 3,
+	NET_DCCP_DEFAULT_ACK_RATIO   = 4,
+	NET_DCCP_DEFAULT_SEND_ACKVEC = 5,
+	NET_DCCP_DEFAULT_SEND_NDP    = 6,
+};
+
 /* /proc/sys/net/ipx */
 enum {
 	NET_IPX_PPROP_BROADCASTING=1,
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 5736acea1c86..7af0569fe4cb 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -11,6 +11,8 @@ dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
 
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
 
+dccp-$(CONFIG_SYSCTL) += sysctl.o
+
 dccp_diag-y := diag.o
 
 obj-y += ccids/
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 1764adb4f15e..f059541f5a1d 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -433,4 +433,18 @@ static inline void timeval_sub_usecs(struct timeval *tv,
 	}
 }
 
+#ifdef CONFIG_SYSCTL
+extern int dccp_sysctl_init(void);
+extern void dccp_sysctl_exit(void);
+#else
+static inline int dccp_sysctl_init(void)
+{
+	return 0;
+}
+
+static inline void dccp_sysctl_exit(void)
+{
+}
+#endif
+
 #endif /* _DCCP_H */
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 7d73b33a6043..3ecd319c0f59 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,19 +23,21 @@
 #include "dccp.h"
 #include "feat.h"
 
-/* stores the default values for new connection. may be changed with sysctl */
-static const struct dccp_options dccpo_default_values = {
-	.dccpo_sequence_window	  = DCCPF_INITIAL_SEQUENCE_WINDOW,
-	.dccpo_rx_ccid		  = DCCPF_INITIAL_CCID,
-	.dccpo_tx_ccid		  = DCCPF_INITIAL_CCID,
-	.dccpo_ack_ratio	  = DCCPF_INITIAL_ACK_RATIO,
-	.dccpo_send_ack_vector	  = DCCPF_INITIAL_SEND_ACK_VECTOR,
-	.dccpo_send_ndp_count	  = DCCPF_INITIAL_SEND_NDP_COUNT,
-};
+int dccp_feat_default_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
+int dccp_feat_default_rx_ccid	      = DCCPF_INITIAL_CCID;
+int dccp_feat_default_tx_ccid	      = DCCPF_INITIAL_CCID;
+int dccp_feat_default_ack_ratio	      = DCCPF_INITIAL_ACK_RATIO;
+int dccp_feat_default_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
+int dccp_feat_default_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
 void dccp_options_init(struct dccp_options *dccpo)
 {
-	memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo));
+	dccpo->dccpo_sequence_window = dccp_feat_default_sequence_window;
+	dccpo->dccpo_rx_ccid	     = dccp_feat_default_rx_ccid;
+	dccpo->dccpo_tx_ccid	     = dccp_feat_default_tx_ccid;
+	dccpo->dccpo_ack_ratio	     = dccp_feat_default_ack_ratio;
+	dccpo->dccpo_send_ack_vector = dccp_feat_default_send_ack_vector;
+	dccpo->dccpo_send_ndp_count  = dccp_feat_default_send_ndp_count;
 }
 
 static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 53735ee2bbd1..6403e9306ddb 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -934,11 +934,17 @@ static int __init dccp_init(void)
 	if (rc)
 		goto out_unregister_protosw;
 
-	rc = dccp_ctl_sock_init();
+	rc = dccp_sysctl_init();
 	if (rc)
 		goto out_ackvec_exit;
+
+	rc = dccp_ctl_sock_init();
+	if (rc)
+		goto out_sysctl_exit;
 out:
 	return rc;
+out_sysctl_exit:
+	dccp_sysctl_exit();
 out_ackvec_exit:
 	dccp_ackvec_exit();
 out_unregister_protosw:
@@ -983,6 +989,7 @@ static void __exit dccp_fini(void)
 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
 	proto_unregister(&dccp_prot);
 	dccp_ackvec_exit();
+	dccp_sysctl_exit();
 }
 
 module_init(dccp_init);
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
new file mode 100644
index 000000000000..64c89e9c229e
--- /dev/null
+++ b/net/dccp/sysctl.c
@@ -0,0 +1,124 @@
+/*
+ *  net/dccp/sysctl.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License v2
+ *	as published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+extern int dccp_feat_default_sequence_window;
+extern int dccp_feat_default_rx_ccid;
+extern int dccp_feat_default_tx_ccid;
+extern int dccp_feat_default_ack_ratio;
+extern int dccp_feat_default_send_ack_vector;
+extern int dccp_feat_default_send_ndp_count;
+
+static struct ctl_table dccp_default_table[] = {
+	{
+		.ctl_name	= NET_DCCP_DEFAULT_SEQ_WINDOW,
+		.procname	= "seq_window",
+		.data		= &dccp_feat_default_sequence_window,
+		.maxlen		= sizeof(dccp_feat_default_sequence_window),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_DCCP_DEFAULT_RX_CCID,
+		.procname	= "rx_ccid",
+		.data		= &dccp_feat_default_rx_ccid,
+		.maxlen		= sizeof(dccp_feat_default_rx_ccid),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_DCCP_DEFAULT_TX_CCID,
+		.procname	= "tx_ccid",
+		.data		= &dccp_feat_default_tx_ccid,
+		.maxlen		= sizeof(dccp_feat_default_tx_ccid),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_DCCP_DEFAULT_ACK_RATIO,
+		.procname	= "ack_ratio",
+		.data		= &dccp_feat_default_ack_ratio,
+		.maxlen		= sizeof(dccp_feat_default_ack_ratio),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_DCCP_DEFAULT_SEND_ACKVEC,
+		.procname	= "send_ackvec",
+		.data		= &dccp_feat_default_send_ack_vector,
+		.maxlen		= sizeof(dccp_feat_default_send_ack_vector),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_DCCP_DEFAULT_SEND_NDP,
+		.procname	= "send_ndp",
+		.data		= &dccp_feat_default_send_ndp_count,
+		.maxlen		= sizeof(dccp_feat_default_send_ndp_count),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ .ctl_name = 0, }
+};
+
+static struct ctl_table dccp_table[] = {
+	{
+		.ctl_name	= NET_DCCP_DEFAULT,
+		.procname	= "default",
+		.mode		= 0555,
+		.child		= dccp_default_table,
+	},
+	{ .ctl_name = 0, },
+};
+
+static struct ctl_table dccp_dir_table[] = {
+	{
+		.ctl_name	= NET_DCCP,
+		.procname	= "dccp",
+		.mode		= 0555,
+		.child		= dccp_table,
+	},
+	{ .ctl_name = 0, },
+};
+
+static struct ctl_table dccp_root_table[] = {
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net",
+		.mode		= 0555,
+		.child		= dccp_dir_table,
+	},
+	{ .ctl_name = 0, },
+};
+
+static struct ctl_table_header *dccp_table_header;
+
+int __init dccp_sysctl_init(void)
+{
+	dccp_table_header = register_sysctl_table(dccp_root_table, 1);
+
+	return dccp_table_header != NULL ? 0 : -ENOMEM;
+}
+
+void dccp_sysctl_exit(void)
+{
+	if (dccp_table_header != NULL) {
+		unregister_sysctl_table(dccp_table_header);
+		dccp_table_header = NULL;
+	}
+}
-- 
cgit v1.2.3


From eaa82edf20d738a7ae31f4b0a5f72f64c14a58df Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Mon, 20 Mar 2006 23:24:04 -0500
Subject: SUNRPC,RPCSEC_GSS: fix krb5 sequence numbers.

Use a spinlock to ensure unique sequence numbers when creating krb5 gss tokens.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/gss_krb5.h     |  2 ++
 net/sunrpc/auth_gss/gss_krb5_seal.c | 11 ++++++++---
 net/sunrpc/auth_gss/gss_krb5_wrap.c |  9 ++++++---
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h
index 2c3601d31045..1279280d7196 100644
--- a/include/linux/sunrpc/gss_krb5.h
+++ b/include/linux/sunrpc/gss_krb5.h
@@ -53,6 +53,8 @@ struct krb5_ctx {
 	struct xdr_netobj	mech_used;
 };
 
+extern spinlock_t krb5_seq_lock;
+
 #define KG_TOK_MIC_MSG    0x0101
 #define KG_TOK_WRAP_MSG   0x0201
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 58f9721980e2..f43311221a72 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,6 +70,8 @@
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
+spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+
 u32
 gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
 		struct xdr_netobj *token)
@@ -80,6 +82,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
 	struct xdr_netobj	md5cksum = {.len = 0, .data = cksumdata};
 	unsigned char		*ptr, *krb5_hdr, *msg_start;
 	s32			now;
+	u32			seq_send;
 
 	dprintk("RPC:     gss_krb5_seal\n");
 
@@ -134,12 +137,14 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
 		BUG();
 	}
 
+	spin_lock(&krb5_seq_lock);
+	seq_send = ctx->seq_send++;
+	spin_unlock(&krb5_seq_lock);
+
 	if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
-			       ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+			       seq_send, krb5_hdr + 16, krb5_hdr + 8)))
 		goto out_err;
 
-	ctx->seq_send++;
-
 	return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
 out_err:
 	return GSS_S_FAILURE;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 346133e446cb..89d1f3e14128 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -128,6 +128,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
 	s32			now;
 	int			headlen;
 	struct page		**tmp_pages;
+	u32			seq_send;
 
 	dprintk("RPC:     gss_wrap_kerberos\n");
 
@@ -206,18 +207,20 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
 		BUG();
 	}
 
+	spin_lock(&krb5_seq_lock);
+	seq_send = kctx->seq_send++;
+	spin_unlock(&krb5_seq_lock);
+
 	/* XXX would probably be more efficient to compute checksum
 	 * and encrypt at the same time: */
 	if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
-			       kctx->seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+			       seq_send, krb5_hdr + 16, krb5_hdr + 8)))
 		goto out_err;
 
 	if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
 									pages))
 		goto out_err;
 
-	kctx->seq_send++;
-
 	return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
 out_err:
 	return GSS_S_FAILURE;
-- 
cgit v1.2.3


From f3ee439f43381e45b191cf721b4a51d41f33301f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Mon, 20 Mar 2006 23:24:13 -0500
Subject: LOCKD: nlmsvc_traverse_blocks return is unused

Note that we never return non-zero.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c          | 3 +--
 fs/lockd/svcsubs.c          | 5 +++--
 include/linux/lockd/lockd.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ce754efe2841..d2b66bad7d50 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -309,14 +309,13 @@ restart:
  * Loop over all blocks and perform the action specified.
  * (NLM_ACT_CHECK handled by nlmsvc_inspect_file).
  */
-int
+void
 nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
 {
 	if (action == NLM_ACT_MARK)
 		nlmsvc_act_mark(host, file);
 	else
 		nlmsvc_act_unlock(host, file);
-	return 0;
 }
 
 /*
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 601e5b3dfe20..2043011a1a2d 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -224,8 +224,9 @@ nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, int action)
 		if (file->f_count || file->f_blocks || file->f_shares)
 			return 1;
 	} else {
-		if (nlmsvc_traverse_blocks(host, file, action)
-		 || nlmsvc_traverse_shares(host, file, action))
+		nlmsvc_traverse_blocks(host, file, action);
+
+		if (nlmsvc_traverse_shares(host, file, action))
 			return 1;
 	}
 	return nlm_traverse_locks(host, file, action);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index a04137d0c5de..995f89dc8c04 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -183,7 +183,7 @@ u32		  nlmsvc_testlock(struct nlm_file *, struct nlm_lock *,
 					struct nlm_lock *);
 u32		  nlmsvc_cancel_blocked(struct nlm_file *, struct nlm_lock *);
 unsigned long	  nlmsvc_retry_blocked(void);
-int		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
+void		  nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *,
 					int action);
 void	  nlmsvc_grant_reply(struct svc_rqst *, struct nlm_cookie *, u32);
 
-- 
cgit v1.2.3


From 5f12191bc000ea31970339a5f54c11087506711c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Mon, 20 Mar 2006 23:24:25 -0500
Subject: LOCKD: Make nlmsvc_traverse_shares return void

The nlmsvc_traverse_shares return value is always zero, hence useless.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svcshare.c         | 4 +---
 fs/lockd/svcsubs.c          | 4 +---
 include/linux/lockd/share.h | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
index 4943fb7836ce..27288c83da96 100644
--- a/fs/lockd/svcshare.c
+++ b/fs/lockd/svcshare.c
@@ -88,7 +88,7 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
  * Traverse all shares for a given file (and host).
  * NLM_ACT_CHECK is handled by nlmsvc_inspect_file.
  */
-int
+void
 nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, int action)
 {
 	struct nlm_share	*share, **shpp;
@@ -106,6 +106,4 @@ nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, int action)
 		}
 		shpp = &share->s_next;
 	}
-
-	return 0;
 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 2043011a1a2d..c7a6e3ae44d6 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -225,9 +225,7 @@ nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, int action)
 			return 1;
 	} else {
 		nlmsvc_traverse_blocks(host, file, action);
-
-		if (nlmsvc_traverse_shares(host, file, action))
-			return 1;
+		nlmsvc_traverse_shares(host, file, action);
 	}
 	return nlm_traverse_locks(host, file, action);
 }
diff --git a/include/linux/lockd/share.h b/include/linux/lockd/share.h
index 5d8aa325f140..c75a424ebe4c 100644
--- a/include/linux/lockd/share.h
+++ b/include/linux/lockd/share.h
@@ -25,6 +25,6 @@ u32	nlmsvc_share_file(struct nlm_host *, struct nlm_file *,
 					       struct nlm_args *);
 u32	nlmsvc_unshare_file(struct nlm_host *, struct nlm_file *,
 					       struct nlm_args *);
-int	nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, int);
+void	nlmsvc_traverse_shares(struct nlm_host *, struct nlm_file *, int);
 
 #endif /* LINUX_LOCKD_SHARE_H */
-- 
cgit v1.2.3


From 4bf07ef3fd5db2df7d1899fcf9c67d2263ead2e2 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Mon, 20 Mar 2006 21:25:50 -0800
Subject: [XFRM]: Rearrange struct xfrm_aevent_id for better compatibility.

struct xfrm_aevent_id needs to be 32-bit + 64-bit align friendly.

Based upon suggestions from Yoshifuji.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index b54a12940ef6..6b42cc474c01 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -259,8 +259,8 @@ struct xfrm_usersa_id {
 };
 
 struct xfrm_aevent_id {
-	__u32				flags;
 	struct xfrm_usersa_id		sa_id;
+	__u32				flags;
 };
 
 struct xfrm_userspi_info {
-- 
cgit v1.2.3


From 231d06ae826664b83369166449144304859a62fa Mon Sep 17 00:00:00 2001
From: J�rn Engel <joern@wohnheim.fh-wedel.de>
Date: Mon, 20 Mar 2006 21:28:35 -0800
Subject: [NET]: Uninline kfree_skb and allow NULL argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

o Uninline kfree_skb, which saves some 15k of object code on my notebook.

o Allow kfree_skb to be called with a NULL argument.

  Subsequent patches can remove conditional from drivers and further
  reduce source and object size.

Signed-off-by: J�rn Engel <joern@wohnheim.fh-wedel.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 17 +----------------
 net/core/skbuff.c      | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1a2611030d36..75c963103b9f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -304,6 +304,7 @@ struct sk_buff {
 
 #include <asm/system.h>
 
+extern void kfree_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
 				   gfp_t priority, int fclone);
@@ -403,22 +404,6 @@ static inline struct sk_buff *skb_get(struct sk_buff *skb)
  * atomic change.
  */
 
-/**
- *	kfree_skb - free an sk_buff
- *	@skb: buffer to free
- *
- *	Drop a reference to the buffer and free it if the usage count has
- *	hit zero.
- */
-static inline void kfree_skb(struct sk_buff *skb)
-{
-	if (likely(atomic_read(&skb->users) == 1))
-		smp_rmb();
-	else if (likely(!atomic_dec_and_test(&skb->users)))
-		return;
-	__kfree_skb(skb);
-}
-
 /**
  *	skb_cloned - is the buffer a clone
  *	@skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2144952d1c6c..01abf1e8990b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -355,6 +355,24 @@ void __kfree_skb(struct sk_buff *skb)
 	kfree_skbmem(skb);
 }
 
+/**
+ *	kfree_skb - free an sk_buff
+ *	@skb: buffer to free
+ *
+ *	Drop a reference to the buffer and free it if the usage count has
+ *	hit zero.
+ */
+void kfree_skb(struct sk_buff *skb)
+{
+	if (unlikely(!skb))
+		return;
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+	__kfree_skb(skb);
+}
+
 /**
  *	skb_clone	-	duplicate an sk_buff
  *	@skb: buffer to clone
@@ -1799,6 +1817,7 @@ void __init skb_init(void)
 
 EXPORT_SYMBOL(___pskb_trim);
 EXPORT_SYMBOL(__kfree_skb);
+EXPORT_SYMBOL(kfree_skb);
 EXPORT_SYMBOL(__pskb_pull_tail);
 EXPORT_SYMBOL(__alloc_skb);
 EXPORT_SYMBOL(pskb_copy);
-- 
cgit v1.2.3


From 0e7b13685f9a06949ea3070c97c0f0085a08cd37 Mon Sep 17 00:00:00 2001
From: John Heffner <jheffner@psc.edu>
Date: Mon, 20 Mar 2006 21:32:58 -0800
Subject: [TCP] mtu probing: move tcp-specific data out of inet_connection_sock

This moves some TCP-specific MTU probing state out of
inet_connection_sock back to tcp_sock.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h                | 6 ++++++
 include/net/inet_connection_sock.h | 2 --
 net/ipv4/tcp_input.c               | 4 ++--
 net/ipv4/tcp_output.c              | 4 ++--
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f2bb2396853f..542d39596bd8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -343,6 +343,12 @@ struct tcp_sock {
 		__u32	seq;
 		__u32	time;
 	} rcvq_space;
+
+/* TCP-specific MTU probe information. */
+	struct {
+		__u32		  probe_seq_start;
+		__u32		  probe_seq_end;
+	} mtu_probe;
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b3abe33f4e5f..4e5a9ff99fc3 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -114,8 +114,6 @@ struct inet_connection_sock {
 
 		/* Information on the current probe. */
 		int		  probe_size;
-		__u32		  probe_seq_start;
-		__u32		  probe_seq_end;
 	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
 #define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0ac388e3d01d..195d83584558 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2054,7 +2054,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		/* MTU probe failure: don't reduce cwnd */
 		if (icsk->icsk_ca_state < TCP_CA_CWR &&
 		    icsk->icsk_mtup.probe_size &&
-		    tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
+		    tp->snd_una == tp->mtu_probe.probe_seq_start) {
 			tcp_mtup_probe_failed(sk);
 			/* Restores the reduction we did in tcp_mtup_probe() */
 			tp->snd_cwnd++;
@@ -2284,7 +2284,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 
 		/* MTU probing checks */
 		if (icsk->icsk_mtup.probe_size) {
-			if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
+			if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
 				tcp_mtup_probe_success(sk, skb);
 			}
 		}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8197b5e12f1f..518e568b53f3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1238,8 +1238,8 @@ static int tcp_mtu_probe(struct sock *sk)
 		update_send_head(sk, tp, nskb);
 
 		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
-		icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
-		icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
 
 		return 1;
 	}
-- 
cgit v1.2.3


From 0ac81ae34ec8898e7eb1388fe21e3cee7b626a88 Mon Sep 17 00:00:00 2001
From: David Basden <davidb-irda@rcpt.to>
Date: Mon, 20 Mar 2006 22:21:10 -0800
Subject: [IRDA]: TOIM3232 dongle support

Here goes a patch for supporting TOIM3232 based serial IrDA dongles.
The code is based on the tekram dongle code.

It's been tested with a TOIM3232 based IRWave 320S dongle. It may work
for TOIM4232 dongles, although it's not been tested.

Signed-off-by: David Basden <davidb-irda@rcpt.to>
Signed-off-by: Samuel Ortiz <samuel.ortiz@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/irda/Kconfig        |   8 +
 drivers/net/irda/Makefile       |   1 +
 drivers/net/irda/toim3232-sir.c | 375 ++++++++++++++++++++++++++++++++++++++++
 include/linux/irda.h            |   1 +
 4 files changed, 385 insertions(+)
 create mode 100644 drivers/net/irda/toim3232-sir.c

(limited to 'include/linux')

diff --git a/drivers/net/irda/Kconfig b/drivers/net/irda/Kconfig
index c81fe1c382d5..5e6d00752990 100644
--- a/drivers/net/irda/Kconfig
+++ b/drivers/net/irda/Kconfig
@@ -64,6 +64,14 @@ config TEKRAM_DONGLE
 	  dongles you will have to start irattach like this:
 	  "irattach -d tekram".
 
+config TOIM3232_DONGLE
+	tristate "TOIM3232 IrDa dongle"
+	depends on DONGLE && IRDA
+	help
+	  Say Y here if you want to build support for the Vishay/Temic
+	  TOIM3232 and TOIM4232 based dongles.
+	  To compile it as a module, choose M here.
+
 config LITELINK_DONGLE
 	tristate "Parallax LiteLink dongle"
 	depends on DONGLE && IRDA
diff --git a/drivers/net/irda/Makefile b/drivers/net/irda/Makefile
index 72cbfdc9cfcc..27ab75f20799 100644
--- a/drivers/net/irda/Makefile
+++ b/drivers/net/irda/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_OLD_BELKIN_DONGLE)	+= old_belkin-sir.o
 obj-$(CONFIG_MCP2120_DONGLE)	+= mcp2120-sir.o
 obj-$(CONFIG_ACT200L_DONGLE)	+= act200l-sir.o
 obj-$(CONFIG_MA600_DONGLE)	+= ma600-sir.o
+obj-$(CONFIG_TOIM3232_DONGLE)	+= toim3232-sir.o
 
 # The SIR helper module
 sir-dev-objs := sir_dev.o sir_dongle.o sir_kthread.o
diff --git a/drivers/net/irda/toim3232-sir.c b/drivers/net/irda/toim3232-sir.c
new file mode 100644
index 000000000000..f6e5fed6a36d
--- /dev/null
+++ b/drivers/net/irda/toim3232-sir.c
@@ -0,0 +1,375 @@
+/*********************************************************************
+ *
+ * Filename:      toim3232-sir.c
+ * Version:       1.0
+ * Description:   Implementation of dongles based on the Vishay/Temic
+ * 		  TOIM3232 SIR Endec chipset. Currently only the
+ * 		  IRWave IR320ST-2 is tested, although it should work
+ * 		  with any TOIM3232 or TOIM4232 chipset based RS232
+ * 		  dongle with minimal modification.
+ * 		  Based heavily on the Tekram driver (tekram.c),
+ * 		  with thanks to Dag Brattli and Martin Diehl.
+ * Status:        Experimental.
+ * Author:        David Basden <davidb-irda@rcpt.to>
+ * Created at:    Thu Feb 09 23:47:32 2006
+ *
+ *     Copyright (c) 2006 David Basden.
+ *     Copyright (c) 1998-1999 Dag Brattli,
+ *     Copyright (c) 2002 Martin Diehl,
+ *     All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+/*
+ * This driver has currently only been tested on the IRWave IR320ST-2
+ *
+ * PROTOCOL:
+ *
+ * The protocol for talking to the TOIM3232 is quite easy, and is
+ * designed to interface with RS232 with only level convertors. The
+ * BR/~D line on the chip is brought high to signal 'command mode',
+ * where a command byte is sent to select the baudrate of the RS232
+ * interface and the pulse length of the IRDA output. When BR/~D
+ * is brought low, the dongle then changes to the selected baudrate,
+ * and the RS232 interface is used for data until BR/~D is brought
+ * high again. The initial speed for the TOIMx323 after RESET is
+ * 9600 baud.  The baudrate for command-mode is the last selected
+ * baud-rate, or 9600 after a RESET.
+ *
+ * The  dongle I have (below) adds some extra hardware on the front end,
+ * but this is mostly directed towards pariasitic power from the RS232
+ * line rather than changing very much about how to communicate with
+ * the TOIM3232.
+ *
+ * The protocol to talk to the TOIM4232 chipset seems to be almost
+ * identical to the TOIM3232 (and the 4232 datasheet is more detailed)
+ * so this code will probably work on that as well, although I haven't
+ * tested it on that hardware.
+ *
+ * Target dongle variations that might be common:
+ *
+ * DTR and RTS function:
+ *   The data sheet for the 4232 has a sample implementation that hooks the
+ *   DTR and RTS lines to the RESET and BaudRate/~Data lines of the
+ *   chip (through line-converters). Given both DTR and RTS would have to
+ *   be held low in normal operation, and the TOIMx232 requires +5V to
+ *   signal ground, most dongle designers would almost certainly choose
+ *   an implementation that kept at least one of DTR or RTS high in
+ *   normal operation to provide power to the dongle, but will likely
+ *   vary between designs.
+ *
+ * User specified command bits:
+ *  There are two user-controllable output lines from the TOIMx232 that
+ *  can be set low or high by setting the appropriate bits in the
+ *  high-nibble of the command byte (when setting speed and pulse length).
+ *  These might be used to switch on and off added hardware or extra
+ *  dongle features.
+ *
+ *
+ * Target hardware: IRWave IR320ST-2
+ *
+ * 	The IRWave IR320ST-2 is a simple dongle based on the Vishay/Temic
+ * 	TOIM3232 SIR Endec and the Vishay/Temic TFDS4500 SIR IRDA transciever.
+ * 	It uses a hex inverter and some discrete components to buffer and
+ * 	line convert the RS232 down to 5V.
+ *
+ * 	The dongle is powered through a voltage regulator, fed by a large
+ * 	capacitor. To switch the dongle on, DTR is brought high to charge
+ * 	the capacitor and drive the voltage regulator. DTR isn't associated
+ * 	with any control lines on the TOIM3232. Parisitic power is also taken
+ * 	from the RTS, TD and RD lines when brought high, but through resistors.
+ * 	When DTR is low, the circuit might lose power even with RTS high.
+ *
+ * 	RTS is inverted and attached to the BR/~D input pin. When RTS
+ * 	is high, BR/~D is low, and the TOIM3232 is in the normal 'data' mode.
+ * 	RTS is brought low, BR/~D is high, and the TOIM3232 is in 'command
+ * 	mode'.
+ *
+ * 	For some unknown reason, the RESET line isn't actually connected
+ * 	to anything. This means to reset the dongle to get it to a known
+ * 	state (9600 baud) you must drop DTR and RTS low, wait for the power
+ * 	capacitor to discharge, and then bring DTR (and RTS for data mode)
+ * 	high again, and wait for the capacitor to charge, the power supply
+ * 	to stabilise, and the oscillator clock to stabilise.
+ *
+ * 	Fortunately, if the current baudrate is known, the chipset can
+ * 	easily change speed by entering command mode without having to
+ * 	reset the dongle first.
+ *
+ * 	Major Components:
+ *
+ * 	- Vishay/Temic TOIM3232 SIR Endec to change RS232 pulse timings
+ * 	  to IRDA pulse timings
+ * 	- 3.6864MHz crystal to drive TOIM3232 clock oscillator
+ * 	- DM74lS04M Inverting Hex line buffer for RS232 input buffering
+ * 	  and level conversion
+ * 	- PJ2951AC 150mA voltage regulator
+ * 	- Vishay/Temic TFDS4500	SIR IRDA front-end transceiver
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+
+#include <net/irda/irda.h>
+
+#include "sir-dev.h"
+
+MODULE_PARM(toim3232delay, "i");
+MODULE_PARM_DESC(toim3232delay, "toim3232 dongle write complete delay");
+static int toim3232delay = 150;	/* default is 150 ms */
+
+#if 0
+MODULE_PARM(toim3232flipdtr, "i");
+MODULE_PARM_DESC(toim3232flipdtr, "toim3232 dongle invert DTR (Reset)");
+static int toim3232flipdtr = 0;	/* default is DTR high to reset */
+
+MODULE_PARM(toim3232fliptrs, "i");
+MODULE_PARM_DESC(toim3232fliprts, "toim3232 dongle invert RTS (BR/D)");
+static int toim3232fliprts = 0;	/* default is RTS high for baud change */
+#endif
+
+static int toim3232_open(struct sir_dev *);
+static int toim3232_close(struct sir_dev *);
+static int toim3232_change_speed(struct sir_dev *, unsigned);
+static int toim3232_reset(struct sir_dev *);
+
+#define TOIM3232_115200 0x00
+#define TOIM3232_57600  0x01
+#define TOIM3232_38400  0x02
+#define TOIM3232_19200  0x03
+#define TOIM3232_9600   0x06
+#define TOIM3232_2400   0x0A
+
+#define TOIM3232_PW     0x10 /* Pulse select bit */
+
+static struct dongle_driver toim3232 = {
+	.owner		= THIS_MODULE,
+	.driver_name	= "Vishay TOIM3232",
+	.type		= IRDA_TOIM3232_DONGLE,
+	.open		= toim3232_open,
+	.close		= toim3232_close,
+	.reset		= toim3232_reset,
+	.set_speed	= toim3232_change_speed,
+};
+
+static int __init toim3232_sir_init(void)
+{
+	if (toim3232delay < 1  ||  toim3232delay > 500)
+		toim3232delay = 200;
+	IRDA_DEBUG(1, "%s - using %d ms delay\n",
+		toim3232.driver_name, toim3232delay);
+	return irda_register_dongle(&toim3232);
+}
+
+static void __exit toim3232_sir_cleanup(void)
+{
+	irda_unregister_dongle(&toim3232);
+}
+
+static int toim3232_open(struct sir_dev *dev)
+{
+	struct qos_info *qos = &dev->qos;
+
+	IRDA_DEBUG(2, "%s()\n", __FUNCTION__);
+
+	/* Pull the lines high to start with.
+	 *
+	 * For the IR320ST-2, we need to charge the main supply capacitor to
+	 * switch the device on. We keep DTR high throughout to do this.
+	 * When RTS, TD and RD are high, they will also trickle-charge the
+	 * cap. RTS is high for data transmission, and low for baud rate select.
+	 * 	-- DGB
+	 */
+	sirdev_set_dtr_rts(dev, TRUE, TRUE);
+
+	/* The TOI3232 supports many speeds between 1200bps and 115000bps.
+	 * We really only care about those supported by the IRDA spec, but
+	 * 38400 seems to be implemented in many places */
+	qos->baud_rate.bits &= IR_2400|IR_9600|IR_19200|IR_38400|IR_57600|IR_115200;
+
+	/* From the tekram driver. Not sure what a reasonable value is -- DGB */
+	qos->min_turn_time.bits = 0x01; /* Needs at least 10 ms */
+	irda_qos_bits_to_value(qos);
+
+	/* irda thread waits 50 msec for power settling */
+
+	return 0;
+}
+
+static int toim3232_close(struct sir_dev *dev)
+{
+	IRDA_DEBUG(2, "%s()\n", __FUNCTION__);
+
+	/* Power off dongle */
+	sirdev_set_dtr_rts(dev, FALSE, FALSE);
+
+	return 0;
+}
+
+/*
+ * Function toim3232change_speed (dev, state, speed)
+ *
+ *    Set the speed for the TOIM3232 based dongle. Warning, this
+ *    function must be called with a process context!
+ *
+ *    Algorithm
+ *    1. keep DTR high but clear RTS to bring into baud programming mode
+ *    2. wait at least 7us to enter programming mode
+ *    3. send control word to set baud rate and timing
+ *    4. wait at least 1us
+ *    5. bring RTS high to enter DATA mode (RS232 is passed through to transceiver)
+ *    6. should take effect immediately (although probably worth waiting)
+ */
+
+#define TOIM3232_STATE_WAIT_SPEED	(SIRDEV_STATE_DONGLE_SPEED + 1)
+
+static int toim3232_change_speed(struct sir_dev *dev, unsigned speed)
+{
+	unsigned state = dev->fsm.substate;
+	unsigned delay = 0;
+	u8 byte;
+	static int ret = 0;
+
+	IRDA_DEBUG(2, "%s()\n", __FUNCTION__);
+
+	switch(state) {
+	case SIRDEV_STATE_DONGLE_SPEED:
+
+		/* Figure out what we are going to send as a control byte */
+		switch (speed) {
+		case 2400:
+			byte = TOIM3232_PW|TOIM3232_2400;
+			break;
+		default:
+			speed = 9600;
+			ret = -EINVAL;
+			/* fall thru */
+		case 9600:
+			byte = TOIM3232_PW|TOIM3232_9600;
+			break;
+		case 19200:
+			byte = TOIM3232_PW|TOIM3232_19200;
+			break;
+		case 38400:
+			byte = TOIM3232_PW|TOIM3232_38400;
+			break;
+		case 57600:
+			byte = TOIM3232_PW|TOIM3232_57600;
+			break;
+		case 115200:
+			byte = TOIM3232_115200;
+			break;
+		}
+
+		/* Set DTR, Clear RTS: Go into baud programming mode */
+		sirdev_set_dtr_rts(dev, TRUE, FALSE);
+
+		/* Wait at least 7us */
+		udelay(14);
+
+		/* Write control byte */
+		sirdev_raw_write(dev, &byte, 1);
+
+		dev->speed = speed;
+
+		state = TOIM3232_STATE_WAIT_SPEED;
+		delay = toim3232delay;
+		break;
+
+	case TOIM3232_STATE_WAIT_SPEED:
+		/* Have transmitted control byte * Wait for 'at least 1us' */
+		udelay(14);
+
+		/* Set DTR, Set RTS: Go into normal data mode */
+		sirdev_set_dtr_rts(dev, TRUE, TRUE);
+
+		/* Wait (TODO: check this is needed) */
+		udelay(50);
+		break;
+
+	default:
+		printk(KERN_ERR "%s - undefined state %d\n", __FUNCTION__, state);
+		ret = -EINVAL;
+		break;
+	}
+
+	dev->fsm.substate = state;
+	return (delay > 0) ? delay : ret;
+}
+
+/*
+ * Function toim3232reset (driver)
+ *
+ *      This function resets the toim3232 dongle. Warning, this function
+ *      must be called with a process context!!
+ *
+ * What we should do is:
+ * 	0. Pull RESET high
+ * 	1. Wait for at least 7us
+ * 	2. Pull RESET low
+ * 	3. Wait for at least 7us
+ * 	4. Pull BR/~D high
+ * 	5. Wait for at least 7us
+ * 	6. Send control byte to set baud rate
+ * 	7. Wait at least 1us after stop bit
+ * 	8. Pull BR/~D low
+ * 	9. Should then be in data mode
+ *
+ * Because the IR320ST-2 doesn't have the RESET line connected for some reason,
+ * we'll have to do something else.
+ *
+ * The default speed after a RESET is 9600, so lets try just bringing it up in
+ * data mode after switching it off, waiting for the supply capacitor to
+ * discharge, and then switch it back on. This isn't actually pulling RESET
+ * high, but it seems to have the same effect.
+ *
+ * This behaviour will probably work on dongles that have the RESET line connected,
+ * but if not, add a flag for the IR320ST-2, and implment the above-listed proper
+ * behaviour.
+ *
+ * RTS is inverted and then fed to BR/~D, so to put it in programming mode, we
+ * need to have pull RTS low
+ */
+
+static int toim3232_reset(struct sir_dev *dev)
+{
+	IRDA_DEBUG(2, "%s()\n", __FUNCTION__);
+
+	/* Switch off both DTR and RTS to switch off dongle */
+	sirdev_set_dtr_rts(dev, FALSE, FALSE);
+
+	/* Should sleep a while. This might be evil doing it this way.*/
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(msecs_to_jiffies(50));
+
+	/* Set DTR, Set RTS (data mode) */
+	sirdev_set_dtr_rts(dev, TRUE, TRUE);
+
+	/* Wait at least 10 ms for power to stabilize again */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(msecs_to_jiffies(10));
+
+	/* Speed should now be 9600 */
+	dev->speed = 9600;
+
+	return 0;
+}
+
+MODULE_AUTHOR("David Basden <davidb-linux@rcpt.to>");
+MODULE_DESCRIPTION("Vishay/Temic TOIM3232 based dongle driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("irda-dongle-12"); /* IRDA_TOIM3232_DONGLE */
+
+module_init(toim3232_sir_init);
+module_exit(toim3232_sir_cleanup);
diff --git a/include/linux/irda.h b/include/linux/irda.h
index 95dee174cdc5..09d8f105a5a8 100644
--- a/include/linux/irda.h
+++ b/include/linux/irda.h
@@ -76,6 +76,7 @@ typedef enum {
 	IRDA_MCP2120_DONGLE      = 9,
 	IRDA_ACT200L_DONGLE      = 10,
 	IRDA_MA600_DONGLE        = 11,
+	IRDA_TOIM3232_DONGLE     = 12,
 } IRDA_DONGLE;
 
 /* Protocol types to be used for SOCK_DGRAM */
-- 
cgit v1.2.3


From 6756ae4b4e97aba48c042b4aa6b77a18f507d2cb Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 20 Mar 2006 22:23:58 -0800
Subject: [NET]: Convert RTNL to mutex.

This patch turns the RTNL from a semaphore to a new 2.6.16 mutex and
gets rid of some of the leftover legacy.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/8139too.c     |  2 +-
 include/linux/rtnetlink.h | 20 +++++++-------------
 net/core/dev.c            |  8 ++++----
 net/core/link_watch.c     |  4 ++--
 net/core/netpoll.c        |  6 +++---
 net/core/rtnetlink.c      | 28 +++++++++++++++++-----------
 net/ipv4/igmp.c           | 24 ++++++++++++------------
 net/ipv4/ipconfig.c       | 10 +++++-----
 8 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index e58d4c50c2e1..f5ee064ab6b2 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -1605,7 +1605,7 @@ static void rtl8139_thread (void *_data)
 	if (tp->watchdog_fired) {
 		tp->watchdog_fired = 0;
 		rtl8139_tx_timeout_task(_data);
-	} else if (rtnl_shlock_nowait() == 0) {
+	} else if (rtnl_trylock()) {
 		rtl8139_thread_iter (dev, tp, tp->mmio_addr);
 		rtnl_unlock ();
 	} else {
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index edccefb45188..d263853a8f1c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -907,6 +907,7 @@ struct tcamsg
 #ifdef __KERNEL__
 
 #include <linux/config.h>
+#include <linux/mutex.h>
 
 extern size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size);
 static __inline__ int rtattr_strcmp(const struct rtattr *rta, const char *str)
@@ -1038,24 +1039,17 @@ __rta_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 
 extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
 
-extern struct semaphore rtnl_sem;
-
-#define rtnl_shlock()		down(&rtnl_sem)
-#define rtnl_shlock_nowait()	down_trylock(&rtnl_sem)
-
-#define rtnl_shunlock()	do { up(&rtnl_sem); \
-		             if (rtnl && rtnl->sk_receive_queue.qlen) \
-				     rtnl->sk_data_ready(rtnl, 0); \
-		        } while(0)
-
+/* RTNL is used as a global lock for all changes to network configuration  */
 extern void rtnl_lock(void);
-extern int rtnl_lock_interruptible(void);
 extern void rtnl_unlock(void);
+extern int rtnl_trylock(void);
+
 extern void rtnetlink_init(void);
+extern void __rtnl_unlock(void);
 
 #define ASSERT_RTNL() do { \
-	if (unlikely(down_trylock(&rtnl_sem) == 0)) { \
-		up(&rtnl_sem); \
+	if (unlikely(rtnl_trylock())) { \
+		rtnl_unlock(); \
 		printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \
 		       __FILE__,  __LINE__); \
 		dump_stack(); \
diff --git a/net/core/dev.c b/net/core/dev.c
index 8763c99fcb84..be1d896cc5b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2466,9 +2466,9 @@ int dev_ioctl(unsigned int cmd, void __user *arg)
 	 */
 
 	if (cmd == SIOCGIFCONF) {
-		rtnl_shlock();
+		rtnl_lock();
 		ret = dev_ifconf((char __user *) arg);
-		rtnl_shunlock();
+		rtnl_unlock();
 		return ret;
 	}
 	if (cmd == SIOCGIFNAME)
@@ -2877,7 +2877,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 	rebroadcast_time = warning_time = jiffies;
 	while (atomic_read(&dev->refcnt) != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
-			rtnl_shlock();
+			rtnl_lock();
 
 			/* Rebroadcast unregister notification */
 			notifier_call_chain(&netdev_chain,
@@ -2894,7 +2894,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 				linkwatch_run_queue();
 			}
 
-			rtnl_shunlock();
+			__rtnl_unlock();
 
 			rebroadcast_time = jiffies;
 		}
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index e82a451d330b..341de44c7ed1 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -139,9 +139,9 @@ static void linkwatch_event(void *dummy)
 	linkwatch_nextevent = jiffies + HZ;
 	clear_bit(LW_RUNNING, &linkwatch_flags);
 
-	rtnl_shlock();
+	rtnl_lock();
 	linkwatch_run_queue();
-	rtnl_shunlock();
+	rtnl_unlock();
 }
 
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ea51f8d02eb8..e8e05cebd95a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -669,14 +669,14 @@ int netpoll_setup(struct netpoll *np)
 		printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
 		       np->name, np->dev_name);
 
-		rtnl_shlock();
+		rtnl_lock();
 		if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) {
 			printk(KERN_ERR "%s: failed to open %s\n",
 			       np->name, np->dev_name);
-			rtnl_shunlock();
+			rtnl_unlock();
 			goto release;
 		}
-		rtnl_shunlock();
+		rtnl_unlock();
 
 		atleast = jiffies + HZ/10;
  		atmost = jiffies + 4*HZ;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1c15a907066f..ae10d3740faa 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -35,6 +35,7 @@
 #include <linux/skbuff.h>
 #include <linux/init.h>
 #include <linux/security.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -51,25 +52,31 @@
 #include <net/pkt_sched.h>
 #include <net/netlink.h>
 
-DECLARE_MUTEX(rtnl_sem);
+static DEFINE_MUTEX(rtnl_mutex);
 
 void rtnl_lock(void)
 {
-	rtnl_shlock();
+	mutex_lock(&rtnl_mutex);
 }
 
-int rtnl_lock_interruptible(void)
+void __rtnl_unlock(void)
 {
-	return down_interruptible(&rtnl_sem);
+	mutex_unlock(&rtnl_mutex);
 }
- 
+
 void rtnl_unlock(void)
 {
-	rtnl_shunlock();
-
+	mutex_unlock(&rtnl_mutex);
+	if (rtnl && rtnl->sk_receive_queue.qlen)
+		rtnl->sk_data_ready(rtnl, 0);
 	netdev_run_todo();
 }
 
+int rtnl_trylock(void)
+{
+	return mutex_trylock(&rtnl_mutex);
+}
+
 int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
 {
 	memset(tb, 0, sizeof(struct rtattr*)*maxattr);
@@ -625,9 +632,9 @@ static void rtnetlink_rcv(struct sock *sk, int len)
 	unsigned int qlen = 0;
 
 	do {
-		rtnl_lock();
+		mutex_lock(&rtnl_mutex);
 		netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg);
-		up(&rtnl_sem);
+		mutex_unlock(&rtnl_mutex);
 
 		netdev_run_todo();
 	} while (qlen);
@@ -704,6 +711,5 @@ EXPORT_SYMBOL(rtnetlink_links);
 EXPORT_SYMBOL(rtnetlink_put_metrics);
 EXPORT_SYMBOL(rtnl);
 EXPORT_SYMBOL(rtnl_lock);
-EXPORT_SYMBOL(rtnl_lock_interruptible);
-EXPORT_SYMBOL(rtnl_sem);
+EXPORT_SYMBOL(rtnl_trylock);
 EXPORT_SYMBOL(rtnl_unlock);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 64ce52bf0485..3ec502f19da0 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1730,7 +1730,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 	if (!MULTICAST(addr))
 		return -EINVAL;
 
-	rtnl_shlock();
+	rtnl_lock();
 
 	in_dev = ip_mc_find_dev(imr);
 
@@ -1763,7 +1763,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 	ip_mc_inc_group(in_dev, addr);
 	err = 0;
 done:
-	rtnl_shunlock();
+	rtnl_unlock();
 	return err;
 }
 
@@ -1837,7 +1837,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	if (!MULTICAST(addr))
 		return -EINVAL;
 
-	rtnl_shlock();
+	rtnl_lock();
 
 	imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
 	imr.imr_address.s_addr = mreqs->imr_interface;
@@ -1947,7 +1947,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
 		&mreqs->imr_sourceaddr, 1);
 done:
-	rtnl_shunlock();
+	rtnl_unlock();
 	if (leavegroup)
 		return ip_mc_leave_group(sk, &imr);
 	return err;
@@ -1970,7 +1970,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 	    msf->imsf_fmode != MCAST_EXCLUDE)
 		return -EINVAL;
 
-	rtnl_shlock();
+	rtnl_lock();
 
 	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
 	imr.imr_address.s_addr = msf->imsf_interface;
@@ -2030,7 +2030,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 	pmc->sfmode = msf->imsf_fmode;
 	err = 0;
 done:
-	rtnl_shunlock();
+	rtnl_unlock();
 	if (leavegroup)
 		err = ip_mc_leave_group(sk, &imr);
 	return err;
@@ -2050,7 +2050,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 	if (!MULTICAST(addr))
 		return -EINVAL;
 
-	rtnl_shlock();
+	rtnl_lock();
 
 	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
 	imr.imr_address.s_addr = msf->imsf_interface;
@@ -2072,7 +2072,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		goto done;
 	msf->imsf_fmode = pmc->sfmode;
 	psl = pmc->sflist;
-	rtnl_shunlock();
+	rtnl_unlock();
 	if (!psl) {
 		len = 0;
 		count = 0;
@@ -2091,7 +2091,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		return -EFAULT;
 	return 0;
 done:
-	rtnl_shunlock();
+	rtnl_unlock();
 	return err;
 }
 
@@ -2112,7 +2112,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	if (!MULTICAST(addr))
 		return -EINVAL;
 
-	rtnl_shlock();
+	rtnl_lock();
 
 	err = -EADDRNOTAVAIL;
 
@@ -2125,7 +2125,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 		goto done;
 	gsf->gf_fmode = pmc->sfmode;
 	psl = pmc->sflist;
-	rtnl_shunlock();
+	rtnl_unlock();
 	count = psl ? psl->sl_count : 0;
 	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
 	gsf->gf_numsrc = count;
@@ -2146,7 +2146,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	}
 	return 0;
 done:
-	rtnl_shunlock();
+	rtnl_unlock();
 	return err;
 }
 
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index bb3613ec448c..cb8a92f18ef6 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -186,7 +186,7 @@ static int __init ic_open_devs(void)
 	unsigned short oflags;
 
 	last = &ic_first_dev;
-	rtnl_shlock();
+	rtnl_lock();
 
 	/* bring loopback device up first */
 	if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
@@ -215,7 +215,7 @@ static int __init ic_open_devs(void)
 				continue;
 			}
 			if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
-				rtnl_shunlock();
+				rtnl_unlock();
 				return -1;
 			}
 			d->dev = dev;
@@ -232,7 +232,7 @@ static int __init ic_open_devs(void)
 				dev->name, able, d->xid));
 		}
 	}
-	rtnl_shunlock();
+	rtnl_unlock();
 
 	*last = NULL;
 
@@ -251,7 +251,7 @@ static void __init ic_close_devs(void)
 	struct ic_device *d, *next;
 	struct net_device *dev;
 
-	rtnl_shlock();
+	rtnl_lock();
 	next = ic_first_dev;
 	while ((d = next)) {
 		next = d->next;
@@ -262,7 +262,7 @@ static void __init ic_close_devs(void)
 		}
 		kfree(d);
 	}
-	rtnl_shunlock();
+	rtnl_unlock();
 }
 
 /*
-- 
cgit v1.2.3


From 99cae7fca1311573f2777b8ceaa8a5abd6e9b04e Mon Sep 17 00:00:00 2001
From: Alpt <alpt@freaknet.org>
Date: Mon, 20 Mar 2006 22:26:17 -0800
Subject: [NET] rtnetlink: Add RTPROT entry for Netsukuku.

The Netsukuku daemon is using the same number to mark its routes, you
can see it here:
http://hinezumilabs.org/cgi-bin/viewcvs.cgi/netsukuku/src/krnl_route.h?rev=HEAD&content-type=text/vnd.viewcvs-markup

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index d263853a8f1c..d572d5376319 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -199,6 +199,7 @@ enum
 #define RTPROT_BIRD	12	/* BIRD */
 #define RTPROT_DNROUTED	13	/* DECnet routing daemon */
 #define RTPROT_XORP	14	/* XORP */
+#define RTPROT_NTK	15	/* Netsukuku */
 
 /* rtm_scope
 
-- 
cgit v1.2.3


From d9ab5ad12b0d865bdb1b750d81192d34465541e9 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Mon, 20 Mar 2006 22:27:35 -0800
Subject: [TG3]: Add 5787 and 5754 basic support

Add basic support for 2 new chips 5787 and 5754.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 27 +++++++++++++++++++++++++--
 drivers/net/tg3.h       |  6 +++++-
 include/linux/pci_ids.h |  4 ++++
 3 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index d4f5b58e1dd4..e6ed940d7714 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -221,6 +221,14 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5753F,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5754,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5754M,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787M,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5714,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5714S,
@@ -4388,6 +4396,10 @@ static int tg3_chip_reset(struct tg3 *tp)
 		tp->nvram_lock_cnt = 0;
 	}
 
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
+		tw32(GRC_FASTBOOT_PC, 0);
+
 	/*
 	 * We must avoid the readl() that normally takes place.
 	 * It locks machines, causes machine checks, and other
@@ -6018,6 +6030,10 @@ static int tg3_reset_hw(struct tg3 *tp)
 		}
 	}
 
+	/* Enable host coalescing bug fix */
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
+		val |= (1 << 29);
+
 	tw32_f(WDMAC_MODE, val);
 	udelay(40);
 
@@ -8326,6 +8342,9 @@ static void tg3_self_test(struct net_device *dev, struct ethtool_test *etest,
 		if (!err)
 			tg3_nvram_unlock(tp);
 
+		if (tp->tg3_flags2 & TG3_FLG2_MII_SERDES)
+			tg3_phy_reset(tp);
+
 		if (tg3_test_registers(tp) != 0) {
 			etest->flags |= ETH_TEST_FL_FAILED;
 			data[2] = 1;
@@ -9681,6 +9700,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5750 ||
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787 ||
 	    (tp->tg3_flags2 & TG3_FLG2_5780_CLASS))
 		tp->tg3_flags2 |= TG3_FLG2_5750_PLUS;
 
@@ -9693,7 +9713,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5705 &&
 	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5750 &&
-	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5752)
+	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5752 &&
+	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5787)
 		tp->tg3_flags2 |= TG3_FLG2_JUMBO_CAPABLE;
 
 	if (pci_find_capability(tp->pdev, PCI_CAP_ID_EXP) != 0)
@@ -9903,7 +9924,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	if (tp->pci_chip_rev_id == CHIPREV_ID_5704_A0)
 		tp->tg3_flags2 |= TG3_FLG2_PHY_5704_A0_BUG;
 
-	if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS)
+	if ((tp->tg3_flags2 & TG3_FLG2_5705_PLUS) &&
+	    (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5787))
 		tp->tg3_flags2 |= TG3_FLG2_PHY_BER_BUG;
 
 	tp->coalesce_mode = 0;
@@ -10628,6 +10650,7 @@ static char * __devinit tg3_phy_string(struct tg3 *tp)
 	case PHY_ID_BCM5752:	return "5752";
 	case PHY_ID_BCM5714:	return "5714";
 	case PHY_ID_BCM5780:	return "5780";
+	case PHY_ID_BCM5787:	return "5787";
 	case PHY_ID_BCM8002:	return "8002/serdes";
 	case 0:			return "serdes";
 	default:		return "unknown";
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 7e3b613afb29..8d32fbe0d303 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -138,6 +138,7 @@
 #define   ASIC_REV_5752			 0x06
 #define   ASIC_REV_5780			 0x08
 #define   ASIC_REV_5714			 0x09
+#define   ASIC_REV_5787			 0x0b
 #define  GET_CHIP_REV(CHIP_REV_ID)	((CHIP_REV_ID) >> 8)
 #define   CHIPREV_5700_AX		 0x70
 #define   CHIPREV_5700_BX		 0x71
@@ -1393,6 +1394,7 @@
 #define GRC_MDI_CTRL			0x00006844
 #define GRC_SEEPROM_DELAY		0x00006848
 /* 0x684c --> 0x6c00 unused */
+#define GRC_FASTBOOT_PC			0x00006894	/* 5752, 5755, 5787 */
 
 /* 0x6c00 --> 0x7000 unused */
 
@@ -2247,6 +2249,7 @@ struct tg3 {
 #define PHY_ID_BCM5752			0x60008100
 #define PHY_ID_BCM5714			0x60008340
 #define PHY_ID_BCM5780			0x60008350
+#define PHY_ID_BCM5787			0xbc050ce0
 #define PHY_ID_BCM8002			0x60010140
 #define PHY_ID_INVALID			0xffffffff
 #define PHY_ID_REV_MASK			0x0000000f
@@ -2271,7 +2274,8 @@ struct tg3 {
 	 (X) == PHY_ID_BCM5703 || (X) == PHY_ID_BCM5704 || \
 	 (X) == PHY_ID_BCM5705 || (X) == PHY_ID_BCM5750 || \
 	 (X) == PHY_ID_BCM5752 || (X) == PHY_ID_BCM5714 || \
-	 (X) == PHY_ID_BCM5780 || (X) == PHY_ID_BCM8002)
+	 (X) == PHY_ID_BCM5780 || (X) == PHY_ID_BCM5787 || \
+	 (X) == PHY_ID_BCM8002)
 
 	struct tg3_hw_stats		*hw_stats;
 	dma_addr_t			stats_mapping;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a3a09cceb026..b9810ddf435a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1861,14 +1861,18 @@
 #define PCI_DEVICE_ID_TIGON3_5780	0x166a
 #define PCI_DEVICE_ID_TIGON3_5780S	0x166b
 #define PCI_DEVICE_ID_TIGON3_5705F	0x166e
+#define PCI_DEVICE_ID_TIGON3_5754M	0x1672
 #define PCI_DEVICE_ID_TIGON3_5750	0x1676
 #define PCI_DEVICE_ID_TIGON3_5751	0x1677
 #define PCI_DEVICE_ID_TIGON3_5715	0x1678
 #define PCI_DEVICE_ID_TIGON3_5715S	0x1679
+#define PCI_DEVICE_ID_TIGON3_5754	0x167a
 #define PCI_DEVICE_ID_TIGON3_5750M	0x167c
 #define PCI_DEVICE_ID_TIGON3_5751M	0x167d
 #define PCI_DEVICE_ID_TIGON3_5751F	0x167e
+#define PCI_DEVICE_ID_TIGON3_5787M	0x1693
 #define PCI_DEVICE_ID_TIGON3_5782	0x1696
+#define PCI_DEVICE_ID_TIGON3_5787	0x169b
 #define PCI_DEVICE_ID_TIGON3_5788	0x169c
 #define PCI_DEVICE_ID_TIGON3_5789	0x169d
 #define PCI_DEVICE_ID_TIGON3_5702X	0x16a6
-- 
cgit v1.2.3


From 153330618691694af64f39fb56c9de051862380e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 20 Mar 2006 22:32:28 -0800
Subject: [NET]: dev_put/dev_hold cleanup

Get rid of the old __dev_put macro that is just a hold over from pre 2.6
kernel.  And turn dev_hold into an inline instead of a macro.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/e1000/e1000_main.c | 2 +-
 include/linux/netdevice.h      | 6 ++++--
 net/ipv4/igmp.c                | 2 +-
 net/ipv4/ipmr.c                | 4 ++--
 net/sched/sch_generic.c        | 2 +-
 5 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index f39de16e6b97..49cd096a3c3d 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -920,7 +920,7 @@ e1000_remove(struct pci_dev *pdev)
 	unregister_netdev(netdev);
 #ifdef CONFIG_E1000_NAPI
 	for (i = 0; i < adapter->num_rx_queues; i++)
-		__dev_put(&adapter->polling_netdev[i]);
+		dev_put(&adapter->polling_netdev[i]);
 #endif
 
 	if (!e1000_check_phy_reset_block(&adapter->hw))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b825be201bce..950dc55e5192 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -712,8 +712,10 @@ static inline void dev_put(struct net_device *dev)
 	atomic_dec(&dev->refcnt);
 }
 
-#define __dev_put(dev) atomic_dec(&(dev)->refcnt)
-#define dev_hold(dev) atomic_inc(&(dev)->refcnt)
+static inline void dev_hold(struct net_device *dev)
+{
+	atomic_inc(&dev->refcnt);
+}
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
  * and _off may be called from IRQ context, but it is caller
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 3ec502f19da0..d512239a1473 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1382,7 +1382,7 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
 		dev = ip_dev_find(imr->imr_address.s_addr);
 		if (!dev)
 			return NULL;
-		__dev_put(dev);
+		dev_put(dev);
 	}
 
 	if (!dev && !ip_route_output_key(&rt, &fl)) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5c94c222e3f3..717ab7d6d7b6 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -415,10 +415,10 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
 			return -ENOBUFS;
 		break;
 	case 0:
-		dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
+		dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
 		if (!dev)
 			return -EADDRNOTAVAIL;
-		__dev_put(dev);
+		dev_put(dev);
 		break;
 	default:
 		return -EINVAL;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 99ceb91f0150..31eb83717c26 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -234,7 +234,7 @@ static void dev_watchdog_down(struct net_device *dev)
 {
 	spin_lock_bh(&dev->xmit_lock);
 	if (del_timer(&dev->watchdog_timer))
-		__dev_put(dev);
+		dev_put(dev);
 	spin_unlock_bh(&dev->xmit_lock);
 }
 
-- 
cgit v1.2.3


From 57b47a53ec4a67691ba32cff5768e8d78fa6c67f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Mar 2006 22:35:41 -0800
Subject: [NET]: sem2mutex part 2

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sunrpc/svcsock.h    |  2 +-
 include/net/af_unix.h             |  3 +-
 net/atm/common.c                  |  4 +-
 net/atm/resources.c               | 32 +++++++-------
 net/atm/resources.h               |  3 +-
 net/bridge/netfilter/ebtables.c   | 91 ++++++++++++++++++++-------------------
 net/ipv4/ipvs/ip_vs_app.c         | 19 ++++----
 net/ipv4/netfilter/arp_tables.c   |  2 +-
 net/ipv4/netfilter/ip_tables.c    |  2 +-
 net/ipv6/netfilter/ip6_tables.c   |  2 +-
 net/netfilter/nf_conntrack_core.c | 16 ++++---
 net/sunrpc/svcsock.c              |  8 ++--
 net/unix/af_unix.c                | 22 +++++-----
 13 files changed, 107 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index d33c6face032..b4acb3d37c3f 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -36,7 +36,7 @@ struct svc_sock {
 
 	struct list_head	sk_deferred;	/* deferred requests that need to
 						 * be revisted */
-	struct semaphore        sk_sem;		/* to serialize sending data */
+	struct mutex		sk_mutex;	/* to serialize sending data */
 
 	int			(*sk_recvfrom)(struct svc_rqst *rqstp);
 	int			(*sk_sendto)(struct svc_rqst *rqstp);
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index bfc1779fc753..427dac94bc7e 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <linux/socket.h>
 #include <linux/un.h>
+#include <linux/mutex.h>
 #include <net/sock.h>
 
 extern void unix_inflight(struct file *fp);
@@ -71,7 +72,7 @@ struct unix_sock {
         struct unix_address     *addr;
         struct dentry		*dentry;
         struct vfsmount		*mnt;
-        struct semaphore        readsem;
+	struct mutex		readlock;
         struct sock		*peer;
         struct sock		*other;
         struct sock		*gc_tree;
diff --git a/net/atm/common.c b/net/atm/common.c
index 6656b111cc05..ae002220fa99 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -451,12 +451,12 @@ int vcc_connect(struct socket *sock, int itf, short vpi, int vci)
 		dev = try_then_request_module(atm_dev_lookup(itf), "atm-device-%d", itf);
 	} else {
 		dev = NULL;
-		down(&atm_dev_mutex);
+		mutex_lock(&atm_dev_mutex);
 		if (!list_empty(&atm_devs)) {
 			dev = list_entry(atm_devs.next, struct atm_dev, dev_list);
 			atm_dev_hold(dev);
 		}
-		up(&atm_dev_mutex);
+		mutex_unlock(&atm_dev_mutex);
 	}
 	if (!dev)
 		return -ENODEV;
diff --git a/net/atm/resources.c b/net/atm/resources.c
index 224190537c90..18ac80698f83 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -18,6 +18,8 @@
 #include <linux/bitops.h>
 #include <linux/capability.h>
 #include <linux/delay.h>
+#include <linux/mutex.h>
+
 #include <net/sock.h>	 /* for struct sock */
 
 #include "common.h"
@@ -26,7 +28,7 @@
 
 
 LIST_HEAD(atm_devs);
-DECLARE_MUTEX(atm_dev_mutex);
+DEFINE_MUTEX(atm_dev_mutex);
 
 static struct atm_dev *__alloc_atm_dev(const char *type)
 {
@@ -65,9 +67,9 @@ struct atm_dev *atm_dev_lookup(int number)
 {
 	struct atm_dev *dev;
 
-	down(&atm_dev_mutex);
+	mutex_lock(&atm_dev_mutex);
 	dev = __atm_dev_lookup(number);
-	up(&atm_dev_mutex);
+	mutex_unlock(&atm_dev_mutex);
 	return dev;
 }
 
@@ -83,11 +85,11 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
 		    type);
 		return NULL;
 	}
-	down(&atm_dev_mutex);
+	mutex_lock(&atm_dev_mutex);
 	if (number != -1) {
 		if ((inuse = __atm_dev_lookup(number))) {
 			atm_dev_put(inuse);
-			up(&atm_dev_mutex);
+			mutex_unlock(&atm_dev_mutex);
 			kfree(dev);
 			return NULL;
 		}
@@ -112,12 +114,12 @@ struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops,
 		printk(KERN_ERR "atm_dev_register: "
 		       "atm_proc_dev_register failed for dev %s\n",
 		       type);
-		up(&atm_dev_mutex);
+		mutex_unlock(&atm_dev_mutex);
 		kfree(dev);
 		return NULL;
 	}
 	list_add_tail(&dev->dev_list, &atm_devs);
-	up(&atm_dev_mutex);
+	mutex_unlock(&atm_dev_mutex);
 
 	return dev;
 }
@@ -133,9 +135,9 @@ void atm_dev_deregister(struct atm_dev *dev)
 	 * with same number can appear, such we need deregister proc, 
 	 * release async all vccs and remove them from vccs list too
 	 */
-	down(&atm_dev_mutex);
+	mutex_lock(&atm_dev_mutex);
 	list_del(&dev->dev_list);
-	up(&atm_dev_mutex);
+	mutex_unlock(&atm_dev_mutex);
 
 	atm_dev_release_vccs(dev);
 	atm_proc_dev_deregister(dev);
@@ -196,16 +198,16 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
 				return -EFAULT;
 			if (get_user(len, &iobuf->length))
 				return -EFAULT;
-			down(&atm_dev_mutex);
+			mutex_lock(&atm_dev_mutex);
 			list_for_each(p, &atm_devs)
 				size += sizeof(int);
 			if (size > len) {
-				up(&atm_dev_mutex);
+				mutex_unlock(&atm_dev_mutex);
 				return -E2BIG;
 			}
 			tmp_buf = kmalloc(size, GFP_ATOMIC);
 			if (!tmp_buf) {
-				up(&atm_dev_mutex);
+				mutex_unlock(&atm_dev_mutex);
 				return -ENOMEM;
 			}
 			tmp_p = tmp_buf;
@@ -213,7 +215,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
 				dev = list_entry(p, struct atm_dev, dev_list);
 				*tmp_p++ = dev->number;
 			}
-			up(&atm_dev_mutex);
+			mutex_unlock(&atm_dev_mutex);
 		        error = ((copy_to_user(buf, tmp_buf, size)) ||
 					put_user(size, &iobuf->length))
 						? -EFAULT : 0;
@@ -400,13 +402,13 @@ static __inline__ void *dev_get_idx(loff_t left)
 
 void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos)
 {
- 	down(&atm_dev_mutex);
+ 	mutex_lock(&atm_dev_mutex);
 	return *pos ? dev_get_idx(*pos) : (void *) 1;
 }
 
 void atm_dev_seq_stop(struct seq_file *seq, void *v)
 {
- 	up(&atm_dev_mutex);
+ 	mutex_unlock(&atm_dev_mutex);
 }
  
 void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
diff --git a/net/atm/resources.h b/net/atm/resources.h
index b7fb82a93b42..ac7222fee7a8 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -8,10 +8,11 @@
 
 #include <linux/config.h>
 #include <linux/atmdev.h>
+#include <linux/mutex.h>
 
 
 extern struct list_head atm_devs;
-extern struct semaphore atm_dev_mutex;
+extern struct mutex atm_dev_mutex;
 
 int atm_dev_ioctl(unsigned int cmd, void __user *arg);
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index cbd4020cc84d..4b178b4a2a95 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -35,6 +35,7 @@
 #define ASSERT_READ_LOCK(x)
 #define ASSERT_WRITE_LOCK(x)
 #include <linux/netfilter_ipv4/listhelp.h>
+#include <linux/mutex.h>
 
 #if 0
 /* use this for remote debugging
@@ -81,7 +82,7 @@ static void print_string(char *str)
 
 
-static DECLARE_MUTEX(ebt_mutex);
+static DEFINE_MUTEX(ebt_mutex);
 static LIST_HEAD(ebt_tables);
 static LIST_HEAD(ebt_targets);
 static LIST_HEAD(ebt_matches);
@@ -296,18 +297,18 @@ letscontinue:
 /* If it succeeds, returns element and locks mutex */
 static inline void *
 find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
-   struct semaphore *mutex)
+   struct mutex *mutex)
 {
 	void *ret;
 
-	*error = down_interruptible(mutex);
+	*error = mutex_lock_interruptible(mutex);
 	if (*error != 0)
 		return NULL;
 
 	ret = list_named_find(head, name);
 	if (!ret) {
 		*error = -ENOENT;
-		up(mutex);
+		mutex_unlock(mutex);
 	}
 	return ret;
 }
@@ -317,7 +318,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
 #else
 static void *
 find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
-   int *error, struct semaphore *mutex)
+   int *error, struct mutex *mutex)
 {
 	void *ret;
 
@@ -331,25 +332,25 @@ find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
 #endif
 
 static inline struct ebt_table *
-find_table_lock(const char *name, int *error, struct semaphore *mutex)
+find_table_lock(const char *name, int *error, struct mutex *mutex)
 {
 	return find_inlist_lock(&ebt_tables, name, "ebtable_", error, mutex);
 }
 
 static inline struct ebt_match *
-find_match_lock(const char *name, int *error, struct semaphore *mutex)
+find_match_lock(const char *name, int *error, struct mutex *mutex)
 {
 	return find_inlist_lock(&ebt_matches, name, "ebt_", error, mutex);
 }
 
 static inline struct ebt_watcher *
-find_watcher_lock(const char *name, int *error, struct semaphore *mutex)
+find_watcher_lock(const char *name, int *error, struct mutex *mutex)
 {
 	return find_inlist_lock(&ebt_watchers, name, "ebt_", error, mutex);
 }
 
 static inline struct ebt_target *
-find_target_lock(const char *name, int *error, struct semaphore *mutex)
+find_target_lock(const char *name, int *error, struct mutex *mutex)
 {
 	return find_inlist_lock(&ebt_targets, name, "ebt_", error, mutex);
 }
@@ -369,10 +370,10 @@ ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e,
 		return ret;
 	m->u.match = match;
 	if (!try_module_get(match->me)) {
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		return -ENOENT;
 	}
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 	if (match->check &&
 	   match->check(name, hookmask, e, m->data, m->match_size) != 0) {
 		BUGPRINT("match->check failed\n");
@@ -398,10 +399,10 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e,
 		return ret;
 	w->u.watcher = watcher;
 	if (!try_module_get(watcher->me)) {
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		return -ENOENT;
 	}
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 	if (watcher->check &&
 	   watcher->check(name, hookmask, e, w->data, w->watcher_size) != 0) {
 		BUGPRINT("watcher->check failed\n");
@@ -638,11 +639,11 @@ ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo,
 	if (!target)
 		goto cleanup_watchers;
 	if (!try_module_get(target->me)) {
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		ret = -ENOENT;
 		goto cleanup_watchers;
 	}
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 
 	t->u.target = target;
 	if (t->u.target == &ebt_standard_target) {
@@ -1015,7 +1016,7 @@ static int do_replace(void __user *user, unsigned int len)
 
 	t->private = newinfo;
 	write_unlock_bh(&t->lock);
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 	/* so, a user can change the chains while having messed up her counter
 	   allocation. Only reason why this is done is because this way the lock
 	   is held only once, while this doesn't bring the kernel into a
@@ -1045,7 +1046,7 @@ static int do_replace(void __user *user, unsigned int len)
 	return ret;
 
 free_unlock:
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 free_iterate:
 	EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
 	   ebt_cleanup_entry, NULL);
@@ -1068,69 +1069,69 @@ int ebt_register_target(struct ebt_target *target)
 {
 	int ret;
 
-	ret = down_interruptible(&ebt_mutex);
+	ret = mutex_lock_interruptible(&ebt_mutex);
 	if (ret != 0)
 		return ret;
 	if (!list_named_insert(&ebt_targets, target)) {
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		return -EEXIST;
 	}
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 
 	return 0;
 }
 
 void ebt_unregister_target(struct ebt_target *target)
 {
-	down(&ebt_mutex);
+	mutex_lock(&ebt_mutex);
 	LIST_DELETE(&ebt_targets, target);
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 }
 
 int ebt_register_match(struct ebt_match *match)
 {
 	int ret;
 
-	ret = down_interruptible(&ebt_mutex);
+	ret = mutex_lock_interruptible(&ebt_mutex);
 	if (ret != 0)
 		return ret;
 	if (!list_named_insert(&ebt_matches, match)) {
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		return -EEXIST;
 	}
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 
 	return 0;
 }
 
 void ebt_unregister_match(struct ebt_match *match)
 {
-	down(&ebt_mutex);
+	mutex_lock(&ebt_mutex);
 	LIST_DELETE(&ebt_matches, match);
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 }
 
 int ebt_register_watcher(struct ebt_watcher *watcher)
 {
 	int ret;
 
-	ret = down_interruptible(&ebt_mutex);
+	ret = mutex_lock_interruptible(&ebt_mutex);
 	if (ret != 0)
 		return ret;
 	if (!list_named_insert(&ebt_watchers, watcher)) {
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		return -EEXIST;
 	}
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 
 	return 0;
 }
 
 void ebt_unregister_watcher(struct ebt_watcher *watcher)
 {
-	down(&ebt_mutex);
+	mutex_lock(&ebt_mutex);
 	LIST_DELETE(&ebt_watchers, watcher);
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 }
 
 int ebt_register_table(struct ebt_table *table)
@@ -1178,7 +1179,7 @@ int ebt_register_table(struct ebt_table *table)
 
 	table->private = newinfo;
 	rwlock_init(&table->lock);
-	ret = down_interruptible(&ebt_mutex);
+	ret = mutex_lock_interruptible(&ebt_mutex);
 	if (ret != 0)
 		goto free_chainstack;
 
@@ -1194,10 +1195,10 @@ int ebt_register_table(struct ebt_table *table)
 		goto free_unlock;
 	}
 	list_prepend(&ebt_tables, table);
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 	return 0;
 free_unlock:
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 free_chainstack:
 	if (newinfo->chainstack) {
 		for_each_cpu(i)
@@ -1218,9 +1219,9 @@ void ebt_unregister_table(struct ebt_table *table)
 		BUGPRINT("Request to unregister NULL table!!!\n");
 		return;
 	}
-	down(&ebt_mutex);
+	mutex_lock(&ebt_mutex);
 	LIST_DELETE(&ebt_tables, table);
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 	vfree(table->private->entries);
 	if (table->private->chainstack) {
 		for_each_cpu(i)
@@ -1281,7 +1282,7 @@ static int update_counters(void __user *user, unsigned int len)
 	write_unlock_bh(&t->lock);
 	ret = 0;
 unlock_mutex:
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 free_tmp:
 	vfree(tmp);
 	return ret;
@@ -1328,7 +1329,7 @@ static inline int ebt_make_names(struct ebt_entry *e, char *base, char *ubase)
 	return 0;
 }
 
-/* called with ebt_mutex down */
+/* called with ebt_mutex locked */
 static int copy_everything_to_user(struct ebt_table *t, void __user *user,
    int *len, int cmd)
 {
@@ -1440,7 +1441,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	case EBT_SO_GET_INIT_INFO:
 		if (*len != sizeof(struct ebt_replace)){
 			ret = -EINVAL;
-			up(&ebt_mutex);
+			mutex_unlock(&ebt_mutex);
 			break;
 		}
 		if (cmd == EBT_SO_GET_INFO) {
@@ -1452,7 +1453,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 			tmp.entries_size = t->table->entries_size;
 			tmp.valid_hooks = t->table->valid_hooks;
 		}
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		if (copy_to_user(user, &tmp, *len) != 0){
 			BUGPRINT("c2u Didn't work\n");
 			ret = -EFAULT;
@@ -1464,11 +1465,11 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	case EBT_SO_GET_ENTRIES:
 	case EBT_SO_GET_INIT_ENTRIES:
 		ret = copy_everything_to_user(t, user, len, cmd);
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		break;
 
 	default:
-		up(&ebt_mutex);
+		mutex_unlock(&ebt_mutex);
 		ret = -EINVAL;
 	}
 
@@ -1484,9 +1485,9 @@ static int __init init(void)
 {
 	int ret;
 
-	down(&ebt_mutex);
+	mutex_lock(&ebt_mutex);
 	list_named_insert(&ebt_targets, &ebt_standard_target);
-	up(&ebt_mutex);
+	mutex_unlock(&ebt_mutex);
 	if ((ret = nf_register_sockopt(&ebt_sockopts)) < 0)
 		return ret;
 
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index 9b176a942ac5..e7752334d296 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -31,6 +31,7 @@
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/mutex.h>
 
 #include <net/ip_vs.h>
 
@@ -40,7 +41,7 @@ EXPORT_SYMBOL(register_ip_vs_app_inc);
 
 /* ipvs application list head */
 static LIST_HEAD(ip_vs_app_list);
-static DECLARE_MUTEX(__ip_vs_app_mutex);
+static DEFINE_MUTEX(__ip_vs_app_mutex);
 
 
 /*
@@ -173,11 +174,11 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
 {
 	int result;
 
-	down(&__ip_vs_app_mutex);
+	mutex_lock(&__ip_vs_app_mutex);
 
 	result = ip_vs_app_inc_new(app, proto, port);
 
-	up(&__ip_vs_app_mutex);
+	mutex_unlock(&__ip_vs_app_mutex);
 
 	return result;
 }
@@ -191,11 +192,11 @@ int register_ip_vs_app(struct ip_vs_app *app)
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
-	down(&__ip_vs_app_mutex);
+	mutex_lock(&__ip_vs_app_mutex);
 
 	list_add(&app->a_list, &ip_vs_app_list);
 
-	up(&__ip_vs_app_mutex);
+	mutex_unlock(&__ip_vs_app_mutex);
 
 	return 0;
 }
@@ -209,7 +210,7 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
 {
 	struct ip_vs_app *inc, *nxt;
 
-	down(&__ip_vs_app_mutex);
+	mutex_lock(&__ip_vs_app_mutex);
 
 	list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
 		ip_vs_app_inc_release(inc);
@@ -217,7 +218,7 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
 
 	list_del(&app->a_list);
 
-	up(&__ip_vs_app_mutex);
+	mutex_unlock(&__ip_vs_app_mutex);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
@@ -498,7 +499,7 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
 
 static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	down(&__ip_vs_app_mutex);
+	mutex_lock(&__ip_vs_app_mutex);
 
 	return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
 }
@@ -530,7 +531,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
 {
-	up(&__ip_vs_app_mutex);
+	mutex_unlock(&__ip_vs_app_mutex);
 }
 
 static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 9423bd0f070a..f7efb3f27bf5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -22,7 +22,7 @@
 #include <linux/init.h>
 
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp/arp_tables.h>
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cf5b9db05371..39705f9bc154 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -25,7 +25,7 @@
 #include <linux/icmp.h>
 #include <net/ip.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include <linux/proc_fs.h>
 #include <linux/err.h>
 #include <linux/cpumask.h>
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d74ec335743e..5a2063bda676 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -29,7 +29,7 @@
 #include <linux/icmpv6.h>
 #include <net/ipv6.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include <linux/proc_fs.h>
 #include <linux/cpumask.h>
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index dc68d0022218..f6498234e264 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -185,7 +185,7 @@ static struct {
 DEFINE_RWLOCK(nf_ct_cache_lock);
 
 /* This avoids calling kmem_cache_create() with same name simultaneously */
-DECLARE_MUTEX(nf_ct_cache_mutex);
+static DEFINE_MUTEX(nf_ct_cache_mutex);
 
 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
 struct nf_conntrack_protocol *
@@ -278,7 +278,7 @@ int nf_conntrack_register_cache(u_int32_t features, const char *name,
 		return -EINVAL;
 	}
 
-	down(&nf_ct_cache_mutex);
+	mutex_lock(&nf_ct_cache_mutex);
 
 	write_lock_bh(&nf_ct_cache_lock);
 	/* e.g: multiple helpers are loaded */
@@ -294,7 +294,7 @@ int nf_conntrack_register_cache(u_int32_t features, const char *name,
 			ret = -EBUSY;
 
 		write_unlock_bh(&nf_ct_cache_lock);
-		up(&nf_ct_cache_mutex);
+		mutex_unlock(&nf_ct_cache_mutex);
 		return ret;
 	}
 	write_unlock_bh(&nf_ct_cache_lock);
@@ -338,7 +338,7 @@ int nf_conntrack_register_cache(u_int32_t features, const char *name,
 out_free_name:
 	kfree(cache_name);
 out_up_mutex:
-	up(&nf_ct_cache_mutex);
+	mutex_unlock(&nf_ct_cache_mutex);
 	return ret;
 }
 
@@ -353,12 +353,12 @@ void nf_conntrack_unregister_cache(u_int32_t features)
 	 * slab cache.
 	 */
 	DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
-	down(&nf_ct_cache_mutex);
+	mutex_lock(&nf_ct_cache_mutex);
 
 	write_lock_bh(&nf_ct_cache_lock);
 	if (--nf_ct_cache[features].use > 0) {
 		write_unlock_bh(&nf_ct_cache_lock);
-		up(&nf_ct_cache_mutex);
+		mutex_unlock(&nf_ct_cache_mutex);
 		return;
 	}
 	cachep = nf_ct_cache[features].cachep;
@@ -373,7 +373,7 @@ void nf_conntrack_unregister_cache(u_int32_t features)
 	kmem_cache_destroy(cachep);
 	kfree(name);
 
-	up(&nf_ct_cache_mutex);
+	mutex_unlock(&nf_ct_cache_mutex);
 }
 
 int
@@ -1408,6 +1408,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/mutex.h>
+
 
 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
  * in ip_conntrack_core, since we don't want the protocols to autoload
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 50580620e897..a27905a0ad27 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1296,13 +1296,13 @@ svc_send(struct svc_rqst *rqstp)
 		xb->page_len +
 		xb->tail[0].iov_len;
 
-	/* Grab svsk->sk_sem to serialize outgoing data. */
-	down(&svsk->sk_sem);
+	/* Grab svsk->sk_mutex to serialize outgoing data. */
+	mutex_lock(&svsk->sk_mutex);
 	if (test_bit(SK_DEAD, &svsk->sk_flags))
 		len = -ENOTCONN;
 	else
 		len = svsk->sk_sendto(rqstp);
-	up(&svsk->sk_sem);
+	mutex_unlock(&svsk->sk_mutex);
 	svc_sock_release(rqstp);
 
 	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
@@ -1351,7 +1351,7 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock,
 	svsk->sk_lastrecv = get_seconds();
 	INIT_LIST_HEAD(&svsk->sk_deferred);
 	INIT_LIST_HEAD(&svsk->sk_ready);
-	sema_init(&svsk->sk_sem, 1);
+	mutex_init(&svsk->sk_mutex);
 
 	/* Initialize the socket */
 	if (sock->type == SOCK_DGRAM)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2b00460f2886..2b4cc2eea5b3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -566,7 +566,7 @@ static struct sock * unix_create1(struct socket *sock)
 	u->mnt	  = NULL;
 	spin_lock_init(&u->lock);
 	atomic_set(&u->inflight, sock ? 0 : -1);
-	init_MUTEX(&u->readsem); /* single task reading lock */
+	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
@@ -623,7 +623,7 @@ static int unix_autobind(struct socket *sock)
 	struct unix_address * addr;
 	int err;
 
-	down(&u->readsem);
+	mutex_lock(&u->readlock);
 
 	err = 0;
 	if (u->addr)
@@ -661,7 +661,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
-out:	up(&u->readsem);
+out:	mutex_unlock(&u->readlock);
 	return err;
 }
 
@@ -744,7 +744,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 	addr_len = err;
 
-	down(&u->readsem);
+	mutex_lock(&u->readlock);
 
 	err = -EINVAL;
 	if (u->addr)
@@ -816,7 +816,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	up(&u->readsem);
+	mutex_unlock(&u->readlock);
 out:
 	return err;
 
@@ -1545,7 +1545,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	msg->msg_namelen = 0;
 
-	down(&u->readsem);
+	mutex_lock(&u->readlock);
 
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
@@ -1600,7 +1600,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 out_free:
 	skb_free_datagram(sk,skb);
 out_unlock:
-	up(&u->readsem);
+	mutex_unlock(&u->readlock);
 out:
 	return err;
 }
@@ -1676,7 +1676,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
 
-	down(&u->readsem);
+	mutex_lock(&u->readlock);
 
 	do
 	{
@@ -1700,7 +1700,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			err = -EAGAIN;
 			if (!timeo)
 				break;
-			up(&u->readsem);
+			mutex_unlock(&u->readlock);
 
 			timeo = unix_stream_data_wait(sk, timeo);
 
@@ -1708,7 +1708,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 				err = sock_intr_errno(timeo);
 				goto out;
 			}
-			down(&u->readsem);
+			mutex_lock(&u->readlock);
 			continue;
 		}
 
@@ -1774,7 +1774,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 	} while (size);
 
-	up(&u->readsem);
+	mutex_unlock(&u->readlock);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
 	return copied ? : err;
-- 
cgit v1.2.3


From abd596a4b68b6526c2676233e10602dd9660e9d7 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 20 Mar 2006 22:39:47 -0800
Subject: [IPV4] ARP: Alloc acceptance of unsolicited ARP via netdevice sysctl.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  1 +
 include/linux/sysctl.h     |  1 +
 net/ipv4/arp.c             | 20 ++++++++++----------
 net/ipv4/devinet.c         |  8 ++++++++
 4 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index fd7af86151b1..92297ff24e85 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -25,6 +25,7 @@ struct ipv4_devconf
 	int     arp_filter;
 	int	arp_announce;
 	int	arp_ignore;
+	int	arp_accept;
 	int	medium_id;
 	int	no_xfrm;
 	int	no_policy;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index dfcf449afc7c..8754568a75d7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -456,6 +456,7 @@ enum
 	NET_IPV4_CONF_ARP_ANNOUNCE=18,
 	NET_IPV4_CONF_ARP_IGNORE=19,
 	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
+	NET_IPV4_CONF_ARP_ACCEPT=21,
 	__NET_IPV4_CONF_MAX
 };
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index accdefedfed7..041dadde31af 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -879,16 +879,16 @@ static int arp_process(struct sk_buff *skb)
 
 	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
 
-#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
-	/* Unsolicited ARP is not accepted by default.
-	   It is possible, that this option should be enabled for some
-	   devices (strip is candidate)
-	 */
-	if (n == NULL &&
-	    arp->ar_op == htons(ARPOP_REPLY) &&
-	    inet_addr_type(sip) == RTN_UNICAST)
-		n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
-#endif
+	if (ipv4_devconf.arp_accept) {
+		/* Unsolicited ARP is not accepted by default.
+		   It is possible, that this option should be enabled for some
+		   devices (strip is candidate)
+		 */
+		if (n == NULL &&
+		    arp->ar_op == htons(ARPOP_REPLY) &&
+		    inet_addr_type(sip) == RTN_UNICAST)
+			n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
+	}
 
 	if (n) {
 		int state = NUD_REACHABLE;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3ffa60dadc0c..44fdf1413e2c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1393,6 +1393,14 @@ static struct devinet_sysctl_table {
 			.mode		= 0644,
 			.proc_handler	= &proc_dointvec,
 		},
+		{
+			.ctl_name	= NET_IPV4_CONF_ARP_ACCEPT,
+			.procname	= "arp_accept",
+			.data		= &ipv4_devconf.arp_accept,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
 		{
 			.ctl_name	= NET_IPV4_CONF_NOXFRM,
 			.procname	= "disable_xfrm",
-- 
cgit v1.2.3


From 15d99e02babae8bc20b836917ace07d93e318149 Mon Sep 17 00:00:00 2001
From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 20 Mar 2006 22:40:29 -0800
Subject: [TCP]: sysctl to allow TCP window > 32767 sans wscale

Back in the dark ages, we had to be conservative and only allow 15-bit
window fields if the window scale option was not negotiated.  Some
ancient stacks used a signed 16-bit quantity for the window field of
the TCP header and would get confused.

Those days are long gone, so we can use the full 16-bits by default
now.

There is a sysctl added so that we can still interact with such old
stacks

Signed-off-by: Rick Jones <rick.jones2@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/linux/sysctl.h                 |  1 +
 include/net/tcp.h                      |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  9 ++++++++-
 net/ipv4/tcp_output.c                  | 23 +++++++++++++++++------
 5 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 651298ba019a..f12007b80a46 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -355,6 +355,13 @@ somaxconn - INTEGER
 	Defaults to 128.  See also tcp_max_syn_backlog for additional tuning
 	for TCP sockets.
 
+tcp_workaround_signed_windows - BOOLEAN
+	If set, assume no receipt of a window scaling option means the
+	remote TCP is broken and treats the window as a signed quantity.
+	If unset, assume the remote TCP is not broken even if we do
+	not receive a window scaling option from them.
+	Default: 0
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 8754568a75d7..76eaeff76f82 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -402,6 +402,7 @@ enum
 	NET_IPV4_IPFRAG_MAX_DIST=112,
  	NET_TCP_MTU_PROBING=113,
 	NET_TCP_BASE_MSS=114,
+	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 16879fa560de..457e224de468 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -224,6 +224,7 @@ extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_abc;
 extern int sysctl_tcp_mtu_probing;
 extern int sysctl_tcp_base_mss;
+extern int sysctl_tcp_workaround_signed_windows;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ebf2e0b363c4..6b6c3adfcf00 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -680,7 +680,14 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-
+        {
+		.ctl_name	= NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,
+		.procname	= "tcp_workaround_signed_windows",
+		.data		= &sysctl_tcp_workaround_signed_windows,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 518e568b53f3..9d79546d384e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,6 +45,11 @@
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse = 1;
 
+/* People can turn this on to  work with those rare, broken TCPs that
+ * interpret the window field as a signed quantity.
+ */
+int sysctl_tcp_workaround_signed_windows = 0;
+
 /* This limits the percentage of the congestion window which we
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
@@ -177,12 +182,18 @@ void tcp_select_initial_window(int __space, __u32 mss,
 		space = (space / mss) * mss;
 
 	/* NOTE: offering an initial window larger than 32767
-	 * will break some buggy TCP stacks. We try to be nice.
-	 * If we are not window scaling, then this truncates
-	 * our initial window offering to 32k. There should also
-	 * be a sysctl option to stop being nice.
+	 * will break some buggy TCP stacks. If the admin tells us
+	 * it is likely we could be speaking with such a buggy stack
+	 * we will truncate our initial window offering to 32K-1
+	 * unless the remote has sent us a window scaling option,
+	 * which we interpret as a sign the remote TCP is not
+	 * misinterpreting the window field as a signed quantity.
 	 */
-	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+	if (sysctl_tcp_workaround_signed_windows)
+		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+	else
+		(*rcv_wnd) = space;
+
 	(*rcv_wscale) = 0;
 	if (wscale_ok) {
 		/* Set window scaling on max possible window
@@ -241,7 +252,7 @@ static u16 tcp_select_window(struct sock *sk)
 	/* Make sure we do not exceed the maximum possible
 	 * scaled window.
 	 */
-	if (!tp->rx_opt.rcv_wscale)
+	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
 		new_win = min(new_win, MAX_TCP_WINDOW);
 	else
 		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
-- 
cgit v1.2.3


From 2c7946a7bf45ae86736ab3b43d0085e43947945c Mon Sep 17 00:00:00 2001
From: Catherine Zhang <cxzhang@watson.ibm.com>
Date: Mon, 20 Mar 2006 22:41:23 -0800
Subject: [SECURITY]: TCP/UDP getpeersec

This patch implements an application of the LSM-IPSec networking
controls whereby an application can determine the label of the
security association its TCP or UDP sockets are currently connected to
via getsockopt and the auxiliary data mechanism of recvmsg.

Patch purpose:

This patch enables a security-aware application to retrieve the
security context of an IPSec security association a particular TCP or
UDP socket is using.  The application can then use this security
context to determine the security context for processing on behalf of
the peer at the other end of this connection.  In the case of UDP, the
security context is for each individual packet.  An example
application is the inetd daemon, which could be modified to start
daemons running at security contexts dependent on the remote client.

Patch design approach:

- Design for TCP
The patch enables the SELinux LSM to set the peer security context for
a socket based on the security context of the IPSec security
association.  The application may retrieve this context using
getsockopt.  When called, the kernel determines if the socket is a
connected (TCP_ESTABLISHED) TCP socket and, if so, uses the dst_entry
cache on the socket to retrieve the security associations.  If a
security association has a security context, the context string is
returned, as for UNIX domain sockets.

- Design for UDP
Unlike TCP, UDP is connectionless.  This requires a somewhat different
API to retrieve the peer security context.  With TCP, the peer
security context stays the same throughout the connection, thus it can
be retrieved at any time between when the connection is established
and when it is torn down.  With UDP, each read/write can have
different peer and thus the security context might change every time.
As a result the security context retrieval must be done TOGETHER with
the packet retrieval.

The solution is to build upon the existing Unix domain socket API for
retrieving user credentials.  Linux offers the API for obtaining user
credentials via ancillary messages (i.e., out of band/control messages
that are bundled together with a normal message).

Patch implementation details:

- Implementation for TCP
The security context can be retrieved by applications using getsockopt
with the existing SO_PEERSEC flag.  As an example (ignoring error
checking):

getsockopt(sockfd, SOL_SOCKET, SO_PEERSEC, optbuf, &optlen);
printf("Socket peer context is: %s\n", optbuf);

The SELinux function, selinux_socket_getpeersec, is extended to check
for labeled security associations for connected (TCP_ESTABLISHED ==
sk->sk_state) TCP sockets only.  If so, the socket has a dst_cache of
struct dst_entry values that may refer to security associations.  If
these have security associations with security contexts, the security
context is returned.

getsockopt returns a buffer that contains a security context string or
the buffer is unmodified.

- Implementation for UDP
To retrieve the security context, the application first indicates to
the kernel such desire by setting the IP_PASSSEC option via
getsockopt.  Then the application retrieves the security context using
the auxiliary data mechanism.

An example server application for UDP should look like this:

toggle = 1;
toggle_len = sizeof(toggle);

setsockopt(sockfd, SOL_IP, IP_PASSSEC, &toggle, &toggle_len);
recvmsg(sockfd, &msg_hdr, 0);
if (msg_hdr.msg_controllen > sizeof(struct cmsghdr)) {
    cmsg_hdr = CMSG_FIRSTHDR(&msg_hdr);
    if (cmsg_hdr->cmsg_len <= CMSG_LEN(sizeof(scontext)) &&
        cmsg_hdr->cmsg_level == SOL_IP &&
        cmsg_hdr->cmsg_type == SCM_SECURITY) {
        memcpy(&scontext, CMSG_DATA(cmsg_hdr), sizeof(scontext));
    }
}

ip_setsockopt is enhanced with a new socket option IP_PASSSEC to allow
a server socket to receive security context of the peer.  A new
ancillary message type SCM_SECURITY.

When the packet is received we get the security context from the
sec_path pointer which is contained in the sk_buff, and copy it to the
ancillary message space.  An additional LSM hook,
selinux_socket_getpeersec_udp, is defined to retrieve the security
context from the SELinux space.  The existing function,
selinux_socket_getpeersec does not suit our purpose, because the
security context is copied directly to user space, rather than to
kernel space.

Testing:

We have tested the patch by setting up TCP and UDP connections between
applications on two machines using the IPSec policies that result in
labeled security associations being built.  For TCP, we can then
extract the peer security context using getsockopt on either end.  For
UDP, the receiving end can retrieve the security context using the
auxiliary data mechanism of recvmsg.

Signed-off-by: Catherine Zhang <cxzhang@watson.ibm.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h              |  1 +
 include/linux/security.h        | 25 +++++++++++----
 include/linux/socket.h          |  1 +
 net/core/sock.c                 |  2 +-
 net/ipv4/ip_sockglue.c          | 31 ++++++++++++++++++-
 security/dummy.c                | 10 ++++--
 security/selinux/hooks.c        | 46 +++++++++++++++++++++++-----
 security/selinux/include/xfrm.h |  2 ++
 security/selinux/xfrm.c         | 68 +++++++++++++++++++++++++++++++++++++++++
 9 files changed, 169 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/in.h b/include/linux/in.h
index ba355384016a..94f557fa4636 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -72,6 +72,7 @@ struct in_addr {
 #define IP_FREEBIND	15
 #define IP_IPSEC_POLICY	16
 #define IP_XFRM_POLICY	17
+#define IP_PASSSEC	18
 
 /* BSD compatibility */
 #define IP_RECVRETOPTS	IP_RETOPTS
diff --git a/include/linux/security.h b/include/linux/security.h
index 7cbef482e13a..b18eb8cfa639 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1286,7 +1286,8 @@ struct security_operations {
 	int (*socket_setsockopt) (struct socket * sock, int level, int optname);
 	int (*socket_shutdown) (struct socket * sock, int how);
 	int (*socket_sock_rcv_skb) (struct sock * sk, struct sk_buff * skb);
-	int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len);
+	int (*socket_getpeersec_stream) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len);
+	int (*socket_getpeersec_dgram) (struct sk_buff *skb, char **secdata, u32 *seclen);
 	int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
 	void (*sk_free_security) (struct sock *sk);
 	unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir);
@@ -2741,10 +2742,16 @@ static inline int security_sock_rcv_skb (struct sock * sk,
 	return security_ops->socket_sock_rcv_skb (sk, skb);
 }
 
-static inline int security_socket_getpeersec(struct socket *sock, char __user *optval,
-					     int __user *optlen, unsigned len)
+static inline int security_socket_getpeersec_stream(struct socket *sock, char __user *optval,
+						    int __user *optlen, unsigned len)
 {
-	return security_ops->socket_getpeersec(sock, optval, optlen, len);
+	return security_ops->socket_getpeersec_stream(sock, optval, optlen, len);
+}
+
+static inline int security_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata,
+						   u32 *seclen)
+{
+	return security_ops->socket_getpeersec_dgram(skb, secdata, seclen);
 }
 
 static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
@@ -2863,8 +2870,14 @@ static inline int security_sock_rcv_skb (struct sock * sk,
 	return 0;
 }
 
-static inline int security_socket_getpeersec(struct socket *sock, char __user *optval,
-					     int __user *optlen, unsigned len)
+static inline int security_socket_getpeersec_stream(struct socket *sock, char __user *optval,
+						    int __user *optlen, unsigned len)
+{
+	return -ENOPROTOOPT;
+}
+
+static inline int security_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata,
+						   u32 *seclen)
 {
 	return -ENOPROTOOPT;
 }
diff --git a/include/linux/socket.h b/include/linux/socket.h
index b02dda4ee83d..9ab2ddd80221 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -150,6 +150,7 @@ __KINLINE struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__
 
 #define	SCM_RIGHTS	0x01		/* rw: access rights (array of int) */
 #define SCM_CREDENTIALS 0x02		/* rw: struct ucred		*/
+#define SCM_SECURITY	0x03		/* rw: security label		*/
 
 struct ucred {
 	__u32	pid;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6e00811d44bc..5038a5a7bd84 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -616,7 +616,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 			break;
 
 		case SO_PEERSEC:
-			return security_socket_getpeersec(sock, optval, optlen, len);
+			return security_socket_getpeersec_stream(sock, optval, optlen, len);
 
 		default:
 			return(-ENOPROTOOPT);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 2bf8d782f678..b5c4f61518e8 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -50,6 +50,7 @@
 #define IP_CMSG_TOS		4
 #define IP_CMSG_RECVOPTS	8
 #define IP_CMSG_RETOPTS		16
+#define IP_CMSG_PASSSEC		32
 
 /*
  *	SOL_IP control messages.
@@ -109,6 +110,19 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
 }
 
+static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
+{
+	char *secdata;
+	u32 seclen;
+	int err;
+
+	err = security_socket_getpeersec_dgram(skb, &secdata, &seclen);
+	if (err)
+		return;
+
+	put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
+}
+
 
 void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 {
@@ -138,6 +152,11 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 
 	if (flags & 1)
 		ip_cmsg_recv_retopts(msg, skb);
+	if ((flags>>=1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_security(msg, skb);
 }
 
 int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
@@ -393,7 +412,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			    (1<<IP_RETOPTS) | (1<<IP_TOS) | 
 			    (1<<IP_TTL) | (1<<IP_HDRINCL) | 
 			    (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 
-			    (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND))) || 
+			    (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
+			    (1<<IP_PASSSEC))) ||
 				optname == IP_MULTICAST_TTL || 
 				optname == IP_MULTICAST_LOOP) { 
 		if (optlen >= sizeof(int)) {
@@ -478,6 +498,12 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			else
 				inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
 			break;
+		case IP_PASSSEC:
+			if (val)
+				inet->cmsg_flags |= IP_CMSG_PASSSEC;
+			else
+				inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
+			break;
 		case IP_TOS:	/* This sets both TOS and Precedence */
 			if (sk->sk_type == SOCK_STREAM) {
 				val &= ~3;
@@ -932,6 +958,9 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		case IP_RETOPTS:
 			val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
 			break;
+		case IP_PASSSEC:
+			val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
+			break;
 		case IP_TOS:
 			val = inet->tos;
 			break;
diff --git a/security/dummy.c b/security/dummy.c
index f1a5bd98bf10..a326d6e0b6d7 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -763,8 +763,14 @@ static int dummy_socket_sock_rcv_skb (struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static int dummy_socket_getpeersec(struct socket *sock, char __user *optval,
-				   int __user *optlen, unsigned len)
+static int dummy_socket_getpeersec_stream(struct socket *sock, char __user *optval,
+					  int __user *optlen, unsigned len)
+{
+	return -ENOPROTOOPT;
+}
+
+static int dummy_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata,
+					 u32 *seclen)
 {
 	return -ENOPROTOOPT;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b65c201e9ff5..5b16196f2823 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3318,24 +3318,38 @@ out:
 	return err;
 }
 
-static int selinux_socket_getpeersec(struct socket *sock, char __user *optval,
-				     int __user *optlen, unsigned len)
+static int selinux_socket_getpeersec_stream(struct socket *sock, char __user *optval,
+					    int __user *optlen, unsigned len)
 {
 	int err = 0;
 	char *scontext;
 	u32 scontext_len;
 	struct sk_security_struct *ssec;
 	struct inode_security_struct *isec;
+	u32 peer_sid = 0;
 
 	isec = SOCK_INODE(sock)->i_security;
-	if (isec->sclass != SECCLASS_UNIX_STREAM_SOCKET) {
+
+	/* if UNIX_STREAM check peer_sid, if TCP check dst for labelled sa */
+	if (isec->sclass == SECCLASS_UNIX_STREAM_SOCKET) {
+		ssec = sock->sk->sk_security;
+		peer_sid = ssec->peer_sid;
+	}
+	else if (isec->sclass == SECCLASS_TCP_SOCKET) {
+		peer_sid = selinux_socket_getpeer_stream(sock->sk);
+
+		if (peer_sid == SECSID_NULL) {
+			err = -ENOPROTOOPT;
+			goto out;
+		}
+	}
+	else {
 		err = -ENOPROTOOPT;
 		goto out;
 	}
 
-	ssec = sock->sk->sk_security;
-	
-	err = security_sid_to_context(ssec->peer_sid, &scontext, &scontext_len);
+	err = security_sid_to_context(peer_sid, &scontext, &scontext_len);
+
 	if (err)
 		goto out;
 
@@ -3356,6 +3370,23 @@ out:
 	return err;
 }
 
+static int selinux_socket_getpeersec_dgram(struct sk_buff *skb, char **secdata, u32 *seclen)
+{
+	int err = 0;
+	u32 peer_sid = selinux_socket_getpeer_dgram(skb);
+
+	if (peer_sid == SECSID_NULL)
+		return -EINVAL;
+
+	err = security_sid_to_context(peer_sid, secdata, seclen);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+
+
 static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
 {
 	return sk_alloc_security(sk, family, priority);
@@ -4344,7 +4375,8 @@ static struct security_operations selinux_ops = {
 	.socket_setsockopt =		selinux_socket_setsockopt,
 	.socket_shutdown =		selinux_socket_shutdown,
 	.socket_sock_rcv_skb =		selinux_socket_sock_rcv_skb,
-	.socket_getpeersec =		selinux_socket_getpeersec,
+	.socket_getpeersec_stream =	selinux_socket_getpeersec_stream,
+	.socket_getpeersec_dgram =	selinux_socket_getpeersec_dgram,
 	.sk_alloc_security =		selinux_sk_alloc_security,
 	.sk_free_security =		selinux_sk_free_security,
 	.sk_getsid = 			selinux_sk_getsid_security,
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 8e87996c6dd5..a7f388bff3f2 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -39,6 +39,8 @@ static inline u32 selinux_no_sk_sid(struct flowi *fl)
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb);
 int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb);
+u32 selinux_socket_getpeer_stream(struct sock *sk);
+u32 selinux_socket_getpeer_dgram(struct sk_buff *skb);
 #else
 static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
 {
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index b2af7ca496c1..dfab6c886698 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -224,6 +224,74 @@ void selinux_xfrm_state_free(struct xfrm_state *x)
 		kfree(ctx);
 }
 
+/*
+ * SELinux internal function to retrieve the context of a connected
+ * (sk->sk_state == TCP_ESTABLISHED) TCP socket based on its security
+ * association used to connect to the remote socket.
+ *
+ * Retrieve via getsockopt SO_PEERSEC.
+ */
+u32 selinux_socket_getpeer_stream(struct sock *sk)
+{
+	struct dst_entry *dst, *dst_test;
+	u32 peer_sid = SECSID_NULL;
+
+	if (sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	dst = sk_dst_get(sk);
+	if (!dst)
+		goto out;
+
+ 	for (dst_test = dst; dst_test != 0;
+      	     dst_test = dst_test->child) {
+		struct xfrm_state *x = dst_test->xfrm;
+
+ 		if (x && selinux_authorizable_xfrm(x)) {
+	 	 	struct xfrm_sec_ctx *ctx = x->security;
+			peer_sid = ctx->ctx_sid;
+			break;
+		}
+	}
+	dst_release(dst);
+
+out:
+	return peer_sid;
+}
+
+/*
+ * SELinux internal function to retrieve the context of a UDP packet
+ * based on its security association used to connect to the remote socket.
+ *
+ * Retrieve via setsockopt IP_PASSSEC and recvmsg with control message
+ * type SCM_SECURITY.
+ */
+u32 selinux_socket_getpeer_dgram(struct sk_buff *skb)
+{
+	struct sec_path *sp;
+
+	if (skb == NULL)
+		return SECSID_NULL;
+
+	if (skb->sk->sk_protocol != IPPROTO_UDP)
+		return SECSID_NULL;
+
+	sp = skb->sp;
+	if (sp) {
+		int i;
+
+		for (i = sp->len-1; i >= 0; i--) {
+			struct xfrm_state *x = sp->x[i].xvec;
+			if (selinux_authorizable_xfrm(x)) {
+				struct xfrm_sec_ctx *ctx = x->security;
+				return ctx->ctx_sid;
+			}
+		}
+	}
+
+	return SECSID_NULL;
+}
+
 /*
  * LSM hook that controls access to unlabelled packets.  If
  * a xfrm_state is authorizable (defined by macro) then it was
-- 
cgit v1.2.3


From c4ea94ab3710eb2434abe2eab1a479c2dc01f8ac Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <steve@chygwyn.com>
Date: Mon, 20 Mar 2006 22:42:39 -0800
Subject: [DECnet]: Endian annotation and fixes for DECnet.

The typedef for dn_address has been removed in favour of using __le16
or __u16 directly as appropriate. All the DECnet header files are
updated accordingly.

The byte ordering of dn_eth2dn() and dn_dn2eth() are both changed
since just about all their callers wanted network order rather than
host order, so the conversion is now done in the functions themselves.

Several missed endianess conversions have been picked up during the
conversion process. The nh_gw field in struct dn_fib_info has been
changed from a 32 bit field to 16 bits as it ought to be.

One or two cases of using htons rather than dn_htons in the routing
code have been found and fixed.

There are still a few warnings to fix, but this patch deals with the
important cases.

Signed-off-by: Steven Whitehouse <steve@chygwyn.com>
Signed-off-by: Patrick Caulfield <patrick@tykepenguin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dn.h             |  44 ++++++++---------
 include/net/dn.h               | 105 ++++++++++++++++++++---------------------
 include/net/dn_dev.h           |  88 +++++++++++++++++-----------------
 include/net/dn_fib.h           |  22 ++++-----
 include/net/dn_neigh.h         |   4 +-
 include/net/dn_nsp.h           |  72 ++++++++++++++--------------
 include/net/dn_route.h         |  12 ++---
 include/net/flow.h             |   8 ++--
 net/decnet/af_decnet.c         |  16 +++----
 net/decnet/dn_dev.c            |  34 ++++++-------
 net/decnet/dn_fib.c            |   8 ++--
 net/decnet/dn_neigh.c          |  24 +++++-----
 net/decnet/dn_nsp_in.c         |  28 +++++------
 net/decnet/dn_nsp_out.c        |  38 +++++++--------
 net/decnet/dn_route.c          |  57 +++++++++++-----------
 net/decnet/dn_rules.c          |  18 +++----
 net/decnet/dn_table.c          |  12 ++---
 net/decnet/sysctl_net_decnet.c |  12 ++---
 18 files changed, 302 insertions(+), 300 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dn.h b/include/linux/dn.h
index 782cae49e64c..10b6a6fd5837 100644
--- a/include/linux/dn.h
+++ b/include/linux/dn.h
@@ -71,17 +71,17 @@
 
 struct dn_naddr 
 {
-	unsigned short		a_len;
-	unsigned char a_addr[DN_MAXADDL];
+	__le16		a_len;
+	__u8 a_addr[DN_MAXADDL]; /* Two bytes little endian */
 };
 
 struct sockaddr_dn
 {
-	unsigned short		sdn_family;
-	unsigned char		sdn_flags;
-	unsigned char		sdn_objnum;
-	unsigned short		sdn_objnamel;
-	unsigned char		sdn_objname[DN_MAXOBJL];
+	__u16		sdn_family;
+	__u8		sdn_flags;
+	__u8		sdn_objnum;
+	__le16		sdn_objnamel;
+	__u8		sdn_objname[DN_MAXOBJL];
 	struct   dn_naddr	sdn_add;
 };
 #define sdn_nodeaddrl   sdn_add.a_len   /* Node address length  */
@@ -93,38 +93,38 @@ struct sockaddr_dn
  * DECnet set/get DSO_CONDATA, DSO_DISDATA (optional data) structure
  */
 struct optdata_dn {
-        unsigned short  opt_status;     /* Extended status return */
+        __le16  opt_status;     /* Extended status return */
 #define opt_sts opt_status
-        unsigned short  opt_optl;       /* Length of user data    */
-        unsigned char   opt_data[16];   /* User data              */
+        __le16  opt_optl;       /* Length of user data    */
+        __u8   opt_data[16];   /* User data              */
 };
 
 struct accessdata_dn
 {
-	unsigned char		acc_accl;
-	unsigned char		acc_acc[DN_MAXACCL];
-	unsigned char 		acc_passl;
-	unsigned char		acc_pass[DN_MAXACCL];
-	unsigned char 		acc_userl;
-	unsigned char		acc_user[DN_MAXACCL];
+	__u8		acc_accl;
+	__u8		acc_acc[DN_MAXACCL];
+	__u8 		acc_passl;
+	__u8		acc_pass[DN_MAXACCL];
+	__u8 		acc_userl;
+	__u8		acc_user[DN_MAXACCL];
 };
 
 /*
  * DECnet logical link information structure
  */
 struct linkinfo_dn {
-        unsigned short  idn_segsize;    /* Segment size for link */
-        unsigned char   idn_linkstate;  /* Logical link state    */
+        __le16  idn_segsize;    /* Segment size for link */
+        __u8   idn_linkstate;  /* Logical link state    */
 };
 
 /*
  * Ethernet address format (for DECnet)
  */
 union etheraddress {
-        unsigned char dne_addr[6];             /* Full ethernet address */
+        __u8 dne_addr[6];             /* Full ethernet address */
   struct {
-                unsigned char dne_hiord[4];    /* DECnet HIORD prefix   */
-                unsigned char dne_nodeaddr[2]; /* DECnet node address   */
+                __u8 dne_hiord[4];    /* DECnet HIORD prefix   */
+                __u8 dne_nodeaddr[2]; /* DECnet node address   */
   } dne_remote;
 };
 
@@ -133,7 +133,7 @@ union etheraddress {
  * DECnet physical socket address format
  */
 struct dn_addr {
-        unsigned short dna_family;      /* AF_DECnet               */
+        __le16 dna_family;      /* AF_DECnet               */
         union etheraddress dna_netaddr; /* DECnet ethernet address */
 };
 
diff --git a/include/net/dn.h b/include/net/dn.h
index a4b6168e1e25..465b78302782 100644
--- a/include/net/dn.h
+++ b/include/net/dn.h
@@ -6,10 +6,8 @@
 #include <net/tcp.h>
 #include <asm/byteorder.h>
 
-typedef unsigned short dn_address;
-
-#define dn_ntohs(x) le16_to_cpu((unsigned short)(x))
-#define dn_htons(x) cpu_to_le16((unsigned short)(x))
+#define dn_ntohs(x) le16_to_cpu(x)
+#define dn_htons(x) cpu_to_le16(x)
 
 struct dn_scp                                   /* Session Control Port */
 {
@@ -31,36 +29,36 @@ struct dn_scp                                   /* Session Control Port */
 #define DN_CL    15                     /* Closed               */
 #define DN_CN    16                     /* Closed Notification  */
 
-        unsigned short          addrloc;
-        unsigned short          addrrem;
-        unsigned short          numdat;
-        unsigned short          numoth;
-        unsigned short          numoth_rcv;
-        unsigned short          numdat_rcv;
-        unsigned short          ackxmt_dat;
-        unsigned short          ackxmt_oth;
-        unsigned short          ackrcv_dat;
-        unsigned short          ackrcv_oth;
-        unsigned char           flowrem_sw;
-	unsigned char		flowloc_sw;
+        __le16          addrloc;
+        __le16          addrrem;
+        __u16          numdat;
+        __u16          numoth;
+        __u16          numoth_rcv;
+        __u16          numdat_rcv;
+        __u16          ackxmt_dat;
+        __u16          ackxmt_oth;
+        __u16          ackrcv_dat;
+        __u16          ackrcv_oth;
+        __u8           flowrem_sw;
+	__u8           flowloc_sw;
 #define DN_SEND         2
 #define DN_DONTSEND     1
 #define DN_NOCHANGE     0
-	unsigned short		flowrem_dat;
-	unsigned short		flowrem_oth;
-	unsigned short		flowloc_dat;
-	unsigned short		flowloc_oth;
-	unsigned char		services_rem;
-	unsigned char		services_loc;
-	unsigned char		info_rem;
-	unsigned char		info_loc;
-
-	unsigned short		segsize_rem;
-	unsigned short		segsize_loc;
-
-	unsigned char		nonagle;
-	unsigned char		multi_ireq;
-	unsigned char		accept_mode;
+	__u16		flowrem_dat;
+	__u16		flowrem_oth;
+	__u16		flowloc_dat;
+	__u16		flowloc_oth;
+	__u8		services_rem;
+	__u8		services_loc;
+	__u8		info_rem;
+	__u8		info_loc;
+
+	__u16		segsize_rem;
+	__u16		segsize_loc;
+
+	__u8		nonagle;
+	__u8		multi_ireq;
+	__u8		accept_mode;
 	unsigned long		seg_total; /* Running total of current segment */
 
 	struct optdata_dn     conndata_in;
@@ -160,40 +158,41 @@ static inline struct dn_scp *DN_SK(struct sock *sk)
  */
 #define DN_SKB_CB(skb) ((struct dn_skb_cb *)(skb)->cb)
 struct dn_skb_cb {
-	unsigned short dst;
-	unsigned short src;
-	unsigned short hops;
-	unsigned short dst_port;
-	unsigned short src_port;
-	unsigned char services;
-	unsigned char info;
-	unsigned char rt_flags;
-	unsigned char nsp_flags;
-	unsigned short segsize;
-	unsigned short segnum;
-	unsigned short xmit_count;
+	__le16 dst;
+	__le16 src;
+	__u16 hops;
+	__le16 dst_port;
+	__le16 src_port;
+	__u8 services;
+	__u8 info;
+	__u8 rt_flags;
+	__u8 nsp_flags;
+	__u16 segsize;
+	__u16 segnum;
+	__u16 xmit_count;
 	unsigned long stamp;
 	int iif;
 };
 
-static inline dn_address dn_eth2dn(unsigned char *ethaddr)
+static inline __le16 dn_eth2dn(unsigned char *ethaddr)
 {
-	return ethaddr[4] | (ethaddr[5] << 8);
+	return dn_htons(ethaddr[4] | (ethaddr[5] << 8));
 }
 
-static inline dn_address dn_saddr2dn(struct sockaddr_dn *saddr)
+static inline __le16 dn_saddr2dn(struct sockaddr_dn *saddr)
 {
-	return *(dn_address *)saddr->sdn_nodeaddr;
+	return *(__le16 *)saddr->sdn_nodeaddr;
 }
 
-static inline void dn_dn2eth(unsigned char *ethaddr, dn_address addr)
+static inline void dn_dn2eth(unsigned char *ethaddr, __le16 addr)
 {
+	__u16 a = dn_ntohs(addr);
 	ethaddr[0] = 0xAA;
 	ethaddr[1] = 0x00;
 	ethaddr[2] = 0x04;
 	ethaddr[3] = 0x00;
-	ethaddr[4] = (unsigned char)(addr & 0xff);
-	ethaddr[5] = (unsigned char)(addr >> 8);
+	ethaddr[4] = (__u8)(a & 0xff);
+	ethaddr[5] = (__u8)(a >> 8);
 }
 
 static inline void dn_sk_ports_copy(struct flowi *fl, struct dn_scp *scp)
@@ -202,7 +201,7 @@ static inline void dn_sk_ports_copy(struct flowi *fl, struct dn_scp *scp)
 	fl->uli_u.dnports.dport = scp->addrrem;
 	fl->uli_u.dnports.objnum = scp->addr.sdn_objnum;
 	if (fl->uli_u.dnports.objnum == 0) {
-		fl->uli_u.dnports.objnamel = scp->addr.sdn_objnamel;
+		fl->uli_u.dnports.objnamel = (__u8)dn_ntohs(scp->addr.sdn_objnamel);
 		memcpy(fl->uli_u.dnports.objname, scp->addr.sdn_objname, 16);
 	}
 }
@@ -217,7 +216,7 @@ extern unsigned dn_mss_from_pmtu(struct net_device *dev, int mtu);
 extern struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr);
 extern struct sock *dn_find_by_skb(struct sk_buff *skb);
 #define DN_ASCBUF_LEN 9
-extern char *dn_addr2asc(dn_address, char *);
+extern char *dn_addr2asc(__u16, char *);
 extern int dn_destroy_timer(struct sock *sk);
 
 extern int dn_sockaddr2username(struct sockaddr_dn *addr, unsigned char *buf, unsigned char type);
@@ -226,7 +225,7 @@ extern int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn
 extern void dn_start_slow_timer(struct sock *sk);
 extern void dn_stop_slow_timer(struct sock *sk);
 
-extern dn_address decnet_address;
+extern __le16 decnet_address;
 extern int decnet_debug_level;
 extern int decnet_time_wait;
 extern int decnet_dn_count;
diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h
index 5a86e78081bf..cee46821dc53 100644
--- a/include/net/dn_dev.h
+++ b/include/net/dn_dev.h
@@ -7,11 +7,11 @@ struct dn_dev;
 struct dn_ifaddr {
 	struct dn_ifaddr *ifa_next;
 	struct dn_dev    *ifa_dev;
-	dn_address       ifa_local;
-	dn_address       ifa_address;
-	unsigned char    ifa_flags;
-	unsigned char    ifa_scope;
-	char             ifa_label[IFNAMSIZ];
+	__le16            ifa_local;
+	__le16            ifa_address;
+	__u8              ifa_flags;
+	__u8              ifa_scope;
+	char              ifa_label[IFNAMSIZ];
 };
 
 #define DN_DEV_S_RU  0 /* Run - working normally   */
@@ -91,7 +91,7 @@ struct dn_dev {
 	struct timer_list timer;
 	unsigned long t3;
 	struct neigh_parms *neigh_parms;
-	unsigned char addr[ETH_ALEN];
+	__u8 addr[ETH_ALEN];
 	struct neighbour *router; /* Default router on circuit */
 	struct neighbour *peer;   /* Peer on pointopoint links */
 	unsigned long uptime;     /* Time device went up in jiffies */
@@ -99,56 +99,56 @@ struct dn_dev {
 
 struct dn_short_packet
 {
-	unsigned char   msgflg;
-	unsigned short  dstnode;
-	unsigned short  srcnode;
-	unsigned char   forward;
+	__u8    msgflg;
+	__le16 dstnode;
+	__le16 srcnode;
+	__u8   forward;
 } __attribute__((packed));
 
 struct dn_long_packet
 {
-	unsigned char   msgflg;
-	unsigned char   d_area;
-	unsigned char   d_subarea;
-	unsigned char   d_id[6];
-	unsigned char   s_area;
-	unsigned char   s_subarea;
-	unsigned char   s_id[6];
-	unsigned char   nl2;
-	unsigned char   visit_ct;
-	unsigned char   s_class;
-	unsigned char   pt;
+	__u8   msgflg;
+	__u8   d_area;
+	__u8   d_subarea;
+	__u8   d_id[6];
+	__u8   s_area;
+	__u8   s_subarea;
+	__u8   s_id[6];
+	__u8   nl2;
+	__u8   visit_ct;
+	__u8   s_class;
+	__u8   pt;
 } __attribute__((packed));
 
 /*------------------------- DRP - Routing messages ---------------------*/
 
 struct endnode_hello_message
 {
-	unsigned char   msgflg;
-	unsigned char   tiver[3];
-	unsigned char   id[6];
-	unsigned char   iinfo;
-	unsigned short  blksize;
-	unsigned char   area;
-	unsigned char   seed[8];
-	unsigned char   neighbor[6];
-	unsigned short  timer;
-	unsigned char   mpd;
-	unsigned char   datalen;
-	unsigned char   data[2];
+	__u8   msgflg;
+	__u8   tiver[3];
+	__u8   id[6];
+	__u8   iinfo;
+	__le16 blksize;
+	__u8   area;
+	__u8   seed[8];
+	__u8   neighbor[6];
+	__le16 timer;
+	__u8   mpd;
+	__u8   datalen;
+	__u8   data[2];
 } __attribute__((packed));
 
 struct rtnode_hello_message
 {
-	unsigned char   msgflg;
-	unsigned char   tiver[3];
-	unsigned char   id[6];
-	unsigned char   iinfo;
-	unsigned short  blksize;
-	unsigned char   priority;
-	unsigned char   area;
-	unsigned short  timer;
-	unsigned char   mpd;
+	__u8   msgflg;
+	__u8   tiver[3];
+	__u8   id[6];
+	__u8   iinfo;
+	__le16  blksize;
+	__u8   priority;
+	__u8   area;
+	__le16  timer;
+	__u8   mpd;
 } __attribute__((packed));
 
 
@@ -169,12 +169,12 @@ extern void dn_dev_down(struct net_device *);
 
 extern int dn_dev_set_default(struct net_device *dev, int force);
 extern struct net_device *dn_dev_get_default(void);
-extern int dn_dev_bind_default(dn_address *addr);
+extern int dn_dev_bind_default(__le16 *addr);
 
 extern int register_dnaddr_notifier(struct notifier_block *nb);
 extern int unregister_dnaddr_notifier(struct notifier_block *nb);
 
-static inline int dn_dev_islocal(struct net_device *dev, dn_address addr)
+static inline int dn_dev_islocal(struct net_device *dev, __le16 addr)
 {
 	struct dn_dev *dn_db = dev->dn_ptr;
 	struct dn_ifaddr *ifa;
diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
index cd3c96d9601b..a15dcf0d5c1e 100644
--- a/include/net/dn_fib.h
+++ b/include/net/dn_fib.h
@@ -37,7 +37,7 @@ struct dn_fib_nh {
 	int			nh_weight;
 	int			nh_power;
 	int			nh_oif;
-	u32			nh_gw;
+	__le16			nh_gw;
 };
 
 struct dn_fib_info {
@@ -48,7 +48,7 @@ struct dn_fib_info {
 	int			fib_dead;
 	unsigned		fib_flags;
 	int			fib_protocol;
-	dn_address		fib_prefsrc;
+	__le16			fib_prefsrc;
 	__u32			fib_priority;
 	__u32			fib_metrics[RTAX_MAX];
 #define dn_fib_mtu  fib_metrics[RTAX_MTU-1]
@@ -71,15 +71,15 @@ struct dn_fib_info {
 #define DN_FIB_RES_OIF(res)	(DN_FIB_RES_NH(res).nh_oif)
 
 typedef struct {
-	u16	datum;
+	__le16	datum;
 } dn_fib_key_t;
 
 typedef struct {
-	u16	datum;
+	__le16	datum;
 } dn_fib_hash_t;
 
 typedef struct {
-	u16	datum;
+	__u16	datum;
 } dn_fib_idx_t;
 
 struct dn_fib_node {
@@ -126,11 +126,11 @@ extern int dn_fib_semantic_match(int type, struct dn_fib_info *fi,
 			const struct flowi *fl,
 			struct dn_fib_res *res);
 extern void dn_fib_release_info(struct dn_fib_info *fi);
-extern u16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type);
+extern __le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type);
 extern void dn_fib_flush(void);
 extern void dn_fib_select_multipath(const struct flowi *fl,
 					struct dn_fib_res *res);
-extern int dn_fib_sync_down(dn_address local, struct net_device *dev, 
+extern int dn_fib_sync_down(__le16 local, struct net_device *dev,
 				int force);
 extern int dn_fib_sync_up(struct net_device *dev);
 
@@ -148,8 +148,8 @@ extern void dn_fib_table_cleanup(void);
 extern void dn_fib_rules_init(void);
 extern void dn_fib_rules_cleanup(void);
 extern void dn_fib_rule_put(struct dn_fib_rule *);
-extern __u16 dn_fib_rules_policy(__u16 saddr, struct dn_fib_res *res, unsigned *flags);
-extern unsigned dnet_addr_type(__u16 addr);
+extern __le16 dn_fib_rules_policy(__le16 saddr, struct dn_fib_res *res, unsigned *flags);
+extern unsigned dnet_addr_type(__le16 addr);
 extern int dn_fib_lookup(const struct flowi *fl, struct dn_fib_res *res);
 
 /*
@@ -194,10 +194,10 @@ extern struct dn_fib_table *dn_fib_tables[];
 
 #endif /* CONFIG_DECNET_ROUTER */
 
-static inline u16 dnet_make_mask(int n)
+static inline __le16 dnet_make_mask(int n)
 {
         if (n)
-                return htons(~((1<<(16-n))-1));
+                return dn_htons(~((1<<(16-n))-1));
         return 0;
 }
 
diff --git a/include/net/dn_neigh.h b/include/net/dn_neigh.h
index 4b1eb038d637..4cb4ae7fb81f 100644
--- a/include/net/dn_neigh.h
+++ b/include/net/dn_neigh.h
@@ -7,13 +7,13 @@
  */
 struct dn_neigh {
         struct neighbour n;
-	dn_address addr;
+	__le16 addr;
         unsigned long flags;
 #define DN_NDFLAG_R1    0x0001 /* Router L1      */
 #define DN_NDFLAG_R2    0x0002 /* Router L2      */
 #define DN_NDFLAG_P3    0x0004 /* Phase III Node */
         unsigned long blksize;
-	unsigned char priority;
+	__u8 priority;
 };
 
 extern void dn_neigh_init(void);
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index e6182b86262b..96e816b6974d 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -72,77 +72,77 @@ extern struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int nobl
 
 struct nsp_data_seg_msg
 {
-	unsigned char   msgflg;
-	unsigned short  dstaddr;
-	unsigned short  srcaddr;
+	__u8   msgflg;
+	__le16 dstaddr;
+	__le16 srcaddr;
 } __attribute__((packed));
 
 struct nsp_data_opt_msg
 {
-	unsigned short  acknum;
-	unsigned short  segnum;
-	unsigned short  lsflgs;
+	__le16 acknum;
+	__le16 segnum;
+	__le16 lsflgs;
 } __attribute__((packed));
 
 struct nsp_data_opt_msg1
 {
-	unsigned short  acknum;
-	unsigned short  segnum;
+	__le16 acknum;
+	__le16 segnum;
 } __attribute__((packed));
 
 
 /* Acknowledgment Message (data/other data)                             */
 struct nsp_data_ack_msg
 {
-	unsigned char   msgflg;
-	unsigned short  dstaddr;
-	unsigned short  srcaddr;
-	unsigned short  acknum;
+	__u8   msgflg;
+	__le16 dstaddr;
+	__le16 srcaddr;
+	__le16 acknum;
 } __attribute__((packed));
 
 /* Connect Acknowledgment Message */
 struct  nsp_conn_ack_msg
 {
-	unsigned char   msgflg;
-	unsigned short  dstaddr;
+	__u8 msgflg;
+	__le16 dstaddr;
 } __attribute__((packed));
 
 
 /* Connect Initiate/Retransmit Initiate/Connect Confirm */
 struct  nsp_conn_init_msg
 {
-	unsigned char   msgflg;
+	__u8   msgflg;
 #define NSP_CI      0x18            /* Connect Initiate     */
 #define NSP_RCI     0x68            /* Retrans. Conn Init   */
-	unsigned short  dstaddr;
-	unsigned short  srcaddr;
-	unsigned char   services;
+	__le16 dstaddr;
+	__le16 srcaddr;
+	__u8   services;
 #define NSP_FC_NONE   0x00            /* Flow Control None    */
 #define NSP_FC_SRC    0x04            /* Seg Req. Count       */
 #define NSP_FC_SCMC   0x08            /* Sess. Control Mess   */
 #define NSP_FC_MASK   0x0c            /* FC type mask         */
-	unsigned char   info;
-	unsigned short  segsize;
+	__u8   info;
+	__le16 segsize;
 } __attribute__((packed));
 
 /* Disconnect Initiate/Disconnect Confirm */
 struct  nsp_disconn_init_msg
 {
-	unsigned char   msgflg;
-	unsigned short  dstaddr;
-	unsigned short  srcaddr;
-	unsigned short  reason;
+	__u8   msgflg;
+	__le16 dstaddr;
+	__le16 srcaddr;
+	__le16 reason;
 } __attribute__((packed));
 
 
 struct  srcobj_fmt
 {
-	char            format;
-	unsigned char   task;
-	unsigned short  grpcode;
-	unsigned short  usrcode;
-	char            dlen;
+	__u8   format;
+	__u8   task;
+	__le16 grpcode;
+	__le16 usrcode;
+	__u8   dlen;
 } __attribute__((packed));
 
 /*
@@ -150,7 +150,7 @@ struct  srcobj_fmt
  * numbers used in NSP. Similar in operation to the functions
  * of the same name in TCP.
  */
-static __inline__ int dn_before(unsigned short seq1, unsigned short seq2)
+static __inline__ int dn_before(__u16 seq1, __u16 seq2)
 {
         seq1 &= 0x0fff;
         seq2 &= 0x0fff;
@@ -159,7 +159,7 @@ static __inline__ int dn_before(unsigned short seq1, unsigned short seq2)
 }
 
 
-static __inline__ int dn_after(unsigned short seq1, unsigned short seq2)
+static __inline__ int dn_after(__u16 seq1, __u16 seq2)
 {
         seq1 &= 0x0fff;
         seq2 &= 0x0fff;
@@ -167,23 +167,23 @@ static __inline__ int dn_after(unsigned short seq1, unsigned short seq2)
         return (int)((seq2 - seq1) & 0x0fff) > 2048;
 }
 
-static __inline__ int dn_equal(unsigned short seq1, unsigned short seq2)
+static __inline__ int dn_equal(__u16 seq1, __u16 seq2)
 {
         return ((seq1 ^ seq2) & 0x0fff) == 0;
 }
 
-static __inline__ int dn_before_or_equal(unsigned short seq1, unsigned short seq2)
+static __inline__ int dn_before_or_equal(__u16 seq1, __u16 seq2)
 {
 	return (dn_before(seq1, seq2) || dn_equal(seq1, seq2));
 }
 
-static __inline__ void seq_add(unsigned short *seq, unsigned short off)
+static __inline__ void seq_add(__u16 *seq, __u16 off)
 {
         (*seq) += off;
         (*seq) &= 0x0fff;
 }
 
-static __inline__ int seq_next(unsigned short seq1, unsigned short seq2)
+static __inline__ int seq_next(__u16 seq1, __u16 seq2)
 {
 	return dn_equal(seq1 + 1, seq2);
 }
@@ -191,7 +191,7 @@ static __inline__ int seq_next(unsigned short seq1, unsigned short seq2)
 /*
  * Can we delay the ack ?
  */
-static __inline__ int sendack(unsigned short seq)
+static __inline__ int sendack(__u16 seq)
 {
         return (int)((seq & 0x1000) ? 0 : 1);
 }
diff --git a/include/net/dn_route.h b/include/net/dn_route.h
index 5122da3f2eb3..76f957e258b0 100644
--- a/include/net/dn_route.h
+++ b/include/net/dn_route.h
@@ -71,12 +71,12 @@ struct dn_route {
 		struct dn_route *rt_next;
 	} u;
 
-	__u16 rt_saddr;
-	__u16 rt_daddr;
-	__u16 rt_gateway;
-	__u16 rt_local_src;	/* Source used for forwarding packets */
-	__u16 rt_src_map;
-	__u16 rt_dst_map;
+	__le16 rt_saddr;
+	__le16 rt_daddr;
+	__le16 rt_gateway;
+	__le16 rt_local_src;	/* Source used for forwarding packets */
+	__le16 rt_src_map;
+	__le16 rt_dst_map;
 
 	unsigned rt_flags;
 	unsigned rt_type;
diff --git a/include/net/flow.h b/include/net/flow.h
index ec7eb86eb203..04d89f763451 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -30,8 +30,8 @@ struct flowi {
 		} ip6_u;
 
 		struct {
-			__u16			daddr;
-			__u16			saddr;
+			__le16			daddr;
+			__le16			saddr;
 			__u32			fwmark;
 			__u8			scope;
 		} dn_u;
@@ -64,8 +64,8 @@ struct flowi {
 		} icmpt;
 
 		struct {
-			__u16	sport;
-			__u16	dport;
+			__le16	sport;
+			__le16	dport;
 			__u8	objnum;
 			__u8	objnamel; /* Not 16 bits since max val is 16 */
 			__u8	objname[16]; /* Not zero terminated */
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index ce4aaf94860d..824981eed225 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -172,7 +172,7 @@ static struct hlist_head *dn_find_list(struct sock *sk)
 /* 
  * Valid ports are those greater than zero and not already in use.
  */
-static int check_port(unsigned short port)
+static int check_port(__le16 port)
 {
 	struct sock *sk;
 	struct hlist_node *node;
@@ -661,7 +661,7 @@ disc_reject:
 	}
 }
 
-char *dn_addr2asc(dn_address addr, char *buf)
+char *dn_addr2asc(__u16 addr, char *buf)
 {
 	unsigned short node, area;
 
@@ -801,7 +801,7 @@ static int dn_auto_bind(struct socket *sock)
 	/* End of compatibility stuff */
 
 	scp->addr.sdn_add.a_len = dn_htons(2);
-	rv = dn_dev_bind_default((dn_address *)scp->addr.sdn_add.a_addr);
+	rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr);
 	if (rv == 0) {
 		rv = dn_hash_sock(sk);
 		if (rv)
@@ -1021,7 +1021,7 @@ static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt)
         opt->opt_optl   = *ptr++;
         opt->opt_status = 0;
         memcpy(opt->opt_data, ptr, opt->opt_optl);
-        skb_pull(skb, opt->opt_optl + 1);
+        skb_pull(skb, dn_ntohs(opt->opt_optl) + 1);
 
 }
 
@@ -1121,8 +1121,8 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type));
 	skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type));
-	*(dn_address *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src;
-	*(dn_address *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst;
+	*(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src;
+	*(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst;
 
 	menuver = *skb->data;
 	skb_pull(skb, 1);
@@ -1365,7 +1365,7 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us
 			if (optlen != sizeof(struct optdata_dn))
 				return -EINVAL;
 
-			if (u.opt.opt_optl > 16)
+			if (dn_ntohs(u.opt.opt_optl) > 16)
 				return -EINVAL;
 
 			memcpy(&scp->conndata_out, &u.opt, optlen);
@@ -1378,7 +1378,7 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us
 			if (optlen != sizeof(struct optdata_dn))
 				return -EINVAL;
 
-			if (u.opt.opt_optl > 16)
+			if (dn_ntohs(u.opt.opt_optl) > 16)
 				return -EINVAL;
 
 			memcpy(&scp->discdata_out, &u.opt, optlen);
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index efbead83ba7f..cc7b9d9255ef 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -64,7 +64,7 @@ extern struct neigh_table dn_neigh_table;
 /*
  * decnet_address is kept in network order.
  */
-dn_address decnet_address = 0;
+__le16 decnet_address = 0;
 
 static DEFINE_RWLOCK(dndev_lock);
 static struct net_device *decnet_default_device;
@@ -439,7 +439,7 @@ static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr **ifap, int de
 	*ifap = ifa1->ifa_next;
 
 	if (dn_db->dev->type == ARPHRD_ETHER) {
-		if (ifa1->ifa_local != dn_htons(dn_eth2dn(dev->dev_addr))) {
+		if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) {
 			dn_dn2eth(mac_addr, ifa1->ifa_local);
 			dev_mc_delete(dev, mac_addr, ETH_ALEN, 0);
 		}
@@ -470,7 +470,7 @@ static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
 	}
 
 	if (dev->type == ARPHRD_ETHER) {
-		if (ifa->ifa_local != dn_htons(dn_eth2dn(dev->dev_addr))) {
+		if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) {
 			dn_dn2eth(mac_addr, ifa->ifa_local);
 			dev_mc_add(dev, mac_addr, ETH_ALEN, 0);
 			dev_mc_upload(dev);
@@ -561,7 +561,7 @@ int dn_dev_ioctl(unsigned int cmd, void __user *arg)
 
 	switch(cmd) {
 		case SIOCGIFADDR:
-			*((dn_address *)sdn->sdn_nodeaddr) = ifa->ifa_local;
+			*((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local;
 			goto rarok;
 
 		case SIOCSIFADDR:
@@ -804,7 +804,7 @@ done:
 	return skb->len;
 }
 
-static int dn_dev_get_first(struct net_device *dev, dn_address *addr)
+static int dn_dev_get_first(struct net_device *dev, __le16 *addr)
 {
 	struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
 	struct dn_ifaddr *ifa;
@@ -830,7 +830,7 @@ out:
  * a sensible default. Eventually the routing code will take care of all the
  * nasties for us I hope.
  */
-int dn_dev_bind_default(dn_address *addr)
+int dn_dev_bind_default(__le16 *addr)
 {
 	struct net_device *dev;
 	int rv;
@@ -853,7 +853,7 @@ static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa)
 {
         struct endnode_hello_message *msg;
         struct sk_buff *skb = NULL;
-        unsigned short int *pktlen;
+        __le16 *pktlen;
 	struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
 
         if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL)
@@ -882,7 +882,7 @@ static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa)
         msg->datalen = 0x02;
         memset(msg->data, 0xAA, 2);
         
-        pktlen = (unsigned short *)skb_push(skb,2);
+        pktlen = (__le16 *)skb_push(skb,2);
         *pktlen = dn_htons(skb->len - 2);
 
 	skb->nh.raw = skb->data;
@@ -926,7 +926,7 @@ static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
 	size_t size;
 	unsigned char *ptr;
 	unsigned char *i1, *i2;
-	unsigned short *pktlen;
+	__le16 *pktlen;
 	char *src;
 
 	if (mtu2blksize(dev) < (26 + 7))
@@ -955,11 +955,11 @@ static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
 	ptr += ETH_ALEN;
 	*ptr++ = dn_db->parms.forwarding == 1 ? 
 			DN_RT_INFO_L1RT : DN_RT_INFO_L2RT;
-	*((unsigned short *)ptr) = dn_htons(mtu2blksize(dev));
+	*((__le16 *)ptr) = dn_htons(mtu2blksize(dev));
 	ptr += 2;
 	*ptr++ = dn_db->parms.priority; /* Priority */ 
 	*ptr++ = 0; /* Area: Reserved */
-	*((unsigned short *)ptr) = dn_htons((unsigned short)dn_db->parms.t3);
+	*((__le16 *)ptr) = dn_htons((unsigned short)dn_db->parms.t3);
 	ptr += 2;
 	*ptr++ = 0; /* MPD: Reserved */
 	i1 = ptr++;
@@ -974,7 +974,7 @@ static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
 
 	skb_trim(skb, (27 + *i2));
 
-	pktlen = (unsigned short *)skb_push(skb, 2);
+	pktlen = (__le16 *)skb_push(skb, 2);
 	*pktlen = dn_htons(skb->len - 2);
 
 	skb->nh.raw = skb->data;
@@ -1016,7 +1016,7 @@ static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa)
 	ptr = skb_put(skb, 2 + 4 + tdlen);
 
 	*ptr++ = DN_RT_PKT_HELO;
-	*((dn_address *)ptr) = ifa->ifa_local;
+	*((__le16 *)ptr) = ifa->ifa_local;
 	ptr += 2;
 	*ptr++ = tdlen;
 
@@ -1150,7 +1150,7 @@ struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
 void dn_dev_up(struct net_device *dev)
 {
 	struct dn_ifaddr *ifa;
-	dn_address addr = decnet_address;
+	__le16 addr = decnet_address;
 	int maybe_default = 0;
 	struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr;
 
@@ -1173,7 +1173,7 @@ void dn_dev_up(struct net_device *dev)
 	if (dev->type == ARPHRD_ETHER) {
 		if (memcmp(dev->dev_addr, dn_hiord, 4) != 0)
 			return;
-		addr = dn_htons(dn_eth2dn(dev->dev_addr));
+		addr = dn_eth2dn(dev->dev_addr);
 		maybe_default = 1;
 	}
 
@@ -1385,8 +1385,8 @@ static int dn_dev_seq_show(struct seq_file *seq, void *v)
 				mtu2blksize(dev),
 				dn_db->parms.priority,
 				dn_db->parms.state, dn_db->parms.name,
-				dn_db->router ? dn_addr2asc(dn_ntohs(*(dn_address *)dn_db->router->primary_key), router_buf) : "",
-				dn_db->peer ? dn_addr2asc(dn_ntohs(*(dn_address *)dn_db->peer->primary_key), peer_buf) : "");
+				dn_db->router ? dn_addr2asc(dn_ntohs(*(__le16 *)dn_db->router->primary_key), router_buf) : "",
+				dn_db->peer ? dn_addr2asc(dn_ntohs(*(__le16 *)dn_db->peer->primary_key), peer_buf) : "");
 	}
 	return 0;
 }
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 99bc061759c3..bd4ce8681a12 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -143,11 +143,11 @@ static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi
 	return NULL;
 }
 
-u16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type)
+__le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type)
 {
 	while(RTA_OK(attr,attrlen)) {
 		if (attr->rta_type == type)
-			return *(u16*)RTA_DATA(attr);
+			return *(__le16*)RTA_DATA(attr);
 		attr = RTA_NEXT(attr, attrlen);
 	}
 
@@ -565,7 +565,7 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static void fib_magic(int cmd, int type, __u16 dst, int dst_len, struct dn_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
 {
 	struct dn_fib_table *tb;
 	struct {
@@ -684,7 +684,7 @@ static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event,
 	return NOTIFY_DONE;
 }
 
-int dn_fib_sync_down(dn_address local, struct net_device *dev, int force)
+int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
 {
         int ret = 0;
         int scope = RT_SCOPE_NOWHERE;
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 33ab256cfd4a..7c8692c26bfe 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -95,7 +95,7 @@ static struct neigh_ops dn_phase3_ops = {
 struct neigh_table dn_neigh_table = {
 	.family =			PF_DECnet,
 	.entry_size =			sizeof(struct dn_neigh),
-	.key_len =			sizeof(dn_address),
+	.key_len =			sizeof(__le16),
 	.hash =				dn_neigh_hash,
 	.constructor =			dn_neigh_construct,
 	.id =				"dn_neigh_cache",
@@ -123,7 +123,7 @@ struct neigh_table dn_neigh_table = {
 
 static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev)
 {
-	return jhash_2words(*(dn_address *)pkey, 0, dn_neigh_table.hash_rnd);
+	return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd);
 }
 
 static int dn_neigh_construct(struct neighbour *neigh)
@@ -249,14 +249,14 @@ static int dn_long_output(struct sk_buff *skb)
 	data = skb_push(skb, sizeof(struct dn_long_packet) + 3);
 	lp = (struct dn_long_packet *)(data+3);
 
-	*((unsigned short *)data) = dn_htons(skb->len - 2);
+	*((__le16 *)data) = dn_htons(skb->len - 2);
 	*(data + 2) = 1 | DN_RT_F_PF; /* Padding */
 
 	lp->msgflg   = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS));
 	lp->d_area   = lp->d_subarea = 0;
-	dn_dn2eth(lp->d_id, dn_ntohs(cb->dst));
+	dn_dn2eth(lp->d_id, cb->dst);
 	lp->s_area   = lp->s_subarea = 0;
-	dn_dn2eth(lp->s_id, dn_ntohs(cb->src));
+	dn_dn2eth(lp->s_id, cb->src);
 	lp->nl2      = 0;
 	lp->visit_ct = cb->hops & 0x3f;
 	lp->s_class  = 0;
@@ -293,7 +293,7 @@ static int dn_short_output(struct sk_buff *skb)
         }
 
 	data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
-	*((unsigned short *)data) = dn_htons(skb->len - 2);
+	*((__le16 *)data) = dn_htons(skb->len - 2);
 	sp = (struct dn_short_packet *)(data+2);
 
 	sp->msgflg     = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
@@ -335,7 +335,7 @@ static int dn_phase3_output(struct sk_buff *skb)
 	}
 
 	data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
-	*((unsigned short *)data) = dn_htons(skb->len - 2);
+	*((__le16 *)data) = dn_htons(skb->len - 2);
 	sp = (struct dn_short_packet *)(data + 2);
 
 	sp->msgflg   = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
@@ -373,9 +373,9 @@ int dn_neigh_router_hello(struct sk_buff *skb)
 	struct neighbour *neigh;
 	struct dn_neigh *dn;
 	struct dn_dev *dn_db;
-	dn_address src;
+	__le16 src;
 
-	src = dn_htons(dn_eth2dn(msg->id));
+	src = dn_eth2dn(msg->id);
 
 	neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
 
@@ -409,7 +409,7 @@ int dn_neigh_router_hello(struct sk_buff *skb)
 		}
 
 		/* Only use routers in our area */
-		if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) {
+		if ((dn_ntohs(src)>>10) == (dn_ntohs((decnet_address))>>10)) {
 			if (!dn_db->router) {
 				dn_db->router = neigh_clone(neigh);
 			} else {
@@ -433,9 +433,9 @@ int dn_neigh_endnode_hello(struct sk_buff *skb)
 	struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data;
 	struct neighbour *neigh;
 	struct dn_neigh *dn;
-	dn_address src;
+	__le16 src;
 
-	src = dn_htons(dn_eth2dn(msg->id));
+	src = dn_eth2dn(msg->id);
 
 	neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
 
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 44bda85e678f..547523b41c81 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -85,7 +85,7 @@ static void dn_log_martian(struct sk_buff *skb, const char *msg)
 	if (decnet_log_martians && net_ratelimit()) {
 		char *devname = skb->dev ? skb->dev->name : "???";
 		struct dn_skb_cb *cb = DN_SKB_CB(skb);
-		printk(KERN_INFO "DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n", msg, devname, cb->src, cb->dst, cb->src_port, cb->dst_port);
+		printk(KERN_INFO "DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n", msg, devname, dn_ntohs(cb->src), dn_ntohs(cb->dst), dn_ntohs(cb->src_port), dn_ntohs(cb->dst_port));
 	}
 }
 
@@ -128,7 +128,7 @@ static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack)
  */
 static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth)
 {
-	unsigned short *ptr = (unsigned short *)skb->data;
+	__le16 *ptr = (__le16 *)skb->data;
 	int len = 0;
 	unsigned short ack;
 
@@ -346,7 +346,7 @@ static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb)
 	ptr = skb->data;
 	cb->services = *ptr++;
 	cb->info = *ptr++;
-	cb->segsize = dn_ntohs(*(__u16 *)ptr);
+	cb->segsize = dn_ntohs(*(__le16 *)ptr);
 
 	if ((scp->state == DN_CI) || (scp->state == DN_CD)) {
 		scp->persist = 0;
@@ -363,7 +363,7 @@ static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb)
 		if (skb->len > 0) {
 			unsigned char dlen = *skb->data;
 			if ((dlen <= 16) && (dlen <= skb->len)) {
-				scp->conndata_in.opt_optl = dlen;
+				scp->conndata_in.opt_optl = dn_htons((__u16)dlen);
 				memcpy(scp->conndata_in.opt_data, skb->data + 1, dlen);
 			}
 		}
@@ -397,17 +397,17 @@ static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb)
 	if (skb->len < 2)
 		goto out;
 
-	reason = dn_ntohs(*(__u16 *)skb->data);
+	reason = dn_ntohs(*(__le16 *)skb->data);
 	skb_pull(skb, 2);
 
-	scp->discdata_in.opt_status = reason;
+	scp->discdata_in.opt_status = dn_htons(reason);
 	scp->discdata_in.opt_optl   = 0;
 	memset(scp->discdata_in.opt_data, 0, 16);
 
 	if (skb->len > 0) {
 		unsigned char dlen = *skb->data;
 		if ((dlen <= 16) && (dlen <= skb->len)) {
-			scp->discdata_in.opt_optl = dlen;
+			scp->discdata_in.opt_optl = dn_htons((__u16)dlen);
 			memcpy(scp->discdata_in.opt_data, skb->data + 1, dlen);
 		}
 	}
@@ -464,7 +464,7 @@ static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
 	if (skb->len != 2)
 		goto out;
 
-	reason = dn_ntohs(*(__u16 *)skb->data);
+	reason = dn_ntohs(*(__le16 *)skb->data);
 
 	sk->sk_state = TCP_CLOSE;
 
@@ -513,7 +513,7 @@ static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb)
 	if (skb->len != 4)
 		goto out;
 
-	segnum = dn_ntohs(*(__u16 *)ptr);
+	segnum = dn_ntohs(*(__le16 *)ptr);
 	ptr += 2;
 	lsflags = *(unsigned char *)ptr++;
 	fcval = *ptr;
@@ -621,7 +621,7 @@ static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb)
 	if (skb->len < 2)
 		goto out;
 
-	cb->segnum = segnum = dn_ntohs(*(__u16 *)skb->data);
+	cb->segnum = segnum = dn_ntohs(*(__le16 *)skb->data);
 	skb_pull(skb, 2);
 
 	if (seq_next(scp->numoth_rcv, segnum)) {
@@ -649,7 +649,7 @@ static void dn_nsp_data(struct sock *sk, struct sk_buff *skb)
 	if (skb->len < 2)
 		goto out;
 
-	cb->segnum = segnum = dn_ntohs(*(__u16 *)skb->data);
+	cb->segnum = segnum = dn_ntohs(*(__le16 *)skb->data);
 	skb_pull(skb, 2);
 
 	if (seq_next(scp->numdat_rcv, segnum)) {
@@ -760,7 +760,7 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
 	/*
 	 * Grab the destination address.
 	 */
-	cb->dst_port = *(unsigned short *)ptr;
+	cb->dst_port = *(__le16 *)ptr;
 	cb->src_port = 0;
 	ptr += 2;
 
@@ -768,7 +768,7 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
 	 * If not a connack, grab the source address too.
 	 */
 	if (pskb_may_pull(skb, 5)) {
-		cb->src_port = *(unsigned short *)ptr;
+		cb->src_port = *(__le16 *)ptr;
 		ptr += 2;
 		skb_pull(skb, 5);
 	}
@@ -778,7 +778,7 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
 	 * Swap src & dst and look up in the normal way.
 	 */
 	if (unlikely(cb->rt_flags & DN_RT_F_RTS)) {
-		unsigned short tmp = cb->dst_port;
+		__le16 tmp = cb->dst_port;
 		cb->dst_port = cb->src_port;
 		cb->src_port = tmp;
 		tmp = cb->dst;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index c96c767b1f74..c2e21cd89b3c 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -287,26 +287,26 @@ int dn_nsp_xmit_timeout(struct sock *sk)
 	return 0;
 }
 
-static inline unsigned char *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len)
+static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len)
 {
 	unsigned char *ptr = skb_push(skb, len);
 
 	BUG_ON(len < 5);
 
 	*ptr++ = msgflag;
-	*((unsigned short *)ptr) = scp->addrrem;
+	*((__le16 *)ptr) = scp->addrrem;
 	ptr += 2;
-	*((unsigned short *)ptr) = scp->addrloc;
+	*((__le16 *)ptr) = scp->addrloc;
 	ptr += 2;
-	return ptr;
+	return (__le16 __force *)ptr;
 }
 
-static unsigned short *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other)
+static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other)
 {
 	struct dn_scp *scp = DN_SK(sk);
 	unsigned short acknum = scp->numdat_rcv & 0x0FFF;
 	unsigned short ackcrs = scp->numoth_rcv & 0x0FFF;
-	unsigned short *ptr;
+	__le16 *ptr;
 
 	BUG_ON(hlen < 9);
 
@@ -325,7 +325,7 @@ static unsigned short *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, un
 	/* Set "cross subchannel" bit in ackcrs */
 	ackcrs |= 0x2000;
 
-	ptr = (unsigned short *)dn_mk_common_header(scp, skb, msgflag, hlen);
+	ptr = (__le16 *)dn_mk_common_header(scp, skb, msgflag, hlen);
 
 	*ptr++ = dn_htons(acknum);
 	*ptr++ = dn_htons(ackcrs);
@@ -333,11 +333,11 @@ static unsigned short *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, un
 	return ptr;
 }
 
-static unsigned short *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth)
+static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth)
 {
 	struct dn_scp *scp = DN_SK(sk);
 	struct dn_skb_cb *cb = DN_SKB_CB(skb);
-	unsigned short *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth);
+	__le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth);
 
 	if (unlikely(oth)) {
 		cb->segnum = scp->numoth;
@@ -524,9 +524,9 @@ void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
 	struct dn_scp *scp = DN_SK(sk);
 	struct sk_buff *skb = NULL;
         struct nsp_conn_init_msg *msg;
-	unsigned char len = scp->conndata_out.opt_optl;
+	__u8 len = (__u8)dn_ntohs(scp->conndata_out.opt_optl);
 
-	if ((skb = dn_alloc_skb(sk, 50 + scp->conndata_out.opt_optl, gfp)) == NULL)
+	if ((skb = dn_alloc_skb(sk, 50 + dn_ntohs(scp->conndata_out.opt_optl), gfp)) == NULL)
 		return;
 
         msg = (struct nsp_conn_init_msg *)skb_put(skb, sizeof(*msg));
@@ -553,7 +553,7 @@ void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
 static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, 
 			unsigned short reason, gfp_t gfp,
 			struct dst_entry *dst,
-			int ddl, unsigned char *dd, __u16 rem, __u16 loc)
+			int ddl, unsigned char *dd, __le16 rem, __le16 loc)
 {
 	struct sk_buff *skb = NULL;
 	int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0);
@@ -561,7 +561,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
 
 	if ((dst == NULL) || (rem == 0)) {
 		if (net_ratelimit())
-			printk(KERN_DEBUG "DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n", (unsigned)rem, dst);
+			printk(KERN_DEBUG "DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n", dn_ntohs(rem), dst);
 		return;
 	}
 
@@ -570,11 +570,11 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
 
 	msg = skb_put(skb, size);
 	*msg++ = msgflg;
-	*(__u16 *)msg = rem;
+	*(__le16 *)msg = rem;
 	msg += 2;
-	*(__u16 *)msg = loc;
+	*(__le16 *)msg = loc;
 	msg += 2;
-	*(__u16 *)msg = dn_htons(reason);
+	*(__le16 *)msg = dn_htons(reason);
 	msg += 2;
 	if (msgflg == NSP_DISCINIT)
 		*msg++ = ddl;
@@ -600,10 +600,10 @@ void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg,
 	int ddl = 0;
 
 	if (msgflg == NSP_DISCINIT)
-		ddl = scp->discdata_out.opt_optl;
+		ddl = dn_ntohs(scp->discdata_out.opt_optl);
 
 	if (reason == 0)
-		reason = scp->discdata_out.opt_status;
+		reason = dn_ntohs(scp->discdata_out.opt_status);
 
 	dn_nsp_do_disc(sk, msgflg, reason, gfp, sk->sk_dst_cache, ddl, 
 		scp->discdata_out.opt_data, scp->addrrem, scp->addrloc);
@@ -708,7 +708,7 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
 	if (aux > 0)
 	memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
 
-	aux = scp->conndata_out.opt_optl;
+	aux = (__u8)dn_ntohs(scp->conndata_out.opt_optl);
 	*skb_put(skb, 1) = aux;
 	if (aux > 0)
 	memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 3407f190afe8..d7037fe3cafe 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -133,9 +133,9 @@ static struct dst_ops dn_dst_ops = {
 	.entries =		ATOMIC_INIT(0),
 };
 
-static __inline__ unsigned dn_hash(unsigned short src, unsigned short dst)
+static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
 {
-	unsigned short tmp = src ^ dst;
+	__u16 tmp = (__u16 __force)(src ^ dst);
 	tmp ^= (tmp >> 3);
 	tmp ^= (tmp >> 5);
 	tmp ^= (tmp >> 10);
@@ -379,9 +379,9 @@ static int dn_return_short(struct sk_buff *skb)
 {
 	struct dn_skb_cb *cb;
 	unsigned char *ptr;
-	dn_address *src;
-	dn_address *dst;
-	dn_address tmp;
+	__le16 *src;
+	__le16 *dst;
+	__le16 tmp;
 
 	/* Add back headers */
 	skb_push(skb, skb->data - skb->nh.raw);
@@ -394,9 +394,9 @@ static int dn_return_short(struct sk_buff *skb)
 	ptr = skb->data + 2;
 	*ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
 
-	dst = (dn_address *)ptr;
+	dst = (__le16 *)ptr;
 	ptr += 2;
-	src = (dn_address *)ptr;
+	src = (__le16 *)ptr;
 	ptr += 2;
 	*ptr = 0; /* Zero hop count */
 
@@ -475,7 +475,8 @@ static int dn_route_rx_packet(struct sk_buff *skb)
 		struct dn_skb_cb *cb = DN_SKB_CB(skb);
 		printk(KERN_DEBUG
 			"DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n",
-			(int)cb->rt_flags, devname, skb->len, cb->src, cb->dst, 
+			(int)cb->rt_flags, devname, skb->len,
+			dn_ntohs(cb->src), dn_ntohs(cb->dst),
 			err, skb->pkt_type);
 	}
 
@@ -505,7 +506,7 @@ static int dn_route_rx_long(struct sk_buff *skb)
 
         /* Destination info */
         ptr += 2;
-	cb->dst = dn_htons(dn_eth2dn(ptr));
+	cb->dst = dn_eth2dn(ptr);
         if (memcmp(ptr, dn_hiord_addr, 4) != 0)
                 goto drop_it;
         ptr += 6;
@@ -513,7 +514,7 @@ static int dn_route_rx_long(struct sk_buff *skb)
 
         /* Source info */
         ptr += 2;
-	cb->src = dn_htons(dn_eth2dn(ptr));
+	cb->src = dn_eth2dn(ptr);
         if (memcmp(ptr, dn_hiord_addr, 4) != 0)
                 goto drop_it;
         ptr += 6;
@@ -541,9 +542,9 @@ static int dn_route_rx_short(struct sk_buff *skb)
 	skb_pull(skb, 5);
 	skb->h.raw = skb->data;
 
-	cb->dst = *(dn_address *)ptr;
+	cb->dst = *(__le16 *)ptr;
         ptr += 2;
-        cb->src = *(dn_address *)ptr;
+        cb->src = *(__le16 *)ptr;
         ptr += 2;
         cb->hops = *ptr & 0x3f;
 
@@ -575,7 +576,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type
 {
 	struct dn_skb_cb *cb;
 	unsigned char flags = 0;
-	__u16 len = dn_ntohs(*(__u16 *)skb->data);
+	__u16 len = dn_ntohs(*(__le16 *)skb->data);
 	struct dn_dev *dn = (struct dn_dev *)dev->dn_ptr;
 	unsigned char padlen = 0;
 
@@ -782,7 +783,7 @@ static int dn_rt_bug(struct sk_buff *skb)
 		struct dn_skb_cb *cb = DN_SKB_CB(skb);
 
 		printk(KERN_DEBUG "dn_rt_bug: skb from:%04x to:%04x\n",
-				cb->src, cb->dst);
+				dn_ntohs(cb->src), dn_ntohs(cb->dst));
 	}
 
 	kfree_skb(skb);
@@ -823,7 +824,7 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
 	return 0;
 }
 
-static inline int dn_match_addr(__u16 addr1, __u16 addr2)
+static inline int dn_match_addr(__le16 addr1, __le16 addr2)
 {
 	__u16 tmp = dn_ntohs(addr1) ^ dn_ntohs(addr2);
 	int match = 16;
@@ -834,9 +835,9 @@ static inline int dn_match_addr(__u16 addr1, __u16 addr2)
 	return match;
 }
 
-static __u16 dnet_select_source(const struct net_device *dev, __u16 daddr, int scope)
+static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope)
 {
-	__u16 saddr = 0;
+	__le16 saddr = 0;
 	struct dn_dev *dn_db = dev->dn_ptr;
 	struct dn_ifaddr *ifa;
 	int best_match = 0;
@@ -861,14 +862,14 @@ static __u16 dnet_select_source(const struct net_device *dev, __u16 daddr, int s
 	return saddr;
 }
 
-static inline __u16 __dn_fib_res_prefsrc(struct dn_fib_res *res)
+static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res)
 {
 	return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope);
 }
 
-static inline __u16 dn_fib_rules_map_destination(__u16 daddr, struct dn_fib_res *res)
+static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res)
 {
-	__u16 mask = dnet_make_mask(res->prefixlen);
+	__le16 mask = dnet_make_mask(res->prefixlen);
 	return (daddr&~mask)|res->fi->fib_nh->nh_gw;
 }
 
@@ -892,12 +893,13 @@ static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *old
 	struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST };
 	int err;
 	int free_res = 0;
-	__u16 gateway = 0;
+	__le16 gateway = 0;
 
 	if (decnet_debug_level & 16)
 		printk(KERN_DEBUG
 		       "dn_route_output_slow: dst=%04x src=%04x mark=%d"
-		       " iif=%d oif=%d\n", oldflp->fld_dst, oldflp->fld_src,
+		       " iif=%d oif=%d\n", dn_ntohs(oldflp->fld_dst),
+		       dn_ntohs(oldflp->fld_src),
                        oldflp->fld_fwmark, loopback_dev.ifindex, oldflp->oif);
 
 	/* If we have an output interface, verify its a DECnet device */
@@ -961,8 +963,9 @@ source_ok:
 	if (decnet_debug_level & 16)
 		printk(KERN_DEBUG
 		       "dn_route_output_slow: initial checks complete."
-		       " dst=%o4x src=%04x oif=%d try_hard=%d\n", fl.fld_dst,
-		       fl.fld_src, fl.oif, try_hard);
+		       " dst=%o4x src=%04x oif=%d try_hard=%d\n",
+		       dn_ntohs(fl.fld_dst), dn_ntohs(fl.fld_src),
+		       fl.oif, try_hard);
 
 	/*
 	 * N.B. If the kernel is compiled without router support then
@@ -1218,8 +1221,8 @@ static int dn_route_input_slow(struct sk_buff *skb)
 	struct neighbour *neigh = NULL;
 	unsigned hash;
 	int flags = 0;
-	__u16 gateway = 0;
-	__u16 local_src = 0;
+	__le16 gateway = 0;
+	__le16 local_src = 0;
 	struct flowi fl = { .nl_u = { .dn_u = 
 				     { .daddr = cb->dst,
 				       .saddr = cb->src,
@@ -1266,7 +1269,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
 		res.type = RTN_LOCAL;
 		flags |= RTCF_DIRECTSRC;
 	} else {
-		__u16 src_map = fl.fld_src;
+		__le16 src_map = fl.fld_src;
 		free_res = 1;
 
 		out_dev = DN_FIB_RES_DEV(res);
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 1060de70bc0c..f2c299dfe030 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -46,11 +46,11 @@ struct dn_fib_rule
 	unsigned char		r_action;
 	unsigned char		r_dst_len;
 	unsigned char		r_src_len;
-	dn_address		r_src;
-	dn_address		r_srcmask;
-	dn_address		r_dst;
-	dn_address		r_dstmask;
-	dn_address		r_srcmap;
+	__le16			r_src;
+	__le16			r_srcmask;
+	__le16			r_dst;
+	__le16			r_dstmask;
+	__le16			r_srcmap;
 	u8			r_flags;
 #ifdef CONFIG_DECNET_ROUTE_FWMARK
 	u32			r_fwmark;
@@ -208,8 +208,8 @@ int dn_fib_lookup(const struct flowi *flp, struct dn_fib_res *res)
 {
 	struct dn_fib_rule *r, *policy;
 	struct dn_fib_table *tb;
-	dn_address saddr = flp->fld_src;
-	dn_address daddr = flp->fld_dst;
+	__le16 saddr = flp->fld_src;
+	__le16 daddr = flp->fld_dst;
 	int err;
 
 	read_lock(&dn_fib_rules_lock);
@@ -259,7 +259,7 @@ int dn_fib_lookup(const struct flowi *flp, struct dn_fib_res *res)
 	return -ESRCH;
 }
 
-unsigned dnet_addr_type(__u16 addr)
+unsigned dnet_addr_type(__le16 addr)
 {
 	struct flowi fl = { .nl_u = { .dn_u = { .daddr = addr } } };
 	struct dn_fib_res res;
@@ -277,7 +277,7 @@ unsigned dnet_addr_type(__u16 addr)
 	return ret;
 }
 
-__u16 dn_fib_rules_policy(__u16 saddr, struct dn_fib_res *res, unsigned *flags)
+__le16 dn_fib_rules_policy(__le16 saddr, struct dn_fib_res *res, unsigned *flags)
 {
 	struct dn_fib_rule *r = res->r;
 
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 6f8b5658cb4e..0ebc46af1bdd 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -46,7 +46,7 @@ struct dn_zone
 	u32			dz_hashmask;
 #define DZ_HASHMASK(dz)	((dz)->dz_hashmask)
 	int			dz_order;
-	u16			dz_mask;
+	__le16			dz_mask;
 #define DZ_MASK(dz)	((dz)->dz_mask)
 };
 
@@ -84,14 +84,14 @@ static int dn_fib_hash_zombies;
 
 static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
 {
-	u16 h = ntohs(key.datum)>>(16 - dz->dz_order);
+	u16 h = dn_ntohs(key.datum)>>(16 - dz->dz_order);
 	h ^= (h >> 10);
 	h ^= (h >> 6);
 	h &= DZ_HASHMASK(dz);
 	return *(dn_fib_idx_t *)&h;
 }
 
-static inline dn_fib_key_t dz_key(u16 dst, struct dn_zone *dz)
+static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz)
 {
 	dn_fib_key_t k;
 	k.datum = dst & DZ_MASK(dz);
@@ -250,7 +250,7 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern
 
 	for_nexthops(fi) {
 		int attrlen = nhlen - sizeof(struct rtnexthop);
-		dn_address gw;
+		__le16 gw;
 
 		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
 			return -EINVAL;
@@ -457,7 +457,7 @@ static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct
 
 	dz_key_0(key);
 	if (rta->rta_dst) {
-		dn_address dst;
+		__le16 dst;
 		memcpy(&dst, rta->rta_dst, 2);
 		if (dst & ~DZ_MASK(dz))
 			return -EINVAL;
@@ -593,7 +593,7 @@ static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct
 
 	dz_key_0(key);
 	if (rta->rta_dst) {
-		dn_address dst;
+		__le16 dst;
 		memcpy(&dst, rta->rta_dst, 2);
 		if (dst & ~DZ_MASK(dz))
 			return -EINVAL;
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 0e9d2c571165..bda5920215fd 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -86,9 +86,9 @@ static void strip_it(char *str)
  * Simple routine to parse an ascii DECnet address
  * into a network order address.
  */
-static int parse_addr(dn_address *addr, char *str)
+static int parse_addr(__le16 *addr, char *str)
 {
-	dn_address area, node;
+	__u16 area, node;
 
 	while(*str && !ISNUM(*str)) str++;
 
@@ -139,7 +139,7 @@ static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen
 				void **context)
 {
 	size_t len;
-	dn_address addr;
+	__le16 addr;
 
 	if (oldval && oldlenp) {
 		if (get_user(len, oldlenp))
@@ -147,14 +147,14 @@ static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen
 		if (len) {
 			if (len != sizeof(unsigned short))
 				return -EINVAL;
-			if (put_user(decnet_address, (unsigned short __user *)oldval))
+			if (put_user(decnet_address, (__le16 __user *)oldval))
 				return -EFAULT;
 		}
 	}
 	if (newval && newlen) {
 		if (newlen != sizeof(unsigned short))
 			return -EINVAL;
-		if (get_user(addr, (unsigned short __user *)newval))
+		if (get_user(addr, (__le16 __user *)newval))
 			return -EFAULT;
 
 		dn_dev_devices_off();
@@ -173,7 +173,7 @@ static int dn_node_address_handler(ctl_table *table, int write,
 {
 	char addr[DN_ASCBUF_LEN];
 	size_t len;
-	dn_address dnaddr;
+	__le16 dnaddr;
 
 	if (!*lenp || (*ppos && !write)) {
 		*lenp = 0;
-- 
cgit v1.2.3


From cbb042f9e1292434e3cacb90e67d8d381aeac5a9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 20 Mar 2006 22:43:56 -0800
Subject: [NET]: Replace skb_pull/skb_postpull_rcsum with skb_pull_rcsum

We're now starting to have quite a number of places that do skb_pull
followed immediately by an skb_postpull_rcsum.  We can merge these two
operations into one function with skb_pull_rcsum.  This makes sense
since most pull operations on receive skb's need to update the
checksum.

I've decided to make this out-of-line since it is fairly big and the
fast path where hardware checksums are enabled need to call
csum_partial anyway.

Since this is a brand new function we get to add an extra check on the
len argument.  As it is most callers of skb_pull ignore its return
value which essentially means that there is no check on the len
argument.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp_generic.c |  4 ++--
 drivers/net/pppoe.c       |  3 +--
 include/linux/skbuff.h    |  4 +++-
 net/802/psnap.c           |  3 +--
 net/8021q/vlan_dev.c      |  6 ++----
 net/bridge/br_netfilter.c |  6 ++----
 net/core/skbuff.c         | 21 +++++++++++++++++++++
 7 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 0245e40b51a1..f608c12e3e8b 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1691,8 +1691,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 		    || ppp->npmode[npi] != NPMODE_PASS) {
 			kfree_skb(skb);
 		} else {
-			skb_pull(skb, 2);	/* chop off protocol */
-			skb_postpull_rcsum(skb, skb->data - 2, 2);
+			/* chop off protocol */
+			skb_pull_rcsum(skb, 2);
 			skb->dev = ppp->dev;
 			skb->protocol = htons(npindex_to_ethertype[npi]);
 			skb->mac.raw = skb->data;
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 9369f811075d..475dc930380f 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -337,8 +337,7 @@ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb)
 	if (sk->sk_state & PPPOX_BOUND) {
 		struct pppoe_hdr *ph = (struct pppoe_hdr *) skb->nh.raw;
 		int len = ntohs(ph->length);
-		skb_pull(skb, sizeof(struct pppoe_hdr));
-		skb_postpull_rcsum(skb, ph, sizeof(*ph));
+		skb_pull_rcsum(skb, sizeof(struct pppoe_hdr));
 		if (pskb_trim_rcsum(skb, len))
 			goto abort_kfree;
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 75c963103b9f..613b9513f8b9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1159,12 +1159,14 @@ static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp)
  */
 
 static inline void skb_postpull_rcsum(struct sk_buff *skb,
-					 const void *start, int len)
+				      const void *start, unsigned int len)
 {
 	if (skb->ip_summed == CHECKSUM_HW)
 		skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0));
 }
 
+unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);
+
 /**
  *	pskb_trim_rcsum - trim received skb and update checksum
  *	@skb: buffer to trim
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 34e42968b477..97c95eec79ed 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -61,8 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
 		/* Pass the frame on. */
 		u8 *hdr = skb->data;
 		skb->h.raw  += 5;
-		skb_pull(skb, 5);
-		skb_postpull_rcsum(skb, hdr, 5);
+		skb_pull_rcsum(skb, 5);
 		rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
 	} else {
 		skb->sk = NULL;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 0f604d227da2..da9cfe927158 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -163,10 +163,8 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 	stats->rx_packets++;
 	stats->rx_bytes += skb->len;
 
-	skb_pull(skb, VLAN_HLEN); /* take off the VLAN header (4 bytes currently) */
-
-	/* Need to correct hardware checksum */
-	skb_postpull_rcsum(skb, vhdr, VLAN_HLEN);
+	/* Take off the VLAN header (4 bytes currently) */
+	skb_pull_rcsum(skb, VLAN_HLEN);
 
 	/* Ok, lets check to make sure the device (dev) we
 	 * came in on is what this VLAN is attached to.
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index e060aad8624d..6787bc5197f7 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -425,8 +425,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
 
 		if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
 			u8 *vhdr = skb->data;
-			skb_pull(skb, VLAN_HLEN);
-			skb_postpull_rcsum(skb, vhdr, VLAN_HLEN);
+			skb_pull_rcsum(skb, VLAN_HLEN);
 			skb->nh.raw += VLAN_HLEN;
 		}
 		return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
@@ -444,8 +443,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
 
 	if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
 		u8 *vhdr = skb->data;
-		skb_pull(skb, VLAN_HLEN);
-		skb_postpull_rcsum(skb, vhdr, VLAN_HLEN);
+		skb_pull_rcsum(skb, VLAN_HLEN);
 		skb->nh.raw += VLAN_HLEN;
 	}
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 01abf1e8990b..2960c8b82b23 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1795,6 +1795,27 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
 	return 0;
 }
 
+/**
+ *	skb_pull_rcsum - pull skb and update receive checksum
+ *	@skb: buffer to update
+ *	@start: start of data before pull
+ *	@len: length of data pulled
+ *
+ *	This function performs an skb_pull on the packet and updates
+ *	update the CHECKSUM_HW checksum.  It should be used on receive
+ *	path processing instead of skb_pull unless you know that the
+ *	checksum difference is zero (e.g., a valid IP header) or you
+ *	are setting ip_summed to CHECKSUM_NONE.
+ */
+unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+{
+	BUG_ON(len > skb->len);
+	skb->len -= len;
+	BUG_ON(skb->len < skb->data_len);
+	skb_postpull_rcsum(skb, skb->data, len);
+	return skb->data += len;
+}
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
-- 
cgit v1.2.3


From 3fdadf7d27e3fbcf72930941884387d1f4936f04 Mon Sep 17 00:00:00 2001
From: Dmitry Mishin <dim@openvz.org>
Date: Mon, 20 Mar 2006 22:45:21 -0800
Subject: [NET]: {get|set}sockopt compatibility layer

This patch extends {get|set}sockopt compatibility layer in order to
move protocol specific parts to their place and avoid huge universal
net/compat.c file in the future.

Signed-off-by: Dmitry Mishin <dim@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h                |   4 +
 include/linux/netfilter.h          |   9 ++
 include/net/inet_connection_sock.h |   6 ++
 include/net/ip.h                   |   4 +
 include/net/ipv6.h                 |  10 +++
 include/net/sctp/structs.h         |  10 +++
 include/net/sock.h                 |  12 +++
 include/net/tcp.h                  |   6 ++
 net/compat.c                       |  95 +++++++++++++++++----
 net/core/sock.c                    |  28 +++++++
 net/dccp/dccp.h                    |   8 ++
 net/dccp/ipv4.c                    |  12 +++
 net/dccp/ipv6.c                    |  16 ++++
 net/dccp/proto.c                   |  67 ++++++++++++---
 net/ipv4/af_inet.c                 |  12 +++
 net/ipv4/ip_sockglue.c             | 142 +++++++++++++++++++++++++++-----
 net/ipv4/raw.c                     |  50 ++++++++++--
 net/ipv4/tcp.c                     |  77 +++++++++++++++---
 net/ipv4/tcp_ipv4.c                |   8 ++
 net/ipv4/udp.c                     |  51 ++++++++++--
 net/ipv6/af_inet6.c                |  12 +++
 net/ipv6/ipv6_sockglue.c           | 163 ++++++++++++++++++++++++++++++-------
 net/ipv6/ipv6_syms.c               |   4 +
 net/ipv6/raw.c                     | 112 +++++++++++++++++++------
 net/ipv6/tcp_ipv6.c                |  12 +++
 net/ipv6/udp.c                     |  52 ++++++++++--
 net/netfilter/nf_sockopt.c         |  69 ++++++++++++++++
 net/sctp/ipv6.c                    |   8 ++
 net/sctp/protocol.c                |   8 ++
 29 files changed, 928 insertions(+), 139 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 28195a2d8ff0..152fa6551fd8 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -149,6 +149,10 @@ struct proto_ops {
 				      int optname, char __user *optval, int optlen);
 	int		(*getsockopt)(struct socket *sock, int level,
 				      int optname, char __user *optval, int __user *optlen);
+	int		(*compat_setsockopt)(struct socket *sock, int level,
+				      int optname, char __user *optval, int optlen);
+	int		(*compat_getsockopt)(struct socket *sock, int level,
+				      int optname, char __user *optval, int __user *optlen);
 	int		(*sendmsg)   (struct kiocb *iocb, struct socket *sock,
 				      struct msghdr *m, size_t total_len);
 	int		(*recvmsg)   (struct kiocb *iocb, struct socket *sock,
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 468896939843..412e52ca9720 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -80,10 +80,14 @@ struct nf_sockopt_ops
 	int set_optmin;
 	int set_optmax;
 	int (*set)(struct sock *sk, int optval, void __user *user, unsigned int len);
+	int (*compat_set)(struct sock *sk, int optval,
+			void __user *user, unsigned int len);
 
 	int get_optmin;
 	int get_optmax;
 	int (*get)(struct sock *sk, int optval, void __user *user, int *len);
+	int (*compat_get)(struct sock *sk, int optval,
+			void __user *user, int *len);
 
 	/* Number of users inside set() or get(). */
 	unsigned int use;
@@ -246,6 +250,11 @@ int nf_setsockopt(struct sock *sk, int pf, int optval, char __user *opt,
 int nf_getsockopt(struct sock *sk, int pf, int optval, char __user *opt,
 		  int *len);
 
+int compat_nf_setsockopt(struct sock *sk, int pf, int optval,
+		char __user *opt, int len);
+int compat_nf_getsockopt(struct sock *sk, int pf, int optval,
+		char __user *opt, int *len);
+
 /* Packet queuing */
 struct nf_queue_handler {
 	int (*outfn)(struct sk_buff *skb, struct nf_info *info,
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 363a067403ee..ae61331366f0 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -50,6 +50,12 @@ struct inet_connection_sock_af_ops {
 				  char __user *optval, int optlen);
 	int	    (*getsockopt)(struct sock *sk, int level, int optname, 
 				  char __user *optval, int __user *optlen);
+	int	    (*compat_setsockopt)(struct sock *sk,
+				int level, int optname,
+				char __user *optval, int optlen);
+	int	    (*compat_getsockopt)(struct sock *sk,
+				int level, int optname,
+				char __user *optval, int __user *optlen);
 	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
 	int sockaddr_len;
 };
diff --git a/include/net/ip.h b/include/net/ip.h
index fab3d5b3ab1c..8fe6156ca9b0 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -356,6 +356,10 @@ extern void	ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb);
 extern int	ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc);
 extern int	ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen);
 extern int	ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen);
+extern int	compat_ip_setsockopt(struct sock *sk, int level,
+			int optname, char __user *optval, int optlen);
+extern int	compat_ip_getsockopt(struct sock *sk, int level,
+			int optname, char __user *optval, int __user *optlen);
 extern int	ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *));
 
 extern int 	ip_recv_error(struct sock *sk, struct msghdr *msg, int len);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c893a1ce4b39..6d6f0634ae41 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -520,6 +520,16 @@ extern int			ipv6_getsockopt(struct sock *sk, int level,
 						int optname,
 						char __user *optval, 
 						int __user *optlen);
+extern int			compat_ipv6_setsockopt(struct sock *sk,
+						int level,
+						int optname,
+						char __user *optval,
+						int optlen);
+extern int			compat_ipv6_getsockopt(struct sock *sk,
+						int level,
+						int optname,
+						char __user *optval,
+						int __user *optlen);
 
 extern void			ipv6_packet_init(void);
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 072f407848a6..eba99f375517 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -514,6 +514,16 @@ struct sctp_af {
 					 int optname,
 					 char __user *optval,
 					 int __user *optlen);
+	int		(*compat_setsockopt)	(struct sock *sk,
+					 int level,
+					 int optname,
+					 char __user *optval,
+					 int optlen);
+	int		(*compat_getsockopt)	(struct sock *sk,
+					 int level,
+					 int optname,
+					 char __user *optval,
+					 int __user *optlen);
 	struct dst_entry *(*get_dst)	(struct sctp_association *asoc,
 					 union sctp_addr *daddr,
 					 union sctp_addr *saddr);
diff --git a/include/net/sock.h b/include/net/sock.h
index f63d0d56712c..ec226f31dc2a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -520,6 +520,14 @@ struct proto {
 	int			(*getsockopt)(struct sock *sk, int level, 
 					int optname, char __user *optval, 
 					int __user *option);  	 
+	int			(*compat_setsockopt)(struct sock *sk,
+					int level,
+					int optname, char __user *optval,
+					int optlen);
+	int			(*compat_getsockopt)(struct sock *sk,
+					int level,
+					int optname, char __user *optval,
+					int __user *option);
 	int			(*sendmsg)(struct kiocb *iocb, struct sock *sk,
 					   struct msghdr *msg, size_t len);
 	int			(*recvmsg)(struct kiocb *iocb, struct sock *sk,
@@ -816,6 +824,10 @@ extern int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
 			       struct msghdr *msg, size_t size, int flags);
 extern int sock_common_setsockopt(struct socket *sock, int level, int optname,
 				  char __user *optval, int optlen);
+extern int compat_sock_common_getsockopt(struct socket *sock, int level,
+		int optname, char __user *optval, int __user *optlen);
+extern int compat_sock_common_setsockopt(struct socket *sock, int level,
+		int optname, char __user *optval, int optlen);
 
 extern void sk_common_release(struct sock *sk);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 457e224de468..9418f4d1afbb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -353,6 +353,12 @@ extern int			tcp_getsockopt(struct sock *sk, int level,
 extern int			tcp_setsockopt(struct sock *sk, int level, 
 					       int optname, char __user *optval, 
 					       int optlen);
+extern int			compat_tcp_getsockopt(struct sock *sk,
+					int level, int optname,
+					char __user *optval, int __user *optlen);
+extern int			compat_tcp_setsockopt(struct sock *sk,
+					int level, int optname,
+					char __user *optval, int optlen);
 extern void			tcp_set_keepalive(struct sock *sk, int val);
 extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg,
diff --git a/net/compat.c b/net/compat.c
index e593dace2fdb..13177a1a4b39 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -416,7 +416,7 @@ struct compat_sock_fprog {
 	compat_uptr_t	filter;		/* struct sock_filter * */
 };
 
-static int do_set_attach_filter(int fd, int level, int optname,
+static int do_set_attach_filter(struct socket *sock, int level, int optname,
 				char __user *optval, int optlen)
 {
 	struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
@@ -432,11 +432,12 @@ static int do_set_attach_filter(int fd, int level, int optname,
 	    __put_user(compat_ptr(ptr), &kfprog->filter))
 		return -EFAULT;
 
-	return sys_setsockopt(fd, level, optname, (char __user *)kfprog, 
+	return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
 			      sizeof(struct sock_fprog));
 }
 
-static int do_set_sock_timeout(int fd, int level, int optname, char __user *optval, int optlen)
+static int do_set_sock_timeout(struct socket *sock, int level,
+		int optname, char __user *optval, int optlen)
 {
 	struct compat_timeval __user *up = (struct compat_timeval __user *) optval;
 	struct timeval ktime;
@@ -451,30 +452,61 @@ static int do_set_sock_timeout(int fd, int level, int optname, char __user *optv
 		return -EFAULT;
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
-	err = sys_setsockopt(fd, level, optname, (char *) &ktime, sizeof(ktime));
+	err = sock_setsockopt(sock, level, optname, (char *) &ktime, sizeof(ktime));
 	set_fs(old_fs);
 
 	return err;
 }
 
+static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
+				char __user *optval, int optlen)
+{
+	if (optname == SO_ATTACH_FILTER)
+		return do_set_attach_filter(sock, level, optname,
+					    optval, optlen);
+	if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+		return do_set_sock_timeout(sock, level, optname, optval, optlen);
+
+	return sock_setsockopt(sock, level, optname, optval, optlen);
+}
+
 asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
 				char __user *optval, int optlen)
 {
+	int err;
+	struct socket *sock;
+
 	/* SO_SET_REPLACE seems to be the same in all levels */
 	if (optname == IPT_SO_SET_REPLACE)
 		return do_netfilter_replace(fd, level, optname,
 					    optval, optlen);
-	if (level == SOL_SOCKET && optname == SO_ATTACH_FILTER)
-		return do_set_attach_filter(fd, level, optname,
-					    optval, optlen);
-	if (level == SOL_SOCKET &&
-	    (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
-		return do_set_sock_timeout(fd, level, optname, optval, optlen);
 
-	return sys_setsockopt(fd, level, optname, optval, optlen);
+	if (optlen < 0)
+		return -EINVAL;
+
+	if ((sock = sockfd_lookup(fd, &err))!=NULL)
+	{
+		err = security_socket_setsockopt(sock,level,optname);
+		if (err) {
+			sockfd_put(sock);
+			return err;
+		}
+
+		if (level == SOL_SOCKET)
+			err = compat_sock_setsockopt(sock, level,
+					optname, optval, optlen);
+		else if (sock->ops->compat_setsockopt)
+			err = sock->ops->compat_setsockopt(sock, level,
+					optname, optval, optlen);
+		else
+			err = sock->ops->setsockopt(sock, level,
+					optname, optval, optlen);
+		sockfd_put(sock);
+	}
+	return err;
 }
 
-static int do_get_sock_timeout(int fd, int level, int optname,
+static int do_get_sock_timeout(struct socket *sock, int level, int optname,
 		char __user *optval, int __user *optlen)
 {
 	struct compat_timeval __user *up;
@@ -490,7 +522,7 @@ static int do_get_sock_timeout(int fd, int level, int optname,
 	len = sizeof(ktime);
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
-	err = sys_getsockopt(fd, level, optname, (char *) &ktime, &len);
+	err = sock_getsockopt(sock, level, optname, (char *) &ktime, &len);
 	set_fs(old_fs);
 
 	if (!err) {
@@ -503,15 +535,42 @@ static int do_get_sock_timeout(int fd, int level, int optname,
 	return err;
 }
 
-asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
+static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
-	if (level == SOL_SOCKET &&
-	    (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
-		return do_get_sock_timeout(fd, level, optname, optval, optlen);
-	return sys_getsockopt(fd, level, optname, optval, optlen);
+	if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+		return do_get_sock_timeout(sock, level, optname, optval, optlen);
+	return sock_getsockopt(sock, level, optname, optval, optlen);
 }
 
+asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	int err;
+	struct socket *sock;
+
+	if ((sock = sockfd_lookup(fd, &err))!=NULL)
+	{
+		err = security_socket_getsockopt(sock, level,
+							   optname);
+		if (err) {
+			sockfd_put(sock);
+			return err;
+		}
+
+		if (level == SOL_SOCKET)
+			err = compat_sock_getsockopt(sock, level,
+					optname, optval, optlen);
+		else if (sock->ops->compat_getsockopt)
+			err = sock->ops->compat_getsockopt(sock, level,
+					optname, optval, optlen);
+		else
+			err = sock->ops->getsockopt(sock, level,
+					optname, optval, optlen);
+		sockfd_put(sock);
+	}
+	return err;
+}
 /* Argument list sizes for compat_sys_socketcall */
 #define AL(x) ((x) * sizeof(u32))
 static unsigned char nas[18]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
diff --git a/net/core/sock.c b/net/core/sock.c
index 5038a5a7bd84..dd63cdea3fe7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1385,6 +1385,20 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
 
 EXPORT_SYMBOL(sock_common_getsockopt);
 
+#ifdef CONFIG_COMPAT
+int compat_sock_common_getsockopt(struct socket *sock, int level,
+		int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk->sk_prot->compat_setsockopt)
+		return sk->sk_prot->compat_getsockopt(sk, level,
+			optname, optval, optlen);
+	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_getsockopt);
+#endif
+
 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
 			struct msghdr *msg, size_t size, int flags)
 {
@@ -1414,6 +1428,20 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname,
 
 EXPORT_SYMBOL(sock_common_setsockopt);
 
+#ifdef CONFIG_COMPAT
+int compat_sock_common_setsockopt(struct socket *sock,
+		int level, int optname, char __user *optval, int optlen)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk->sk_prot->compat_setsockopt)
+		return sk->sk_prot->compat_setsockopt(sk, level,
+			optname, optval, optlen);
+	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_setsockopt);
+#endif
+
 void sk_common_release(struct sock *sk)
 {
 	if (sk->sk_prot->destroy)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 34e70fb89d4a..47de17208d7a 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -192,6 +192,14 @@ extern int	   dccp_getsockopt(struct sock *sk, int level, int optname,
 				   char __user *optval, int __user *optlen);
 extern int	   dccp_setsockopt(struct sock *sk, int level, int optname,
 				   char __user *optval, int optlen);
+#ifdef CONFIG_COMPAT
+extern int	   compat_dccp_getsockopt(struct sock *sk,
+				int level, int optname,
+				char __user *optval, int __user *optlen);
+extern int	   compat_dccp_setsockopt(struct sock *sk,
+				int level, int optname,
+				char __user *optval, int optlen);
+#endif
 extern int	   dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int	   dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
 				struct msghdr *msg, size_t size);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 80d450ba6219..8a33c8498d9c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -994,6 +994,10 @@ static struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 	.net_header_len	= sizeof(struct iphdr),
 	.setsockopt	= ip_setsockopt,
 	.getsockopt	= ip_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_ip_setsockopt,
+	.compat_getsockopt	= compat_ip_getsockopt,
+#endif
 	.addr2sockaddr	= inet_csk_addr2sockaddr,
 	.sockaddr_len	= sizeof(struct sockaddr_in),
 };
@@ -1040,6 +1044,10 @@ static struct proto dccp_v4_prot = {
 	.init			= dccp_v4_init_sock,
 	.setsockopt		= dccp_setsockopt,
 	.getsockopt		= dccp_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_dccp_setsockopt,
+	.compat_getsockopt	= compat_dccp_getsockopt,
+#endif
 	.sendmsg		= dccp_sendmsg,
 	.recvmsg		= dccp_recvmsg,
 	.backlog_rcv		= dccp_v4_do_rcv,
@@ -1079,6 +1087,10 @@ static const struct proto_ops inet_dccp_ops = {
 	.shutdown	= inet_shutdown,
 	.setsockopt	= sock_common_setsockopt,
 	.getsockopt	= sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg	= inet_sendmsg,
 	.recvmsg	= sock_common_recvmsg,
 	.mmap		= sock_no_mmap,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 7c8233f6d3c2..89106c7d3247 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1114,6 +1114,10 @@ static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
 	.net_header_len	= sizeof(struct ipv6hdr),
 	.setsockopt	= ipv6_setsockopt,
 	.getsockopt	= ipv6_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_ipv6_setsockopt,
+	.compat_getsockopt	= compat_ipv6_getsockopt,
+#endif
 	.addr2sockaddr	= inet6_csk_addr2sockaddr,
 	.sockaddr_len	= sizeof(struct sockaddr_in6)
 };
@@ -1130,6 +1134,10 @@ static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
 	.net_header_len	= sizeof(struct iphdr),
 	.setsockopt	= ipv6_setsockopt,
 	.getsockopt	= ipv6_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_ipv6_setsockopt,
+	.compat_getsockopt	= compat_ipv6_getsockopt,
+#endif
 	.addr2sockaddr	= inet6_csk_addr2sockaddr,
 	.sockaddr_len	= sizeof(struct sockaddr_in6)
 };
@@ -1167,6 +1175,10 @@ static struct proto dccp_v6_prot = {
 	.init		= dccp_v6_init_sock,
 	.setsockopt	= dccp_setsockopt,
 	.getsockopt	= dccp_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_dccp_setsockopt,
+	.compat_getsockopt	= compat_dccp_getsockopt,
+#endif
 	.sendmsg	= dccp_sendmsg,
 	.recvmsg	= dccp_recvmsg,
 	.backlog_rcv	= dccp_v6_do_rcv,
@@ -1204,6 +1216,10 @@ static struct proto_ops inet6_dccp_ops = {
 	.shutdown	= inet_shutdown,
 	.setsockopt	= sock_common_setsockopt,
 	.getsockopt	= sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg	= inet_sendmsg,
 	.recvmsg	= sock_common_recvmsg,
 	.mmap		= sock_no_mmap,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index baccaf35ffbd..59b214995f28 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -455,18 +455,13 @@ out_free_val:
 	goto out;
 }
 
-int dccp_setsockopt(struct sock *sk, int level, int optname,
-		    char __user *optval, int optlen)
+static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
+		char __user *optval, int optlen)
 {
 	struct dccp_sock *dp;
 	int err;
 	int val;
 
-	if (level != SOL_DCCP)
-		return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
-							     optname, optval,
-							     optlen);
-
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -512,8 +507,34 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
 	return err;
 }
 
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int optlen)
+{
+	if (level != SOL_DCCP)
+		return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+							     optname, optval,
+							     optlen);
+	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
 EXPORT_SYMBOL_GPL(dccp_setsockopt);
 
+#ifdef CONFIG_COMPAT
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int optlen)
+{
+	if (level != SOL_DCCP) {
+		if (inet_csk(sk)->icsk_af_ops->compat_setsockopt)
+			return inet_csk(sk)->icsk_af_ops->compat_setsockopt(sk,
+				level, optname, optval, optlen);
+		else
+			return inet_csk(sk)->icsk_af_ops->setsockopt(sk,
+				level, optname, optval, optlen);
+	}
+	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
+#endif
+
 static int dccp_getsockopt_service(struct sock *sk, int len,
 				   __be32 __user *optval,
 				   int __user *optlen)
@@ -545,16 +566,12 @@ out:
 	return err;
 }
 
-int dccp_getsockopt(struct sock *sk, int level, int optname,
+static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, int __user *optlen)
 {
 	struct dccp_sock *dp;
 	int val, len;
 
-	if (level != SOL_DCCP)
-		return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
-							     optname, optval,
-							     optlen);
 	if (get_user(len, optlen))
 		return -EFAULT;
 
@@ -587,8 +604,34 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
 	return 0;
 }
 
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	if (level != SOL_DCCP)
+		return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+							     optname, optval,
+							     optlen);
+	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
 EXPORT_SYMBOL_GPL(dccp_getsockopt);
 
+#ifdef CONFIG_COMPAT
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	if (level != SOL_DCCP) {
+		if (inet_csk(sk)->icsk_af_ops->compat_setsockopt)
+			return inet_csk(sk)->icsk_af_ops->compat_getsockopt(sk,
+				level, optname, optval, optlen);
+		else
+			return inet_csk(sk)->icsk_af_ops->getsockopt(sk,
+				level, optname, optval, optlen);
+	}
+	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
+#endif
+
 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 size_t len)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 97c276f95b35..454e523b506a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -802,6 +802,10 @@ const struct proto_ops inet_stream_ops = {
 	.shutdown =	inet_shutdown,
 	.setsockopt =	sock_common_setsockopt,
 	.getsockopt =	sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg =	inet_sendmsg,
 	.recvmsg =	sock_common_recvmsg,
 	.mmap =		sock_no_mmap,
@@ -823,6 +827,10 @@ const struct proto_ops inet_dgram_ops = {
 	.shutdown =	inet_shutdown,
 	.setsockopt =	sock_common_setsockopt,
 	.getsockopt =	sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg =	inet_sendmsg,
 	.recvmsg =	sock_common_recvmsg,
 	.mmap =		sock_no_mmap,
@@ -848,6 +856,10 @@ static const struct proto_ops inet_sockraw_ops = {
 	.shutdown =	inet_shutdown,
 	.setsockopt =	sock_common_setsockopt,
 	.getsockopt =	sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg =	inet_sendmsg,
 	.recvmsg =	sock_common_recvmsg,
 	.mmap =		sock_no_mmap,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b5c4f61518e8..49ff1cd4e1c9 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -399,14 +399,12 @@ out:
  *	an IP socket.
  */
 
-int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen)
+static int do_ip_setsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int optlen)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	int val=0,err;
 
-	if (level != SOL_IP)
-		return -ENOPROTOOPT;
-
 	if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | 
 			    (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | 
 			    (1<<IP_RETOPTS) | (1<<IP_TOS) | 
@@ -875,12 +873,7 @@ mc_msf_out:
 			break;
 
 		default:
-#ifdef CONFIG_NETFILTER
-			err = nf_setsockopt(sk, PF_INET, optname, optval, 
-					    optlen);
-#else
 			err = -ENOPROTOOPT;
-#endif
 			break;
 	}
 	release_sock(sk);
@@ -891,12 +884,66 @@ e_inval:
 	return -EINVAL;
 }
 
+int ip_setsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int optlen)
+{
+	int err;
+
+	if (level != SOL_IP)
+		return -ENOPROTOOPT;
+
+	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+		optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY
+#ifdef CONFIG_IP_MROUTE
+		&& (optname < MRT_BASE || optname > (MRT_BASE + 10))
+#endif
+	   ) {
+		lock_sock(sk);
+		err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_ip_setsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int optlen)
+{
+	int err;
+
+	if (level != SOL_IP)
+		return -ENOPROTOOPT;
+
+	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+		optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY
+#ifdef CONFIG_IP_MROUTE
+		&& (optname < MRT_BASE || optname > (MRT_BASE + 10))
+#endif
+	   ) {
+		lock_sock(sk);
+		err = compat_nf_setsockopt(sk, PF_INET,
+				optname, optval, optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+#endif
+
 /*
  *	Get the options. Note for future reference. The GET of IP options gets the
  *	_received_ ones. The set sets the _sent_ ones.
  */
 
-int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
+static int do_ip_getsockopt(struct sock *sk, int level, int optname,
+		char __user *optval, int __user *optlen)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	int val;
@@ -1080,17 +1127,8 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			val = inet->freebind; 
 			break; 
 		default:
-#ifdef CONFIG_NETFILTER
-			val = nf_getsockopt(sk, PF_INET, optname, optval, 
-					    &len);
-			release_sock(sk);
-			if (val >= 0)
-				val = put_user(len, optlen);
-			return val;
-#else
 			release_sock(sk);
 			return -ENOPROTOOPT;
-#endif
 	}
 	release_sock(sk);
 	
@@ -1111,7 +1149,73 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	return 0;
 }
 
+int ip_getsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int __user *optlen)
+{
+	int err;
+
+	err = do_ip_getsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS
+#ifdef CONFIG_IP_MROUTE
+		&& (optname < MRT_BASE || optname > MRT_BASE+10)
+#endif
+	   ) {
+	   	int len;
+
+		if(get_user(len,optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = nf_getsockopt(sk, PF_INET, optname, optval,
+				&len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+		return err;
+	}
+#endif
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_ip_getsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int __user *optlen)
+{
+	int err;
+
+	err = do_ip_getsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS
+#ifdef CONFIG_IP_MROUTE
+		&& (optname < MRT_BASE || optname > MRT_BASE+10)
+#endif
+	   ) {
+	   	int len;
+
+		if(get_user(len,optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = compat_nf_getsockopt(sk, PF_INET,
+				optname, optval, &len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+		return err;
+	}
+#endif
+	return err;
+}
+#endif
+
 EXPORT_SYMBOL(ip_cmsg_recv);
 
 EXPORT_SYMBOL(ip_getsockopt);
 EXPORT_SYMBOL(ip_setsockopt);
+#ifdef CONFIG_COMPAT
+EXPORT_SYMBOL(compat_ip_getsockopt);
+EXPORT_SYMBOL(compat_ip_setsockopt);
+#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f29a12da5109..f1b02b34fc0a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -660,12 +660,9 @@ static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *o
 out:	return ret;
 }
 
-static int raw_setsockopt(struct sock *sk, int level, int optname, 
+static int do_raw_setsockopt(struct sock *sk, int level, int optname,
 			  char __user *optval, int optlen)
 {
-	if (level != SOL_RAW)
-		return ip_setsockopt(sk, level, optname, optval, optlen);
-
 	if (optname == ICMP_FILTER) {
 		if (inet_sk(sk)->num != IPPROTO_ICMP)
 			return -EOPNOTSUPP;
@@ -675,12 +672,28 @@ static int raw_setsockopt(struct sock *sk, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
-static int raw_getsockopt(struct sock *sk, int level, int optname, 
-			  char __user *optval, int __user *optlen)
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int optlen)
 {
 	if (level != SOL_RAW)
-		return ip_getsockopt(sk, level, optname, optval, optlen);
+		return ip_setsockopt(sk, level, optname, optval, optlen);
+	return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
 
+#ifdef CONFIG_COMPAT
+static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int optlen)
+{
+	if (level != SOL_RAW)
+		return compat_ip_setsockopt(sk, level,
+				optname, optval, optlen);
+	return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_raw_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
 	if (optname == ICMP_FILTER) {
 		if (inet_sk(sk)->num != IPPROTO_ICMP)
 			return -EOPNOTSUPP;
@@ -690,6 +703,25 @@ static int raw_getsockopt(struct sock *sk, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_RAW)
+		return ip_getsockopt(sk, level, optname, optval, optlen);
+	return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_RAW)
+		return compat_ip_getsockopt(sk, level,
+				optname, optval, optlen);
+	return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
 static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
 	switch (cmd) {
@@ -728,6 +760,10 @@ struct proto raw_prot = {
 	.init =		raw_init,
 	.setsockopt =	raw_setsockopt,
 	.getsockopt =	raw_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt =	compat_raw_setsockopt,
+	.compat_getsockopt =	compat_raw_getsockopt,
+#endif
 	.sendmsg =	raw_sendmsg,
 	.recvmsg =	raw_recvmsg,
 	.bind =		raw_bind,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 00aa80e93243..31b0123a9699 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1687,18 +1687,14 @@ int tcp_disconnect(struct sock *sk, int flags)
 /*
  *	Socket option code for TCP.
  */
-int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
-		   int optlen)
+static int do_tcp_setsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int optlen)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int val;
 	int err = 0;
 
-	if (level != SOL_TCP)
-		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
-						     optval, optlen);
-
 	/* This is a string value all the others are int's */
 	if (optname == TCP_CONGESTION) {
 		char name[TCP_CA_NAME_MAX];
@@ -1871,6 +1867,35 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	return err;
 }
 
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+		   int optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP)
+		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+						     optval, optlen);
+	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_setsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP) {
+		if (icsk->icsk_af_ops->compat_setsockopt)
+			return icsk->icsk_af_ops->compat_setsockopt(sk,
+				level, optname, optval, optlen);
+		else
+			return icsk->icsk_af_ops->setsockopt(sk,
+				level, optname, optval, optlen);
+	}
+	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
 /* Return information about state of tcp endpoint in API format. */
 void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
@@ -1931,17 +1956,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
-int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
-		   int __user *optlen)
+static int do_tcp_getsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int __user *optlen)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int val, len;
 
-	if (level != SOL_TCP)
-		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
-						     optval, optlen);
-
 	if (get_user(len, optlen))
 		return -EFAULT;
 
@@ -2025,6 +2046,34 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	return 0;
 }
 
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+		   int __user *optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP)
+		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+						     optval, optlen);
+	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_getsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int __user *optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP) {
+		if (icsk->icsk_af_ops->compat_getsockopt)
+			return icsk->icsk_af_ops->compat_getsockopt(sk,
+				level, optname, optval, optlen);
+		else
+			return icsk->icsk_af_ops->getsockopt(sk,
+				level, optname, optval, optlen);
+	}
+	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
 
 extern void __skb_cb_too_small_for_tcp(int, int);
 extern struct tcp_congestion_ops tcp_reno;
@@ -2142,3 +2191,7 @@ EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
+#ifdef CONFIG_COMPAT
+EXPORT_SYMBOL(compat_tcp_setsockopt);
+EXPORT_SYMBOL(compat_tcp_getsockopt);
+#endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4eb903db1b12..249ef6c88959 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1226,6 +1226,10 @@ struct inet_connection_sock_af_ops ipv4_specific = {
 	.net_header_len	=	sizeof(struct iphdr),
 	.setsockopt	=	ip_setsockopt,
 	.getsockopt	=	ip_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt =	compat_ip_setsockopt,
+	.compat_getsockopt =	compat_ip_getsockopt,
+#endif
 	.addr2sockaddr	=	inet_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in),
 };
@@ -1808,6 +1812,10 @@ struct proto tcp_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_tcp_setsockopt,
+	.compat_getsockopt	= compat_tcp_getsockopt,
+#endif
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
 	.backlog_rcv		= tcp_v4_do_rcv,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 00840474a449..0b0721bd45c6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1207,16 +1207,13 @@ static int udp_destroy_sock(struct sock *sk)
 /*
  *	Socket option code for UDP
  */
-static int udp_setsockopt(struct sock *sk, int level, int optname, 
+static int do_udp_setsockopt(struct sock *sk, int level, int optname,
 			  char __user *optval, int optlen)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int val;
 	int err = 0;
 
-	if (level != SOL_UDP)
-		return ip_setsockopt(sk, level, optname, optval, optlen);
-
 	if(optlen<sizeof(int))
 		return -EINVAL;
 
@@ -1256,15 +1253,31 @@ static int udp_setsockopt(struct sock *sk, int level, int optname,
 	return err;
 }
 
-static int udp_getsockopt(struct sock *sk, int level, int optname, 
+static int udp_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int optlen)
+{
+	if (level != SOL_UDP)
+		return ip_setsockopt(sk, level, optname, optval, optlen);
+	return do_udp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+		char __user *optval, int optlen)
+{
+	if (level != SOL_UDP)
+		return compat_ip_setsockopt(sk, level,
+				optname, optval, optlen);
+	return do_udp_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_udp_getsockopt(struct sock *sk, int level, int optname,
 			  char __user *optval, int __user *optlen)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int val, len;
 
-	if (level != SOL_UDP)
-		return ip_getsockopt(sk, level, optname, optval, optlen);
-
 	if(get_user(len,optlen))
 		return -EFAULT;
 
@@ -1293,6 +1306,24 @@ static int udp_getsockopt(struct sock *sk, int level, int optname,
   	return 0;
 }
 
+static int udp_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_UDP)
+		return ip_getsockopt(sk, level, optname, optval, optlen);
+	return do_udp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_UDP)
+		return compat_ip_getsockopt(sk, level,
+				optname, optval, optlen);
+	return do_udp_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
 /**
  * 	udp_poll - wait for a UDP event.
  *	@file - file struct
@@ -1350,6 +1381,10 @@ struct proto udp_prot = {
 	.destroy =	udp_destroy_sock,
 	.setsockopt =	udp_setsockopt,
 	.getsockopt =	udp_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt =	compat_udp_setsockopt,
+	.compat_getsockopt =	compat_udp_getsockopt,
+#endif
 	.sendmsg =	udp_sendmsg,
 	.recvmsg =	udp_recvmsg,
 	.sendpage =	udp_sendpage,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6c9711ac1c03..97844c4cd9b1 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -470,6 +470,10 @@ const struct proto_ops inet6_stream_ops = {
 	.shutdown =	inet_shutdown,			/* ok		*/
 	.setsockopt =	sock_common_setsockopt,		/* ok		*/
 	.getsockopt =	sock_common_getsockopt,		/* ok		*/
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg =	inet_sendmsg,			/* ok		*/
 	.recvmsg =	sock_common_recvmsg,		/* ok		*/
 	.mmap =		sock_no_mmap,
@@ -491,6 +495,10 @@ const struct proto_ops inet6_dgram_ops = {
 	.shutdown =	inet_shutdown,			/* ok		*/
 	.setsockopt =	sock_common_setsockopt,		/* ok		*/
 	.getsockopt =	sock_common_getsockopt,		/* ok		*/
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg =	inet_sendmsg,			/* ok		*/
 	.recvmsg =	sock_common_recvmsg,		/* ok		*/
 	.mmap =		sock_no_mmap,
@@ -519,6 +527,10 @@ static const struct proto_ops inet6_sockraw_ops = {
 	.shutdown =	inet_shutdown,			/* ok		*/
 	.setsockopt =	sock_common_setsockopt,		/* ok		*/
 	.getsockopt =	sock_common_getsockopt,		/* ok		*/
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg =	inet_sendmsg,			/* ok		*/
 	.recvmsg =	sock_common_recvmsg,		/* ok		*/
 	.mmap =		sock_no_mmap,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index f7142ba519ab..988eac58e9d1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -109,19 +109,13 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
 	return 0;
 }
 
-int ipv6_setsockopt(struct sock *sk, int level, int optname,
+static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, int optlen)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	int val, valbool;
 	int retv = -ENOPROTOOPT;
 
-	if (level == SOL_IP && sk->sk_type != SOCK_RAW)
-		return udp_prot.setsockopt(sk, level, optname, optval, optlen);
-
-	if(level!=SOL_IPV6)
-		goto out;
-
 	if (optval == NULL)
 		val=0;
 	else if (get_user(val, (int __user *) optval))
@@ -613,17 +607,9 @@ done:
 		retv = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
 
-#ifdef CONFIG_NETFILTER
-	default:
-		retv = nf_setsockopt(sk, PF_INET6, optname, optval, 
-					    optlen);
-		break;
-#endif
-
 	}
 	release_sock(sk);
 
-out:
 	return retv;
 
 e_inval:
@@ -631,6 +617,65 @@ e_inval:
 	return -EINVAL;
 }
 
+int ipv6_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW)
+		return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+
+	if (level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
+			optname != IPV6_XFRM_POLICY) {
+		lock_sock(sk);
+		err = nf_setsockopt(sk, PF_INET6, optname, optval,
+				optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+
+
+#ifdef CONFIG_COMPAT
+int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
+		if (udp_prot.compat_setsockopt)
+			return udp_prot.compat_setsockopt(sk, level,
+					optname, optval, optlen);
+		else
+			return udp_prot.setsockopt(sk, level,
+					optname, optval, optlen);
+	}
+
+	if (level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
+			optname != IPV6_XFRM_POLICY) {
+		lock_sock(sk);
+		err = compat_nf_setsockopt(sk, PF_INET6, optname, optval,
+				optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+#endif
+
 static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr,
 				  char __user *optval, int len)
 {
@@ -642,17 +687,13 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_opt_hdr *hdr,
 	return len;
 }
 
-int ipv6_getsockopt(struct sock *sk, int level, int optname,
+static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, int __user *optlen)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	int len;
 	int val;
 
-	if (level == SOL_IP && sk->sk_type != SOCK_RAW)
-		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
-	if(level!=SOL_IPV6)
-		return -ENOPROTOOPT;
 	if (get_user(len, optlen))
 		return -EFAULT;
 	switch (optname) {
@@ -842,17 +883,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	default:
-#ifdef CONFIG_NETFILTER
-		lock_sock(sk);
-		val = nf_getsockopt(sk, PF_INET6, optname, optval, 
-				    &len);
-		release_sock(sk);
-		if (val >= 0)
-			val = put_user(len, optlen);
-		return val;
-#else
 		return -EINVAL;
-#endif
 	}
 	len = min_t(unsigned int, sizeof(int), len);
 	if(put_user(len, optlen))
@@ -862,6 +893,78 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
 	return 0;
 }
 
+int ipv6_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW)
+		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+
+	if(level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	err = do_ipv6_getsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible EINVALs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_ADDRFORM &&
+			optname != MCAST_MSFILTER) {
+		int len;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = nf_getsockopt(sk, PF_INET6, optname, optval,
+				&len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+	}
+#endif
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
+		if (udp_prot.compat_getsockopt)
+			return udp_prot.compat_getsockopt(sk, level,
+					optname, optval, optlen);
+		else
+			return udp_prot.getsockopt(sk, level,
+					optname, optval, optlen);
+	}
+
+	if(level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	err = do_ipv6_getsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible EINVALs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_ADDRFORM &&
+			optname != MCAST_MSFILTER) {
+		int len;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = compat_nf_getsockopt(sk, PF_INET6, optname, optval,
+				&len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+	}
+#endif
+	return err;
+}
+#endif
+
 void __init ipv6_packet_init(void)
 {
 	dev_add_pack(&ipv6_packet_type);
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 16482785bdfd..61419e11e35d 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -18,6 +18,10 @@ EXPORT_SYMBOL(ip6_route_output);
 EXPORT_SYMBOL(addrconf_lock);
 EXPORT_SYMBOL(ipv6_setsockopt);
 EXPORT_SYMBOL(ipv6_getsockopt);
+#ifdef CONFIG_COMPAT
+EXPORT_SYMBOL(compat_ipv6_setsockopt);
+EXPORT_SYMBOL(compat_ipv6_getsockopt);
+#endif
 EXPORT_SYMBOL(inet6_register_protosw);
 EXPORT_SYMBOL(inet6_unregister_protosw);
 EXPORT_SYMBOL(inet6_add_protocol);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ae20a0ec9bd8..8de5a8e59149 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -859,29 +859,12 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
 }
 
 
-static int rawv6_setsockopt(struct sock *sk, int level, int optname, 
+static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 			    char __user *optval, int optlen)
 {
 	struct raw6_sock *rp = raw6_sk(sk);
 	int val;
 
-	switch(level) {
-		case SOL_RAW:
-			break;
-
-		case SOL_ICMPV6:
-			if (inet_sk(sk)->num != IPPROTO_ICMPV6)
-				return -EOPNOTSUPP;
-			return rawv6_seticmpfilter(sk, level, optname, optval,
-						   optlen);
-		case SOL_IPV6:
-			if (optname == IPV6_CHECKSUM)
-				break;
-		default:
-			return ipv6_setsockopt(sk, level, optname, optval,
-					       optlen);
-	};
-
   	if (get_user(val, (int __user *)optval))
 		return -EFAULT;
 
@@ -906,12 +889,9 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
 	}
 }
 
-static int rawv6_getsockopt(struct sock *sk, int level, int optname, 
-			    char __user *optval, int __user *optlen)
+static int rawv6_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int optlen)
 {
-	struct raw6_sock *rp = raw6_sk(sk);
-	int val, len;
-
 	switch(level) {
 		case SOL_RAW:
 			break;
@@ -919,15 +899,47 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 		case SOL_ICMPV6:
 			if (inet_sk(sk)->num != IPPROTO_ICMPV6)
 				return -EOPNOTSUPP;
-			return rawv6_geticmpfilter(sk, level, optname, optval,
+			return rawv6_seticmpfilter(sk, level, optname, optval,
 						   optlen);
 		case SOL_IPV6:
 			if (optname == IPV6_CHECKSUM)
 				break;
 		default:
-			return ipv6_getsockopt(sk, level, optname, optval,
+			return ipv6_setsockopt(sk, level, optname, optval,
 					       optlen);
 	};
+	return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int optlen)
+{
+	switch(level) {
+		case SOL_RAW:
+			break;
+
+		case SOL_ICMPV6:
+			if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+				return -EOPNOTSUPP;
+			return rawv6_seticmpfilter(sk, level, optname, optval,
+						   optlen);
+		case SOL_IPV6:
+			if (optname == IPV6_CHECKSUM)
+				break;
+		default:
+			return compat_ipv6_setsockopt(sk, level,
+					optname, optval, optlen);
+	};
+	return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, int __user *optlen)
+{
+	struct raw6_sock *rp = raw6_sk(sk);
+	int val, len;
 
 	if (get_user(len,optlen))
 		return -EFAULT;
@@ -953,6 +965,52 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 	return 0;
 }
 
+static int rawv6_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	switch(level) {
+		case SOL_RAW:
+			break;
+
+		case SOL_ICMPV6:
+			if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+				return -EOPNOTSUPP;
+			return rawv6_geticmpfilter(sk, level, optname, optval,
+						   optlen);
+		case SOL_IPV6:
+			if (optname == IPV6_CHECKSUM)
+				break;
+		default:
+			return ipv6_getsockopt(sk, level, optname, optval,
+					       optlen);
+	};
+	return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	switch(level) {
+		case SOL_RAW:
+			break;
+
+		case SOL_ICMPV6:
+			if (inet_sk(sk)->num != IPPROTO_ICMPV6)
+				return -EOPNOTSUPP;
+			return rawv6_geticmpfilter(sk, level, optname, optval,
+						   optlen);
+		case SOL_IPV6:
+			if (optname == IPV6_CHECKSUM)
+				break;
+		default:
+			return compat_ipv6_getsockopt(sk, level,
+					optname, optval, optlen);
+	};
+	return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
 static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
 	switch(cmd) {
@@ -1008,6 +1066,10 @@ struct proto rawv6_prot = {
 	.destroy =	inet6_destroy_sock,
 	.setsockopt =	rawv6_setsockopt,
 	.getsockopt =	rawv6_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt =	compat_rawv6_setsockopt,
+	.compat_getsockopt =	compat_rawv6_getsockopt,
+#endif
 	.sendmsg =	rawv6_sendmsg,
 	.recvmsg =	rawv6_recvmsg,
 	.bind =		rawv6_bind,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index af6a0c60f903..2f8975e0150a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1308,6 +1308,10 @@ static struct inet_connection_sock_af_ops ipv6_specific = {
 
 	.setsockopt	=	ipv6_setsockopt,
 	.getsockopt	=	ipv6_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_ipv6_setsockopt,
+	.compat_getsockopt	= compat_ipv6_getsockopt,
+#endif
 	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in6)
 };
@@ -1327,6 +1331,10 @@ static struct inet_connection_sock_af_ops ipv6_mapped = {
 
 	.setsockopt	=	ipv6_setsockopt,
 	.getsockopt	=	ipv6_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_ipv6_setsockopt,
+	.compat_getsockopt	= compat_ipv6_getsockopt,
+#endif
 	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in6)
 };
@@ -1566,6 +1574,10 @@ struct proto tcpv6_prot = {
 	.shutdown		= tcp_shutdown,
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_tcp_setsockopt,
+	.compat_getsockopt	= compat_tcp_getsockopt,
+#endif
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
 	.backlog_rcv		= tcp_v6_do_rcv,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c47648892c04..538ada00646a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -880,16 +880,13 @@ static int udpv6_destroy_sock(struct sock *sk)
 /*
  *	Socket option code for UDP
  */
-static int udpv6_setsockopt(struct sock *sk, int level, int optname, 
+static int do_udpv6_setsockopt(struct sock *sk, int level, int optname,
 			  char __user *optval, int optlen)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int val;
 	int err = 0;
 
-	if (level != SOL_UDP)
-		return ipv6_setsockopt(sk, level, optname, optval, optlen);
-
 	if(optlen<sizeof(int))
 		return -EINVAL;
 
@@ -927,15 +924,31 @@ static int udpv6_setsockopt(struct sock *sk, int level, int optname,
 	return err;
 }
 
-static int udpv6_getsockopt(struct sock *sk, int level, int optname, 
+static int udpv6_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int optlen)
+{
+	if (level != SOL_UDP)
+		return ipv6_setsockopt(sk, level, optname, optval, optlen);
+	return do_udpv6_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
+		char __user *optval, int optlen)
+{
+	if (level != SOL_UDP)
+		return compat_ipv6_setsockopt(sk, level,
+				optname, optval, optlen);
+	return do_udpv6_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_udpv6_getsockopt(struct sock *sk, int level, int optname,
 			  char __user *optval, int __user *optlen)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int val, len;
 
-	if (level != SOL_UDP)
-		return ipv6_getsockopt(sk, level, optname, optval, optlen);
-
 	if(get_user(len,optlen))
 		return -EFAULT;
 
@@ -964,6 +977,25 @@ static int udpv6_getsockopt(struct sock *sk, int level, int optname,
   	return 0;
 }
 
+static int udpv6_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_UDP)
+		return ipv6_getsockopt(sk, level, optname, optval, optlen);
+	return do_udpv6_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_UDP)
+		return compat_ipv6_getsockopt(sk, level,
+				optname, optval, optlen);
+	return do_udpv6_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
 static struct inet6_protocol udpv6_protocol = {
 	.handler	=	udpv6_rcv,
 	.err_handler	=	udpv6_err,
@@ -1046,6 +1078,10 @@ struct proto udpv6_prot = {
 	.destroy =	udpv6_destroy_sock,
 	.setsockopt =	udpv6_setsockopt,
 	.getsockopt =	udpv6_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt =	compat_udpv6_setsockopt,
+	.compat_getsockopt =	compat_udpv6_getsockopt,
+#endif
 	.sendmsg =	udpv6_sendmsg,
 	.recvmsg =	udpv6_recvmsg,
 	.backlog_rcv =	udpv6_queue_rcv_skb,
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 0e5c5e204799..da1cd48de216 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -131,3 +131,72 @@ int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
 }
 EXPORT_SYMBOL(nf_getsockopt);
 
+#ifdef CONFIG_COMPAT
+static int compat_nf_sockopt(struct sock *sk, int pf, int val,
+		      char __user *opt, int *len, int get)
+{
+	struct list_head *i;
+	struct nf_sockopt_ops *ops;
+	int ret;
+
+	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
+		return -EINTR;
+
+	list_for_each(i, &nf_sockopts) {
+		ops = (struct nf_sockopt_ops *)i;
+		if (ops->pf == pf) {
+			if (get) {
+				if (val >= ops->get_optmin
+				    && val < ops->get_optmax) {
+					ops->use++;
+					mutex_unlock(&nf_sockopt_mutex);
+					if (ops->compat_get)
+						ret = ops->compat_get(sk,
+							val, opt, len);
+					else
+						ret = ops->get(sk,
+							val, opt, len);
+					goto out;
+				}
+			} else {
+				if (val >= ops->set_optmin
+				    && val < ops->set_optmax) {
+					ops->use++;
+					mutex_unlock(&nf_sockopt_mutex);
+					if (ops->compat_set)
+						ret = ops->compat_set(sk,
+							val, opt, *len);
+					else
+						ret = ops->set(sk,
+							val, opt, *len);
+					goto out;
+				}
+			}
+		}
+	}
+	mutex_unlock(&nf_sockopt_mutex);
+	return -ENOPROTOOPT;
+
+ out:
+	mutex_lock(&nf_sockopt_mutex);
+	ops->use--;
+	if (ops->cleanup_task)
+		wake_up_process(ops->cleanup_task);
+	mutex_unlock(&nf_sockopt_mutex);
+	return ret;
+}
+
+int compat_nf_setsockopt(struct sock *sk, int pf,
+		int val, char __user *opt, int len)
+{
+	return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(compat_nf_setsockopt);
+
+int compat_nf_getsockopt(struct sock *sk, int pf,
+		int val, char __user *opt, int *len)
+{
+	return compat_nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(compat_nf_getsockopt);
+#endif
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 2e266129a764..bbee14d01c9b 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -875,6 +875,10 @@ static const struct proto_ops inet6_seqpacket_ops = {
 	.shutdown   = inet_shutdown,
 	.setsockopt = sock_common_setsockopt,
 	.getsockopt = sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_sock_common_setsockopt,
+	.compat_getsockopt	= compat_sock_common_getsockopt,
+#endif
 	.sendmsg    = inet_sendmsg,
 	.recvmsg    = sock_common_recvmsg,
 	.mmap       = sock_no_mmap,
@@ -914,6 +918,10 @@ static struct sctp_af sctp_ipv6_specific = {
 	.sctp_xmit       = sctp_v6_xmit,
 	.setsockopt      = ipv6_setsockopt,
 	.getsockopt      = ipv6_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_ipv6_setsockopt,
+	.compat_getsockopt	= compat_ipv6_getsockopt,
+#endif
 	.get_dst	 = sctp_v6_get_dst,
 	.get_saddr	 = sctp_v6_get_saddr,
 	.copy_addrlist   = sctp_v6_copy_addrlist,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index de693b43c8ea..d90f5491870f 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -845,6 +845,10 @@ static const struct proto_ops inet_seqpacket_ops = {
 	.shutdown    = inet_shutdown,     /* Looks harmless.  */
 	.setsockopt  = sock_common_setsockopt,   /* IP_SOL IP_OPTION is a problem. */
 	.getsockopt  = sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt  = compat_sock_common_setsockopt,
+	.compat_getsockopt  = compat_sock_common_getsockopt,
+#endif
 	.sendmsg     = inet_sendmsg,
 	.recvmsg     = sock_common_recvmsg,
 	.mmap        = sock_no_mmap,
@@ -883,6 +887,10 @@ static struct sctp_af sctp_ipv4_specific = {
 	.sctp_xmit      = sctp_v4_xmit,
 	.setsockopt     = ip_setsockopt,
 	.getsockopt     = ip_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt     = compat_ip_setsockopt,
+	.compat_getsockopt     = compat_ip_getsockopt,
+#endif
 	.get_dst	= sctp_v4_get_dst,
 	.get_saddr	= sctp_v4_get_saddr,
 	.copy_addrlist  = sctp_v4_copy_addrlist,
-- 
cgit v1.2.3


From a4bf3902427a128455b8de299ff0918072b2e974 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Mon, 20 Mar 2006 22:50:58 -0800
Subject: [DCCP] minisock: Rename struct dccp_options to struct dccp_minisock

This will later be included in struct dccp_request_sock so that we can
have per connection feature negotiation state while in the 3way
handshake, when we clone the DCCP_ROLE_LISTEN socket (in
dccp_create_openreq_child) we'll just copy this state from
dreq_minisock to dccps_minisock.

Also the feature negotiation and option parsing code will mostly touch
dccps_minisock, which will simplify some stuff.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h | 43 +++++++++++++++++-----------
 net/dccp/dccp.h      | 11 ++++---
 net/dccp/diag.c      |  2 +-
 net/dccp/feat.c      | 81 ++++++++++++++++++++++++----------------------------
 net/dccp/input.c     |  8 +++---
 net/dccp/minisocks.c |  9 +++---
 net/dccp/options.c   | 30 +++++++++----------
 net/dccp/proto.c     | 23 +++++++--------
 8 files changed, 104 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index e35f680f909b..676333b9fad0 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -328,21 +328,24 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 #define DCCP_NDP_LIMIT 0xFFFFFF
 
 /**
-  * struct dccp_options - option values for a DCCP connection
-  *	@dccpo_sequence_window - Sequence Window Feature (section 7.5.2)
-  *	@dccpo_ccid - Congestion Control Id (CCID) (section 10)
-  *	@dccpo_send_ack_vector - Send Ack Vector Feature (section 11.5)
-  *	@dccpo_send_ndp_count - Send NDP Count Feature (7.7.2)
+  * struct dccp_minisock - Minimal DCCP connection representation
+  *
+  * Will be used to pass the state from dccp_request_sock to dccp_sock.
+  *
+  * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
+  * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
+  * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
+  * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
   */
-struct dccp_options {
-	__u64	dccpo_sequence_window;
-	__u8	dccpo_rx_ccid;
-	__u8	dccpo_tx_ccid;
-	__u8	dccpo_send_ack_vector;
-	__u8	dccpo_send_ndp_count;
-	__u8			dccpo_ack_ratio;
-	struct list_head	dccpo_pending;
-	struct list_head	dccpo_conf;
+struct dccp_minisock {
+	__u64			dccpms_sequence_window;
+	__u8			dccpms_rx_ccid;
+	__u8			dccpms_tx_ccid;
+	__u8			dccpms_send_ack_vector;
+	__u8			dccpms_send_ndp_count;
+	__u8			dccpms_ack_ratio;
+	struct list_head	dccpms_pending;
+	struct list_head	dccpms_conf;
 };
 
 struct dccp_opt_conf {
@@ -360,8 +363,9 @@ struct dccp_opt_pend {
 	struct dccp_opt_conf    *dccpop_sc;
 };
 
-extern void __dccp_options_init(struct dccp_options *dccpo);
-extern void dccp_options_init(struct dccp_options *dccpo);
+extern void __dccp_minisock_init(struct dccp_minisock *dmsk);
+extern void dccp_minisock_init(struct dccp_minisock *dmsk);
+
 extern int dccp_parse_options(struct sock *sk, struct sk_buff *skb);
 
 struct dccp_request_sock {
@@ -457,7 +461,7 @@ struct dccp_sock {
 	__u16				dccps_r_ack_ratio;
 	unsigned long			dccps_ndp_count;
 	__u32				dccps_mss_cache;
-	struct dccp_options		dccps_options;
+	struct dccp_minisock		dccps_minisock;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
 	struct ccid			*dccps_hc_rx_ccid;
 	struct ccid			*dccps_hc_tx_ccid;
@@ -473,6 +477,11 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk)
 	return (struct dccp_sock *)sk;
 }
 
+static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
+{
+	return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
+}
+
 static inline int dccp_service_not_initialized(const struct sock *sk)
 {
 	return dccp_sk(sk)->dccps_service == DCCP_SERVICE_INVALID_VALUE;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 47de17208d7a..1fe509148689 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -303,14 +303,13 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
 static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	const struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	dp->dccps_gsr = seq;
 	dccp_set_seqno(&dp->dccps_swl,
-		       (dp->dccps_gsr + 1 -
-		        (dp->dccps_options.dccpo_sequence_window / 4)));
+		       dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
 	dccp_set_seqno(&dp->dccps_swh,
-		       (dp->dccps_gsr +
-			(3 * dp->dccps_options.dccpo_sequence_window) / 4));
+		       dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
 }
 
 static inline void dccp_update_gss(struct sock *sk, u64 seq)
@@ -320,7 +319,7 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
 	dp->dccps_awh = dp->dccps_gss = seq;
 	dccp_set_seqno(&dp->dccps_awl,
 		       (dp->dccps_gss -
-			dp->dccps_options.dccpo_sequence_window + 1));
+			dccp_msk(sk)->dccpms_sequence_window + 1));
 }
 				
 static inline int dccp_ack_pending(const struct sock *sk)
@@ -328,7 +327,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
 	const struct dccp_sock *dp = dccp_sk(sk);
 	return dp->dccps_timestamp_echo != 0 ||
 #ifdef CONFIG_IP_DCCP_ACKVEC
-	       (dp->dccps_options.dccpo_send_ack_vector &&
+	       (dccp_msk(sk)->dccpms_send_ack_vector &&
 		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
 #endif
 	       inet_csk_ack_scheduled(sk);
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 3f78c00e3822..0f25dc395967 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -30,7 +30,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_backoff	= icsk->icsk_backoff;
 	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
-	if (dp->dccps_options.dccpo_send_ack_vector)
+	if (dccp_msk(sk)->dccpms_send_ack_vector)
 		info->tcpi_options |= TCPI_OPT_SACK;
 
 	ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index ed0851fa3cb3..b09064c4643d 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -22,7 +22,7 @@
 int dccp_feat_change(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len,
 		     gfp_t gfp)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct dccp_opt_pend *opt;
 
 	dccp_pr_debug("feat change type=%d feat=%d\n", type, feature);
@@ -30,8 +30,7 @@ int dccp_feat_change(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len,
 	/* XXX sanity check feat change request */
 
 	/* check if that feature is already being negotiated */
-	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
-			    dccpop_node) {
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
 		/* ok we found a negotiation for this option already */
 		if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
 			dccp_pr_debug("Replacing old\n");
@@ -59,7 +58,7 @@ int dccp_feat_change(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len,
 
 	BUG_ON(opt->dccpop_val == NULL);
 
-	list_add_tail(&opt->dccpop_node, &dp->dccps_options.dccpo_pending);
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
 	return 0;
 }
 
@@ -68,10 +67,10 @@ EXPORT_SYMBOL_GPL(dccp_feat_change);
 static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	/* figure out if we are changing our CCID or the peer's */
 	const int rx = type == DCCPO_CHANGE_R;
-	const u8 ccid_nr = rx ? dp->dccps_options.dccpo_rx_ccid :
-				dp->dccps_options.dccpo_tx_ccid;
+	const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
 	struct ccid *new_ccid;
 
 	/* Check if nothing is being changed. */
@@ -85,11 +84,11 @@ static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
 	if (rx) {
 		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
 		dp->dccps_hc_rx_ccid = new_ccid;
-		dp->dccps_options.dccpo_rx_ccid = new_ccid_nr;
+		dmsk->dccpms_rx_ccid = new_ccid_nr;
 	} else {
 		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
 		dp->dccps_hc_tx_ccid = new_ccid;
-		dp->dccps_options.dccpo_tx_ccid = new_ccid_nr;
+		dmsk->dccpms_tx_ccid = new_ccid_nr;
 	}
 
 	return 0;
@@ -159,9 +158,9 @@ static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
 		case DCCPF_CCID:
 			/* XXX did i get this right? =P */
 			if (opt->dccpop_type == DCCPO_CHANGE_L)
-				res = &dp->dccps_options.dccpo_tx_ccid;
+				res = &dccp_msk(sk)->dccpms_tx_ccid;
 			else
-				res = &dp->dccps_options.dccpo_rx_ccid;
+				res = &dccp_msk(sk)->dccpms_rx_ccid;
 			break;
 
 		default:
@@ -226,7 +225,7 @@ static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
 
 static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct dccp_opt_pend *opt;
 	int rc = 1;
 	u8 t;
@@ -242,8 +241,7 @@ static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 		t = DCCPO_CHANGE_L;
 
 	/* find our preference list for this feature */
-	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
-			    dccpop_node) {
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
 		if (opt->dccpop_type != t || opt->dccpop_feat != feature)
 			continue;
 
@@ -265,7 +263,7 @@ static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
 	struct dccp_opt_pend *opt;
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	u8 *copy;
 	int rc;
 
@@ -304,14 +302,14 @@ static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 	}
 
 	dccp_pr_debug("Confirming NN feature %d (val=%d)\n", feature, *copy);
-	list_add_tail(&opt->dccpop_node, &dp->dccps_options.dccpo_conf);
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
 
 	return 0;
 }
 
 static void dccp_feat_empty_confirm(struct sock *sk, u8 type, u8 feature)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	/* XXX check if other confirms for that are queued and recycle slot */
 	struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
 
@@ -330,20 +328,19 @@ static void dccp_feat_empty_confirm(struct sock *sk, u8 type, u8 feature)
 
 	/* change feature */
 	dccp_pr_debug("Empty confirm feature %d type %d\n", feature, type);
-	list_add_tail(&opt->dccpop_node, &dp->dccps_options.dccpo_conf);
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
 }
 
 static void dccp_feat_flush_confirm(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	/* Check if there is anything to confirm in the first place */
-	int yes = !list_empty(&dp->dccps_options.dccpo_conf);
+	int yes = !list_empty(&dmsk->dccpms_conf);
 
 	if (!yes) {
 		struct dccp_opt_pend *opt;
 
-		list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
-				    dccpop_node) {
+		list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
 			if (opt->dccpop_conf) {
 				yes = 1;
 				break;
@@ -407,7 +404,7 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 {
 	u8 t;
 	struct dccp_opt_pend *opt;
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	int rc = 1;
 	int all_confirmed = 1;
 
@@ -418,8 +415,7 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 	/* locate our change request */
 	t = type == DCCPO_CONFIRM_L ? DCCPO_CHANGE_R : DCCPO_CHANGE_L;
 
-	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
-			    dccpop_node) {
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
 		if (!opt->dccpop_conf && opt->dccpop_type == t &&
 		    opt->dccpop_feat == feature) {
 			/* we found it */
@@ -462,10 +458,10 @@ EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
 
 void dccp_feat_clean(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct dccp_opt_pend *opt, *next;
 
-	list_for_each_entry_safe(opt, next, &dp->dccps_options.dccpo_pending,
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending,
 				 dccpop_node) {
                 BUG_ON(opt->dccpop_val == NULL);
                 kfree(opt->dccpop_val);
@@ -478,16 +474,15 @@ void dccp_feat_clean(struct sock *sk)
 
                 kfree(opt);
         }
-	INIT_LIST_HEAD(&dp->dccps_options.dccpo_pending);
+	INIT_LIST_HEAD(&dmsk->dccpms_pending);
 
-	list_for_each_entry_safe(opt, next, &dp->dccps_options.dccpo_conf,
-				 dccpop_node) {
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
 		BUG_ON(opt == NULL);
 		if (opt->dccpop_val != NULL)
 			kfree(opt->dccpop_val);
 		kfree(opt);
 	}
-	INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
 }
 
 EXPORT_SYMBOL_GPL(dccp_feat_clean);
@@ -498,16 +493,15 @@ EXPORT_SYMBOL_GPL(dccp_feat_clean);
  */
 int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
 {
-	struct dccp_sock *olddp = dccp_sk(oldsk);
-	struct dccp_sock *newdp = dccp_sk(newsk);
+	struct dccp_minisock *olddmsk = dccp_msk(oldsk);
+	struct dccp_minisock *newdmsk = dccp_msk(newsk);
 	struct dccp_opt_pend *opt;
 	int rc = 0;
 
-	INIT_LIST_HEAD(&newdp->dccps_options.dccpo_pending);
-	INIT_LIST_HEAD(&newdp->dccps_options.dccpo_conf);
+	INIT_LIST_HEAD(&newdmsk->dccpms_pending);
+	INIT_LIST_HEAD(&newdmsk->dccpms_conf);
 
-	list_for_each_entry(opt, &olddp->dccps_options.dccpo_pending,
-			    dccpop_node) {
+	list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) {
 		struct dccp_opt_pend *newopt;
 		/* copy the value of the option */
 		u8 *val = kmalloc(opt->dccpop_len, GFP_ATOMIC);
@@ -525,8 +519,7 @@ int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
 		/* insert the option */
 		memcpy(newopt, opt, sizeof(*newopt));
 		newopt->dccpop_val = val;
-		list_add_tail(&newopt->dccpop_node,
-			      &newdp->dccps_options.dccpo_pending);
+		list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
 
 		/* XXX what happens with backlogs and multiple connections at
 		 * once...
@@ -567,27 +560,27 @@ static int __dccp_feat_init(struct sock *sk, u8 type, u8 feat, u8 *val, u8 len)
 
 int dccp_feat_init(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	int rc;
 
-	INIT_LIST_HEAD(&dp->dccps_options.dccpo_pending);
-	INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
+	INIT_LIST_HEAD(&dmsk->dccpms_pending);
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
 
 	/* CCID L */
 	rc = __dccp_feat_init(sk, DCCPO_CHANGE_L, DCCPF_CCID,
-			      &dp->dccps_options.dccpo_tx_ccid, 1);
+			      &dmsk->dccpms_tx_ccid, 1);
 	if (rc)
 		goto out;
 
 	/* CCID R */
 	rc = __dccp_feat_init(sk, DCCPO_CHANGE_R, DCCPF_CCID,
-			      &dp->dccps_options.dccpo_rx_ccid, 1);
+			      &dmsk->dccpms_rx_ccid, 1);
 	if (rc)
 		goto out;
 
 	/* Ack ratio */
 	rc = __dccp_feat_init(sk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
-			      &dp->dccps_options.dccpo_ack_ratio, 1);
+			      &dmsk->dccpms_ack_ratio, 1);
 out:
 	return rc;
 }
diff --git a/net/dccp/input.c b/net/dccp/input.c
index f53e3fc77262..bfc53665516b 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -60,7 +60,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	if (dp->dccps_options.dccpo_send_ack_vector)
+	if (dccp_msk(sk)->dccpms_send_ack_vector)
 		dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
 					    DCCP_SKB_CB(skb)->dccpd_ack_seq);
 }
@@ -246,7 +246,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 		dccp_event_ack_recv(sk, skb);
 
-	if (dp->dccps_options.dccpo_send_ack_vector &&
+	if (dccp_msk(sk)->dccpms_send_ack_vector &&
 	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
 			    DCCP_SKB_CB(skb)->dccpd_seq,
 			    DCCP_ACKVEC_STATE_RECEIVED))
@@ -302,7 +302,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		if (dccp_parse_options(sk, skb))
 			goto out_invalid_packet;
 
-                if (dp->dccps_options.dccpo_send_ack_vector &&
+                if (dccp_msk(sk)->dccpms_send_ack_vector &&
                     dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
                                     DCCP_SKB_CB(skb)->dccpd_seq,
                                     DCCP_ACKVEC_STATE_RECEIVED))
@@ -486,7 +486,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 			dccp_event_ack_recv(sk, skb);
 
- 		if (dp->dccps_options.dccpo_send_ack_vector &&
+ 		if (dccp_msk(sk)->dccpms_send_ack_vector &&
 		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
  				    DCCP_SKB_CB(skb)->dccpd_seq,
  				    DCCP_ACKVEC_STATE_RECEIVED))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 5324fcacb34d..c0349e5b0551 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -107,6 +107,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		const struct dccp_request_sock *dreq = dccp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(sk);
 		struct dccp_sock *newdp = dccp_sk(newsk);
+		struct dccp_minisock *newdmsk = dccp_msk(newsk);
 
 		newdp->dccps_role	   = DCCP_ROLE_SERVER;
 		newdp->dccps_hc_rx_ackvec  = NULL;
@@ -118,7 +119,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		if (dccp_feat_clone(sk, newsk))
 			goto out_free;
 
-		if (newdp->dccps_options.dccpo_send_ack_vector) {
+		if (newdmsk->dccpms_send_ack_vector) {
 			newdp->dccps_hc_rx_ackvec =
 						dccp_ackvec_alloc(GFP_ATOMIC);
 			if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
@@ -126,10 +127,10 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		}
 
 		newdp->dccps_hc_rx_ccid =
-			    ccid_hc_rx_new(newdp->dccps_options.dccpo_rx_ccid,
+			    ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
 					   newsk, GFP_ATOMIC);
 		newdp->dccps_hc_tx_ccid =
-			    ccid_hc_tx_new(newdp->dccps_options.dccpo_tx_ccid,
+			    ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
 					   newsk, GFP_ATOMIC);
 		if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
 			     newdp->dccps_hc_tx_ccid == NULL)) {
@@ -153,7 +154,7 @@ out_free:
 		 */
 
 		/* See dccp_v4_conn_request */
-		newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd;
+		newdmsk->dccpms_sequence_window = req->rcv_wnd;
 
 		newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
 		dccp_update_gsr(newsk, dreq->dreq_isr);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 6e85550cc6f0..e9feb2a0c770 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -30,14 +30,14 @@ int dccp_feat_default_ack_ratio	      = DCCPF_INITIAL_ACK_RATIO;
 int dccp_feat_default_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 int dccp_feat_default_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
-void dccp_options_init(struct dccp_options *dccpo)
+void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
-	dccpo->dccpo_sequence_window = dccp_feat_default_sequence_window;
-	dccpo->dccpo_rx_ccid	     = dccp_feat_default_rx_ccid;
-	dccpo->dccpo_tx_ccid	     = dccp_feat_default_tx_ccid;
-	dccpo->dccpo_ack_ratio	     = dccp_feat_default_ack_ratio;
-	dccpo->dccpo_send_ack_vector = dccp_feat_default_send_ack_vector;
-	dccpo->dccpo_send_ndp_count  = dccp_feat_default_send_ndp_count;
+	dmsk->dccpms_sequence_window = dccp_feat_default_sequence_window;
+	dmsk->dccpms_rx_ccid	     = dccp_feat_default_rx_ccid;
+	dmsk->dccpms_tx_ccid	     = dccp_feat_default_tx_ccid;
+	dmsk->dccpms_ack_ratio	     = dccp_feat_default_ack_ratio;
+	dmsk->dccpms_send_ack_vector = dccp_feat_default_send_ack_vector;
+	dmsk->dccpms_send_ndp_count  = dccp_feat_default_send_ndp_count;
 }
 
 static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
@@ -151,7 +151,7 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
 			if (pkt_type == DCCP_PKT_DATA)
 				break;
 
-			if (dp->dccps_options.dccpo_send_ack_vector &&
+			if (dccp_msk(sk)->dccpms_send_ack_vector &&
 			    dccp_ackvec_parse(sk, skb, opt, value, len))
 				goto out_invalid_option;
 			break;
@@ -472,12 +472,12 @@ static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
 static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct dccp_opt_pend *opt, *next;
 	int change = 0;
 
 	/* confirm any options [NN opts] */
-	list_for_each_entry_safe(opt, next, &dp->dccps_options.dccpo_conf,
-				 dccpop_node) {
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
 		dccp_insert_feat_opt(skb, opt->dccpop_type,
 				     opt->dccpop_feat, opt->dccpop_val,
 				     opt->dccpop_len);
@@ -486,11 +486,10 @@ static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
 			kfree(opt->dccpop_val);
 		kfree(opt);
 	}
-	INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
 
 	/* see which features we need to send */
-	list_for_each_entry(opt, &dp->dccps_options.dccpo_pending,
-			    dccpop_node) {
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
 		/* see if we need to send any confirm */
 		if (opt->dccpop_sc) {
 			dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
@@ -536,15 +535,16 @@ static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
 int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dp->dccps_options.dccpo_send_ndp_count &&
+	if (dmsk->dccpms_send_ndp_count &&
 	    dccp_insert_option_ndp(sk, skb))
 		return -1;
 
 	if (!dccp_packet_without_ack(skb)) {
-		if (dp->dccps_options.dccpo_send_ack_vector &&
+		if (dmsk->dccpms_send_ack_vector &&
 		    dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
 		    dccp_insert_option_ackvec(sk, skb))
 			return -1;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8a6d0a83047c..ede969074967 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -166,9 +166,10 @@ EXPORT_SYMBOL_GPL(dccp_unhash);
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	dccp_options_init(&dp->dccps_options);
+	dccp_minisock_init(&dp->dccps_minisock);
 	do_gettimeofday(&dp->dccps_epoch);
 
 	/*
@@ -184,22 +185,20 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 		if (rc)
 			return rc;
 
-		if (dp->dccps_options.dccpo_send_ack_vector) {
+		if (dmsk->dccpms_send_ack_vector) {
 			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
 			if (dp->dccps_hc_rx_ackvec == NULL)
 				return -ENOMEM;
 		}
-		dp->dccps_hc_rx_ccid =
-				ccid_hc_rx_new(dp->dccps_options.dccpo_rx_ccid,
-					       sk, GFP_KERNEL);
-		dp->dccps_hc_tx_ccid =
-				ccid_hc_tx_new(dp->dccps_options.dccpo_tx_ccid,
-					       sk, GFP_KERNEL);
+		dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
+						      sk, GFP_KERNEL);
+		dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
+						      sk, GFP_KERNEL);
 	    	if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
 			     dp->dccps_hc_tx_ccid == NULL)) {
 			ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
 			ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-			if (dp->dccps_options.dccpo_send_ack_vector) {
+			if (dmsk->dccpms_send_ack_vector) {
 				dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 				dp->dccps_hc_rx_ackvec = NULL;
 			}
@@ -208,8 +207,8 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 		}
 	} else {
 		/* control socket doesn't need feat nego */
-		INIT_LIST_HEAD(&dp->dccps_options.dccpo_pending);
-		INIT_LIST_HEAD(&dp->dccps_options.dccpo_conf);
+		INIT_LIST_HEAD(&dmsk->dccpms_pending);
+		INIT_LIST_HEAD(&dmsk->dccpms_conf);
 	}
 
 	dccp_init_xmit_timers(sk);
@@ -247,7 +246,7 @@ int dccp_destroy_sock(struct sock *sk)
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
 
-	if (dp->dccps_options.dccpo_send_ack_vector) {
+	if (dccp_msk(sk)->dccpms_send_ack_vector) {
 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
-- 
cgit v1.2.3


From fdeabdefb227be9aa932f59a23ddb47e003e643e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Mon, 20 Mar 2006 22:58:21 -0800
Subject: [BRIDGE]: netfilter inline cleanup

Move nf_bridge_alloc from header file to the one place it is
used and optimize it.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_bridge.h | 27 ---------------------------
 net/bridge/br_netfilter.c        | 25 +++++++++++++++++++++----
 2 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index de4d397865ce..a75b84bb9a88 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -47,22 +47,6 @@ enum nf_br_hook_priorities {
 #define BRNF_BRIDGED			0x08
 #define BRNF_NF_BRIDGE_PREROUTING	0x10
 
-static inline
-struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
-{
-	struct nf_bridge_info **nf_bridge = &(skb->nf_bridge);
-
-	if ((*nf_bridge = kmalloc(sizeof(**nf_bridge), GFP_ATOMIC)) != NULL) {
-		atomic_set(&(*nf_bridge)->use, 1);
-		(*nf_bridge)->mask = 0;
-		(*nf_bridge)->physindev = (*nf_bridge)->physoutdev = NULL;
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
-		(*nf_bridge)->netoutdev = NULL;
-#endif
-	}
-
-	return *nf_bridge;
-}
 
 /* Only used in br_forward.c */
 static inline
@@ -77,17 +61,6 @@ void nf_bridge_maybe_copy_header(struct sk_buff *skb)
 	}
 }
 
-static inline
-void nf_bridge_save_header(struct sk_buff *skb)
-{
-        int header_size = 16;
-
-	if (skb->protocol == __constant_htons(ETH_P_8021Q))
-		header_size = 18;
-
-	memcpy(skb->nf_bridge->data, skb->data - header_size, header_size);
-}
-
 /* This is called by the IP fragmenting code and it ensures there is
  * enough room for the encapsulating header (if there is one). */
 static inline
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 6fc9ecc4eb39..f29450b788be 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -113,6 +113,25 @@ static inline struct net_device *bridge_parent(const struct net_device *dev)
 	return port ? port->br->dev : NULL;
 }
 
+static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
+{
+	skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
+	if (likely(skb->nf_bridge))
+		atomic_set(&(skb->nf_bridge->use), 1);
+
+	return skb->nf_bridge;
+}
+
+static inline void nf_bridge_save_header(struct sk_buff *skb)
+{
+        int header_size = 16;
+
+	if (skb->protocol == htons(ETH_P_8021Q))
+		header_size = 18;
+
+	memcpy(skb->nf_bridge->data, skb->data - header_size, header_size);
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -371,7 +390,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
 {
 	struct ipv6hdr *hdr;
 	u32 pkt_len;
-	struct nf_bridge_info *nf_bridge;
 
 	if (skb->len < sizeof(struct ipv6hdr))
 		goto inhdr_error;
@@ -400,7 +418,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
 		goto inhdr_error;
 
 	nf_bridge_put(skb->nf_bridge);
-	if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
+	if (!nf_bridge_alloc(skb))
 		return NF_DROP;
 	if (!setup_pre_routing(skb))
 		return NF_DROP;
@@ -428,7 +446,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
 	struct iphdr *iph;
 	__u32 len;
 	struct sk_buff *skb = *pskb;
-	struct nf_bridge_info *nf_bridge;
 
 	if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb)) {
 #ifdef CONFIG_SYSCTL
@@ -485,7 +502,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
 	}
 
 	nf_bridge_put(skb->nf_bridge);
-	if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
+	if (!nf_bridge_alloc(skb))
 		return NF_DROP;
 	if (!setup_pre_routing(skb))
 		return NF_DROP;
-- 
cgit v1.2.3


From 5e35941d990123f155b02d5663e51a24f816b6f3 Mon Sep 17 00:00:00 2001
From: Jing Min Zhao <zhaojignmin@hotmail.com>
Date: Mon, 20 Mar 2006 23:41:17 -0800
Subject: [NETFILTER]: Add H.323 conntrack/NAT helper

Signed-off-by: Jing Min Zhao <zhaojignmin@hotmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h        |    2 +
 include/linux/netfilter_ipv4/ip_conntrack_h323.h   |   30 +
 net/ipv4/netfilter/Kconfig                         |   26 +
 net/ipv4/netfilter/Makefile                        |    5 +
 net/ipv4/netfilter/ip_conntrack_helper_h323.c      | 1731 ++++++++++++++++++
 net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c |  870 +++++++++
 net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.h |   98 +
 .../netfilter/ip_conntrack_helper_h323_types.c     | 1926 ++++++++++++++++++++
 .../netfilter/ip_conntrack_helper_h323_types.h     |  938 ++++++++++
 net/ipv4/netfilter/ip_nat_helper_h323.c            |  605 ++++++
 10 files changed, 6231 insertions(+)
 create mode 100644 include/linux/netfilter_ipv4/ip_conntrack_h323.h
 create mode 100644 net/ipv4/netfilter/ip_conntrack_helper_h323.c
 create mode 100644 net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c
 create mode 100644 net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.h
 create mode 100644 net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
 create mode 100644 net/ipv4/netfilter/ip_conntrack_helper_h323_types.h
 create mode 100644 net/ipv4/netfilter/ip_nat_helper_h323.c

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 215765f043e6..f32d75c4f4cf 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -29,6 +29,7 @@ union ip_conntrack_expect_proto {
 };
 
 /* Add protocol helper include file here */
+#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
 #include <linux/netfilter_ipv4/ip_conntrack_pptp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
 #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
@@ -37,6 +38,7 @@ union ip_conntrack_expect_proto {
 /* per conntrack: application helper private data */
 union ip_conntrack_help {
 	/* insert conntrack helper private data (master) here */
+	struct ip_ct_h323_master ct_h323_info;
 	struct ip_ct_pptp_master ct_pptp_info;
 	struct ip_ct_ftp_master ct_ftp_info;
 	struct ip_ct_irc_master ct_irc_info;
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_h323.h b/include/linux/netfilter_ipv4/ip_conntrack_h323.h
new file mode 100644
index 000000000000..0987cea53840
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ip_conntrack_h323.h
@@ -0,0 +1,30 @@
+#ifndef _IP_CONNTRACK_H323_H
+#define _IP_CONNTRACK_H323_H
+
+#ifdef __KERNEL__
+
+#define RAS_PORT 1719
+#define Q931_PORT 1720
+#define H323_RTP_CHANNEL_MAX 4	/* Audio, video, FAX and other */
+
+/* This structure exists only once per master */
+struct ip_ct_h323_master {
+
+	/* Original and NATed Q.931 or H.245 signal ports */
+	u_int16_t sig_port[IP_CT_DIR_MAX];
+
+	/* Original and NATed RTP ports */
+	u_int16_t rtp_port[H323_RTP_CHANNEL_MAX][IP_CT_DIR_MAX];
+
+	union {
+		/* RAS connection timeout */
+		u_int32_t timeout;
+
+		/* Next TPKT length (for separate TPKT header and data) */
+		u_int16_t tpkt_len[IP_CT_DIR_MAX];
+	};
+};
+
+#endif
+
+#endif
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 933ee7a9efdf..882b842c25d4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -168,6 +168,26 @@ config IP_NF_PPTP
 	  If you want to compile it as a module, say M here and read
 	  Documentation/modules.txt.  If unsure, say `N'.
 
+config IP_NF_H323
+	tristate  'H.323 protocol support'
+	depends on IP_NF_CONNTRACK
+	help
+	  H.323 is a VoIP signalling protocol from ITU-T. As one of the most
+	  important VoIP protocols, it is widely used by voice hardware and
+	  software including voice gateways, IP phones, Netmeeting, OpenPhone,
+	  Gnomemeeting, etc.
+
+	  With this module you can support H.323 on a connection tracking/NAT
+	  firewall.
+
+	  This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP
+	  and T.120 based data and applications including audio, video, FAX,
+	  chat, whiteboard, file transfer, etc. For more information, please
+	  see http://nath323.sourceforge.net/.
+
+	  If you want to compile it as a module, say 'M' here and read
+	  Documentation/modules.txt.  If unsure, say 'N'.
+
 config IP_NF_QUEUE
 	tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
 	help
@@ -484,6 +504,12 @@ config IP_NF_NAT_PPTP
 	default IP_NF_NAT if IP_NF_PPTP=y
 	default m if IP_NF_PPTP=m
 
+config IP_NF_NAT_H323
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_H323=y
+	default m if IP_NF_H323=m
+
 # mangle + specific targets
 config IP_NF_MANGLE
 	tristate "Packet mangling"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3fe80924ac54..f2cd9a6c5b91 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,6 +10,9 @@ iptable_nat-objs	:= ip_nat_rule.o ip_nat_standalone.o
 ip_conntrack_pptp-objs	:= ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o
 ip_nat_pptp-objs	:= ip_nat_helper_pptp.o ip_nat_proto_gre.o
 
+ip_conntrack_h323-objs := ip_conntrack_helper_h323.o ip_conntrack_helper_h323_asn1.o
+ip_nat_h323-objs := ip_nat_helper_h323.o
+
 # connection tracking
 obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
 obj-$(CONFIG_IP_NF_NAT) += ip_nat.o
@@ -22,6 +25,7 @@ obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
 obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
 
 # connection tracking helpers
+obj-$(CONFIG_IP_NF_H323) += ip_conntrack_h323.o
 obj-$(CONFIG_IP_NF_PPTP) += ip_conntrack_pptp.o
 obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
@@ -30,6 +34,7 @@ obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
 obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
 
 # NAT helpers 
+obj-$(CONFIG_IP_NF_NAT_H323) += ip_nat_h323.o
 obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o
 obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
 obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
new file mode 100644
index 000000000000..20da6730b860
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c
@@ -0,0 +1,1731 @@
+/*
+ * H.323 connection tracking helper
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 connection tracking module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * For more information, please see http://nath323.sourceforge.net/
+ *
+ * Changes:
+ * 	2006-02-01 - initial version 0.1
+ *
+ *	2006-02-20 - version 0.2
+ *	  1. Changed source format to follow kernel conventions
+ *	  2. Deleted some unnecessary structures
+ *	  3. Minor fixes
+ *
+ * 	2006-03-10 - version 0.3
+ * 	  1. Added support for multiple TPKTs in one packet (suggested by
+ * 	     Patrick McHardy)
+ * 	  2. Avoid excessive stack usage (based on Patrick McHardy's patch)
+ * 	  3. Added support for non-linear skb (based on Patrick McHardy's patch)
+ * 	  4. Fixed missing H.245 module owner (Patrick McHardy)
+ * 	  5. Avoid long RAS expectation chains (Patrick McHardy)
+ * 	  6. Fixed incorrect __exit attribute (Patrick McHardy)
+ * 	  7. Eliminated unnecessary return code
+ * 	  8. Fixed incorrect use of NAT data from conntrack code (suggested by
+ * 	     Patrick McHardy)
+ * 	  9. Fixed TTL calculation error in RCF
+ * 	  10. Added TTL support in RRQ
+ * 	  11. Better support for separate TPKT header and data
+ *
+ * 	2006-03-15 - version 0.4
+ * 	  1. Added support for T.120 channels
+ * 	  2. Added parameter gkrouted_only (suggested by Patrick McHardy)
+ * 	  3. Splitted ASN.1 code and data (suggested by Patrick McHardy)
+ * 	  4. Sort ASN.1 data to avoid forwarding declarations (suggested by
+ * 	     Patrick McHardy)
+ * 	  5. Reset next TPKT data length in get_tpkt_data()
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
+#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
+#include <linux/moduleparam.h>
+
+#include "ip_conntrack_helper_h323_asn1.h"
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Parameters */
+static int gkrouted_only = 1;
+module_param(gkrouted_only, int, 0600);
+MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
+
+/* Hooks for NAT */
+int (*set_h245_addr_hook) (struct sk_buff ** pskb,
+			   unsigned char **data, int dataoff,
+			   H245_TransportAddress * addr,
+			   u_int32_t ip, u_int16_t port);
+int (*set_h225_addr_hook) (struct sk_buff ** pskb,
+			   unsigned char **data, int dataoff,
+			   TransportAddress * addr,
+			   u_int32_t ip, u_int16_t port);
+int (*set_sig_addr_hook) (struct sk_buff ** pskb,
+			  struct ip_conntrack * ct,
+			  enum ip_conntrack_info ctinfo,
+			  unsigned char **data,
+			  TransportAddress * addr, int count);
+int (*set_ras_addr_hook) (struct sk_buff ** pskb,
+			  struct ip_conntrack * ct,
+			  enum ip_conntrack_info ctinfo,
+			  unsigned char **data,
+			  TransportAddress * addr, int count);
+int (*nat_rtp_rtcp_hook) (struct sk_buff ** pskb,
+			  struct ip_conntrack * ct,
+			  enum ip_conntrack_info ctinfo,
+			  unsigned char **data, int dataoff,
+			  H245_TransportAddress * addr,
+			  u_int16_t port, u_int16_t rtp_port,
+			  struct ip_conntrack_expect * rtp_exp,
+			  struct ip_conntrack_expect * rtcp_exp);
+int (*nat_t120_hook) (struct sk_buff ** pskb,
+		      struct ip_conntrack * ct,
+		      enum ip_conntrack_info ctinfo,
+		      unsigned char **data, int dataoff,
+		      H245_TransportAddress * addr, u_int16_t port,
+		      struct ip_conntrack_expect * exp);
+int (*nat_h245_hook) (struct sk_buff ** pskb,
+		      struct ip_conntrack * ct,
+		      enum ip_conntrack_info ctinfo,
+		      unsigned char **data, int dataoff,
+		      TransportAddress * addr, u_int16_t port,
+		      struct ip_conntrack_expect * exp);
+int (*nat_q931_hook) (struct sk_buff ** pskb,
+		      struct ip_conntrack * ct,
+		      enum ip_conntrack_info ctinfo,
+		      unsigned char **data, TransportAddress * addr, int idx,
+		      u_int16_t port, struct ip_conntrack_expect * exp);
+
+
+static DEFINE_SPINLOCK(ip_h323_lock);
+static char *h323_buffer;
+
+/****************************************************************************/
+static int get_tpkt_data(struct sk_buff **pskb, struct ip_conntrack *ct,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned char **data, int *datalen, int *dataoff)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	struct tcphdr _tcph, *th;
+	int tcpdatalen;
+	int tcpdataoff;
+	unsigned char *tpkt;
+	int tpktlen;
+	int tpktoff;
+
+	/* Get TCP header */
+	th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+
+	/* Get TCP data offset */
+	tcpdataoff = (*pskb)->nh.iph->ihl * 4 + th->doff * 4;
+
+	/* Get TCP data length */
+	tcpdatalen = (*pskb)->len - tcpdataoff;
+	if (tcpdatalen <= 0)	/* No TCP data */
+		goto clear_out;
+
+	if (*data == NULL) {	/* first TPKT */
+		/* Get first TPKT pointer */
+		tpkt = skb_header_pointer(*pskb, tcpdataoff, tcpdatalen,
+					  h323_buffer);
+		BUG_ON(tpkt == NULL);
+
+		/* Validate TPKT identifier */
+		if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
+			/* Netmeeting sends TPKT header and data separately */
+			if (info->tpkt_len[dir] > 0) {
+				DEBUGP("ip_ct_h323: previous packet "
+				       "indicated separate TPKT data of %hu "
+				       "bytes\n", info->tpkt_len[dir]);
+				if (info->tpkt_len[dir] <= tcpdatalen) {
+					/* Yes, there was a TPKT header
+					 * received */
+					*data = tpkt;
+					*datalen = info->tpkt_len[dir];
+					*dataoff = 0;
+					goto out;
+				}
+
+				/* Fragmented TPKT */
+				if (net_ratelimit())
+					printk("ip_ct_h323: "
+					       "fragmented TPKT\n");
+				goto clear_out;
+			}
+
+			/* It is not even a TPKT */
+			return 0;
+		}
+		tpktoff = 0;
+	} else {		/* Next TPKT */
+		tpktoff = *dataoff + *datalen;
+		tcpdatalen -= tpktoff;
+		if (tcpdatalen <= 4)	/* No more TPKT */
+			goto clear_out;
+		tpkt = *data + *datalen;
+
+		/* Validate TPKT identifier */
+		if (tpkt[0] != 0x03 || tpkt[1] != 0)
+			goto clear_out;
+	}
+
+	/* Validate TPKT length */
+	tpktlen = tpkt[2] * 256 + tpkt[3];
+	if (tpktlen > tcpdatalen) {
+		if (tcpdatalen == 4) {	/* Separate TPKT header */
+			/* Netmeeting sends TPKT header and data separately */
+			DEBUGP("ip_ct_h323: separate TPKT header indicates "
+			       "there will be TPKT data of %hu bytes\n",
+			       tpktlen - 4);
+			info->tpkt_len[dir] = tpktlen - 4;
+			return 0;
+		}
+
+		if (net_ratelimit())
+			printk("ip_ct_h323: incomplete TPKT (fragmented?)\n");
+		goto clear_out;
+	}
+
+	/* This is the encapsulated data */
+	*data = tpkt + 4;
+	*datalen = tpktlen - 4;
+	*dataoff = tpktoff + 4;
+
+      out:
+	/* Clear TPKT length */
+	info->tpkt_len[dir] = 0;
+	return 1;
+
+      clear_out:
+	info->tpkt_len[dir] = 0;
+	return 0;
+}
+
+/****************************************************************************/
+int get_h245_addr(unsigned char *data, H245_TransportAddress * addr,
+		  u_int32_t * ip, u_int16_t * port)
+{
+	unsigned char *p;
+
+	if (addr->choice != eH245_TransportAddress_unicastAddress ||
+	    addr->unicastAddress.choice != eUnicastAddress_iPAddress)
+		return 0;
+
+	p = data + addr->unicastAddress.iPAddress.network;
+	*ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3]));
+	*port = (p[4] << 8) | (p[5]);
+
+	return 1;
+}
+
+/****************************************************************************/
+static int expect_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct,
+			   enum ip_conntrack_info ctinfo,
+			   unsigned char **data, int dataoff,
+			   H245_TransportAddress * addr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	u_int16_t rtp_port;
+	struct ip_conntrack_expect *rtp_exp;
+	struct ip_conntrack_expect *rtcp_exp;
+
+	/* Read RTP or RTCP address */
+	if (!get_h245_addr(*data, addr, &ip, &port) ||
+	    ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
+		return 0;
+
+	/* RTP port is even */
+	rtp_port = port & (~1);
+
+	/* Create expect for RTP */
+	if ((rtp_exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	rtp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	rtp_exp->tuple.src.u.udp.port = 0;
+	rtp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	rtp_exp->tuple.dst.u.udp.port = htons(rtp_port);
+	rtp_exp->tuple.dst.protonum = IPPROTO_UDP;
+	rtp_exp->mask.src.ip = 0xFFFFFFFF;
+	rtp_exp->mask.src.u.udp.port = 0;
+	rtp_exp->mask.dst.ip = 0xFFFFFFFF;
+	rtp_exp->mask.dst.u.udp.port = 0xFFFF;
+	rtp_exp->mask.dst.protonum = 0xFF;
+	rtp_exp->flags = 0;
+
+	/* Create expect for RTCP */
+	if ((rtcp_exp = ip_conntrack_expect_alloc(ct)) == NULL) {
+		ip_conntrack_expect_put(rtp_exp);
+		return -1;
+	}
+	rtcp_exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	rtcp_exp->tuple.src.u.udp.port = 0;
+	rtcp_exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	rtcp_exp->tuple.dst.u.udp.port = htons(rtp_port + 1);
+	rtcp_exp->tuple.dst.protonum = IPPROTO_UDP;
+	rtcp_exp->mask.src.ip = 0xFFFFFFFF;
+	rtcp_exp->mask.src.u.udp.port = 0;
+	rtcp_exp->mask.dst.ip = 0xFFFFFFFF;
+	rtcp_exp->mask.dst.u.udp.port = 0xFFFF;
+	rtcp_exp->mask.dst.protonum = 0xFF;
+	rtcp_exp->flags = 0;
+
+	if (ct->tuplehash[dir].tuple.src.ip !=
+	    ct->tuplehash[!dir].tuple.dst.ip && nat_rtp_rtcp_hook) {
+		/* NAT needed */
+		ret = nat_rtp_rtcp_hook(pskb, ct, ctinfo, data, dataoff,
+					addr, port, rtp_port, rtp_exp,
+					rtcp_exp);
+	} else {		/* Conntrack only */
+		rtp_exp->expectfn = NULL;
+		rtcp_exp->expectfn = NULL;
+
+		if (ip_conntrack_expect_related(rtp_exp) == 0) {
+			if (ip_conntrack_expect_related(rtcp_exp) == 0) {
+				DEBUGP("ip_ct_h323: expect RTP "
+				       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+				       NIPQUAD(rtp_exp->tuple.src.ip),
+				       ntohs(rtp_exp->tuple.src.u.udp.port),
+				       NIPQUAD(rtp_exp->tuple.dst.ip),
+				       ntohs(rtp_exp->tuple.dst.u.udp.port));
+				DEBUGP("ip_ct_h323: expect RTCP "
+				       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+				       NIPQUAD(rtcp_exp->tuple.src.ip),
+				       ntohs(rtcp_exp->tuple.src.u.udp.port),
+				       NIPQUAD(rtcp_exp->tuple.dst.ip),
+				       ntohs(rtcp_exp->tuple.dst.u.udp.port));
+			} else {
+				ip_conntrack_unexpect_related(rtp_exp);
+				ret = -1;
+			}
+		} else
+			ret = -1;
+	}
+
+	ip_conntrack_expect_put(rtp_exp);
+	ip_conntrack_expect_put(rtcp_exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int expect_t120(struct sk_buff **pskb,
+		       struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, int dataoff,
+		       H245_TransportAddress * addr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp = NULL;
+
+	/* Read T.120 address */
+	if (!get_h245_addr(*data, addr, &ip, &port) ||
+	    ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
+		return 0;
+
+	/* Create expect for T.120 connections */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = IP_CT_EXPECT_PERMANENT;	/* Accept multiple channels */
+
+	if (ct->tuplehash[dir].tuple.src.ip !=
+	    ct->tuplehash[!dir].tuple.dst.ip && nat_t120_hook) {
+		/* NAT needed */
+		ret = nat_t120_hook(pskb, ct, ctinfo, data, dataoff, addr,
+				    port, exp);
+	} else {		/* Conntrack only */
+		exp->expectfn = NULL;
+		if (ip_conntrack_expect_related(exp) == 0) {
+			DEBUGP("ip_ct_h323: expect T.120 "
+			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+			       NIPQUAD(exp->tuple.src.ip),
+			       ntohs(exp->tuple.src.u.tcp.port),
+			       NIPQUAD(exp->tuple.dst.ip),
+			       ntohs(exp->tuple.dst.u.tcp.port));
+		} else
+			ret = -1;
+	}
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_h245_channel(struct sk_buff **pskb,
+				struct ip_conntrack *ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned char **data, int dataoff,
+				H2250LogicalChannelParameters * channel)
+{
+	int ret;
+
+	if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {
+		/* RTP */
+		ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
+				      &channel->mediaChannel);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (channel->
+	    options & eH2250LogicalChannelParameters_mediaControlChannel) {
+		/* RTCP */
+		ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
+				      &channel->mediaControlChannel);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_olc(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, int dataoff,
+		       OpenLogicalChannel * olc)
+{
+	int ret;
+
+	DEBUGP("ip_ct_h323: OpenLogicalChannel\n");
+
+	if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
+	    eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
+	{
+		ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff,
+					   &olc->
+					   forwardLogicalChannelParameters.
+					   multiplexParameters.
+					   h2250LogicalChannelParameters);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((olc->options &
+	     eOpenLogicalChannel_reverseLogicalChannelParameters) &&
+	    (olc->reverseLogicalChannelParameters.options &
+	     eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters)
+	    && (olc->reverseLogicalChannelParameters.multiplexParameters.
+		choice ==
+		eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+	{
+		ret =
+		    process_h245_channel(pskb, ct, ctinfo, data, dataoff,
+					 &olc->
+					 reverseLogicalChannelParameters.
+					 multiplexParameters.
+					 h2250LogicalChannelParameters);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((olc->options & eOpenLogicalChannel_separateStack) &&
+	    olc->forwardLogicalChannelParameters.dataType.choice ==
+	    eDataType_data &&
+	    olc->forwardLogicalChannelParameters.dataType.data.application.
+	    choice == eDataApplicationCapability_application_t120 &&
+	    olc->forwardLogicalChannelParameters.dataType.data.application.
+	    t120.choice == eDataProtocolCapability_separateLANStack &&
+	    olc->separateStack.networkAddress.choice ==
+	    eNetworkAccessParameters_networkAddress_localAreaAddress) {
+		ret = expect_t120(pskb, ct, ctinfo, data, dataoff,
+				  &olc->separateStack.networkAddress.
+				  localAreaAddress);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_olca(struct sk_buff **pskb, struct ip_conntrack *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff,
+			OpenLogicalChannelAck * olca)
+{
+	H2250LogicalChannelAckParameters *ack;
+	int ret;
+
+	DEBUGP("ip_ct_h323: OpenLogicalChannelAck\n");
+
+	if ((olca->options &
+	     eOpenLogicalChannelAck_reverseLogicalChannelParameters) &&
+	    (olca->reverseLogicalChannelParameters.options &
+	     eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters)
+	    && (olca->reverseLogicalChannelParameters.multiplexParameters.
+		choice ==
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+	{
+		ret = process_h245_channel(pskb, ct, ctinfo, data, dataoff,
+					   &olca->
+					   reverseLogicalChannelParameters.
+					   multiplexParameters.
+					   h2250LogicalChannelParameters);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((olca->options &
+	     eOpenLogicalChannelAck_forwardMultiplexAckParameters) &&
+	    (olca->forwardMultiplexAckParameters.choice ==
+	     eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters))
+	{
+		ack = &olca->forwardMultiplexAckParameters.
+		    h2250LogicalChannelAckParameters;
+		if (ack->options &
+		    eH2250LogicalChannelAckParameters_mediaChannel) {
+			/* RTP */
+			ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
+					      &ack->mediaChannel);
+			if (ret < 0)
+				return -1;
+		}
+
+		if (ack->options &
+		    eH2250LogicalChannelAckParameters_mediaControlChannel) {
+			/* RTCP */
+			ret = expect_rtp_rtcp(pskb, ct, ctinfo, data, dataoff,
+					      &ack->mediaControlChannel);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff,
+			MultimediaSystemControlMessage * mscm)
+{
+	switch (mscm->choice) {
+	case eMultimediaSystemControlMessage_request:
+		if (mscm->request.choice ==
+		    eRequestMessage_openLogicalChannel) {
+			return process_olc(pskb, ct, ctinfo, data, dataoff,
+					   &mscm->request.openLogicalChannel);
+		}
+		DEBUGP("ip_ct_h323: H.245 Request %d\n",
+		       mscm->request.choice);
+		break;
+	case eMultimediaSystemControlMessage_response:
+		if (mscm->response.choice ==
+		    eResponseMessage_openLogicalChannelAck) {
+			return process_olca(pskb, ct, ctinfo, data, dataoff,
+					    &mscm->response.
+					    openLogicalChannelAck);
+		}
+		DEBUGP("ip_ct_h323: H.245 Response %d\n",
+		       mscm->response.choice);
+		break;
+	default:
+		DEBUGP("ip_ct_h323: H.245 signal %d\n", mscm->choice);
+		break;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int h245_help(struct sk_buff **pskb, struct ip_conntrack *ct,
+		     enum ip_conntrack_info ctinfo)
+{
+	static MultimediaSystemControlMessage mscm;
+	unsigned char *data = NULL;
+	int datalen;
+	int dataoff;
+	int ret;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED
+	    && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+		return NF_ACCEPT;
+	}
+	DEBUGP("ip_ct_h245: skblen = %u\n", (*pskb)->len);
+
+	spin_lock_bh(&ip_h323_lock);
+
+	/* Process each TPKT */
+	while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) {
+		DEBUGP("ip_ct_h245: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
+		       NIPQUAD((*pskb)->nh.iph->saddr),
+		       NIPQUAD((*pskb)->nh.iph->daddr), datalen);
+
+		/* Decode H.245 signal */
+		ret = DecodeMultimediaSystemControlMessage(data, datalen,
+							   &mscm);
+		if (ret < 0) {
+			if (net_ratelimit())
+				printk("ip_ct_h245: decoding error: %s\n",
+				       ret == H323_ERROR_BOUND ?
+				       "out of bound" : "out of range");
+			/* We don't drop when decoding error */
+			break;
+		}
+
+		/* Process H.245 signal */
+		if (process_h245(pskb, ct, ctinfo, &data, dataoff, &mscm) < 0)
+			goto drop;
+	}
+
+	spin_unlock_bh(&ip_h323_lock);
+	return NF_ACCEPT;
+
+      drop:
+	spin_unlock_bh(&ip_h323_lock);
+	if (net_ratelimit())
+		printk("ip_ct_h245: packet dropped\n");
+	return NF_DROP;
+}
+
+/****************************************************************************/
+static struct ip_conntrack_helper ip_conntrack_helper_h245 = {
+	.name = "H.245",
+	.me = THIS_MODULE,
+	.max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */ ,
+	.timeout = 240,
+	.tuple = {.dst = {.protonum = IPPROTO_TCP}},
+	.mask = {.src = {.u = {0xFFFF}},
+		 .dst = {.protonum = 0xFF}},
+	.help = h245_help
+};
+
+/****************************************************************************/
+void ip_conntrack_h245_expect(struct ip_conntrack *new,
+			      struct ip_conntrack_expect *this)
+{
+	write_lock_bh(&ip_conntrack_lock);
+	new->helper = &ip_conntrack_helper_h245;
+	write_unlock_bh(&ip_conntrack_lock);
+}
+
+/****************************************************************************/
+static int get_h225_addr(unsigned char *data, TransportAddress * addr,
+			 u_int32_t * ip, u_int16_t * port)
+{
+	unsigned char *p;
+
+	if (addr->choice != eTransportAddress_ipAddress)
+		return 0;
+
+	p = data + addr->ipAddress.ip;
+	*ip = htonl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | (p[3]));
+	*port = (p[4] << 8) | (p[5]);
+
+	return 1;
+}
+
+/****************************************************************************/
+static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, int dataoff,
+		       TransportAddress * addr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp = NULL;
+
+	/* Read h245Address */
+	if (!get_h225_addr(*data, addr, &ip, &port) ||
+	    ip != ct->tuplehash[dir].tuple.src.ip || port == 0)
+		return 0;
+
+	/* Create expect for h245 connection */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = 0;
+
+	if (ct->tuplehash[dir].tuple.src.ip !=
+	    ct->tuplehash[!dir].tuple.dst.ip && nat_h245_hook) {
+		/* NAT needed */
+		ret = nat_h245_hook(pskb, ct, ctinfo, data, dataoff, addr,
+				    port, exp);
+	} else {		/* Conntrack only */
+		exp->expectfn = ip_conntrack_h245_expect;
+
+		if (ip_conntrack_expect_related(exp) == 0) {
+			DEBUGP("ip_ct_q931: expect H.245 "
+			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+			       NIPQUAD(exp->tuple.src.ip),
+			       ntohs(exp->tuple.src.u.tcp.port),
+			       NIPQUAD(exp->tuple.dst.ip),
+			       ntohs(exp->tuple.dst.u.tcp.port));
+		} else
+			ret = -1;
+	}
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned char **data, int dataoff,
+			 Setup_UUIE * setup)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret;
+	int i;
+	u_int32_t ip;
+	u_int16_t port;
+
+	DEBUGP("ip_ct_q931: Setup\n");
+
+	if (setup->options & eSetup_UUIE_h245Address) {
+		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
+				  &setup->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
+	    (set_h225_addr_hook) &&
+	    get_h225_addr(*data, &setup->destCallSignalAddress, &ip, &port) &&
+	    ip != ct->tuplehash[!dir].tuple.src.ip) {
+		DEBUGP("ip_ct_q931: set destCallSignalAddress "
+		       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		       NIPQUAD(ip), port,
+		       NIPQUAD(ct->tuplehash[!dir].tuple.src.ip),
+		       ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
+		ret = set_h225_addr_hook(pskb, data, dataoff,
+					 &setup->destCallSignalAddress,
+					 ct->tuplehash[!dir].tuple.src.ip,
+					 ntohs(ct->tuplehash[!dir].tuple.src.
+					       u.tcp.port));
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
+	    (set_h225_addr_hook) &&
+	    get_h225_addr(*data, &setup->sourceCallSignalAddress, &ip, &port)
+	    && ip != ct->tuplehash[!dir].tuple.dst.ip) {
+		DEBUGP("ip_ct_q931: set sourceCallSignalAddress "
+		       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		       NIPQUAD(ip), port,
+		       NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip),
+		       ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
+		ret = set_h225_addr_hook(pskb, data, dataoff,
+					 &setup->sourceCallSignalAddress,
+					 ct->tuplehash[!dir].tuple.dst.ip,
+					 ntohs(ct->tuplehash[!dir].tuple.dst.
+					       u.tcp.port));
+		if (ret < 0)
+			return -1;
+	}
+
+	if (setup->options & eSetup_UUIE_fastStart) {
+		for (i = 0; i < setup->fastStart.count; i++) {
+			ret = process_olc(pskb, ct, ctinfo, data, dataoff,
+					  &setup->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_callproceeding(struct sk_buff **pskb,
+				  struct ip_conntrack *ct,
+				  enum ip_conntrack_info ctinfo,
+				  unsigned char **data, int dataoff,
+				  CallProceeding_UUIE * callproc)
+{
+	int ret;
+	int i;
+
+	DEBUGP("ip_ct_q931: CallProceeding\n");
+
+	if (callproc->options & eCallProceeding_UUIE_h245Address) {
+		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
+				  &callproc->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (callproc->options & eCallProceeding_UUIE_fastStart) {
+		for (i = 0; i < callproc->fastStart.count; i++) {
+			ret = process_olc(pskb, ct, ctinfo, data, dataoff,
+					  &callproc->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_connect(struct sk_buff **pskb, struct ip_conntrack *ct,
+			   enum ip_conntrack_info ctinfo,
+			   unsigned char **data, int dataoff,
+			   Connect_UUIE * connect)
+{
+	int ret;
+	int i;
+
+	DEBUGP("ip_ct_q931: Connect\n");
+
+	if (connect->options & eConnect_UUIE_h245Address) {
+		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
+				  &connect->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (connect->options & eConnect_UUIE_fastStart) {
+		for (i = 0; i < connect->fastStart.count; i++) {
+			ret = process_olc(pskb, ct, ctinfo, data, dataoff,
+					  &connect->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_alerting(struct sk_buff **pskb, struct ip_conntrack *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned char **data, int dataoff,
+			    Alerting_UUIE * alert)
+{
+	int ret;
+	int i;
+
+	DEBUGP("ip_ct_q931: Alerting\n");
+
+	if (alert->options & eAlerting_UUIE_h245Address) {
+		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
+				  &alert->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (alert->options & eAlerting_UUIE_fastStart) {
+		for (i = 0; i < alert->fastStart.count; i++) {
+			ret = process_olc(pskb, ct, ctinfo, data, dataoff,
+					  &alert->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_information(struct sk_buff **pskb,
+			       struct ip_conntrack *ct,
+			       enum ip_conntrack_info ctinfo,
+			       unsigned char **data, int dataoff,
+			       Information_UUIE * info)
+{
+	int ret;
+	int i;
+
+	DEBUGP("ip_ct_q931: Information\n");
+
+	if (info->options & eInformation_UUIE_fastStart) {
+		for (i = 0; i < info->fastStart.count; i++) {
+			ret = process_olc(pskb, ct, ctinfo, data, dataoff,
+					  &info->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned char **data, int dataoff,
+			    Facility_UUIE * facility)
+{
+	int ret;
+	int i;
+
+	DEBUGP("ip_ct_q931: Facility\n");
+
+	if (facility->options & eFacility_UUIE_h245Address) {
+		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
+				  &facility->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (facility->options & eFacility_UUIE_fastStart) {
+		for (i = 0; i < facility->fastStart.count; i++) {
+			ret = process_olc(pskb, ct, ctinfo, data, dataoff,
+					  &facility->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_progress(struct sk_buff **pskb, struct ip_conntrack *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned char **data, int dataoff,
+			    Progress_UUIE * progress)
+{
+	int ret;
+	int i;
+
+	DEBUGP("ip_ct_q931: Progress\n");
+
+	if (progress->options & eProgress_UUIE_h245Address) {
+		ret = expect_h245(pskb, ct, ctinfo, data, dataoff,
+				  &progress->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (progress->options & eProgress_UUIE_fastStart) {
+		for (i = 0; i < progress->fastStart.count; i++) {
+			ret = process_olc(pskb, ct, ctinfo, data, dataoff,
+					  &progress->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff, Q931 * q931)
+{
+	H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;
+	int i;
+	int ret = 0;
+
+	switch (pdu->h323_message_body.choice) {
+	case eH323_UU_PDU_h323_message_body_setup:
+		ret = process_setup(pskb, ct, ctinfo, data, dataoff,
+				    &pdu->h323_message_body.setup);
+		break;
+	case eH323_UU_PDU_h323_message_body_callProceeding:
+		ret = process_callproceeding(pskb, ct, ctinfo, data, dataoff,
+					     &pdu->h323_message_body.
+					     callProceeding);
+		break;
+	case eH323_UU_PDU_h323_message_body_connect:
+		ret = process_connect(pskb, ct, ctinfo, data, dataoff,
+				      &pdu->h323_message_body.connect);
+		break;
+	case eH323_UU_PDU_h323_message_body_alerting:
+		ret = process_alerting(pskb, ct, ctinfo, data, dataoff,
+				       &pdu->h323_message_body.alerting);
+		break;
+	case eH323_UU_PDU_h323_message_body_information:
+		ret = process_information(pskb, ct, ctinfo, data, dataoff,
+					  &pdu->h323_message_body.
+					  information);
+		break;
+	case eH323_UU_PDU_h323_message_body_facility:
+		ret = process_facility(pskb, ct, ctinfo, data, dataoff,
+				       &pdu->h323_message_body.facility);
+		break;
+	case eH323_UU_PDU_h323_message_body_progress:
+		ret = process_progress(pskb, ct, ctinfo, data, dataoff,
+				       &pdu->h323_message_body.progress);
+		break;
+	default:
+		DEBUGP("ip_ct_q931: Q.931 signal %d\n",
+		       pdu->h323_message_body.choice);
+		break;
+	}
+
+	if (ret < 0)
+		return -1;
+
+	if (pdu->options & eH323_UU_PDU_h245Control) {
+		for (i = 0; i < pdu->h245Control.count; i++) {
+			ret = process_h245(pskb, ct, ctinfo, data, dataoff,
+					   &pdu->h245Control.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int q931_help(struct sk_buff **pskb, struct ip_conntrack *ct,
+		     enum ip_conntrack_info ctinfo)
+{
+	static Q931 q931;
+	unsigned char *data = NULL;
+	int datalen;
+	int dataoff;
+	int ret;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED
+	    && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+		return NF_ACCEPT;
+	}
+	DEBUGP("ip_ct_q931: skblen = %u\n", (*pskb)->len);
+
+	spin_lock_bh(&ip_h323_lock);
+
+	/* Process each TPKT */
+	while (get_tpkt_data(pskb, ct, ctinfo, &data, &datalen, &dataoff)) {
+		DEBUGP("ip_ct_q931: TPKT %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
+		       NIPQUAD((*pskb)->nh.iph->saddr),
+		       NIPQUAD((*pskb)->nh.iph->daddr), datalen);
+
+		/* Decode Q.931 signal */
+		ret = DecodeQ931(data, datalen, &q931);
+		if (ret < 0) {
+			if (net_ratelimit())
+				printk("ip_ct_q931: decoding error: %s\n",
+				       ret == H323_ERROR_BOUND ?
+				       "out of bound" : "out of range");
+			/* We don't drop when decoding error */
+			break;
+		}
+
+		/* Process Q.931 signal */
+		if (process_q931(pskb, ct, ctinfo, &data, dataoff, &q931) < 0)
+			goto drop;
+	}
+
+	spin_unlock_bh(&ip_h323_lock);
+	return NF_ACCEPT;
+
+      drop:
+	spin_unlock_bh(&ip_h323_lock);
+	if (net_ratelimit())
+		printk("ip_ct_q931: packet dropped\n");
+	return NF_DROP;
+}
+
+/****************************************************************************/
+static struct ip_conntrack_helper ip_conntrack_helper_q931 = {
+	.name = "Q.931",
+	.me = THIS_MODULE,
+	.max_expected = H323_RTP_CHANNEL_MAX * 4 + 4 /* T.120 and H.245 */ ,
+	.timeout = 240,
+	.tuple = {.src = {.u = {__constant_htons(Q931_PORT)}},
+		  .dst = {.protonum = IPPROTO_TCP}},
+	.mask = {.src = {.u = {0xFFFF}},
+		 .dst = {.protonum = 0xFF}},
+	.help = q931_help
+};
+
+/****************************************************************************/
+void ip_conntrack_q931_expect(struct ip_conntrack *new,
+			      struct ip_conntrack_expect *this)
+{
+	write_lock_bh(&ip_conntrack_lock);
+	new->helper = &ip_conntrack_helper_q931;
+	write_unlock_bh(&ip_conntrack_lock);
+}
+
+/****************************************************************************/
+static unsigned char *get_udp_data(struct sk_buff **pskb, int *datalen)
+{
+	struct udphdr _uh, *uh;
+	int dataoff;
+
+	uh = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4, sizeof(_uh),
+				&_uh);
+	if (uh == NULL)
+		return NULL;
+	dataoff = (*pskb)->nh.iph->ihl * 4 + sizeof(_uh);
+	if (dataoff >= (*pskb)->len)
+		return NULL;
+	*datalen = (*pskb)->len - dataoff;
+	return skb_header_pointer(*pskb, dataoff, *datalen, h323_buffer);
+}
+
+/****************************************************************************/
+static struct ip_conntrack_expect *find_expect(struct ip_conntrack *ct,
+					       u_int32_t ip, u_int16_t port)
+{
+	struct ip_conntrack_expect *exp;
+	struct ip_conntrack_tuple tuple;
+
+	tuple.src.ip = 0;
+	tuple.src.u.tcp.port = 0;
+	tuple.dst.ip = ip;
+	tuple.dst.u.tcp.port = htons(port);
+	tuple.dst.protonum = IPPROTO_TCP;
+
+	exp = __ip_conntrack_expect_find(&tuple);
+	if (exp->master == ct)
+		return exp;
+	return NULL;
+}
+
+/****************************************************************************/
+static int set_expect_timeout(struct ip_conntrack_expect *exp,
+			      unsigned timeout)
+{
+	if (!exp || !del_timer(&exp->timeout))
+		return 0;
+
+	exp->timeout.expires = jiffies + timeout * HZ;
+	add_timer(&exp->timeout);
+
+	return 1;
+}
+
+/****************************************************************************/
+static int expect_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data,
+		       TransportAddress * addr, int count)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	int i;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp;
+
+	/* Look for the first related address */
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(*data, &addr[i], &ip, &port) &&
+		    ip == ct->tuplehash[dir].tuple.src.ip && port != 0)
+			break;
+	}
+
+	if (i >= count)		/* Not found */
+		return 0;
+
+	/* Create expect for Q.931 */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = gkrouted_only ?	/* only accept calls from GK? */
+	    ct->tuplehash[!dir].tuple.src.ip : 0;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = gkrouted_only ? 0xFFFFFFFF : 0;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = IP_CT_EXPECT_PERMANENT;	/* Accept multiple calls */
+
+	if (nat_q931_hook) {	/* Need NAT */
+		ret = nat_q931_hook(pskb, ct, ctinfo, data, addr, i,
+				    port, exp);
+	} else {		/* Conntrack only */
+		exp->expectfn = ip_conntrack_q931_expect;
+
+		if (ip_conntrack_expect_related(exp) == 0) {
+			DEBUGP("ip_ct_ras: expect Q.931 "
+			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+			       NIPQUAD(exp->tuple.src.ip),
+			       ntohs(exp->tuple.src.u.tcp.port),
+			       NIPQUAD(exp->tuple.dst.ip),
+			       ntohs(exp->tuple.dst.u.tcp.port));
+
+			/* Save port for looking up expect in processing RCF */
+			info->sig_port[dir] = port;
+		} else
+			ret = -1;
+	}
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_grq(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, GatekeeperRequest * grq)
+{
+	DEBUGP("ip_ct_ras: GRQ\n");
+
+	if (set_ras_addr_hook)	/* NATed */
+		return set_ras_addr_hook(pskb, ct, ctinfo, data,
+					 &grq->rasAddress, 1);
+	return 0;
+}
+
+/* Declare before using */
+static void ip_conntrack_ras_expect(struct ip_conntrack *new,
+				    struct ip_conntrack_expect *this);
+
+/****************************************************************************/
+static int process_gcf(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, GatekeeperConfirm * gcf)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp;
+
+	DEBUGP("ip_ct_ras: GCF\n");
+
+	if (!get_h225_addr(*data, &gcf->rasAddress, &ip, &port))
+		return 0;
+
+	/* Registration port is the same as discovery port */
+	if (ip == ct->tuplehash[dir].tuple.src.ip &&
+	    port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port))
+		return 0;
+
+	/* Avoid RAS expectation loops. A GCF is never expected. */
+	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+		return 0;
+
+	/* Need new expect */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_UDP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = 0;
+	exp->expectfn = ip_conntrack_ras_expect;
+	if (ip_conntrack_expect_related(exp) == 0) {
+		DEBUGP("ip_ct_ras: expect RAS "
+		       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		       NIPQUAD(exp->tuple.src.ip),
+		       ntohs(exp->tuple.src.u.tcp.port),
+		       NIPQUAD(exp->tuple.dst.ip),
+		       ntohs(exp->tuple.dst.u.tcp.port));
+	} else
+		ret = -1;
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_rrq(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, RegistrationRequest * rrq)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int ret;
+
+	DEBUGP("ip_ct_ras: RRQ\n");
+
+	ret = expect_q931(pskb, ct, ctinfo, data,
+			  rrq->callSignalAddress.item,
+			  rrq->callSignalAddress.count);
+	if (ret < 0)
+		return -1;
+
+	if (set_ras_addr_hook) {
+		ret = set_ras_addr_hook(pskb, ct, ctinfo, data,
+					rrq->rasAddress.item,
+					rrq->rasAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (rrq->options & eRegistrationRequest_timeToLive) {
+		DEBUGP("ip_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
+		info->timeout = rrq->timeToLive;
+	} else
+		info->timeout = 0;
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_rcf(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, RegistrationConfirm * rcf)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int ret;
+	struct ip_conntrack_expect *exp;
+
+	DEBUGP("ip_ct_ras: RCF\n");
+
+	if (set_sig_addr_hook) {
+		ret = set_sig_addr_hook(pskb, ct, ctinfo, data,
+					rcf->callSignalAddress.item,
+					rcf->callSignalAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (rcf->options & eRegistrationConfirm_timeToLive) {
+		DEBUGP("ip_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
+		info->timeout = rcf->timeToLive;
+	}
+
+	if (info->timeout > 0) {
+		DEBUGP
+		    ("ip_ct_ras: set RAS connection timeout to %u seconds\n",
+		     info->timeout);
+		ip_ct_refresh_acct(ct, ctinfo, NULL, info->timeout * HZ);
+
+		/* Set expect timeout */
+		read_lock_bh(&ip_conntrack_lock);
+		exp = find_expect(ct, ct->tuplehash[dir].tuple.dst.ip,
+				  info->sig_port[!dir]);
+		if (exp) {
+			DEBUGP("ip_ct_ras: set Q.931 expect "
+			       "(%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu) "
+			       "timeout to %u seconds\n",
+			       NIPQUAD(exp->tuple.src.ip),
+			       ntohs(exp->tuple.src.u.tcp.port),
+			       NIPQUAD(exp->tuple.dst.ip),
+			       ntohs(exp->tuple.dst.u.tcp.port),
+			       info->timeout);
+			set_expect_timeout(exp, info->timeout);
+		}
+		read_unlock_bh(&ip_conntrack_lock);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_urq(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, UnregistrationRequest * urq)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int ret;
+
+	DEBUGP("ip_ct_ras: URQ\n");
+
+	if (set_sig_addr_hook) {
+		ret = set_sig_addr_hook(pskb, ct, ctinfo, data,
+					urq->callSignalAddress.item,
+					urq->callSignalAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	/* Clear old expect */
+	ip_ct_remove_expectations(ct);
+	info->sig_port[dir] = 0;
+	info->sig_port[!dir] = 0;
+
+	/* Give it 30 seconds for UCF or URJ */
+	ip_ct_refresh_acct(ct, ctinfo, NULL, 30 * HZ);
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_arq(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, AdmissionRequest * arq)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	u_int32_t ip;
+	u_int16_t port;
+
+	DEBUGP("ip_ct_ras: ARQ\n");
+
+	if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
+	    get_h225_addr(*data, &arq->destCallSignalAddress, &ip, &port) &&
+	    ip == ct->tuplehash[dir].tuple.src.ip &&
+	    port == info->sig_port[dir] && set_h225_addr_hook) {
+		/* Answering ARQ */
+		return set_h225_addr_hook(pskb, data, 0,
+					  &arq->destCallSignalAddress,
+					  ct->tuplehash[!dir].tuple.dst.ip,
+					  info->sig_port[!dir]);
+	}
+
+	if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
+	    get_h225_addr(*data, &arq->srcCallSignalAddress, &ip, &port) &&
+	    ip == ct->tuplehash[dir].tuple.src.ip && set_h225_addr_hook) {
+		/* Calling ARQ */
+		return set_h225_addr_hook(pskb, data, 0,
+					  &arq->srcCallSignalAddress,
+					  ct->tuplehash[!dir].tuple.dst.ip,
+					  port);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_acf(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, AdmissionConfirm * acf)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp;
+
+	DEBUGP("ip_ct_ras: ACF\n");
+
+	if (!get_h225_addr(*data, &acf->destCallSignalAddress, &ip, &port))
+		return 0;
+
+	if (ip == ct->tuplehash[dir].tuple.dst.ip) {	/* Answering ACF */
+		if (set_sig_addr_hook)
+			return set_sig_addr_hook(pskb, ct, ctinfo, data,
+						 &acf->destCallSignalAddress,
+						 1);
+		return 0;
+	}
+
+	/* Need new expect */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = IP_CT_EXPECT_PERMANENT;
+	exp->expectfn = ip_conntrack_q931_expect;
+
+	if (ip_conntrack_expect_related(exp) == 0) {
+		DEBUGP("ip_ct_ras: expect Q.931 "
+		       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		       NIPQUAD(exp->tuple.src.ip),
+		       ntohs(exp->tuple.src.u.tcp.port),
+		       NIPQUAD(exp->tuple.dst.ip),
+		       ntohs(exp->tuple.dst.u.tcp.port));
+	} else
+		ret = -1;
+
+	ip_conntrack_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_lrq(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, LocationRequest * lrq)
+{
+	DEBUGP("ip_ct_ras: LRQ\n");
+
+	if (set_ras_addr_hook)
+		return set_ras_addr_hook(pskb, ct, ctinfo, data,
+					 &lrq->replyAddress, 1);
+	return 0;
+}
+
+/****************************************************************************/
+static int process_lcf(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, LocationConfirm * lcf)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	u_int32_t ip;
+	u_int16_t port;
+	struct ip_conntrack_expect *exp = NULL;
+
+	DEBUGP("ip_ct_ras: LCF\n");
+
+	if (!get_h225_addr(*data, &lcf->callSignalAddress, &ip, &port))
+		return 0;
+
+	/* Need new expect for call signal */
+	if ((exp = ip_conntrack_expect_alloc(ct)) == NULL)
+		return -1;
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.src.u.tcp.port = 0;
+	exp->tuple.dst.ip = ip;
+	exp->tuple.dst.u.tcp.port = htons(port);
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask.src.ip = 0xFFFFFFFF;
+	exp->mask.src.u.tcp.port = 0;
+	exp->mask.dst.ip = 0xFFFFFFFF;
+	exp->mask.dst.u.tcp.port = 0xFFFF;
+	exp->mask.dst.protonum = 0xFF;
+	exp->flags = IP_CT_EXPECT_PERMANENT;
+	exp->expectfn = ip_conntrack_q931_expect;
+
+	if (ip_conntrack_expect_related(exp) == 0) {
+		DEBUGP("ip_ct_ras: expect Q.931 "
+		       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+		       NIPQUAD(exp->tuple.src.ip),
+		       ntohs(exp->tuple.src.u.tcp.port),
+		       NIPQUAD(exp->tuple.dst.ip),
+		       ntohs(exp->tuple.dst.u.tcp.port));
+	} else
+		ret = -1;
+
+	ip_conntrack_expect_put(exp);
+
+	/* Ignore rasAddress */
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_irr(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, InfoRequestResponse * irr)
+{
+	int ret;
+
+	DEBUGP("ip_ct_ras: IRR\n");
+
+	if (set_ras_addr_hook) {
+		ret = set_ras_addr_hook(pskb, ct, ctinfo, data,
+					&irr->rasAddress, 1);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (set_sig_addr_hook) {
+		ret = set_sig_addr_hook(pskb, ct, ctinfo, data,
+					irr->callSignalAddress.item,
+					irr->callSignalAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_ras(struct sk_buff **pskb, struct ip_conntrack *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, RasMessage * ras)
+{
+	switch (ras->choice) {
+	case eRasMessage_gatekeeperRequest:
+		return process_grq(pskb, ct, ctinfo, data,
+				   &ras->gatekeeperRequest);
+	case eRasMessage_gatekeeperConfirm:
+		return process_gcf(pskb, ct, ctinfo, data,
+				   &ras->gatekeeperConfirm);
+	case eRasMessage_registrationRequest:
+		return process_rrq(pskb, ct, ctinfo, data,
+				   &ras->registrationRequest);
+	case eRasMessage_registrationConfirm:
+		return process_rcf(pskb, ct, ctinfo, data,
+				   &ras->registrationConfirm);
+	case eRasMessage_unregistrationRequest:
+		return process_urq(pskb, ct, ctinfo, data,
+				   &ras->unregistrationRequest);
+	case eRasMessage_admissionRequest:
+		return process_arq(pskb, ct, ctinfo, data,
+				   &ras->admissionRequest);
+	case eRasMessage_admissionConfirm:
+		return process_acf(pskb, ct, ctinfo, data,
+				   &ras->admissionConfirm);
+	case eRasMessage_locationRequest:
+		return process_lrq(pskb, ct, ctinfo, data,
+				   &ras->locationRequest);
+	case eRasMessage_locationConfirm:
+		return process_lcf(pskb, ct, ctinfo, data,
+				   &ras->locationConfirm);
+	case eRasMessage_infoRequestResponse:
+		return process_irr(pskb, ct, ctinfo, data,
+				   &ras->infoRequestResponse);
+	default:
+		DEBUGP("ip_ct_ras: RAS message %d\n", ras->choice);
+		break;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int ras_help(struct sk_buff **pskb, struct ip_conntrack *ct,
+		    enum ip_conntrack_info ctinfo)
+{
+	static RasMessage ras;
+	unsigned char *data;
+	int datalen = 0;
+	int ret;
+
+	DEBUGP("ip_ct_ras: skblen = %u\n", (*pskb)->len);
+
+	spin_lock_bh(&ip_h323_lock);
+
+	/* Get UDP data */
+	data = get_udp_data(pskb, &datalen);
+	if (data == NULL)
+		goto accept;
+	DEBUGP("ip_ct_ras: RAS message %u.%u.%u.%u->%u.%u.%u.%u, len=%d\n",
+	       NIPQUAD((*pskb)->nh.iph->saddr),
+	       NIPQUAD((*pskb)->nh.iph->daddr), datalen);
+
+	/* Decode RAS message */
+	ret = DecodeRasMessage(data, datalen, &ras);
+	if (ret < 0) {
+		if (net_ratelimit())
+			printk("ip_ct_ras: decoding error: %s\n",
+			       ret == H323_ERROR_BOUND ?
+			       "out of bound" : "out of range");
+		goto accept;
+	}
+
+	/* Process RAS message */
+	if (process_ras(pskb, ct, ctinfo, &data, &ras) < 0)
+		goto drop;
+
+      accept:
+	spin_unlock_bh(&ip_h323_lock);
+	return NF_ACCEPT;
+
+      drop:
+	spin_unlock_bh(&ip_h323_lock);
+	if (net_ratelimit())
+		printk("ip_ct_ras: packet dropped\n");
+	return NF_DROP;
+}
+
+/****************************************************************************/
+static struct ip_conntrack_helper ip_conntrack_helper_ras = {
+	.name = "RAS",
+	.me = THIS_MODULE,
+	.max_expected = 32,
+	.timeout = 240,
+	.tuple = {.src = {.u = {__constant_htons(RAS_PORT)}},
+		  .dst = {.protonum = IPPROTO_UDP}},
+	.mask = {.src = {.u = {0xFFFE}},
+		 .dst = {.protonum = 0xFF}},
+	.help = ras_help,
+};
+
+/****************************************************************************/
+static void ip_conntrack_ras_expect(struct ip_conntrack *new,
+				    struct ip_conntrack_expect *this)
+{
+	write_lock_bh(&ip_conntrack_lock);
+	new->helper = &ip_conntrack_helper_ras;
+	write_unlock_bh(&ip_conntrack_lock);
+}
+
+/****************************************************************************/
+/* Not __exit - called from init() */
+static void fini(void)
+{
+	ip_conntrack_helper_unregister(&ip_conntrack_helper_ras);
+	ip_conntrack_helper_unregister(&ip_conntrack_helper_q931);
+	kfree(h323_buffer);
+	DEBUGP("ip_ct_h323: fini\n");
+}
+
+/****************************************************************************/
+static int __init init(void)
+{
+	int ret;
+
+	h323_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!h323_buffer)
+		return -ENOMEM;
+	if ((ret = ip_conntrack_helper_register(&ip_conntrack_helper_q931)) ||
+	    (ret = ip_conntrack_helper_register(&ip_conntrack_helper_ras))) {
+		fini();
+		return ret;
+	}
+
+	DEBUGP("ip_ct_h323: init success\n");
+	return 0;
+}
+
+/****************************************************************************/
+module_init(init);
+module_exit(fini);
+
+EXPORT_SYMBOL(get_h245_addr);
+EXPORT_SYMBOL(get_h225_addr);
+EXPORT_SYMBOL(ip_conntrack_h245_expect);
+EXPORT_SYMBOL(ip_conntrack_q931_expect);
+EXPORT_SYMBOL(set_h245_addr_hook);
+EXPORT_SYMBOL(set_h225_addr_hook);
+EXPORT_SYMBOL(set_sig_addr_hook);
+EXPORT_SYMBOL(set_ras_addr_hook);
+EXPORT_SYMBOL(nat_rtp_rtcp_hook);
+EXPORT_SYMBOL(nat_t120_hook);
+EXPORT_SYMBOL(nat_h245_hook);
+EXPORT_SYMBOL(nat_q931_hook);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 connection tracking helper");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c
new file mode 100644
index 000000000000..afa525129b51
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.c
@@ -0,0 +1,870 @@
+/****************************************************************************
+ * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323
+ * 			      	     conntrack/NAT module.
+ *
+ * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@hotmail.com>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * See ip_conntrack_helper_h323_asn1.h for details.
+ *
+ ****************************************************************************/
+
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#else
+#include <stdio.h>
+#endif
+#include "ip_conntrack_helper_h323_asn1.h"
+
+/* Trace Flag */
+#ifndef H323_TRACE
+#define H323_TRACE 0
+#endif
+
+#if H323_TRACE
+#define TAB_SIZE 4
+#define IFTHEN(cond, act) if(cond){act;}
+#ifdef __KERNEL__
+#define PRINT printk
+#else
+#define PRINT printf
+#endif
+#define FNAME(name) name,
+#else
+#define IFTHEN(cond, act)
+#define PRINT(fmt, args...)
+#define FNAME(name)
+#endif
+
+/* ASN.1 Types */
+#define NUL 0
+#define BOOL 1
+#define OID 2
+#define INT 3
+#define ENUM 4
+#define BITSTR 5
+#define NUMSTR 6
+#define NUMDGT 6
+#define TBCDSTR 6
+#define OCTSTR 7
+#define PRTSTR 7
+#define IA5STR 7
+#define GENSTR 7
+#define BMPSTR 8
+#define SEQ 9
+#define SET 9
+#define SEQOF 10
+#define SETOF 10
+#define CHOICE 11
+
+/* Constraint Types */
+#define FIXD 0
+/* #define BITS 1-8 */
+#define BYTE 9
+#define WORD 10
+#define CONS 11
+#define SEMI 12
+#define UNCO 13
+
+/* ASN.1 Type Attributes */
+#define SKIP 0
+#define STOP 1
+#define DECODE 2
+#define EXT 4
+#define OPEN 8
+#define OPT 16
+
+
+/* ASN.1 Field Structure */
+typedef struct field_t {
+#if H323_TRACE
+	char *name;
+#endif
+	unsigned char type;
+	unsigned char sz;
+	unsigned char lb;
+	unsigned char ub;
+	unsigned short attr;
+	unsigned short offset;
+	struct field_t *fields;
+} field_t;
+
+/* Bit Stream */
+typedef struct {
+	unsigned char *buf;
+	unsigned char *beg;
+	unsigned char *end;
+	unsigned char *cur;
+	unsigned bit;
+} bitstr_t;
+
+/* Tool Functions */
+#define INC_BIT(bs) if((++bs->bit)>7){bs->cur++;bs->bit=0;}
+#define INC_BITS(bs,b) if((bs->bit+=b)>7){bs->cur+=bs->bit>>3;bs->bit&=7;}
+#define BYTE_ALIGN(bs) if(bs->bit){bs->cur++;bs->bit=0;}
+#define CHECK_BOUND(bs,n) if(bs->cur+(n)>bs->end)return(H323_ERROR_BOUND)
+static unsigned get_len(bitstr_t * bs);
+static unsigned get_bit(bitstr_t * bs);
+static unsigned get_bits(bitstr_t * bs, unsigned b);
+static unsigned get_bitmap(bitstr_t * bs, unsigned b);
+static unsigned get_uint(bitstr_t * bs, int b);
+
+/* Decoder Functions */
+static int decode_nul(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_bool(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_oid(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_int(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_enum(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_bitstr(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_numstr(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_octstr(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_bmpstr(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_seq(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_seqof(bitstr_t * bs, field_t * f, char *base, int level);
+static int decode_choice(bitstr_t * bs, field_t * f, char *base, int level);
+
+/* Decoder Functions Vector */
+typedef int (*decoder_t) (bitstr_t *, field_t *, char *, int);
+static decoder_t Decoders[] = {
+	decode_nul,
+	decode_bool,
+	decode_oid,
+	decode_int,
+	decode_enum,
+	decode_bitstr,
+	decode_numstr,
+	decode_octstr,
+	decode_bmpstr,
+	decode_seq,
+	decode_seqof,
+	decode_choice,
+};
+
+/****************************************************************************
+ * H.323 Types
+ ****************************************************************************/
+#include "ip_conntrack_helper_h323_types.c"
+
+/****************************************************************************
+ * Functions
+ ****************************************************************************/
+/* Assume bs is aligned && v < 16384 */
+unsigned get_len(bitstr_t * bs)
+{
+	unsigned v;
+
+	v = *bs->cur++;
+
+	if (v & 0x80) {
+		v &= 0x3f;
+		v <<= 8;
+		v += *bs->cur++;
+	}
+
+	return v;
+}
+
+/****************************************************************************/
+unsigned get_bit(bitstr_t * bs)
+{
+	unsigned b = (*bs->cur) & (0x80 >> bs->bit);
+
+	INC_BIT(bs);
+
+	return b;
+}
+
+/****************************************************************************/
+/* Assume b <= 8 */
+unsigned get_bits(bitstr_t * bs, unsigned b)
+{
+	unsigned v, l;
+
+	v = (*bs->cur) & (0xffU >> bs->bit);
+	l = b + bs->bit;
+
+	if (l < 8) {
+		v >>= 8 - l;
+		bs->bit = l;
+	} else if (l == 8) {
+		bs->cur++;
+		bs->bit = 0;
+	} else {		/* l > 8 */
+
+		v <<= 8;
+		v += *(++bs->cur);
+		v >>= 16 - l;
+		bs->bit = l - 8;
+	}
+
+	return v;
+}
+
+/****************************************************************************/
+/* Assume b <= 32 */
+unsigned get_bitmap(bitstr_t * bs, unsigned b)
+{
+	unsigned v, l, shift, bytes;
+
+	if (!b)
+		return 0;
+
+	l = bs->bit + b;
+
+	if (l < 8) {
+		v = (unsigned) (*bs->cur) << (bs->bit + 24);
+		bs->bit = l;
+	} else if (l == 8) {
+		v = (unsigned) (*bs->cur++) << (bs->bit + 24);
+		bs->bit = 0;
+	} else {
+		for (bytes = l >> 3, shift = 24, v = 0; bytes;
+		     bytes--, shift -= 8)
+			v |= (unsigned) (*bs->cur++) << shift;
+
+		if (l < 32) {
+			v |= (unsigned) (*bs->cur) << shift;
+			v <<= bs->bit;
+		} else if (l > 32) {
+			v <<= bs->bit;
+			v |= (*bs->cur) >> (8 - bs->bit);
+		}
+
+		bs->bit = l & 0x7;
+	}
+
+	v &= 0xffffffff << (32 - b);
+
+	return v;
+}
+
+/****************************************************************************
+ * Assume bs is aligned and sizeof(unsigned int) == 4
+ ****************************************************************************/
+unsigned get_uint(bitstr_t * bs, int b)
+{
+	unsigned v = 0;
+
+	switch (b) {
+	case 4:
+		v |= *bs->cur++;
+		v <<= 8;
+	case 3:
+		v |= *bs->cur++;
+		v <<= 8;
+	case 2:
+		v |= *bs->cur++;
+		v <<= 8;
+	case 1:
+		v |= *bs->cur++;
+		break;
+	}
+	return v;
+}
+
+/****************************************************************************/
+int decode_nul(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_bool(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	INC_BIT(bs);
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_oid(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	int len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	BYTE_ALIGN(bs);
+	CHECK_BOUND(bs, 1);
+	len = *bs->cur++;
+	bs->cur += len;
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_int(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned len;
+
+	PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+	switch (f->sz) {
+	case BYTE:		/* Range == 256 */
+		BYTE_ALIGN(bs);
+		bs->cur++;
+		break;
+	case WORD:		/* 257 <= Range <= 64K */
+		BYTE_ALIGN(bs);
+		bs->cur += 2;
+		break;
+	case CONS:		/* 64K < Range < 4G */
+		len = get_bits(bs, 2) + 1;
+		BYTE_ALIGN(bs);
+		if (base && (f->attr & DECODE)) {	/* timeToLive */
+			unsigned v = get_uint(bs, len) + f->lb;
+			PRINT(" = %u", v);
+			*((unsigned *) (base + f->offset)) = v;
+		}
+		bs->cur += len;
+		break;
+	case UNCO:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs);
+		bs->cur += len;
+		break;
+	default:		/* 2 <= Range <= 255 */
+		INC_BITS(bs, f->sz);
+		break;
+	}
+
+	PRINT("\n");
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_enum(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	if ((f->attr & EXT) && get_bit(bs)) {
+		INC_BITS(bs, 7);
+	} else {
+		INC_BITS(bs, f->sz);
+	}
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_bitstr(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	BYTE_ALIGN(bs);
+	switch (f->sz) {
+	case FIXD:		/* fixed length > 16 */
+		len = f->lb;
+		break;
+	case WORD:		/* 2-byte length */
+		CHECK_BOUND(bs, 2);
+		len = (*bs->cur++) << 8;
+		len += (*bs->cur++) + f->lb;
+		break;
+	case SEMI:
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs);
+		break;
+	default:
+		len = 0;
+		break;
+	}
+
+	bs->cur += len >> 3;
+	bs->bit = len & 7;
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_numstr(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* 2 <= Range <= 255 */
+	len = get_bits(bs, f->sz) + f->lb;
+
+	BYTE_ALIGN(bs);
+	INC_BITS(bs, (len << 2));
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_octstr(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned len;
+
+	PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+	switch (f->sz) {
+	case FIXD:		/* Range == 1 */
+		if (f->lb > 2) {
+			BYTE_ALIGN(bs);
+			if (base && (f->attr & DECODE)) {
+				/* The IP Address */
+				IFTHEN(f->lb == 4,
+				       PRINT(" = %d.%d.%d.%d:%d",
+					     bs->cur[0], bs->cur[1],
+					     bs->cur[2], bs->cur[3],
+					     bs->cur[4] * 256 + bs->cur[5]));
+				*((unsigned *) (base + f->offset)) =
+				    bs->cur - bs->buf;
+			}
+		}
+		len = f->lb;
+		break;
+	case BYTE:		/* Range == 256 */
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 1);
+		len = (*bs->cur++) + f->lb;
+		break;
+	case SEMI:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs) + f->lb;
+		break;
+	default:		/* 2 <= Range <= 255 */
+		len = get_bits(bs, f->sz) + f->lb;
+		BYTE_ALIGN(bs);
+		break;
+	}
+
+	bs->cur += len;
+
+	PRINT("\n");
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_bmpstr(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	switch (f->sz) {
+	case BYTE:		/* Range == 256 */
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 1);
+		len = (*bs->cur++) + f->lb;
+		break;
+	default:		/* 2 <= Range <= 255 */
+		len = get_bits(bs, f->sz) + f->lb;
+		BYTE_ALIGN(bs);
+		break;
+	}
+
+	bs->cur += len << 1;
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_seq(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned ext, bmp, i, opt, len = 0, bmp2, bmp2_len;
+	int err;
+	field_t *son;
+	unsigned char *beg = NULL;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* Decode? */
+	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+	/* Extensible? */
+	ext = (f->attr & EXT) ? get_bit(bs) : 0;
+
+	/* Get fields bitmap */
+	bmp = get_bitmap(bs, f->sz);
+	if (base)
+		*(unsigned *) base = bmp;
+
+	/* Decode the root components */
+	for (i = opt = 0, son = f->fields; i < f->lb; i++, son++) {
+		if (son->attr & STOP) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			return H323_ERROR_STOP;
+		}
+
+		if (son->attr & OPT) {	/* Optional component */
+			if (!((0x80000000U >> (opt++)) & bmp))	/* Not exist */
+				continue;
+		}
+
+		/* Decode */
+		if (son->attr & OPEN) {	/* Open field */
+			CHECK_BOUND(bs, 2);
+			len = get_len(bs);
+			CHECK_BOUND(bs, len);
+			if (!base) {
+				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+				      " ", son->name);
+				bs->cur += len;
+				continue;
+			}
+			beg = bs->cur;
+
+			/* Decode */
+			if ((err = (Decoders[son->type]) (bs, son, base,
+							  level + 1)) >
+			    H323_ERROR_STOP)
+				return err;
+
+			bs->cur = beg + len;
+			bs->bit = 0;
+		} else if ((err = (Decoders[son->type]) (bs, son, base,
+							 level + 1)))
+			return err;
+	}
+
+	/* No extension? */
+	if (!ext)
+		return H323_ERROR_NONE;
+
+	/* Get the extension bitmap */
+	bmp2_len = get_bits(bs, 7) + 1;
+	CHECK_BOUND(bs, (bmp2_len + 7) >> 3);
+	bmp2 = get_bitmap(bs, bmp2_len);
+	bmp |= bmp2 >> f->sz;
+	if (base)
+		*(unsigned *) base = bmp;
+	BYTE_ALIGN(bs);
+
+	/* Decode the extension components */
+	for (opt = 0; opt < bmp2_len; opt++, i++, son++) {
+		if (son->attr & STOP) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			return H323_ERROR_STOP;
+		}
+
+		if (!((0x80000000 >> opt) & bmp2))	/* Not present */
+			continue;
+
+		/* Check Range */
+		if (i >= f->ub) {	/* Newer Version? */
+			CHECK_BOUND(bs, 2);
+			len = get_len(bs);
+			CHECK_BOUND(bs, len);
+			bs->cur += len;
+			continue;
+		}
+
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs);
+		CHECK_BOUND(bs, len);
+		if (!base || !(son->attr & DECODE)) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			bs->cur += len;
+			continue;
+		}
+		beg = bs->cur;
+
+		if ((err = (Decoders[son->type]) (bs, son, base,
+						  level + 1)) >
+		    H323_ERROR_STOP)
+			return err;
+
+		bs->cur = beg + len;
+		bs->bit = 0;
+	}
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int decode_seqof(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned count, effective_count = 0, i, len = 0;
+	int err;
+	field_t *son;
+	unsigned char *beg = NULL;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* Decode? */
+	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+	/* Decode item count */
+	switch (f->sz) {
+	case BYTE:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 1);
+		count = *bs->cur++;
+		break;
+	case WORD:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		count = *bs->cur++;
+		count <<= 8;
+		count = *bs->cur++;
+		break;
+	case SEMI:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		count = get_len(bs);
+		break;
+	default:
+		count = get_bits(bs, f->sz);
+		break;
+	}
+	count += f->lb;
+
+	/* Write Count */
+	if (base) {
+		effective_count = count > f->ub ? f->ub : count;
+		*(unsigned *) base = effective_count;
+		base += sizeof(unsigned);
+	}
+
+	/* Decode nested field */
+	son = f->fields;
+	if (base)
+		base -= son->offset;
+	for (i = 0; i < count; i++) {
+		if (son->attr & OPEN) {
+			BYTE_ALIGN(bs);
+			len = get_len(bs);
+			CHECK_BOUND(bs, len);
+			if (!base || !(son->attr & DECODE)) {
+				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+				      " ", son->name);
+				bs->cur += len;
+				continue;
+			}
+			beg = bs->cur;
+
+			if ((err = (Decoders[son->type]) (bs, son,
+							  i <
+							  effective_count ?
+							  base : NULL,
+							  level + 1)) >
+			    H323_ERROR_STOP)
+				return err;
+
+			bs->cur = beg + len;
+			bs->bit = 0;
+		} else
+		    if ((err = (Decoders[son->type]) (bs, son,
+						      i < effective_count ?
+						      base : NULL,
+						      level + 1)))
+			return err;
+
+		if (base)
+			base += son->offset;
+	}
+
+	return H323_ERROR_NONE;
+}
+
+
+/****************************************************************************/
+int decode_choice(bitstr_t * bs, field_t * f, char *base, int level)
+{
+	unsigned type, ext, len = 0;
+	int err;
+	field_t *son;
+	unsigned char *beg = NULL;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* Decode? */
+	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+	/* Decode the choice index number */
+	if ((f->attr & EXT) && get_bit(bs)) {
+		ext = 1;
+		type = get_bits(bs, 7) + f->lb;
+	} else {
+		ext = 0;
+		type = get_bits(bs, f->sz);
+	}
+
+	/* Check Range */
+	if (type >= f->ub) {	/* Newer version? */
+		BYTE_ALIGN(bs);
+		len = get_len(bs);
+		CHECK_BOUND(bs, len);
+		bs->cur += len;
+		return H323_ERROR_NONE;
+	}
+
+	/* Write Type */
+	if (base)
+		*(unsigned *) base = type;
+
+	/* Transfer to son level */
+	son = &f->fields[type];
+	if (son->attr & STOP) {
+		PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name);
+		return H323_ERROR_STOP;
+	}
+
+	if (ext || (son->attr & OPEN)) {
+		BYTE_ALIGN(bs);
+		len = get_len(bs);
+		CHECK_BOUND(bs, len);
+		if (!base || !(son->attr & DECODE)) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			bs->cur += len;
+			return H323_ERROR_NONE;
+		}
+		beg = bs->cur;
+
+		if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) >
+		    H323_ERROR_STOP)
+			return err;
+
+		bs->cur = beg + len;
+		bs->bit = 0;
+	} else if ((err = (Decoders[son->type]) (bs, son, base, level + 1)))
+		return err;
+
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage * ras)
+{
+	static field_t ras_message = {
+		FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT,
+		0, _RasMessage
+	};
+	bitstr_t bs;
+
+	bs.buf = bs.beg = bs.cur = buf;
+	bs.end = buf + sz;
+	bs.bit = 0;
+
+	return decode_choice(&bs, &ras_message, (char *) ras, 0);
+}
+
+/****************************************************************************/
+static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg,
+				      size_t sz, H323_UserInformation * uuie)
+{
+	static field_t h323_userinformation = {
+		FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT,
+		0, _H323_UserInformation
+	};
+	bitstr_t bs;
+
+	bs.buf = buf;
+	bs.beg = bs.cur = beg;
+	bs.end = beg + sz;
+	bs.bit = 0;
+
+	return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0);
+}
+
+/****************************************************************************/
+int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz,
+					 MultimediaSystemControlMessage *
+					 mscm)
+{
+	static field_t multimediasystemcontrolmessage = {
+		FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4,
+		DECODE | EXT, 0, _MultimediaSystemControlMessage
+	};
+	bitstr_t bs;
+
+	bs.buf = bs.beg = bs.cur = buf;
+	bs.end = buf + sz;
+	bs.bit = 0;
+
+	return decode_choice(&bs, &multimediasystemcontrolmessage,
+			     (char *) mscm, 0);
+}
+
+/****************************************************************************/
+int DecodeQ931(unsigned char *buf, size_t sz, Q931 * q931)
+{
+	unsigned char *p = buf;
+	int len;
+
+	if (!p || sz < 1)
+		return H323_ERROR_BOUND;
+
+	/* Protocol Discriminator */
+	if (*p != 0x08) {
+		PRINT("Unknown Protocol Discriminator\n");
+		return H323_ERROR_RANGE;
+	}
+	p++;
+	sz--;
+
+	/* CallReferenceValue */
+	if (sz < 1)
+		return H323_ERROR_BOUND;
+	len = *p++;
+	sz--;
+	if (sz < len)
+		return H323_ERROR_BOUND;
+	p += len;
+	sz -= len;
+
+	/* Message Type */
+	if (sz < 1)
+		return H323_ERROR_BOUND;
+	q931->MessageType = *p++;
+	PRINT("MessageType = %02X\n", q931->MessageType);
+	if (*p & 0x80) {
+		p++;
+		sz--;
+	}
+
+	/* Decode Information Elements */
+	while (sz > 0) {
+		if (*p == 0x7e) {	/* UserUserIE */
+			if (sz < 3)
+				break;
+			p++;
+			len = *p++ << 8;
+			len |= *p++;
+			sz -= 3;
+			if (sz < len)
+				break;
+			p++;
+			len--;
+			return DecodeH323_UserInformation(buf, p, len,
+							  &q931->UUIE);
+		}
+		p++;
+		sz--;
+		if (sz < 1)
+			break;
+		len = *p++;
+		if (sz < len)
+			break;
+		p += len;
+		sz -= len;
+	}
+
+	PRINT("Q.931 UUIE not found\n");
+
+	return H323_ERROR_BOUND;
+}
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.h b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.h
new file mode 100644
index 000000000000..0bd828081c0c
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_asn1.h
@@ -0,0 +1,98 @@
+/****************************************************************************
+ * ip_conntrack_helper_h323_asn1.h - BER and PER decoding library for H.323
+ * 			      	     conntrack/NAT module.
+ *
+ * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@hotmail.com>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ *
+ * This library is based on H.225 version 4, H.235 version 2 and H.245
+ * version 7. It is extremely optimized to decode only the absolutely
+ * necessary objects in a signal for Linux kernel NAT module use, so don't
+ * expect it to be a full ASN.1 library.
+ *
+ * Features:
+ *
+ * 1. Small. The total size of code plus data is less than 20 KB (IA32).
+ * 2. Fast. Decoding Netmeeting's Setup signal 1 million times on a PIII 866
+ *    takes only 3.9 seconds.
+ * 3. No memory allocation. It uses a static object. No need to initialize or
+ *    cleanup.
+ * 4. Thread safe.
+ * 5. Support embedded architectures that has no misaligned memory access
+ *    support.
+ *
+ * Limitations:
+ *
+ * 1. At most 30 faststart entries. Actually this is limited by ethernet's MTU.
+ *    If a Setup signal contains more than 30 faststart, the packet size will
+ *    very likely exceed the MTU size, then the TPKT will be fragmented. I
+ *    don't know how to handle this in a Netfilter module. Anybody can help?
+ *    Although I think 30 is enough for most of the cases.
+ * 2. IPv4 addresses only.
+ *
+ ****************************************************************************/
+
+#ifndef _IP_CONNTRACK_HELPER_H323_ASN1_H_
+#define _IP_CONNTRACK_HELPER_H323_ASN1_H_
+
+/*****************************************************************************
+ * H.323 Types
+ ****************************************************************************/
+#include "ip_conntrack_helper_h323_types.h"
+
+typedef struct {
+	enum {
+		Q931_NationalEscape = 0x00,
+		Q931_Alerting = 0x01,
+		Q931_CallProceeding = 0x02,
+		Q931_Connect = 0x07,
+		Q931_ConnectAck = 0x0F,
+		Q931_Progress = 0x03,
+		Q931_Setup = 0x05,
+		Q931_SetupAck = 0x0D,
+		Q931_Resume = 0x26,
+		Q931_ResumeAck = 0x2E,
+		Q931_ResumeReject = 0x22,
+		Q931_Suspend = 0x25,
+		Q931_SuspendAck = 0x2D,
+		Q931_SuspendReject = 0x21,
+		Q931_UserInformation = 0x20,
+		Q931_Disconnect = 0x45,
+		Q931_Release = 0x4D,
+		Q931_ReleaseComplete = 0x5A,
+		Q931_Restart = 0x46,
+		Q931_RestartAck = 0x4E,
+		Q931_Segment = 0x60,
+		Q931_CongestionCtrl = 0x79,
+		Q931_Information = 0x7B,
+		Q931_Notify = 0x6E,
+		Q931_Status = 0x7D,
+		Q931_StatusEnquiry = 0x75,
+		Q931_Facility = 0x62
+	} MessageType;
+	H323_UserInformation UUIE;
+} Q931;
+
+/*****************************************************************************
+ * Decode Functions Return Codes
+ ****************************************************************************/
+
+#define H323_ERROR_NONE 0	/* Decoded successfully */
+#define H323_ERROR_STOP 1	/* Decoding stopped, not really an error */
+#define H323_ERROR_BOUND -1
+#define H323_ERROR_RANGE -2
+
+
+/*****************************************************************************
+ * Decode Functions
+ ****************************************************************************/
+
+int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage * ras);
+int DecodeQ931(unsigned char *buf, size_t sz, Q931 * q931);
+int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz,
+					 MultimediaSystemControlMessage *
+					 mscm);
+
+#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
new file mode 100644
index 000000000000..022c47b9f6c9
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
@@ -0,0 +1,1926 @@
+/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ */
+
+static field_t _TransportAddress_ipAddress[] = {	/* SEQUENCE */
+	{FNAME("ip") OCTSTR, FIXD, 4, 0, DECODE,
+	 offsetof(TransportAddress_ipAddress, ip), NULL},
+	{FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _TransportAddress_ipSourceRoute_route[] = {	/* SEQUENCE OF */
+	{FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static field_t _TransportAddress_ipSourceRoute_routing[] = {	/* CHOICE */
+	{FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _TransportAddress_ipSourceRoute[] = {	/* SEQUENCE */
+	{FNAME("ip") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _TransportAddress_ipSourceRoute_route},
+	{FNAME("routing") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _TransportAddress_ipSourceRoute_routing},
+};
+
+static field_t _TransportAddress_ipxAddress[] = {	/* SEQUENCE */
+	{FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+	{FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("port") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static field_t _TransportAddress_ip6Address[] = {	/* SEQUENCE */
+	{FNAME("ip") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H221NonStandard[] = {	/* SEQUENCE */
+	{FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _NonStandardIdentifier[] = {	/* CHOICE */
+	{FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP | EXT, 0,
+	 _H221NonStandard},
+};
+
+static field_t _NonStandardParameter[] = {	/* SEQUENCE */
+	{FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _NonStandardIdentifier},
+	{FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _TransportAddress[] = {	/* CHOICE */
+	{FNAME("ipAddress") SEQ, 0, 2, 2, DECODE,
+	 offsetof(TransportAddress, ipAddress), _TransportAddress_ipAddress},
+	{FNAME("ipSourceRoute") SEQ, 0, 4, 4, SKIP | EXT, 0,
+	 _TransportAddress_ipSourceRoute},
+	{FNAME("ipxAddress") SEQ, 0, 3, 3, SKIP, 0,
+	 _TransportAddress_ipxAddress},
+	{FNAME("ip6Address") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _TransportAddress_ip6Address},
+	{FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0,
+	 _NonStandardParameter},
+};
+
+static field_t _AliasAddress[] = {	/* CHOICE */
+	{FNAME("dialedDigits") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("h323-ID") BMPSTR, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("url-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("transportID") CHOICE, 3, 7, 7, SKIP | EXT, 0, NULL},
+	{FNAME("email-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("partyNumber") CHOICE, 3, 5, 5, SKIP | EXT, 0, NULL},
+	{FNAME("mobileUIM") CHOICE, 1, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static field_t _Setup_UUIE_sourceAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _VendorIdentifier[] = {	/* SEQUENCE */
+	{FNAME("vendor") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard},
+	{FNAME("productId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("versionId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _GatekeeperInfo[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+};
+
+static field_t _H310Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H320Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H321Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H322Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H323Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H324Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _VoiceCaps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _T120OnlyCaps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _SupportedProtocols[] = {	/* CHOICE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP, 0,
+	 _NonStandardParameter},
+	{FNAME("h310") SEQ, 1, 1, 3, SKIP | EXT, 0, _H310Caps},
+	{FNAME("h320") SEQ, 1, 1, 3, SKIP | EXT, 0, _H320Caps},
+	{FNAME("h321") SEQ, 1, 1, 3, SKIP | EXT, 0, _H321Caps},
+	{FNAME("h322") SEQ, 1, 1, 3, SKIP | EXT, 0, _H322Caps},
+	{FNAME("h323") SEQ, 1, 1, 3, SKIP | EXT, 0, _H323Caps},
+	{FNAME("h324") SEQ, 1, 1, 3, SKIP | EXT, 0, _H324Caps},
+	{FNAME("voice") SEQ, 1, 1, 3, SKIP | EXT, 0, _VoiceCaps},
+	{FNAME("t120-only") SEQ, 1, 1, 3, SKIP | EXT, 0, _T120OnlyCaps},
+	{FNAME("nonStandardProtocol") SEQ, 2, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("t38FaxAnnexbOnly") SEQ, 2, 5, 5, SKIP | EXT, 0, NULL},
+};
+
+static field_t _GatewayInfo_protocol[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 4, 9, 11, SKIP | EXT, 0, _SupportedProtocols},
+};
+
+static field_t _GatewayInfo[] = {	/* SEQUENCE */
+	{FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _GatewayInfo_protocol},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+};
+
+static field_t _McuInfo[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _TerminalInfo[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+};
+
+static field_t _EndpointType[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("vendor") SEQ, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 _VendorIdentifier},
+	{FNAME("gatekeeper") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0,
+	 _GatekeeperInfo},
+	{FNAME("gateway") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, _GatewayInfo},
+	{FNAME("mcu") SEQ, 1, 1, 2, SKIP | EXT | OPT, 0, _McuInfo},
+	{FNAME("terminal") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, _TerminalInfo},
+	{FNAME("mc") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("undefinedNode") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("set") BITSTR, FIXD, 32, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedTunnelledProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT,
+	 0, NULL},
+};
+
+static field_t _Setup_UUIE_destinationAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _Setup_UUIE_destExtraCallInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _Setup_UUIE_destExtraCRV[] = {	/* SEQUENCE OF */
+	{FNAME("item") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _Setup_UUIE_conferenceGoal[] = {	/* CHOICE */
+	{FNAME("create") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("join") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("invite") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("capability-negotiation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callIndependentSupplementaryService") NUL, FIXD, 0, 0, SKIP,
+	 0, NULL},
+};
+
+static field_t _Q954Details[] = {	/* SEQUENCE */
+	{FNAME("conferenceCalling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("threePartyService") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _QseriesOptions[] = {	/* SEQUENCE */
+	{FNAME("q932Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q951Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q952Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q953Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q955Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q956Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q957Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q954Info") SEQ, 0, 2, 2, SKIP | EXT, 0, _Q954Details},
+};
+
+static field_t _CallType[] = {	/* CHOICE */
+	{FNAME("pointToPoint") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("oneToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("nToOne") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("nToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H245_NonStandardIdentifier_h221NonStandard[] = {	/* SEQUENCE */
+	{FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H245_NonStandardIdentifier[] = {	/* CHOICE */
+	{FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP, 0,
+	 _H245_NonStandardIdentifier_h221NonStandard},
+};
+
+static field_t _H245_NonStandardParameter[] = {	/* SEQUENCE */
+	{FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP, 0,
+	 _H245_NonStandardIdentifier},
+	{FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H261VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("qcifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+	 NULL},
+	{FNAME("maxBitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("stillImageTransmission") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H262VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("profileAndLevel-SPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-SNRatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-SNRatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-SpatialatH-14") BOOL, FIXD, 0, 0, SKIP, 0,
+	 NULL},
+	{FNAME("profileAndLevel-HPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-HPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-HPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("framesPerSecond") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H263VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("sqcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("qcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cif4MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cif16MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("maxBitRate") INT, CONS, 1, 0, SKIP, 0, NULL},
+	{FNAME("unrestrictedVector") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("arithmeticCoding") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("advancedPrediction") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("pbFrames") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+	 NULL},
+	{FNAME("hrd-B") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("bppMaxKb") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowSqcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowQcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowCifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowCif4MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowCif16MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("errorCompensation") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("enhancementLayerInfo") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("h263Options") SEQ, 5, 29, 31, SKIP | EXT | OPT, 0, NULL},
+};
+
+static field_t _IS11172VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("constrainedBitstream") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("pictureRate") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _VideoCapability[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("h261VideoCapability") SEQ, 2, 5, 6, SKIP | EXT, 0,
+	 _H261VideoCapability},
+	{FNAME("h262VideoCapability") SEQ, 6, 17, 18, SKIP | EXT, 0,
+	 _H262VideoCapability},
+	{FNAME("h263VideoCapability") SEQ, 7, 13, 21, SKIP | EXT, 0,
+	 _H263VideoCapability},
+	{FNAME("is11172VideoCapability") SEQ, 6, 7, 8, SKIP | EXT, 0,
+	 _IS11172VideoCapability},
+	{FNAME("genericVideoCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static field_t _AudioCapability_g7231[] = {	/* SEQUENCE */
+	{FNAME("maxAl-sduAudioFrames") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("silenceSuppression") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _IS11172AudioCapability[] = {	/* SEQUENCE */
+	{FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static field_t _IS13818AudioCapability[] = {	/* SEQUENCE */
+	{FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling16k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling22k05") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling24k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("threeChannels2-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("threeChannels3-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fourChannels2-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fourChannels2-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fourChannels3-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fiveChannels3-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fiveChannels3-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("lowFrequencyEnhancement") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("multilingual") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static field_t _AudioCapability[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("g711Alaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g711Alaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g711Ulaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g711Ulaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g722-64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g722-56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g722-48k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g7231") SEQ, 0, 2, 2, SKIP, 0, _AudioCapability_g7231},
+	{FNAME("g728") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g729") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g729AnnexA") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("is11172AudioCapability") SEQ, 0, 9, 9, SKIP | EXT, 0,
+	 _IS11172AudioCapability},
+	{FNAME("is13818AudioCapability") SEQ, 0, 21, 21, SKIP | EXT, 0,
+	 _IS13818AudioCapability},
+	{FNAME("g729wAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g729AnnexAwAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g7231AnnexCCapability") SEQ, 1, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("gsmFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("gsmHalfRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("gsmEnhancedFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("genericAudioCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+	{FNAME("g729Extensions") SEQ, 1, 8, 8, SKIP | EXT, 0, NULL},
+};
+
+static field_t _DataProtocolCapability[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("v14buffered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("v42lapm") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdlcFrameTunnelling") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h310SeparateVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h310SingleVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("transparent") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("segmentationAndReassembly") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdlcFrameTunnelingwSAR") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("v120") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("separateLANStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("v76wCompression") CHOICE, 2, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("tcp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("udp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _T84Profile_t84Restricted[] = {	/* SEQUENCE */
+	{FNAME("qcif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("cif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("ccir601Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("ccir601Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdtvSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdtvProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g3FacsMH200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g3FacsMH200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g4FacsMMR200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g4FacsMMR200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig200x200Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig200x200Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig300x300Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig300x300Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoLow") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoMedSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoMedProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoHighSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoHighProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _T84Profile[] = {	/* CHOICE */
+	{FNAME("t84Unrestricted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("t84Restricted") SEQ, 0, 19, 19, SKIP | EXT, 0,
+	 _T84Profile_t84Restricted},
+};
+
+static field_t _DataApplicationCapability_application_t84[] = {	/* SEQUENCE */
+	{FNAME("t84Protocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("t84Profile") CHOICE, 1, 2, 2, SKIP, 0, _T84Profile},
+};
+
+static field_t _DataApplicationCapability_application_nlpid[] = {	/* SEQUENCE */
+	{FNAME("nlpidProtocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("nlpidData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _DataApplicationCapability_application[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("t120") CHOICE, 3, 7, 14, DECODE | EXT,
+	 offsetof(DataApplicationCapability_application, t120),
+	 _DataProtocolCapability},
+	{FNAME("dsm-cc") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("userData") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("t84") SEQ, 0, 2, 2, SKIP, 0,
+	 _DataApplicationCapability_application_t84},
+	{FNAME("t434") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("h224") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("nlpid") SEQ, 0, 2, 2, SKIP, 0,
+	 _DataApplicationCapability_application_nlpid},
+	{FNAME("dsvdControl") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h222DataPartitioning") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("t30fax") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+	{FNAME("t140") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+	{FNAME("t38fax") SEQ, 0, 2, 2, SKIP, 0, NULL},
+	{FNAME("genericDataCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static field_t _DataApplicationCapability[] = {	/* SEQUENCE */
+	{FNAME("application") CHOICE, 4, 10, 14, DECODE | EXT,
+	 offsetof(DataApplicationCapability, application),
+	 _DataApplicationCapability_application},
+	{FNAME("maxBitRate") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _EncryptionMode[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("h233Encryption") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _DataType[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("nullData") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoData") CHOICE, 3, 5, 6, SKIP | EXT, 0, _VideoCapability},
+	{FNAME("audioData") CHOICE, 4, 14, 22, SKIP | EXT, 0,
+	 _AudioCapability},
+	{FNAME("data") SEQ, 0, 2, 2, DECODE | EXT, offsetof(DataType, data),
+	 _DataApplicationCapability},
+	{FNAME("encryptionData") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _EncryptionMode},
+	{FNAME("h235Control") SEQ, 0, 2, 2, SKIP, 0, NULL},
+	{FNAME("h235Media") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+	{FNAME("multiplexedStream") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static field_t _H222LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("resourceID") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("subChannelID") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("pcr-pid") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("programDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("streamDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = {	/* SEQUENCE */
+	{FNAME("controlFieldOctets") INT, 2, 0, 0, SKIP, 0, NULL},
+	{FNAME("sendBufferSize") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H223LogicalChannelParameters_adaptationLayerType[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("al1Framed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al1NotFramed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al2WithoutSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al2WithSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al3") SEQ, 0, 2, 2, SKIP, 0,
+	 _H223LogicalChannelParameters_adaptationLayerType_al3},
+	{FNAME("al1M") SEQ, 0, 7, 8, SKIP | EXT, 0, NULL},
+	{FNAME("al2M") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+	{FNAME("al3M") SEQ, 0, 5, 6, SKIP | EXT, 0, NULL},
+};
+
+static field_t _H223LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("adaptationLayerType") CHOICE, 3, 6, 9, SKIP | EXT, 0,
+	 _H223LogicalChannelParameters_adaptationLayerType},
+	{FNAME("segmentableFlag") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CRCLength[] = {	/* CHOICE */
+	{FNAME("crc8bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("crc16bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("crc32bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _V76HDLCParameters[] = {	/* SEQUENCE */
+	{FNAME("crcLength") CHOICE, 2, 3, 3, SKIP | EXT, 0, _CRCLength},
+	{FNAME("n401") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("loopbackTestProcedure") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _V76LogicalChannelParameters_suspendResume[] = {	/* CHOICE */
+	{FNAME("noSuspendResume") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("suspendResumewAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("suspendResumewoAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = {	/* CHOICE */
+	{FNAME("rej") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("sREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("mSREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _V76LogicalChannelParameters_mode_eRM[] = {	/* SEQUENCE */
+	{FNAME("windowSize") INT, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("recovery") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_mode_eRM_recovery},
+};
+
+static field_t _V76LogicalChannelParameters_mode[] = {	/* CHOICE */
+	{FNAME("eRM") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_mode_eRM},
+	{FNAME("uNERM") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _V75Parameters[] = {	/* SEQUENCE */
+	{FNAME("audioHeaderPresent") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _V76LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("hdlcParameters") SEQ, 0, 3, 3, SKIP | EXT, 0,
+	 _V76HDLCParameters},
+	{FNAME("suspendResume") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_suspendResume},
+	{FNAME("uIH") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("mode") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_mode},
+	{FNAME("v75Parameters") SEQ, 0, 1, 1, SKIP | EXT, 0, _V75Parameters},
+};
+
+static field_t _H2250LogicalChannelParameters_nonStandard[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static field_t _UnicastAddress_iPAddress[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 4, 0, DECODE,
+	 offsetof(UnicastAddress_iPAddress, network), NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _UnicastAddress_iPXAddress[] = {	/* SEQUENCE */
+	{FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+	{FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static field_t _UnicastAddress_iP6Address[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _UnicastAddress_iPSourceRouteAddress_routing[] = {	/* CHOICE */
+	{FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _UnicastAddress_iPSourceRouteAddress_route[] = {	/* SEQUENCE OF */
+	{FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static field_t _UnicastAddress_iPSourceRouteAddress[] = {	/* SEQUENCE */
+	{FNAME("routing") CHOICE, 1, 2, 2, SKIP, 0,
+	 _UnicastAddress_iPSourceRouteAddress_routing},
+	{FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _UnicastAddress_iPSourceRouteAddress_route},
+};
+
+static field_t _UnicastAddress[] = {	/* CHOICE */
+	{FNAME("iPAddress") SEQ, 0, 2, 2, DECODE | EXT,
+	 offsetof(UnicastAddress, iPAddress), _UnicastAddress_iPAddress},
+	{FNAME("iPXAddress") SEQ, 0, 3, 3, SKIP | EXT, 0,
+	 _UnicastAddress_iPXAddress},
+	{FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _UnicastAddress_iP6Address},
+	{FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("iPSourceRouteAddress") SEQ, 0, 4, 4, SKIP | EXT, 0,
+	 _UnicastAddress_iPSourceRouteAddress},
+	{FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static field_t _MulticastAddress_iPAddress[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _MulticastAddress_iP6Address[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _MulticastAddress[] = {	/* CHOICE */
+	{FNAME("iPAddress") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _MulticastAddress_iPAddress},
+	{FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _MulticastAddress_iP6Address},
+	{FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static field_t _H245_TransportAddress[] = {	/* CHOICE */
+	{FNAME("unicastAddress") CHOICE, 3, 5, 7, DECODE | EXT,
+	 offsetof(H245_TransportAddress, unicastAddress), _UnicastAddress},
+	{FNAME("multicastAddress") CHOICE, 1, 2, 4, SKIP | EXT, 0,
+	 _MulticastAddress},
+};
+
+static field_t _H2250LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _H2250LogicalChannelParameters_nonStandard},
+	{FNAME("sessionID") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("associatedSessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelParameters, mediaChannel),
+	 _H245_TransportAddress},
+	{FNAME("mediaGuaranteedDelivery") BOOL, FIXD, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelParameters, mediaControlChannel),
+	 _H245_TransportAddress},
+	{FNAME("mediaControlGuaranteedDelivery") BOOL, FIXD, 0, 0, STOP | OPT,
+	 0, NULL},
+	{FNAME("silenceSuppression") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destination") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, STOP | OPT, 0, NULL},
+	{FNAME("mediaPacketization") CHOICE, 0, 1, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("transportCapability") SEQ, 3, 3, 3, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("redundancyEncoding") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("source") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, NULL},
+};
+
+static field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = {	/* CHOICE */
+	{FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+	 _H222LogicalChannelParameters},
+	{FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _H223LogicalChannelParameters},
+	{FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters},
+	{FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+	 offsetof
+	 (OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters,
+	  h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+	{FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("dataType") CHOICE, 3, 6, 9, DECODE | EXT,
+	 offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+		  dataType), _DataType},
+	{FNAME("multiplexParameters") CHOICE, 2, 3, 5, DECODE | EXT,
+	 offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+		  multiplexParameters),
+	 _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters},
+	{FNAME("forwardLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+	 0, NULL},
+	{FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = {	/* CHOICE */
+	{FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _H223LogicalChannelParameters},
+	{FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters},
+	{FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+	 offsetof
+	 (OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters,
+	  h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("dataType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _DataType},
+	{FNAME("multiplexParameters") CHOICE, 1, 2, 3, DECODE | EXT | OPT,
+	 offsetof(OpenLogicalChannel_reverseLogicalChannelParameters,
+		  multiplexParameters),
+	 _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters},
+	{FNAME("reverseLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+	 0, NULL},
+	{FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _NetworkAccessParameters_distribution[] = {	/* CHOICE */
+	{FNAME("unicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("multicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _Q2931Address_address[] = {	/* CHOICE */
+	{FNAME("internationalNumber") NUMSTR, 4, 1, 0, SKIP, 0, NULL},
+	{FNAME("nsapAddress") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+};
+
+static field_t _Q2931Address[] = {	/* SEQUENCE */
+	{FNAME("address") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _Q2931Address_address},
+	{FNAME("subaddress") OCTSTR, 5, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _NetworkAccessParameters_networkAddress[] = {	/* CHOICE */
+	{FNAME("q2931Address") SEQ, 1, 2, 2, SKIP | EXT, 0, _Q2931Address},
+	{FNAME("e164Address") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("localAreaAddress") CHOICE, 1, 2, 2, DECODE | EXT,
+	 offsetof(NetworkAccessParameters_networkAddress, localAreaAddress),
+	 _H245_TransportAddress},
+};
+
+static field_t _NetworkAccessParameters[] = {	/* SEQUENCE */
+	{FNAME("distribution") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0,
+	 _NetworkAccessParameters_distribution},
+	{FNAME("networkAddress") CHOICE, 2, 3, 3, DECODE | EXT,
+	 offsetof(NetworkAccessParameters, networkAddress),
+	 _NetworkAccessParameters_networkAddress},
+	{FNAME("associateConference") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("externalReference") OCTSTR, 8, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("t120SetupProcedure") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+};
+
+static field_t _OpenLogicalChannel[] = {	/* SEQUENCE */
+	{FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("forwardLogicalChannelParameters") SEQ, 1, 3, 5, DECODE | EXT,
+	 offsetof(OpenLogicalChannel, forwardLogicalChannelParameters),
+	 _OpenLogicalChannel_forwardLogicalChannelParameters},
+	{FNAME("reverseLogicalChannelParameters") SEQ, 1, 2, 4,
+	 DECODE | EXT | OPT, offsetof(OpenLogicalChannel,
+				      reverseLogicalChannelParameters),
+	 _OpenLogicalChannel_reverseLogicalChannelParameters},
+	{FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT,
+	 offsetof(OpenLogicalChannel, separateStack),
+	 _NetworkAccessParameters},
+	{FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static field_t _Setup_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static field_t _Setup_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Setup_UUIE, h245Address), _TransportAddress},
+	{FNAME("sourceAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_sourceAddress},
+	{FNAME("sourceInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+	{FNAME("destinationAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_destinationAddress},
+	{FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Setup_UUIE, destCallSignalAddress), _TransportAddress},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_destExtraCallInfo},
+	{FNAME("destExtraCRV") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_destExtraCRV},
+	{FNAME("activeMC") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("conferenceGoal") CHOICE, 2, 3, 5, SKIP | EXT, 0,
+	 _Setup_UUIE_conferenceGoal},
+	{FNAME("callServices") SEQ, 0, 8, 8, SKIP | EXT | OPT, 0,
+	 _QseriesOptions},
+	{FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+	{FNAME("sourceCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Setup_UUIE, sourceCallSignalAddress), _TransportAddress},
+	{FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityCapability") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Setup_UUIE, fastStart), _Setup_UUIE_fastStart},
+	{FNAME("mediaWaitForConnect") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("canOverlapSend") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("connectionParameters") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("symmetricOperationRequired") NUL, FIXD, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("neededFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("desiredFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("parallelH245Control") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("additionalSourceAddresses") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 NULL},
+};
+
+static field_t _CallProceeding_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static field_t _CallProceeding_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(CallProceeding_UUIE, h245Address), _TransportAddress},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(CallProceeding_UUIE, fastStart),
+	 _CallProceeding_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static field_t _Connect_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static field_t _Connect_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Connect_UUIE, h245Address), _TransportAddress},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Connect_UUIE, fastStart), _Connect_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("connectedAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static field_t _Alerting_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static field_t _Alerting_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Alerting_UUIE, h245Address), _TransportAddress},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Alerting_UUIE, fastStart), _Alerting_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("alertingAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static field_t _Information_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static field_t _Information_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Information_UUIE, fastStart), _Information_UUIE_fastStart},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+};
+
+static field_t _ReleaseCompleteReason[] = {	/* CHOICE */
+	{FNAME("noBandwidth") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("gatekeeperResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("unreachableDestination") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationRejection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("invalidRevision") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("noPermission") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("unreachableGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("gatewayResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("badFormatAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("adaptiveBusy") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("inConf") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("facilityCallDeflection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("securityDenied") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("calledPartyNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callerNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("newConnectionNeeded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardReason") SEQ, 0, 2, 2, SKIP, 0, NULL},
+	{FNAME("replaceWithConferenceInvite") OCTSTR, FIXD, 16, 0, SKIP, 0,
+	 NULL},
+	{FNAME("genericDataReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("neededFeatureNotSupported") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("tunnelledSignallingRejected") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _ReleaseComplete_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("reason") CHOICE, 4, 12, 22, SKIP | EXT | OPT, 0,
+	 _ReleaseCompleteReason},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("busyAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static field_t _Facility_UUIE_alternativeAliasAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _FacilityReason[] = {	/* CHOICE */
+	{FNAME("routeCallToGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callForwarded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("routeCallToMC") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("conferenceListChoice") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("startH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("noH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("newTokens") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("featureSetUpdate") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("forwardedElements") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("transportedInformation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _Facility_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static field_t _Facility_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0,
+	 _TransportAddress},
+	{FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Facility_UUIE_alternativeAliasAddress},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+	{FNAME("reason") CHOICE, 2, 4, 11, DECODE | EXT,
+	 offsetof(Facility_UUIE, reason), _FacilityReason},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("conferences") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Facility_UUIE, h245Address), _TransportAddress},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Facility_UUIE, fastStart), _Facility_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+};
+
+static field_t _CallIdentifier[] = {	/* SEQUENCE */
+	{FNAME("guid") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+};
+
+static field_t _SecurityServiceMode[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+	{FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("default") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _SecurityCapabilities[] = {	/* SEQUENCE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("encryption") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _SecurityServiceMode},
+	{FNAME("authenticaton") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _SecurityServiceMode},
+	{FNAME("integrity") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _SecurityServiceMode},
+};
+
+static field_t _H245Security[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+	{FNAME("noSecurity") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("tls") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+	{FNAME("ipsec") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+};
+
+static field_t _DHset[] = {	/* SEQUENCE */
+	{FNAME("halfkey") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("modSize") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("generator") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _TypedCertificate[] = {	/* SEQUENCE */
+	{FNAME("type") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("certificate") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _H235_NonStandardParameter[] = {	/* SEQUENCE */
+	{FNAME("nonStandardIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _ClearToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("timeStamp") INT, CONS, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("password") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("dhkey") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, _DHset},
+	{FNAME("challenge") OCTSTR, 7, 8, 0, SKIP | OPT, 0, NULL},
+	{FNAME("random") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("certificate") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0,
+	 _TypedCertificate},
+	{FNAME("generalID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _H235_NonStandardParameter},
+	{FNAME("eckasdhkey") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("sendersID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _Progress_UUIE_tokens[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+};
+
+static field_t _Params[] = {	/* SEQUENCE */
+	{FNAME("ranInt") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("iv8") OCTSTR, FIXD, 8, 0, SKIP | OPT, 0, NULL},
+	{FNAME("iv16") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _CryptoH323Token_cryptoEPPwdHash_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoH323Token_cryptoEPPwdHash[] = {	/* SEQUENCE */
+	{FNAME("alias") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+	{FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoEPPwdHash_token},
+};
+
+static field_t _CryptoH323Token_cryptoGKPwdHash_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoH323Token_cryptoGKPwdHash[] = {	/* SEQUENCE */
+	{FNAME("gatekeeperId") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoGKPwdHash_token},
+};
+
+static field_t _CryptoH323Token_cryptoEPPwdEncr[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoH323Token_cryptoGKPwdEncr[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoH323Token_cryptoEPCert[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoH323Token_cryptoGKCert[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoH323Token_cryptoFastStart[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoToken_cryptoEncryptedToken_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoToken_cryptoEncryptedToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoEncryptedToken_token},
+};
+
+static field_t _CryptoToken_cryptoSignedToken_token[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoToken_cryptoSignedToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoToken_cryptoSignedToken_token},
+};
+
+static field_t _CryptoToken_cryptoHashedToken_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoToken_cryptoHashedToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("hashedVals") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoHashedToken_token},
+};
+
+static field_t _CryptoToken_cryptoPwdEncr[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _CryptoToken[] = {	/* CHOICE */
+	{FNAME("cryptoEncryptedToken") SEQ, 0, 2, 2, SKIP, 0,
+	 _CryptoToken_cryptoEncryptedToken},
+	{FNAME("cryptoSignedToken") SEQ, 0, 2, 2, SKIP, 0,
+	 _CryptoToken_cryptoSignedToken},
+	{FNAME("cryptoHashedToken") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoHashedToken},
+	{FNAME("cryptoPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoPwdEncr},
+};
+
+static field_t _CryptoH323Token[] = {	/* CHOICE */
+	{FNAME("cryptoEPPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoEPPwdHash},
+	{FNAME("cryptoGKPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoGKPwdHash},
+	{FNAME("cryptoEPPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoEPPwdEncr},
+	{FNAME("cryptoGKPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoGKPwdEncr},
+	{FNAME("cryptoEPCert") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoH323Token_cryptoEPCert},
+	{FNAME("cryptoGKCert") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoH323Token_cryptoGKCert},
+	{FNAME("cryptoFastStart") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoH323Token_cryptoFastStart},
+	{FNAME("nestedcryptoToken") CHOICE, 2, 4, 4, SKIP | EXT, 0,
+	 _CryptoToken},
+};
+
+static field_t _Progress_UUIE_cryptoTokens[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 8, 8, SKIP | EXT, 0, _CryptoH323Token},
+};
+
+static field_t _Progress_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static field_t _Progress_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Progress_UUIE, h245Address), _TransportAddress},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0,
+	 _CallIdentifier},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 _H245Security},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Progress_UUIE_tokens},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Progress_UUIE_cryptoTokens},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Progress_UUIE, fastStart), _Progress_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _H323_UU_PDU_h323_message_body[] = {	/* CHOICE */
+	{FNAME("setup") SEQ, 7, 13, 39, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, setup), _Setup_UUIE},
+	{FNAME("callProceeding") SEQ, 1, 3, 12, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, callProceeding),
+	 _CallProceeding_UUIE},
+	{FNAME("connect") SEQ, 1, 4, 19, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, connect), _Connect_UUIE},
+	{FNAME("alerting") SEQ, 1, 3, 17, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, alerting), _Alerting_UUIE},
+	{FNAME("information") SEQ, 0, 1, 7, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, information),
+	 _Information_UUIE},
+	{FNAME("releaseComplete") SEQ, 1, 2, 11, SKIP | EXT, 0,
+	 _ReleaseComplete_UUIE},
+	{FNAME("facility") SEQ, 3, 5, 21, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, facility), _Facility_UUIE},
+	{FNAME("progress") SEQ, 5, 8, 11, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, progress), _Progress_UUIE},
+	{FNAME("empty") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("status") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+	{FNAME("statusInquiry") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+	{FNAME("setupAcknowledge") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+	{FNAME("notify") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+};
+
+static field_t _RequestMessage[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("masterSlaveDetermination") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("terminalCapabilitySet") SEQ, 3, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("openLogicalChannel") SEQ, 1, 3, 5, DECODE | EXT,
+	 offsetof(RequestMessage, openLogicalChannel), _OpenLogicalChannel},
+	{FNAME("closeLogicalChannel") SEQ, 0, 2, 3, STOP | EXT, 0, NULL},
+	{FNAME("requestChannelClose") SEQ, 0, 1, 3, STOP | EXT, 0, NULL},
+	{FNAME("multiplexEntrySend") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("requestMultiplexEntry") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestMode") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("roundTripDelayRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("maintenanceLoopRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("communicationModeRequest") SEQ, 0, 0, 0, STOP | EXT, 0, NULL},
+	{FNAME("conferenceRequest") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+	{FNAME("multilinkRequest") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("logicalChannelRateRequest") SEQ, 0, 3, 3, STOP | EXT, 0,
+	 NULL},
+};
+
+static field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = {	/* CHOICE */
+	{FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+	 _H222LogicalChannelParameters},
+	{FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+	 offsetof
+	 (OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters,
+	  h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("reverseLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("multiplexParameters") CHOICE, 0, 1, 2, DECODE | EXT | OPT,
+	 offsetof(OpenLogicalChannelAck_reverseLogicalChannelParameters,
+		  multiplexParameters),
+	 _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters},
+	{FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _H2250LogicalChannelAckParameters_nonStandard[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static field_t _H2250LogicalChannelAckParameters[] = {	/* SEQUENCE */
+	{FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _H2250LogicalChannelAckParameters_nonStandard},
+	{FNAME("sessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelAckParameters, mediaChannel),
+	 _H245_TransportAddress},
+	{FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelAckParameters, mediaControlChannel),
+	 _H245_TransportAddress},
+	{FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, SKIP | OPT, 0, NULL},
+	{FNAME("flowControlToZero") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = {	/* CHOICE */
+	{FNAME("h2250LogicalChannelAckParameters") SEQ, 5, 5, 7, DECODE | EXT,
+	 offsetof(OpenLogicalChannelAck_forwardMultiplexAckParameters,
+		  h2250LogicalChannelAckParameters),
+	 _H2250LogicalChannelAckParameters},
+};
+
+static field_t _OpenLogicalChannelAck[] = {	/* SEQUENCE */
+	{FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("reverseLogicalChannelParameters") SEQ, 2, 3, 4,
+	 DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+				      reverseLogicalChannelParameters),
+	 _OpenLogicalChannelAck_reverseLogicalChannelParameters},
+	{FNAME("separateStack") SEQ, 2, 4, 5, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("forwardMultiplexAckParameters") CHOICE, 0, 1, 1,
+	 DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+				      forwardMultiplexAckParameters),
+	 _OpenLogicalChannelAck_forwardMultiplexAckParameters},
+	{FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static field_t _ResponseMessage[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("masterSlaveDeterminationAck") SEQ, 0, 1, 1, STOP | EXT, 0,
+	 NULL},
+	{FNAME("masterSlaveDeterminationReject") SEQ, 0, 1, 1, STOP | EXT, 0,
+	 NULL},
+	{FNAME("terminalCapabilitySetAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("terminalCapabilitySetReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+	 NULL},
+	{FNAME("openLogicalChannelAck") SEQ, 1, 2, 5, DECODE | EXT,
+	 offsetof(ResponseMessage, openLogicalChannelAck),
+	 _OpenLogicalChannelAck},
+	{FNAME("openLogicalChannelReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("closeLogicalChannelAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestChannelCloseAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestChannelCloseReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+	 NULL},
+	{FNAME("multiplexEntrySendAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("multiplexEntrySendReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("requestMultiplexEntryAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestMultiplexEntryReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+	 NULL},
+	{FNAME("requestModeAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("requestModeReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("roundTripDelayResponse") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("maintenanceLoopAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("maintenanceLoopReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("communicationModeResponse") CHOICE, 0, 1, 1, STOP | EXT, 0,
+	 NULL},
+	{FNAME("conferenceResponse") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+	{FNAME("multilinkResponse") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("logicalChannelRateAcknowledge") SEQ, 0, 3, 3, STOP | EXT, 0,
+	 NULL},
+	{FNAME("logicalChannelRateReject") SEQ, 1, 4, 4, STOP | EXT, 0, NULL},
+};
+
+static field_t _MultimediaSystemControlMessage[] = {	/* CHOICE */
+	{FNAME("request") CHOICE, 4, 11, 15, DECODE | EXT,
+	 offsetof(MultimediaSystemControlMessage, request), _RequestMessage},
+	{FNAME("response") CHOICE, 5, 19, 24, DECODE | EXT,
+	 offsetof(MultimediaSystemControlMessage, response),
+	 _ResponseMessage},
+	{FNAME("command") CHOICE, 3, 7, 12, STOP | EXT, 0, NULL},
+	{FNAME("indication") CHOICE, 4, 14, 23, STOP | EXT, 0, NULL},
+};
+
+static field_t _H323_UU_PDU_h245Control[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 2, 4, 4, DECODE | OPEN | EXT,
+	 sizeof(MultimediaSystemControlMessage),
+	 _MultimediaSystemControlMessage}
+	,
+};
+
+static field_t _H323_UU_PDU[] = {	/* SEQUENCE */
+	{FNAME("h323-message-body") CHOICE, 3, 7, 13, DECODE | EXT,
+	 offsetof(H323_UU_PDU, h323_message_body),
+	 _H323_UU_PDU_h323_message_body},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("h4501SupplementaryService") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("h245Tunneling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h245Control") SEQOF, SEMI, 0, 4, DECODE | OPT,
+	 offsetof(H323_UU_PDU, h245Control), _H323_UU_PDU_h245Control},
+	{FNAME("nonStandardControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("tunnelledSignallingMessage") SEQ, 2, 4, 4, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("provisionalRespToH245Tunneling") NUL, FIXD, 0, 0, STOP | OPT,
+	 0, NULL},
+	{FNAME("stimulusControl") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _H323_UserInformation[] = {	/* SEQUENCE */
+	{FNAME("h323-uu-pdu") SEQ, 1, 2, 11, DECODE | EXT,
+	 offsetof(H323_UserInformation, h323_uu_pdu), _H323_UU_PDU},
+	{FNAME("user-data") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+};
+
+static field_t _GatekeeperRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(GatekeeperRequest, rasAddress), _TransportAddress},
+	{FNAME("endpointType") SEQ, 6, 8, 10, STOP | EXT, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+	{FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("authenticationCapability") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("algorithmOIDs") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _GatekeeperConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(GatekeeperConfirm, rasAddress), _TransportAddress},
+	{FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("authenticationMode") CHOICE, 3, 7, 8, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _RegistrationRequest_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static field_t _RegistrationRequest_rasAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static field_t _RegistrationRequest_terminalAlias[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _RegistrationRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("discoveryComplete") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(RegistrationRequest, callSignalAddress),
+	 _RegistrationRequest_callSignalAddress},
+	{FNAME("rasAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(RegistrationRequest, rasAddress),
+	 _RegistrationRequest_rasAddress},
+	{FNAME("terminalType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+	{FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _RegistrationRequest_terminalAlias},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("endpointVendor") SEQ, 2, 3, 3, SKIP | EXT, 0,
+	 _VendorIdentifier},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+	 offsetof(RegistrationRequest, timeToLive), NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("keepAlive") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("additiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("usageReportingCapability") SEQ, 3, 4, 4, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("supportedH248Packages") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("callCreditCapability") SEQ, 2, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("capacityReportingCapability") SEQ, 0, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _RegistrationConfirm_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static field_t _RegistrationConfirm_terminalAlias[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _RegistrationConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(RegistrationConfirm, callSignalAddress),
+	 _RegistrationConfirm_callSignalAddress},
+	{FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _RegistrationConfirm_terminalAlias},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+	 offsetof(RegistrationConfirm, timeToLive), NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("preGrantedARQ") SEQ, 0, 4, 8, STOP | EXT | OPT, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("supportsAdditiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureServerAlias") CHOICE, 1, 2, 7, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("capacityReportingSpec") SEQ, 0, 1, 1, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _UnregistrationRequest_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static field_t _UnregistrationRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(UnregistrationRequest, callSignalAddress),
+	 _UnregistrationRequest_callSignalAddress},
+	{FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("reason") CHOICE, 2, 4, 5, STOP | EXT | OPT, 0, NULL},
+	{FNAME("endpointAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _CallModel[] = {	/* CHOICE */
+	{FNAME("direct") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("gatekeeperRouted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static field_t _AdmissionRequest_destinationInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _AdmissionRequest_destExtraCallInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _AdmissionRequest_srcInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _AdmissionRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+	{FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _CallModel},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _AdmissionRequest_destinationInfo},
+	{FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(AdmissionRequest, destCallSignalAddress),
+	 _TransportAddress},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _AdmissionRequest_destExtraCallInfo},
+	{FNAME("srcInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _AdmissionRequest_srcInfo},
+	{FNAME("srcCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(AdmissionRequest, srcCallSignalAddress), _TransportAddress},
+	{FNAME("bandWidth") INT, CONS, 0, 0, STOP, 0, NULL},
+	{FNAME("callReferenceValue") INT, WORD, 0, 0, STOP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, STOP, 0, NULL},
+	{FNAME("activeMC") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("answerCall") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("srcAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("gatewayDataRate") SEQ, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _AdmissionConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("bandWidth") INT, CONS, 0, 0, SKIP, 0, NULL},
+	{FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT, 0, _CallModel},
+	{FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(AdmissionConfirm, destCallSignalAddress),
+	 _TransportAddress},
+	{FNAME("irrFrequency") INT, WORD, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+	{FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("uuiesRequested") SEQ, 0, 9, 13, STOP | EXT, 0, NULL},
+	{FNAME("language") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("useSpecifiedTransport") CHOICE, 1, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _LocationRequest_destinationInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static field_t _LocationRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _LocationRequest_destinationInfo},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("replyAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(LocationRequest, replyAddress), _TransportAddress},
+	{FNAME("sourceInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("hopCount") INT, 8, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+};
+
+static field_t _LocationConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("callSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(LocationConfirm, callSignalAddress), _TransportAddress},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(LocationConfirm, rasAddress), _TransportAddress},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+	{FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _InfoRequestResponse_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static field_t _InfoRequestResponse[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("endpointType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(InfoRequestResponse, rasAddress), _TransportAddress},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(InfoRequestResponse, callSignalAddress),
+	 _InfoRequestResponse_callSignalAddress},
+	{FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("perCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("needResponse") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("irrStatus") CHOICE, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("unsolicited") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static field_t _RasMessage[] = {	/* CHOICE */
+	{FNAME("gatekeeperRequest") SEQ, 4, 8, 18, DECODE | EXT,
+	 offsetof(RasMessage, gatekeeperRequest), _GatekeeperRequest},
+	{FNAME("gatekeeperConfirm") SEQ, 2, 5, 14, DECODE | EXT,
+	 offsetof(RasMessage, gatekeeperConfirm), _GatekeeperConfirm},
+	{FNAME("gatekeeperReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+	{FNAME("registrationRequest") SEQ, 3, 10, 31, DECODE | EXT,
+	 offsetof(RasMessage, registrationRequest), _RegistrationRequest},
+	{FNAME("registrationConfirm") SEQ, 3, 7, 24, DECODE | EXT,
+	 offsetof(RasMessage, registrationConfirm), _RegistrationConfirm},
+	{FNAME("registrationReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+	{FNAME("unregistrationRequest") SEQ, 3, 5, 15, DECODE | EXT,
+	 offsetof(RasMessage, unregistrationRequest), _UnregistrationRequest},
+	{FNAME("unregistrationConfirm") SEQ, 1, 2, 6, STOP | EXT, 0, NULL},
+	{FNAME("unregistrationReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+	{FNAME("admissionRequest") SEQ, 7, 16, 34, DECODE | EXT,
+	 offsetof(RasMessage, admissionRequest), _AdmissionRequest},
+	{FNAME("admissionConfirm") SEQ, 2, 6, 27, DECODE | EXT,
+	 offsetof(RasMessage, admissionConfirm), _AdmissionConfirm},
+	{FNAME("admissionReject") SEQ, 1, 3, 11, STOP | EXT, 0, NULL},
+	{FNAME("bandwidthRequest") SEQ, 2, 7, 18, STOP | EXT, 0, NULL},
+	{FNAME("bandwidthConfirm") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+	{FNAME("bandwidthReject") SEQ, 1, 4, 9, STOP | EXT, 0, NULL},
+	{FNAME("disengageRequest") SEQ, 1, 6, 19, STOP | EXT, 0, NULL},
+	{FNAME("disengageConfirm") SEQ, 1, 2, 9, STOP | EXT, 0, NULL},
+	{FNAME("disengageReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+	{FNAME("locationRequest") SEQ, 2, 5, 17, DECODE | EXT,
+	 offsetof(RasMessage, locationRequest), _LocationRequest},
+	{FNAME("locationConfirm") SEQ, 1, 4, 19, DECODE | EXT,
+	 offsetof(RasMessage, locationConfirm), _LocationConfirm},
+	{FNAME("locationReject") SEQ, 1, 3, 10, STOP | EXT, 0, NULL},
+	{FNAME("infoRequest") SEQ, 2, 4, 15, STOP | EXT, 0, NULL},
+	{FNAME("infoRequestResponse") SEQ, 3, 8, 16, DECODE | EXT,
+	 offsetof(RasMessage, infoRequestResponse), _InfoRequestResponse},
+	{FNAME("nonStandardMessage") SEQ, 0, 2, 7, STOP | EXT, 0, NULL},
+	{FNAME("unknownMessageResponse") SEQ, 0, 1, 5, STOP | EXT, 0, NULL},
+	{FNAME("requestInProgress") SEQ, 4, 6, 6, STOP | EXT, 0, NULL},
+	{FNAME("resourcesAvailableIndicate") SEQ, 4, 9, 11, STOP | EXT, 0,
+	 NULL},
+	{FNAME("resourcesAvailableConfirm") SEQ, 4, 6, 7, STOP | EXT, 0,
+	 NULL},
+	{FNAME("infoRequestAck") SEQ, 4, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("infoRequestNak") SEQ, 5, 7, 7, STOP | EXT, 0, NULL},
+	{FNAME("serviceControlIndication") SEQ, 8, 10, 10, STOP | EXT, 0,
+	 NULL},
+	{FNAME("serviceControlResponse") SEQ, 7, 8, 8, STOP | EXT, 0, NULL},
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.h b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.h
new file mode 100644
index 000000000000..cc98f7aa5abe
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.h
@@ -0,0 +1,938 @@
+/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ */
+
+typedef struct TransportAddress_ipAddress {	/* SEQUENCE */
+	int options;		/* No use */
+	unsigned ip;
+} TransportAddress_ipAddress;
+
+typedef struct TransportAddress {	/* CHOICE */
+	enum {
+		eTransportAddress_ipAddress,
+		eTransportAddress_ipSourceRoute,
+		eTransportAddress_ipxAddress,
+		eTransportAddress_ip6Address,
+		eTransportAddress_netBios,
+		eTransportAddress_nsap,
+		eTransportAddress_nonStandardAddress,
+	} choice;
+	union {
+		TransportAddress_ipAddress ipAddress;
+	};
+} TransportAddress;
+
+typedef struct DataProtocolCapability {	/* CHOICE */
+	enum {
+		eDataProtocolCapability_nonStandard,
+		eDataProtocolCapability_v14buffered,
+		eDataProtocolCapability_v42lapm,
+		eDataProtocolCapability_hdlcFrameTunnelling,
+		eDataProtocolCapability_h310SeparateVCStack,
+		eDataProtocolCapability_h310SingleVCStack,
+		eDataProtocolCapability_transparent,
+		eDataProtocolCapability_segmentationAndReassembly,
+		eDataProtocolCapability_hdlcFrameTunnelingwSAR,
+		eDataProtocolCapability_v120,
+		eDataProtocolCapability_separateLANStack,
+		eDataProtocolCapability_v76wCompression,
+		eDataProtocolCapability_tcp,
+		eDataProtocolCapability_udp,
+	} choice;
+} DataProtocolCapability;
+
+typedef struct DataApplicationCapability_application {	/* CHOICE */
+	enum {
+		eDataApplicationCapability_application_nonStandard,
+		eDataApplicationCapability_application_t120,
+		eDataApplicationCapability_application_dsm_cc,
+		eDataApplicationCapability_application_userData,
+		eDataApplicationCapability_application_t84,
+		eDataApplicationCapability_application_t434,
+		eDataApplicationCapability_application_h224,
+		eDataApplicationCapability_application_nlpid,
+		eDataApplicationCapability_application_dsvdControl,
+		eDataApplicationCapability_application_h222DataPartitioning,
+		eDataApplicationCapability_application_t30fax,
+		eDataApplicationCapability_application_t140,
+		eDataApplicationCapability_application_t38fax,
+		eDataApplicationCapability_application_genericDataCapability,
+	} choice;
+	union {
+		DataProtocolCapability t120;
+	};
+} DataApplicationCapability_application;
+
+typedef struct DataApplicationCapability {	/* SEQUENCE */
+	int options;		/* No use */
+	DataApplicationCapability_application application;
+} DataApplicationCapability;
+
+typedef struct DataType {	/* CHOICE */
+	enum {
+		eDataType_nonStandard,
+		eDataType_nullData,
+		eDataType_videoData,
+		eDataType_audioData,
+		eDataType_data,
+		eDataType_encryptionData,
+		eDataType_h235Control,
+		eDataType_h235Media,
+		eDataType_multiplexedStream,
+	} choice;
+	union {
+		DataApplicationCapability data;
+	};
+} DataType;
+
+typedef struct UnicastAddress_iPAddress {	/* SEQUENCE */
+	int options;		/* No use */
+	unsigned network;
+} UnicastAddress_iPAddress;
+
+typedef struct UnicastAddress {	/* CHOICE */
+	enum {
+		eUnicastAddress_iPAddress,
+		eUnicastAddress_iPXAddress,
+		eUnicastAddress_iP6Address,
+		eUnicastAddress_netBios,
+		eUnicastAddress_iPSourceRouteAddress,
+		eUnicastAddress_nsap,
+		eUnicastAddress_nonStandardAddress,
+	} choice;
+	union {
+		UnicastAddress_iPAddress iPAddress;
+	};
+} UnicastAddress;
+
+typedef struct H245_TransportAddress {	/* CHOICE */
+	enum {
+		eH245_TransportAddress_unicastAddress,
+		eH245_TransportAddress_multicastAddress,
+	} choice;
+	union {
+		UnicastAddress unicastAddress;
+	};
+} H245_TransportAddress;
+
+typedef struct H2250LogicalChannelParameters {	/* SEQUENCE */
+	enum {
+		eH2250LogicalChannelParameters_nonStandard = (1 << 31),
+		eH2250LogicalChannelParameters_associatedSessionID =
+		    (1 << 30),
+		eH2250LogicalChannelParameters_mediaChannel = (1 << 29),
+		eH2250LogicalChannelParameters_mediaGuaranteedDelivery =
+		    (1 << 28),
+		eH2250LogicalChannelParameters_mediaControlChannel =
+		    (1 << 27),
+		eH2250LogicalChannelParameters_mediaControlGuaranteedDelivery
+		    = (1 << 26),
+		eH2250LogicalChannelParameters_silenceSuppression = (1 << 25),
+		eH2250LogicalChannelParameters_destination = (1 << 24),
+		eH2250LogicalChannelParameters_dynamicRTPPayloadType =
+		    (1 << 23),
+		eH2250LogicalChannelParameters_mediaPacketization = (1 << 22),
+		eH2250LogicalChannelParameters_transportCapability =
+		    (1 << 21),
+		eH2250LogicalChannelParameters_redundancyEncoding = (1 << 20),
+		eH2250LogicalChannelParameters_source = (1 << 19),
+	} options;
+	H245_TransportAddress mediaChannel;
+	H245_TransportAddress mediaControlChannel;
+} H2250LogicalChannelParameters;
+
+typedef struct OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters {	/* CHOICE */
+	enum {
+		eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h222LogicalChannelParameters,
+		eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h223LogicalChannelParameters,
+		eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_v76LogicalChannelParameters,
+		eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters,
+		eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_none,
+	} choice;
+	union {
+		H2250LogicalChannelParameters h2250LogicalChannelParameters;
+	};
+} OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters;
+
+typedef struct OpenLogicalChannel_forwardLogicalChannelParameters {	/* SEQUENCE */
+	enum {
+		eOpenLogicalChannel_forwardLogicalChannelParameters_portNumber
+		    = (1 << 31),
+		eOpenLogicalChannel_forwardLogicalChannelParameters_forwardLogicalChannelDependency
+		    = (1 << 30),
+		eOpenLogicalChannel_forwardLogicalChannelParameters_replacementFor
+		    = (1 << 29),
+	} options;
+	DataType dataType;
+	OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters
+	    multiplexParameters;
+} OpenLogicalChannel_forwardLogicalChannelParameters;
+
+typedef struct OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters {	/* CHOICE */
+	enum {
+		eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h223LogicalChannelParameters,
+		eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_v76LogicalChannelParameters,
+		eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters,
+	} choice;
+	union {
+		H2250LogicalChannelParameters h2250LogicalChannelParameters;
+	};
+} OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters;
+
+typedef struct OpenLogicalChannel_reverseLogicalChannelParameters {	/* SEQUENCE */
+	enum {
+		eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters
+		    = (1 << 31),
+		eOpenLogicalChannel_reverseLogicalChannelParameters_reverseLogicalChannelDependency
+		    = (1 << 30),
+		eOpenLogicalChannel_reverseLogicalChannelParameters_replacementFor
+		    = (1 << 29),
+	} options;
+	OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters
+	    multiplexParameters;
+} OpenLogicalChannel_reverseLogicalChannelParameters;
+
+typedef struct NetworkAccessParameters_networkAddress {	/* CHOICE */
+	enum {
+		eNetworkAccessParameters_networkAddress_q2931Address,
+		eNetworkAccessParameters_networkAddress_e164Address,
+		eNetworkAccessParameters_networkAddress_localAreaAddress,
+	} choice;
+	union {
+		H245_TransportAddress localAreaAddress;
+	};
+} NetworkAccessParameters_networkAddress;
+
+typedef struct NetworkAccessParameters {	/* SEQUENCE */
+	enum {
+		eNetworkAccessParameters_distribution = (1 << 31),
+		eNetworkAccessParameters_externalReference = (1 << 30),
+		eNetworkAccessParameters_t120SetupProcedure = (1 << 29),
+	} options;
+	NetworkAccessParameters_networkAddress networkAddress;
+} NetworkAccessParameters;
+
+typedef struct OpenLogicalChannel {	/* SEQUENCE */
+	enum {
+		eOpenLogicalChannel_reverseLogicalChannelParameters =
+		    (1 << 31),
+		eOpenLogicalChannel_separateStack = (1 << 30),
+		eOpenLogicalChannel_encryptionSync = (1 << 29),
+	} options;
+	OpenLogicalChannel_forwardLogicalChannelParameters
+	    forwardLogicalChannelParameters;
+	OpenLogicalChannel_reverseLogicalChannelParameters
+	    reverseLogicalChannelParameters;
+	NetworkAccessParameters separateStack;
+} OpenLogicalChannel;
+
+typedef struct Setup_UUIE_fastStart {	/* SEQUENCE OF */
+	int count;
+	OpenLogicalChannel item[30];
+} Setup_UUIE_fastStart;
+
+typedef struct Setup_UUIE {	/* SEQUENCE */
+	enum {
+		eSetup_UUIE_h245Address = (1 << 31),
+		eSetup_UUIE_sourceAddress = (1 << 30),
+		eSetup_UUIE_destinationAddress = (1 << 29),
+		eSetup_UUIE_destCallSignalAddress = (1 << 28),
+		eSetup_UUIE_destExtraCallInfo = (1 << 27),
+		eSetup_UUIE_destExtraCRV = (1 << 26),
+		eSetup_UUIE_callServices = (1 << 25),
+		eSetup_UUIE_sourceCallSignalAddress = (1 << 24),
+		eSetup_UUIE_remoteExtensionAddress = (1 << 23),
+		eSetup_UUIE_callIdentifier = (1 << 22),
+		eSetup_UUIE_h245SecurityCapability = (1 << 21),
+		eSetup_UUIE_tokens = (1 << 20),
+		eSetup_UUIE_cryptoTokens = (1 << 19),
+		eSetup_UUIE_fastStart = (1 << 18),
+		eSetup_UUIE_mediaWaitForConnect = (1 << 17),
+		eSetup_UUIE_canOverlapSend = (1 << 16),
+		eSetup_UUIE_endpointIdentifier = (1 << 15),
+		eSetup_UUIE_multipleCalls = (1 << 14),
+		eSetup_UUIE_maintainConnection = (1 << 13),
+		eSetup_UUIE_connectionParameters = (1 << 12),
+		eSetup_UUIE_language = (1 << 11),
+		eSetup_UUIE_presentationIndicator = (1 << 10),
+		eSetup_UUIE_screeningIndicator = (1 << 9),
+		eSetup_UUIE_serviceControl = (1 << 8),
+		eSetup_UUIE_symmetricOperationRequired = (1 << 7),
+		eSetup_UUIE_capacity = (1 << 6),
+		eSetup_UUIE_circuitInfo = (1 << 5),
+		eSetup_UUIE_desiredProtocols = (1 << 4),
+		eSetup_UUIE_neededFeatures = (1 << 3),
+		eSetup_UUIE_desiredFeatures = (1 << 2),
+		eSetup_UUIE_supportedFeatures = (1 << 1),
+		eSetup_UUIE_parallelH245Control = (1 << 0),
+	} options;
+	TransportAddress h245Address;
+	TransportAddress destCallSignalAddress;
+	TransportAddress sourceCallSignalAddress;
+	Setup_UUIE_fastStart fastStart;
+} Setup_UUIE;
+
+typedef struct CallProceeding_UUIE_fastStart {	/* SEQUENCE OF */
+	int count;
+	OpenLogicalChannel item[30];
+} CallProceeding_UUIE_fastStart;
+
+typedef struct CallProceeding_UUIE {	/* SEQUENCE */
+	enum {
+		eCallProceeding_UUIE_h245Address = (1 << 31),
+		eCallProceeding_UUIE_callIdentifier = (1 << 30),
+		eCallProceeding_UUIE_h245SecurityMode = (1 << 29),
+		eCallProceeding_UUIE_tokens = (1 << 28),
+		eCallProceeding_UUIE_cryptoTokens = (1 << 27),
+		eCallProceeding_UUIE_fastStart = (1 << 26),
+		eCallProceeding_UUIE_multipleCalls = (1 << 25),
+		eCallProceeding_UUIE_maintainConnection = (1 << 24),
+		eCallProceeding_UUIE_fastConnectRefused = (1 << 23),
+		eCallProceeding_UUIE_featureSet = (1 << 22),
+	} options;
+	TransportAddress h245Address;
+	CallProceeding_UUIE_fastStart fastStart;
+} CallProceeding_UUIE;
+
+typedef struct Connect_UUIE_fastStart {	/* SEQUENCE OF */
+	int count;
+	OpenLogicalChannel item[30];
+} Connect_UUIE_fastStart;
+
+typedef struct Connect_UUIE {	/* SEQUENCE */
+	enum {
+		eConnect_UUIE_h245Address = (1 << 31),
+		eConnect_UUIE_callIdentifier = (1 << 30),
+		eConnect_UUIE_h245SecurityMode = (1 << 29),
+		eConnect_UUIE_tokens = (1 << 28),
+		eConnect_UUIE_cryptoTokens = (1 << 27),
+		eConnect_UUIE_fastStart = (1 << 26),
+		eConnect_UUIE_multipleCalls = (1 << 25),
+		eConnect_UUIE_maintainConnection = (1 << 24),
+		eConnect_UUIE_language = (1 << 23),
+		eConnect_UUIE_connectedAddress = (1 << 22),
+		eConnect_UUIE_presentationIndicator = (1 << 21),
+		eConnect_UUIE_screeningIndicator = (1 << 20),
+		eConnect_UUIE_fastConnectRefused = (1 << 19),
+		eConnect_UUIE_serviceControl = (1 << 18),
+		eConnect_UUIE_capacity = (1 << 17),
+		eConnect_UUIE_featureSet = (1 << 16),
+	} options;
+	TransportAddress h245Address;
+	Connect_UUIE_fastStart fastStart;
+} Connect_UUIE;
+
+typedef struct Alerting_UUIE_fastStart {	/* SEQUENCE OF */
+	int count;
+	OpenLogicalChannel item[30];
+} Alerting_UUIE_fastStart;
+
+typedef struct Alerting_UUIE {	/* SEQUENCE */
+	enum {
+		eAlerting_UUIE_h245Address = (1 << 31),
+		eAlerting_UUIE_callIdentifier = (1 << 30),
+		eAlerting_UUIE_h245SecurityMode = (1 << 29),
+		eAlerting_UUIE_tokens = (1 << 28),
+		eAlerting_UUIE_cryptoTokens = (1 << 27),
+		eAlerting_UUIE_fastStart = (1 << 26),
+		eAlerting_UUIE_multipleCalls = (1 << 25),
+		eAlerting_UUIE_maintainConnection = (1 << 24),
+		eAlerting_UUIE_alertingAddress = (1 << 23),
+		eAlerting_UUIE_presentationIndicator = (1 << 22),
+		eAlerting_UUIE_screeningIndicator = (1 << 21),
+		eAlerting_UUIE_fastConnectRefused = (1 << 20),
+		eAlerting_UUIE_serviceControl = (1 << 19),
+		eAlerting_UUIE_capacity = (1 << 18),
+		eAlerting_UUIE_featureSet = (1 << 17),
+	} options;
+	TransportAddress h245Address;
+	Alerting_UUIE_fastStart fastStart;
+} Alerting_UUIE;
+
+typedef struct Information_UUIE_fastStart {	/* SEQUENCE OF */
+	int count;
+	OpenLogicalChannel item[30];
+} Information_UUIE_fastStart;
+
+typedef struct Information_UUIE {	/* SEQUENCE */
+	enum {
+		eInformation_UUIE_callIdentifier = (1 << 31),
+		eInformation_UUIE_tokens = (1 << 30),
+		eInformation_UUIE_cryptoTokens = (1 << 29),
+		eInformation_UUIE_fastStart = (1 << 28),
+		eInformation_UUIE_fastConnectRefused = (1 << 27),
+		eInformation_UUIE_circuitInfo = (1 << 26),
+	} options;
+	Information_UUIE_fastStart fastStart;
+} Information_UUIE;
+
+typedef struct FacilityReason {	/* CHOICE */
+	enum {
+		eFacilityReason_routeCallToGatekeeper,
+		eFacilityReason_callForwarded,
+		eFacilityReason_routeCallToMC,
+		eFacilityReason_undefinedReason,
+		eFacilityReason_conferenceListChoice,
+		eFacilityReason_startH245,
+		eFacilityReason_noH245,
+		eFacilityReason_newTokens,
+		eFacilityReason_featureSetUpdate,
+		eFacilityReason_forwardedElements,
+		eFacilityReason_transportedInformation,
+	} choice;
+} FacilityReason;
+
+typedef struct Facility_UUIE_fastStart {	/* SEQUENCE OF */
+	int count;
+	OpenLogicalChannel item[30];
+} Facility_UUIE_fastStart;
+
+typedef struct Facility_UUIE {	/* SEQUENCE */
+	enum {
+		eFacility_UUIE_alternativeAddress = (1 << 31),
+		eFacility_UUIE_alternativeAliasAddress = (1 << 30),
+		eFacility_UUIE_conferenceID = (1 << 29),
+		eFacility_UUIE_callIdentifier = (1 << 28),
+		eFacility_UUIE_destExtraCallInfo = (1 << 27),
+		eFacility_UUIE_remoteExtensionAddress = (1 << 26),
+		eFacility_UUIE_tokens = (1 << 25),
+		eFacility_UUIE_cryptoTokens = (1 << 24),
+		eFacility_UUIE_conferences = (1 << 23),
+		eFacility_UUIE_h245Address = (1 << 22),
+		eFacility_UUIE_fastStart = (1 << 21),
+		eFacility_UUIE_multipleCalls = (1 << 20),
+		eFacility_UUIE_maintainConnection = (1 << 19),
+		eFacility_UUIE_fastConnectRefused = (1 << 18),
+		eFacility_UUIE_serviceControl = (1 << 17),
+		eFacility_UUIE_circuitInfo = (1 << 16),
+		eFacility_UUIE_featureSet = (1 << 15),
+		eFacility_UUIE_destinationInfo = (1 << 14),
+		eFacility_UUIE_h245SecurityMode = (1 << 13),
+	} options;
+	FacilityReason reason;
+	TransportAddress h245Address;
+	Facility_UUIE_fastStart fastStart;
+} Facility_UUIE;
+
+typedef struct Progress_UUIE_fastStart {	/* SEQUENCE OF */
+	int count;
+	OpenLogicalChannel item[30];
+} Progress_UUIE_fastStart;
+
+typedef struct Progress_UUIE {	/* SEQUENCE */
+	enum {
+		eProgress_UUIE_h245Address = (1 << 31),
+		eProgress_UUIE_h245SecurityMode = (1 << 30),
+		eProgress_UUIE_tokens = (1 << 29),
+		eProgress_UUIE_cryptoTokens = (1 << 28),
+		eProgress_UUIE_fastStart = (1 << 27),
+		eProgress_UUIE_multipleCalls = (1 << 26),
+		eProgress_UUIE_maintainConnection = (1 << 25),
+		eProgress_UUIE_fastConnectRefused = (1 << 24),
+	} options;
+	TransportAddress h245Address;
+	Progress_UUIE_fastStart fastStart;
+} Progress_UUIE;
+
+typedef struct H323_UU_PDU_h323_message_body {	/* CHOICE */
+	enum {
+		eH323_UU_PDU_h323_message_body_setup,
+		eH323_UU_PDU_h323_message_body_callProceeding,
+		eH323_UU_PDU_h323_message_body_connect,
+		eH323_UU_PDU_h323_message_body_alerting,
+		eH323_UU_PDU_h323_message_body_information,
+		eH323_UU_PDU_h323_message_body_releaseComplete,
+		eH323_UU_PDU_h323_message_body_facility,
+		eH323_UU_PDU_h323_message_body_progress,
+		eH323_UU_PDU_h323_message_body_empty,
+		eH323_UU_PDU_h323_message_body_status,
+		eH323_UU_PDU_h323_message_body_statusInquiry,
+		eH323_UU_PDU_h323_message_body_setupAcknowledge,
+		eH323_UU_PDU_h323_message_body_notify,
+	} choice;
+	union {
+		Setup_UUIE setup;
+		CallProceeding_UUIE callProceeding;
+		Connect_UUIE connect;
+		Alerting_UUIE alerting;
+		Information_UUIE information;
+		Facility_UUIE facility;
+		Progress_UUIE progress;
+	};
+} H323_UU_PDU_h323_message_body;
+
+typedef struct RequestMessage {	/* CHOICE */
+	enum {
+		eRequestMessage_nonStandard,
+		eRequestMessage_masterSlaveDetermination,
+		eRequestMessage_terminalCapabilitySet,
+		eRequestMessage_openLogicalChannel,
+		eRequestMessage_closeLogicalChannel,
+		eRequestMessage_requestChannelClose,
+		eRequestMessage_multiplexEntrySend,
+		eRequestMessage_requestMultiplexEntry,
+		eRequestMessage_requestMode,
+		eRequestMessage_roundTripDelayRequest,
+		eRequestMessage_maintenanceLoopRequest,
+		eRequestMessage_communicationModeRequest,
+		eRequestMessage_conferenceRequest,
+		eRequestMessage_multilinkRequest,
+		eRequestMessage_logicalChannelRateRequest,
+	} choice;
+	union {
+		OpenLogicalChannel openLogicalChannel;
+	};
+} RequestMessage;
+
+typedef struct OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters {	/* CHOICE */
+	enum {
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h222LogicalChannelParameters,
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters,
+	} choice;
+	union {
+		H2250LogicalChannelParameters h2250LogicalChannelParameters;
+	};
+} OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters;
+
+typedef struct OpenLogicalChannelAck_reverseLogicalChannelParameters {	/* SEQUENCE */
+	enum {
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters_portNumber
+		    = (1 << 31),
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters
+		    = (1 << 30),
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters_replacementFor
+		    = (1 << 29),
+	} options;
+	OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters
+	    multiplexParameters;
+} OpenLogicalChannelAck_reverseLogicalChannelParameters;
+
+typedef struct H2250LogicalChannelAckParameters {	/* SEQUENCE */
+	enum {
+		eH2250LogicalChannelAckParameters_nonStandard = (1 << 31),
+		eH2250LogicalChannelAckParameters_sessionID = (1 << 30),
+		eH2250LogicalChannelAckParameters_mediaChannel = (1 << 29),
+		eH2250LogicalChannelAckParameters_mediaControlChannel =
+		    (1 << 28),
+		eH2250LogicalChannelAckParameters_dynamicRTPPayloadType =
+		    (1 << 27),
+		eH2250LogicalChannelAckParameters_flowControlToZero =
+		    (1 << 26),
+		eH2250LogicalChannelAckParameters_portNumber = (1 << 25),
+	} options;
+	H245_TransportAddress mediaChannel;
+	H245_TransportAddress mediaControlChannel;
+} H2250LogicalChannelAckParameters;
+
+typedef struct OpenLogicalChannelAck_forwardMultiplexAckParameters {	/* CHOICE */
+	enum {
+		eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters,
+	} choice;
+	union {
+		H2250LogicalChannelAckParameters
+		    h2250LogicalChannelAckParameters;
+	};
+} OpenLogicalChannelAck_forwardMultiplexAckParameters;
+
+typedef struct OpenLogicalChannelAck {	/* SEQUENCE */
+	enum {
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters =
+		    (1 << 31),
+		eOpenLogicalChannelAck_separateStack = (1 << 30),
+		eOpenLogicalChannelAck_forwardMultiplexAckParameters =
+		    (1 << 29),
+		eOpenLogicalChannelAck_encryptionSync = (1 << 28),
+	} options;
+	OpenLogicalChannelAck_reverseLogicalChannelParameters
+	    reverseLogicalChannelParameters;
+	OpenLogicalChannelAck_forwardMultiplexAckParameters
+	    forwardMultiplexAckParameters;
+} OpenLogicalChannelAck;
+
+typedef struct ResponseMessage {	/* CHOICE */
+	enum {
+		eResponseMessage_nonStandard,
+		eResponseMessage_masterSlaveDeterminationAck,
+		eResponseMessage_masterSlaveDeterminationReject,
+		eResponseMessage_terminalCapabilitySetAck,
+		eResponseMessage_terminalCapabilitySetReject,
+		eResponseMessage_openLogicalChannelAck,
+		eResponseMessage_openLogicalChannelReject,
+		eResponseMessage_closeLogicalChannelAck,
+		eResponseMessage_requestChannelCloseAck,
+		eResponseMessage_requestChannelCloseReject,
+		eResponseMessage_multiplexEntrySendAck,
+		eResponseMessage_multiplexEntrySendReject,
+		eResponseMessage_requestMultiplexEntryAck,
+		eResponseMessage_requestMultiplexEntryReject,
+		eResponseMessage_requestModeAck,
+		eResponseMessage_requestModeReject,
+		eResponseMessage_roundTripDelayResponse,
+		eResponseMessage_maintenanceLoopAck,
+		eResponseMessage_maintenanceLoopReject,
+		eResponseMessage_communicationModeResponse,
+		eResponseMessage_conferenceResponse,
+		eResponseMessage_multilinkResponse,
+		eResponseMessage_logicalChannelRateAcknowledge,
+		eResponseMessage_logicalChannelRateReject,
+	} choice;
+	union {
+		OpenLogicalChannelAck openLogicalChannelAck;
+	};
+} ResponseMessage;
+
+typedef struct MultimediaSystemControlMessage {	/* CHOICE */
+	enum {
+		eMultimediaSystemControlMessage_request,
+		eMultimediaSystemControlMessage_response,
+		eMultimediaSystemControlMessage_command,
+		eMultimediaSystemControlMessage_indication,
+	} choice;
+	union {
+		RequestMessage request;
+		ResponseMessage response;
+	};
+} MultimediaSystemControlMessage;
+
+typedef struct H323_UU_PDU_h245Control {	/* SEQUENCE OF */
+	int count;
+	MultimediaSystemControlMessage item[4];
+} H323_UU_PDU_h245Control;
+
+typedef struct H323_UU_PDU {	/* SEQUENCE */
+	enum {
+		eH323_UU_PDU_nonStandardData = (1 << 31),
+		eH323_UU_PDU_h4501SupplementaryService = (1 << 30),
+		eH323_UU_PDU_h245Tunneling = (1 << 29),
+		eH323_UU_PDU_h245Control = (1 << 28),
+		eH323_UU_PDU_nonStandardControl = (1 << 27),
+		eH323_UU_PDU_callLinkage = (1 << 26),
+		eH323_UU_PDU_tunnelledSignallingMessage = (1 << 25),
+		eH323_UU_PDU_provisionalRespToH245Tunneling = (1 << 24),
+		eH323_UU_PDU_stimulusControl = (1 << 23),
+		eH323_UU_PDU_genericData = (1 << 22),
+	} options;
+	H323_UU_PDU_h323_message_body h323_message_body;
+	H323_UU_PDU_h245Control h245Control;
+} H323_UU_PDU;
+
+typedef struct H323_UserInformation {	/* SEQUENCE */
+	enum {
+		eH323_UserInformation_user_data = (1 << 31),
+	} options;
+	H323_UU_PDU h323_uu_pdu;
+} H323_UserInformation;
+
+typedef struct GatekeeperRequest {	/* SEQUENCE */
+	enum {
+		eGatekeeperRequest_nonStandardData = (1 << 31),
+		eGatekeeperRequest_gatekeeperIdentifier = (1 << 30),
+		eGatekeeperRequest_callServices = (1 << 29),
+		eGatekeeperRequest_endpointAlias = (1 << 28),
+		eGatekeeperRequest_alternateEndpoints = (1 << 27),
+		eGatekeeperRequest_tokens = (1 << 26),
+		eGatekeeperRequest_cryptoTokens = (1 << 25),
+		eGatekeeperRequest_authenticationCapability = (1 << 24),
+		eGatekeeperRequest_algorithmOIDs = (1 << 23),
+		eGatekeeperRequest_integrity = (1 << 22),
+		eGatekeeperRequest_integrityCheckValue = (1 << 21),
+		eGatekeeperRequest_supportsAltGK = (1 << 20),
+		eGatekeeperRequest_featureSet = (1 << 19),
+		eGatekeeperRequest_genericData = (1 << 18),
+	} options;
+	TransportAddress rasAddress;
+} GatekeeperRequest;
+
+typedef struct GatekeeperConfirm {	/* SEQUENCE */
+	enum {
+		eGatekeeperConfirm_nonStandardData = (1 << 31),
+		eGatekeeperConfirm_gatekeeperIdentifier = (1 << 30),
+		eGatekeeperConfirm_alternateGatekeeper = (1 << 29),
+		eGatekeeperConfirm_authenticationMode = (1 << 28),
+		eGatekeeperConfirm_tokens = (1 << 27),
+		eGatekeeperConfirm_cryptoTokens = (1 << 26),
+		eGatekeeperConfirm_algorithmOID = (1 << 25),
+		eGatekeeperConfirm_integrity = (1 << 24),
+		eGatekeeperConfirm_integrityCheckValue = (1 << 23),
+		eGatekeeperConfirm_featureSet = (1 << 22),
+		eGatekeeperConfirm_genericData = (1 << 21),
+	} options;
+	TransportAddress rasAddress;
+} GatekeeperConfirm;
+
+typedef struct RegistrationRequest_callSignalAddress {	/* SEQUENCE OF */
+	int count;
+	TransportAddress item[10];
+} RegistrationRequest_callSignalAddress;
+
+typedef struct RegistrationRequest_rasAddress {	/* SEQUENCE OF */
+	int count;
+	TransportAddress item[10];
+} RegistrationRequest_rasAddress;
+
+typedef struct RegistrationRequest {	/* SEQUENCE */
+	enum {
+		eRegistrationRequest_nonStandardData = (1 << 31),
+		eRegistrationRequest_terminalAlias = (1 << 30),
+		eRegistrationRequest_gatekeeperIdentifier = (1 << 29),
+		eRegistrationRequest_alternateEndpoints = (1 << 28),
+		eRegistrationRequest_timeToLive = (1 << 27),
+		eRegistrationRequest_tokens = (1 << 26),
+		eRegistrationRequest_cryptoTokens = (1 << 25),
+		eRegistrationRequest_integrityCheckValue = (1 << 24),
+		eRegistrationRequest_keepAlive = (1 << 23),
+		eRegistrationRequest_endpointIdentifier = (1 << 22),
+		eRegistrationRequest_willSupplyUUIEs = (1 << 21),
+		eRegistrationRequest_maintainConnection = (1 << 20),
+		eRegistrationRequest_alternateTransportAddresses = (1 << 19),
+		eRegistrationRequest_additiveRegistration = (1 << 18),
+		eRegistrationRequest_terminalAliasPattern = (1 << 17),
+		eRegistrationRequest_supportsAltGK = (1 << 16),
+		eRegistrationRequest_usageReportingCapability = (1 << 15),
+		eRegistrationRequest_multipleCalls = (1 << 14),
+		eRegistrationRequest_supportedH248Packages = (1 << 13),
+		eRegistrationRequest_callCreditCapability = (1 << 12),
+		eRegistrationRequest_capacityReportingCapability = (1 << 11),
+		eRegistrationRequest_capacity = (1 << 10),
+		eRegistrationRequest_featureSet = (1 << 9),
+		eRegistrationRequest_genericData = (1 << 8),
+	} options;
+	RegistrationRequest_callSignalAddress callSignalAddress;
+	RegistrationRequest_rasAddress rasAddress;
+	unsigned timeToLive;
+} RegistrationRequest;
+
+typedef struct RegistrationConfirm_callSignalAddress {	/* SEQUENCE OF */
+	int count;
+	TransportAddress item[10];
+} RegistrationConfirm_callSignalAddress;
+
+typedef struct RegistrationConfirm {	/* SEQUENCE */
+	enum {
+		eRegistrationConfirm_nonStandardData = (1 << 31),
+		eRegistrationConfirm_terminalAlias = (1 << 30),
+		eRegistrationConfirm_gatekeeperIdentifier = (1 << 29),
+		eRegistrationConfirm_alternateGatekeeper = (1 << 28),
+		eRegistrationConfirm_timeToLive = (1 << 27),
+		eRegistrationConfirm_tokens = (1 << 26),
+		eRegistrationConfirm_cryptoTokens = (1 << 25),
+		eRegistrationConfirm_integrityCheckValue = (1 << 24),
+		eRegistrationConfirm_willRespondToIRR = (1 << 23),
+		eRegistrationConfirm_preGrantedARQ = (1 << 22),
+		eRegistrationConfirm_maintainConnection = (1 << 21),
+		eRegistrationConfirm_serviceControl = (1 << 20),
+		eRegistrationConfirm_supportsAdditiveRegistration = (1 << 19),
+		eRegistrationConfirm_terminalAliasPattern = (1 << 18),
+		eRegistrationConfirm_supportedPrefixes = (1 << 17),
+		eRegistrationConfirm_usageSpec = (1 << 16),
+		eRegistrationConfirm_featureServerAlias = (1 << 15),
+		eRegistrationConfirm_capacityReportingSpec = (1 << 14),
+		eRegistrationConfirm_featureSet = (1 << 13),
+		eRegistrationConfirm_genericData = (1 << 12),
+	} options;
+	RegistrationConfirm_callSignalAddress callSignalAddress;
+	unsigned timeToLive;
+} RegistrationConfirm;
+
+typedef struct UnregistrationRequest_callSignalAddress {	/* SEQUENCE OF */
+	int count;
+	TransportAddress item[10];
+} UnregistrationRequest_callSignalAddress;
+
+typedef struct UnregistrationRequest {	/* SEQUENCE */
+	enum {
+		eUnregistrationRequest_endpointAlias = (1 << 31),
+		eUnregistrationRequest_nonStandardData = (1 << 30),
+		eUnregistrationRequest_endpointIdentifier = (1 << 29),
+		eUnregistrationRequest_alternateEndpoints = (1 << 28),
+		eUnregistrationRequest_gatekeeperIdentifier = (1 << 27),
+		eUnregistrationRequest_tokens = (1 << 26),
+		eUnregistrationRequest_cryptoTokens = (1 << 25),
+		eUnregistrationRequest_integrityCheckValue = (1 << 24),
+		eUnregistrationRequest_reason = (1 << 23),
+		eUnregistrationRequest_endpointAliasPattern = (1 << 22),
+		eUnregistrationRequest_supportedPrefixes = (1 << 21),
+		eUnregistrationRequest_alternateGatekeeper = (1 << 20),
+		eUnregistrationRequest_genericData = (1 << 19),
+	} options;
+	UnregistrationRequest_callSignalAddress callSignalAddress;
+} UnregistrationRequest;
+
+typedef struct AdmissionRequest {	/* SEQUENCE */
+	enum {
+		eAdmissionRequest_callModel = (1 << 31),
+		eAdmissionRequest_destinationInfo = (1 << 30),
+		eAdmissionRequest_destCallSignalAddress = (1 << 29),
+		eAdmissionRequest_destExtraCallInfo = (1 << 28),
+		eAdmissionRequest_srcCallSignalAddress = (1 << 27),
+		eAdmissionRequest_nonStandardData = (1 << 26),
+		eAdmissionRequest_callServices = (1 << 25),
+		eAdmissionRequest_canMapAlias = (1 << 24),
+		eAdmissionRequest_callIdentifier = (1 << 23),
+		eAdmissionRequest_srcAlternatives = (1 << 22),
+		eAdmissionRequest_destAlternatives = (1 << 21),
+		eAdmissionRequest_gatekeeperIdentifier = (1 << 20),
+		eAdmissionRequest_tokens = (1 << 19),
+		eAdmissionRequest_cryptoTokens = (1 << 18),
+		eAdmissionRequest_integrityCheckValue = (1 << 17),
+		eAdmissionRequest_transportQOS = (1 << 16),
+		eAdmissionRequest_willSupplyUUIEs = (1 << 15),
+		eAdmissionRequest_callLinkage = (1 << 14),
+		eAdmissionRequest_gatewayDataRate = (1 << 13),
+		eAdmissionRequest_capacity = (1 << 12),
+		eAdmissionRequest_circuitInfo = (1 << 11),
+		eAdmissionRequest_desiredProtocols = (1 << 10),
+		eAdmissionRequest_desiredTunnelledProtocol = (1 << 9),
+		eAdmissionRequest_featureSet = (1 << 8),
+		eAdmissionRequest_genericData = (1 << 7),
+	} options;
+	TransportAddress destCallSignalAddress;
+	TransportAddress srcCallSignalAddress;
+} AdmissionRequest;
+
+typedef struct AdmissionConfirm {	/* SEQUENCE */
+	enum {
+		eAdmissionConfirm_irrFrequency = (1 << 31),
+		eAdmissionConfirm_nonStandardData = (1 << 30),
+		eAdmissionConfirm_destinationInfo = (1 << 29),
+		eAdmissionConfirm_destExtraCallInfo = (1 << 28),
+		eAdmissionConfirm_destinationType = (1 << 27),
+		eAdmissionConfirm_remoteExtensionAddress = (1 << 26),
+		eAdmissionConfirm_alternateEndpoints = (1 << 25),
+		eAdmissionConfirm_tokens = (1 << 24),
+		eAdmissionConfirm_cryptoTokens = (1 << 23),
+		eAdmissionConfirm_integrityCheckValue = (1 << 22),
+		eAdmissionConfirm_transportQOS = (1 << 21),
+		eAdmissionConfirm_willRespondToIRR = (1 << 20),
+		eAdmissionConfirm_uuiesRequested = (1 << 19),
+		eAdmissionConfirm_language = (1 << 18),
+		eAdmissionConfirm_alternateTransportAddresses = (1 << 17),
+		eAdmissionConfirm_useSpecifiedTransport = (1 << 16),
+		eAdmissionConfirm_circuitInfo = (1 << 15),
+		eAdmissionConfirm_usageSpec = (1 << 14),
+		eAdmissionConfirm_supportedProtocols = (1 << 13),
+		eAdmissionConfirm_serviceControl = (1 << 12),
+		eAdmissionConfirm_multipleCalls = (1 << 11),
+		eAdmissionConfirm_featureSet = (1 << 10),
+		eAdmissionConfirm_genericData = (1 << 9),
+	} options;
+	TransportAddress destCallSignalAddress;
+} AdmissionConfirm;
+
+typedef struct LocationRequest {	/* SEQUENCE */
+	enum {
+		eLocationRequest_endpointIdentifier = (1 << 31),
+		eLocationRequest_nonStandardData = (1 << 30),
+		eLocationRequest_sourceInfo = (1 << 29),
+		eLocationRequest_canMapAlias = (1 << 28),
+		eLocationRequest_gatekeeperIdentifier = (1 << 27),
+		eLocationRequest_tokens = (1 << 26),
+		eLocationRequest_cryptoTokens = (1 << 25),
+		eLocationRequest_integrityCheckValue = (1 << 24),
+		eLocationRequest_desiredProtocols = (1 << 23),
+		eLocationRequest_desiredTunnelledProtocol = (1 << 22),
+		eLocationRequest_featureSet = (1 << 21),
+		eLocationRequest_genericData = (1 << 20),
+		eLocationRequest_hopCount = (1 << 19),
+		eLocationRequest_circuitInfo = (1 << 18),
+	} options;
+	TransportAddress replyAddress;
+} LocationRequest;
+
+typedef struct LocationConfirm {	/* SEQUENCE */
+	enum {
+		eLocationConfirm_nonStandardData = (1 << 31),
+		eLocationConfirm_destinationInfo = (1 << 30),
+		eLocationConfirm_destExtraCallInfo = (1 << 29),
+		eLocationConfirm_destinationType = (1 << 28),
+		eLocationConfirm_remoteExtensionAddress = (1 << 27),
+		eLocationConfirm_alternateEndpoints = (1 << 26),
+		eLocationConfirm_tokens = (1 << 25),
+		eLocationConfirm_cryptoTokens = (1 << 24),
+		eLocationConfirm_integrityCheckValue = (1 << 23),
+		eLocationConfirm_alternateTransportAddresses = (1 << 22),
+		eLocationConfirm_supportedProtocols = (1 << 21),
+		eLocationConfirm_multipleCalls = (1 << 20),
+		eLocationConfirm_featureSet = (1 << 19),
+		eLocationConfirm_genericData = (1 << 18),
+		eLocationConfirm_circuitInfo = (1 << 17),
+		eLocationConfirm_serviceControl = (1 << 16),
+	} options;
+	TransportAddress callSignalAddress;
+	TransportAddress rasAddress;
+} LocationConfirm;
+
+typedef struct InfoRequestResponse_callSignalAddress {	/* SEQUENCE OF */
+	int count;
+	TransportAddress item[10];
+} InfoRequestResponse_callSignalAddress;
+
+typedef struct InfoRequestResponse {	/* SEQUENCE */
+	enum {
+		eInfoRequestResponse_nonStandardData = (1 << 31),
+		eInfoRequestResponse_endpointAlias = (1 << 30),
+		eInfoRequestResponse_perCallInfo = (1 << 29),
+		eInfoRequestResponse_tokens = (1 << 28),
+		eInfoRequestResponse_cryptoTokens = (1 << 27),
+		eInfoRequestResponse_integrityCheckValue = (1 << 26),
+		eInfoRequestResponse_needResponse = (1 << 25),
+		eInfoRequestResponse_capacity = (1 << 24),
+		eInfoRequestResponse_irrStatus = (1 << 23),
+		eInfoRequestResponse_unsolicited = (1 << 22),
+		eInfoRequestResponse_genericData = (1 << 21),
+	} options;
+	TransportAddress rasAddress;
+	InfoRequestResponse_callSignalAddress callSignalAddress;
+} InfoRequestResponse;
+
+typedef struct RasMessage {	/* CHOICE */
+	enum {
+		eRasMessage_gatekeeperRequest,
+		eRasMessage_gatekeeperConfirm,
+		eRasMessage_gatekeeperReject,
+		eRasMessage_registrationRequest,
+		eRasMessage_registrationConfirm,
+		eRasMessage_registrationReject,
+		eRasMessage_unregistrationRequest,
+		eRasMessage_unregistrationConfirm,
+		eRasMessage_unregistrationReject,
+		eRasMessage_admissionRequest,
+		eRasMessage_admissionConfirm,
+		eRasMessage_admissionReject,
+		eRasMessage_bandwidthRequest,
+		eRasMessage_bandwidthConfirm,
+		eRasMessage_bandwidthReject,
+		eRasMessage_disengageRequest,
+		eRasMessage_disengageConfirm,
+		eRasMessage_disengageReject,
+		eRasMessage_locationRequest,
+		eRasMessage_locationConfirm,
+		eRasMessage_locationReject,
+		eRasMessage_infoRequest,
+		eRasMessage_infoRequestResponse,
+		eRasMessage_nonStandardMessage,
+		eRasMessage_unknownMessageResponse,
+		eRasMessage_requestInProgress,
+		eRasMessage_resourcesAvailableIndicate,
+		eRasMessage_resourcesAvailableConfirm,
+		eRasMessage_infoRequestAck,
+		eRasMessage_infoRequestNak,
+		eRasMessage_serviceControlIndication,
+		eRasMessage_serviceControlResponse,
+	} choice;
+	union {
+		GatekeeperRequest gatekeeperRequest;
+		GatekeeperConfirm gatekeeperConfirm;
+		RegistrationRequest registrationRequest;
+		RegistrationConfirm registrationConfirm;
+		UnregistrationRequest unregistrationRequest;
+		AdmissionRequest admissionRequest;
+		AdmissionConfirm admissionConfirm;
+		LocationRequest locationRequest;
+		LocationConfirm locationConfirm;
+		InfoRequestResponse infoRequestResponse;
+	};
+} RasMessage;
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c
new file mode 100644
index 000000000000..a0bc883928c0
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_helper_h323.c
@@ -0,0 +1,605 @@
+/*
+ * H.323 extension for NAT alteration.
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 NAT module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * Changes:
+ *	2006-02-01 - initial version 0.1
+ *
+ *	2006-02-20 - version 0.2
+ *	  1. Changed source format to follow kernel conventions
+ *	  2. Deleted some unnecessary structures
+ *	  3. Minor fixes
+ *
+ * 	2006-03-10 - version 0.3
+ * 	  1. Added support for multiple TPKTs in one packet (suggested by
+ * 	     Patrick McHardy)
+ * 	  2. Added support for non-linear skb (based on Patrick McHardy's patch)
+ * 	  3. Eliminated unnecessary return code
+ *
+ * 	2006-03-15 - version 0.4
+ * 	  1. Added support for T.120 channels
+ * 	  2. Added parameter gkrouted_only (suggested by Patrick McHardy)
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/moduleparam.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
+#include <linux/netfilter_ipv4/ip_conntrack_h323.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+
+#include "ip_conntrack_helper_h323_asn1.h"
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+extern int get_h245_addr(unsigned char *data, H245_TransportAddress * addr,
+			 u_int32_t * ip, u_int16_t * port);
+extern int get_h225_addr(unsigned char *data, TransportAddress * addr,
+			 u_int32_t * ip, u_int16_t * port);
+extern void ip_conntrack_h245_expect(struct ip_conntrack *new,
+				     struct ip_conntrack_expect *this);
+extern void ip_conntrack_q931_expect(struct ip_conntrack *new,
+				     struct ip_conntrack_expect *this);
+extern int (*set_h245_addr_hook) (struct sk_buff ** pskb,
+				  unsigned char **data, int dataoff,
+				  H245_TransportAddress * addr,
+				  u_int32_t ip, u_int16_t port);
+extern int (*set_h225_addr_hook) (struct sk_buff ** pskb,
+				  unsigned char **data, int dataoff,
+				  TransportAddress * addr,
+				  u_int32_t ip, u_int16_t port);
+extern int (*set_sig_addr_hook) (struct sk_buff ** pskb,
+				 struct ip_conntrack * ct,
+				 enum ip_conntrack_info ctinfo,
+				 unsigned char **data,
+				 TransportAddress * addr, int count);
+extern int (*set_ras_addr_hook) (struct sk_buff ** pskb,
+				 struct ip_conntrack * ct,
+				 enum ip_conntrack_info ctinfo,
+				 unsigned char **data,
+				 TransportAddress * addr, int count);
+extern int (*nat_rtp_rtcp_hook) (struct sk_buff ** pskb,
+				 struct ip_conntrack * ct,
+				 enum ip_conntrack_info ctinfo,
+				 unsigned char **data, int dataoff,
+				 H245_TransportAddress * addr,
+				 u_int16_t port, u_int16_t rtp_port,
+				 struct ip_conntrack_expect * rtp_exp,
+				 struct ip_conntrack_expect * rtcp_exp);
+extern int (*nat_t120_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct,
+			     enum ip_conntrack_info ctinfo,
+			     unsigned char **data, int dataoff,
+			     H245_TransportAddress * addr, u_int16_t port,
+			     struct ip_conntrack_expect * exp);
+extern int (*nat_h245_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct,
+			     enum ip_conntrack_info ctinfo,
+			     unsigned char **data, int dataoff,
+			     TransportAddress * addr, u_int16_t port,
+			     struct ip_conntrack_expect * exp);
+extern int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct,
+			     enum ip_conntrack_info ctinfo,
+			     unsigned char **data, TransportAddress * addr,
+			     int idx, u_int16_t port,
+			     struct ip_conntrack_expect * exp);
+
+
+/****************************************************************************/
+static int set_addr(struct sk_buff **pskb,
+		    unsigned char **data, int dataoff,
+		    unsigned int addroff, u_int32_t ip, u_int16_t port)
+{
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct = ip_conntrack_get(*pskb, &ctinfo);
+	struct {
+		u_int32_t ip;
+		u_int16_t port;
+	} __attribute__ ((__packed__)) buf;
+	struct tcphdr _tcph, *th;
+
+	buf.ip = ip;
+	buf.port = htons(port);
+	addroff += dataoff;
+
+	if ((*pskb)->nh.iph->protocol == IPPROTO_TCP) {
+		if (!ip_nat_mangle_tcp_packet(pskb, ct, ctinfo,
+					      addroff, sizeof(buf),
+					      (char *) &buf, sizeof(buf))) {
+			if (net_ratelimit())
+				printk("ip_nat_h323: ip_nat_mangle_tcp_packet"
+				       " error\n");
+			return -1;
+		}
+
+		/* Relocate data pointer */
+		th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl * 4,
+					sizeof(_tcph), &_tcph);
+		if (th == NULL)
+			return -1;
+		*data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 +
+		    th->doff * 4 + dataoff;
+	} else {
+		if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo,
+					      addroff, sizeof(buf),
+					      (char *) &buf, sizeof(buf))) {
+			if (net_ratelimit())
+				printk("ip_nat_h323: ip_nat_mangle_udp_packet"
+				       " error\n");
+			return -1;
+		}
+		/* ip_nat_mangle_udp_packet uses skb_make_writable() to copy
+		 * or pull everything in a linear buffer, so we can safely
+		 * use the skb pointers now */
+		*data = (*pskb)->data + (*pskb)->nh.iph->ihl * 4 +
+		    sizeof(struct udphdr);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int set_h225_addr(struct sk_buff **pskb,
+			 unsigned char **data, int dataoff,
+			 TransportAddress * addr,
+			 u_int32_t ip, u_int16_t port)
+{
+	return set_addr(pskb, data, dataoff, addr->ipAddress.ip, ip, port);
+}
+
+/****************************************************************************/
+static int set_h245_addr(struct sk_buff **pskb,
+			 unsigned char **data, int dataoff,
+			 H245_TransportAddress * addr,
+			 u_int32_t ip, u_int16_t port)
+{
+	return set_addr(pskb, data, dataoff,
+			addr->unicastAddress.iPAddress.network, ip, port);
+}
+
+/****************************************************************************/
+static int set_sig_addr(struct sk_buff **pskb, struct ip_conntrack *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data,
+			TransportAddress * addr, int count)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	u_int32_t ip;
+	u_int16_t port;
+
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(*data, &addr[i], &ip, &port)) {
+			if (ip == ct->tuplehash[dir].tuple.src.ip &&
+			    port == info->sig_port[dir]) {
+				/* GW->GK */
+
+				/* Fix for Gnomemeeting */
+				if (i > 0 &&
+				    get_h225_addr(*data, &addr[0],
+						  &ip, &port) &&
+				    (ntohl(ip) & 0xff000000) == 0x7f000000)
+					i = 0;
+
+				DEBUGP
+				    ("ip_nat_ras: set signal address "
+				     "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+				     NIPQUAD(ip), port,
+				     NIPQUAD(ct->tuplehash[!dir].tuple.dst.
+					     ip), info->sig_port[!dir]);
+				return set_h225_addr(pskb, data, 0, &addr[i],
+						     ct->tuplehash[!dir].
+						     tuple.dst.ip,
+						     info->sig_port[!dir]);
+			} else if (ip == ct->tuplehash[dir].tuple.dst.ip &&
+				   port == info->sig_port[dir]) {
+				/* GK->GW */
+				DEBUGP
+				    ("ip_nat_ras: set signal address "
+				     "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+				     NIPQUAD(ip), port,
+				     NIPQUAD(ct->tuplehash[!dir].tuple.src.
+					     ip), info->sig_port[!dir]);
+				return set_h225_addr(pskb, data, 0, &addr[i],
+						     ct->tuplehash[!dir].
+						     tuple.src.ip,
+						     info->sig_port[!dir]);
+			}
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int set_ras_addr(struct sk_buff **pskb, struct ip_conntrack *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data,
+			TransportAddress * addr, int count)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	u_int32_t ip;
+	u_int16_t port;
+
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(*data, &addr[i], &ip, &port) &&
+		    ip == ct->tuplehash[dir].tuple.src.ip &&
+		    port == ntohs(ct->tuplehash[dir].tuple.src.u.udp.port)) {
+			DEBUGP("ip_nat_ras: set rasAddress "
+			       "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+			       NIPQUAD(ip), port,
+			       NIPQUAD(ct->tuplehash[!dir].tuple.dst.ip),
+			       ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.
+				     port));
+			return set_h225_addr(pskb, data, 0, &addr[i],
+					     ct->tuplehash[!dir].tuple.dst.ip,
+					     ntohs(ct->tuplehash[!dir].tuple.
+						   dst.u.udp.port));
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_rtp_rtcp(struct sk_buff **pskb, struct ip_conntrack *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff,
+			H245_TransportAddress * addr,
+			u_int16_t port, u_int16_t rtp_port,
+			struct ip_conntrack_expect *rtp_exp,
+			struct ip_conntrack_expect *rtcp_exp)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+	rtp_exp->expectfn = ip_nat_follow_master;
+	rtp_exp->dir = !dir;
+	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+	rtcp_exp->expectfn = ip_nat_follow_master;
+	rtcp_exp->dir = !dir;
+
+	/* Lookup existing expects */
+	for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
+		if (info->rtp_port[i][dir] == rtp_port) {
+			/* Expected */
+
+			/* Use allocated ports first. This will refresh
+			 * the expects */
+			rtp_exp->tuple.dst.u.udp.port =
+			    htons(info->rtp_port[i][dir]);
+			rtcp_exp->tuple.dst.u.udp.port =
+			    htons(info->rtp_port[i][dir] + 1);
+			break;
+		} else if (info->rtp_port[i][dir] == 0) {
+			/* Not expected */
+			break;
+		}
+	}
+
+	/* Run out of expectations */
+	if (i >= H323_RTP_CHANNEL_MAX) {
+		if (net_ratelimit())
+			printk("ip_nat_h323: out of expectations\n");
+		return 0;
+	}
+
+	/* Try to get a pair of ports. */
+	for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+	     nated_port != 0; nated_port += 2) {
+		rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
+		if (ip_conntrack_expect_related(rtp_exp) == 0) {
+			rtcp_exp->tuple.dst.u.udp.port =
+			    htons(nated_port + 1);
+			if (ip_conntrack_expect_related(rtcp_exp) == 0)
+				break;
+			ip_conntrack_unexpect_related(rtp_exp);
+		}
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			printk("ip_nat_h323: out of RTP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h245_addr(pskb, data, dataoff, addr,
+			  ct->tuplehash[!dir].tuple.dst.ip,
+			  (port & 1) ? nated_port + 1 : nated_port) == 0) {
+		/* Save ports */
+		info->rtp_port[i][dir] = rtp_port;
+		info->rtp_port[i][!dir] = nated_port;
+	} else {
+		ip_conntrack_unexpect_related(rtp_exp);
+		ip_conntrack_unexpect_related(rtcp_exp);
+		return -1;
+	}
+
+	/* Success */
+	DEBUGP("ip_nat_h323: expect RTP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(rtp_exp->tuple.src.ip),
+	       ntohs(rtp_exp->tuple.src.u.udp.port),
+	       NIPQUAD(rtp_exp->tuple.dst.ip),
+	       ntohs(rtp_exp->tuple.dst.u.udp.port));
+	DEBUGP("ip_nat_h323: expect RTCP %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(rtcp_exp->tuple.src.ip),
+	       ntohs(rtcp_exp->tuple.src.u.udp.port),
+	       NIPQUAD(rtcp_exp->tuple.dst.ip),
+	       ntohs(rtcp_exp->tuple.dst.u.udp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_t120(struct sk_buff **pskb, struct ip_conntrack *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned char **data, int dataoff,
+		    H245_TransportAddress * addr, u_int16_t port,
+		    struct ip_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = port;
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_follow_master;
+	exp->dir = !dir;
+
+	/* Try to get same port: if not, try to change it. */
+	for (; nated_port != 0; nated_port++) {
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			printk("ip_nat_h323: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h245_addr(pskb, data, dataoff, addr,
+			  ct->tuplehash[!dir].tuple.dst.ip, nated_port) < 0) {
+		ip_conntrack_unexpect_related(exp);
+		return -1;
+	}
+
+	DEBUGP("ip_nat_h323: expect T.120 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
+	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************
+ * This conntrack expect function replaces ip_conntrack_h245_expect()
+ * which was set by ip_conntrack_helper_h323.c. It calls both
+ * ip_nat_follow_master() and ip_conntrack_h245_expect()
+ ****************************************************************************/
+static void ip_nat_h245_expect(struct ip_conntrack *new,
+			       struct ip_conntrack_expect *this)
+{
+	ip_nat_follow_master(new, this);
+	ip_conntrack_h245_expect(new, this);
+}
+
+/****************************************************************************/
+static int nat_h245(struct sk_buff **pskb, struct ip_conntrack *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned char **data, int dataoff,
+		    TransportAddress * addr, u_int16_t port,
+		    struct ip_conntrack_expect *exp)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = port;
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_h245_expect;
+	exp->dir = !dir;
+
+	/* Check existing expects */
+	if (info->sig_port[dir] == port)
+		nated_port = info->sig_port[!dir];
+
+	/* Try to get same port: if not, try to change it. */
+	for (; nated_port != 0; nated_port++) {
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			printk("ip_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h225_addr(pskb, data, dataoff, addr,
+			  ct->tuplehash[!dir].tuple.dst.ip,
+			  nated_port) == 0) {
+		/* Save ports */
+		info->sig_port[dir] = port;
+		info->sig_port[!dir] = nated_port;
+	} else {
+		ip_conntrack_unexpect_related(exp);
+		return -1;
+	}
+
+	DEBUGP("ip_nat_q931: expect H.245 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
+	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************
+ * This conntrack expect function replaces ip_conntrack_q931_expect()
+ * which was set by ip_conntrack_helper_h323.c.
+ ****************************************************************************/
+static void ip_nat_q931_expect(struct ip_conntrack *new,
+			       struct ip_conntrack_expect *this)
+{
+	struct ip_nat_range range;
+
+	if (this->tuple.src.ip != 0) {	/* Only accept calls from GK */
+		ip_nat_follow_master(new, this);
+		goto out;
+	}
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip;
+
+	/* hook doesn't matter, but it has to do source manip */
+	ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = this->saved_proto;
+	range.min_ip = range.max_ip =
+	    new->master->tuplehash[!this->dir].tuple.src.ip;
+
+	/* hook doesn't matter, but it has to do destination manip */
+	ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING);
+
+      out:
+	ip_conntrack_q931_expect(new, this);
+}
+
+/****************************************************************************/
+static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned char **data, TransportAddress * addr, int idx,
+		    u_int16_t port, struct ip_conntrack_expect *exp)
+{
+	struct ip_ct_h323_master *info = &ct->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = port;
+	u_int32_t ip;
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_q931_expect;
+	exp->dir = !dir;
+
+	/* Check existing expects */
+	if (info->sig_port[dir] == port)
+		nated_port = info->sig_port[!dir];
+
+	/* Try to get same port: if not, try to change it. */
+	for (; nated_port != 0; nated_port++) {
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			printk("ip_nat_ras: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h225_addr(pskb, data, 0, &addr[idx],
+			  ct->tuplehash[!dir].tuple.dst.ip,
+			  nated_port) == 0) {
+		/* Save ports */
+		info->sig_port[dir] = port;
+		info->sig_port[!dir] = nated_port;
+
+		/* Fix for Gnomemeeting */
+		if (idx > 0 &&
+		    get_h225_addr(*data, &addr[0], &ip, &port) &&
+		    (ntohl(ip) & 0xff000000) == 0x7f000000) {
+			set_h225_addr_hook(pskb, data, 0, &addr[0],
+					   ct->tuplehash[!dir].tuple.dst.ip,
+					   info->sig_port[!dir]);
+		}
+	} else {
+		ip_conntrack_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Success */
+	DEBUGP("ip_nat_ras: expect Q.931 %u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n",
+	       NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port),
+	       NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static int __init init(void)
+{
+	BUG_ON(set_h245_addr_hook != NULL);
+	BUG_ON(set_h225_addr_hook != NULL);
+	BUG_ON(set_sig_addr_hook != NULL);
+	BUG_ON(set_ras_addr_hook != NULL);
+	BUG_ON(nat_rtp_rtcp_hook != NULL);
+	BUG_ON(nat_t120_hook != NULL);
+	BUG_ON(nat_h245_hook != NULL);
+	BUG_ON(nat_q931_hook != NULL);
+
+	set_h245_addr_hook = set_h245_addr;
+	set_h225_addr_hook = set_h225_addr;
+	set_sig_addr_hook = set_sig_addr;
+	set_ras_addr_hook = set_ras_addr;
+	nat_rtp_rtcp_hook = nat_rtp_rtcp;
+	nat_t120_hook = nat_t120;
+	nat_h245_hook = nat_h245;
+	nat_q931_hook = nat_q931;
+
+	DEBUGP("ip_nat_h323: init success\n");
+	return 0;
+}
+
+/****************************************************************************/
+static void __exit fini(void)
+{
+	set_h245_addr_hook = NULL;
+	set_h225_addr_hook = NULL;
+	set_sig_addr_hook = NULL;
+	set_ras_addr_hook = NULL;
+	nat_rtp_rtcp_hook = NULL;
+	nat_t120_hook = NULL;
+	nat_h245_hook = NULL;
+	nat_q931_hook = NULL;
+	synchronize_net();
+}
+
+/****************************************************************************/
+module_init(init);
+module_exit(fini);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 NAT helper");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From f10b7897ee29649fa7f0ccdc8d859ccd6ce7dbfd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 25 Jan 2006 22:34:01 +1100
Subject: [CRYPTO] api: Align tfm context as wide as possible

Since tfm contexts can contain arbitrary types we should provide at least
natural alignment (__attribute__ ((__aligned__))) for them.  In particular,
this is needed on the Xscale which is a 32-bit architecture with a u64 type
that requires 64-bit alignment.  This problem was reported by Ronen Shitrit.

The crypto_tfm structure's size was 44 bytes on 32-bit architectures and
80 bytes on 64-bit architectures.  So adding this requirement only means
that we have to add an extra 4 bytes on 32-bit architectures.

On i386 the natural alignment is 16 bytes which also benefits the VIA
Padlock as it no longer has to manually align its context structure to
128 bits.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c                 |  2 +-
 drivers/crypto/padlock-aes.c |  6 +++++-
 include/linux/crypto.h       | 10 +++++++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index e26156f71839..34e02caffc2a 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -165,7 +165,7 @@ static unsigned int crypto_ctxsize(struct crypto_alg *alg, int flags)
 		break;
 	}
 
-	return len + alg->cra_alignmask;
+	return len + (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1));
 }
 
 struct crypto_tfm *crypto_alloc_tfm(const char *name, u32 flags)
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 0c08c58252be..5158a9db4bc5 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -284,7 +284,11 @@ aes_hw_extkey_available(uint8_t key_len)
 
 static inline struct aes_ctx *aes_ctx(void *ctx)
 {
-	return (struct aes_ctx *)ALIGN((unsigned long)ctx, PADLOCK_ALIGNMENT);
+	unsigned long align = PADLOCK_ALIGNMENT;
+
+	if (align <= crypto_tfm_ctx_alignment())
+		align = 1;
+	return (struct aes_ctx *)ALIGN((unsigned long)ctx, align);
 }
 
 static int
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index d88bf8aa8b47..0ab1bc1152ca 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -229,6 +229,8 @@ struct crypto_tfm {
 	} crt_u;
 	
 	struct crypto_alg *__crt_alg;
+
+	char __crt_ctx[] __attribute__ ((__aligned__));
 };
 
 /* 
@@ -301,7 +303,13 @@ static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm)
 
 static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
 {
-	return (void *)&tfm[1];
+	return tfm->__crt_ctx;
+}
+
+static inline unsigned int crypto_tfm_ctx_alignment(void)
+{
+	struct crypto_tfm *tfm;
+	return __alignof__(tfm->__crt_ctx);
 }
 
 /*
-- 
cgit v1.2.3


From 30afc84cf7325e88fb9746340eba3c161080ff49 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sat, 18 Mar 2006 18:40:14 +0900
Subject: [SCSI] libata: implement minimal transport template for
 ->eh_timed_out

SCSI midlayer has moved hostt->eh_timed_out to transport template.  As
libata doesn't need full-blown transport support yet, implement
minimal transport for libata.  No transport class or whatsoever, just
empty transport template with ->eh_timed_out hook.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/ahci.c         |  1 -
 drivers/scsi/ata_piix.c     |  1 -
 drivers/scsi/libata-core.c  |  3 ++-
 drivers/scsi/libata-scsi.c  | 10 ++++++++++
 drivers/scsi/libata.h       |  2 ++
 drivers/scsi/pdc_adma.c     |  1 -
 drivers/scsi/sata_mv.c      |  1 -
 drivers/scsi/sata_nv.c      |  1 -
 drivers/scsi/sata_promise.c |  1 -
 drivers/scsi/sata_qstor.c   |  1 -
 drivers/scsi/sata_sil.c     |  1 -
 drivers/scsi/sata_sil24.c   |  1 -
 drivers/scsi/sata_sis.c     |  1 -
 drivers/scsi/sata_svw.c     |  1 -
 drivers/scsi/sata_sx4.c     |  1 -
 drivers/scsi/sata_uli.c     |  1 -
 drivers/scsi/sata_via.c     |  1 -
 drivers/scsi/sata_vsc.c     |  1 -
 include/linux/libata.h      |  1 -
 19 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index e97ab3e6de4d..a1ddbba2cbdf 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -207,7 +207,6 @@ static struct scsi_host_template ahci_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c
index 9327b62f97de..a74e23d39ba9 100644
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -209,7 +209,6 @@ static struct scsi_host_template piix_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 714b42bad935..64dce00e9c46 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4653,6 +4653,8 @@ static struct ata_port * ata_host_add(const struct ata_probe_ent *ent,
 	if (!host)
 		return NULL;
 
+	host->transportt = &ata_scsi_transport_template;
+
 	ap = (struct ata_port *) &host->hostdata[0];
 
 	ata_host_init(ap, host, host_set, ent, port_no);
@@ -5084,7 +5086,6 @@ EXPORT_SYMBOL_GPL(ata_busy_sleep);
 EXPORT_SYMBOL_GPL(ata_port_queue_task);
 EXPORT_SYMBOL_GPL(ata_scsi_ioctl);
 EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
-EXPORT_SYMBOL_GPL(ata_scsi_timed_out);
 EXPORT_SYMBOL_GPL(ata_scsi_error);
 EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
 EXPORT_SYMBOL_GPL(ata_scsi_release);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index ccedb4536977..bd9f2176f79a 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -41,6 +41,7 @@
 #include <scsi/scsi_eh.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_request.h>
+#include <scsi/scsi_transport.h>
 #include <linux/libata.h>
 #include <linux/hdreg.h>
 #include <asm/uaccess.h>
@@ -52,6 +53,7 @@
 typedef unsigned int (*ata_xlat_func_t)(struct ata_queued_cmd *qc, const u8 *scsicmd);
 static struct ata_device *
 ata_scsi_find_dev(struct ata_port *ap, const struct scsi_device *scsidev);
+enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 
 #define RW_RECOVERY_MPAGE 0x1
 #define RW_RECOVERY_MPAGE_LEN 12
@@ -92,6 +94,14 @@ static const u8 def_control_mpage[CONTROL_MPAGE_LEN] = {
 	0, 30	/* extended self test time, see 05-359r1 */
 };
 
+/*
+ * libata transport template.  libata doesn't do real transport stuff.
+ * It just needs the eh_timed_out hook.
+ */
+struct scsi_transport_template ata_scsi_transport_template = {
+	.eh_timed_out		= ata_scsi_timed_out,
+};
+
 
 static void ata_scsi_invalid_field(struct scsi_cmnd *cmd,
 				   void (*done)(struct scsi_cmnd *))
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index f4c48c91b63d..65f52beea884 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -57,6 +57,8 @@ extern int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg);
 
 
 /* libata-scsi.c */
+extern struct scsi_transport_template ata_scsi_transport_template;
+
 extern void ata_scsi_scan_host(struct ata_port *ap);
 extern int ata_scsi_error(struct Scsi_Host *host);
 extern unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf,
diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c
index 5f33cc932e70..b3dc5f85ae0b 100644
--- a/drivers/scsi/pdc_adma.c
+++ b/drivers/scsi/pdc_adma.c
@@ -143,7 +143,6 @@ static struct scsi_host_template adma_ata_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index e561281967dd..874c5be0843c 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -378,7 +378,6 @@ static struct scsi_host_template mv_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= MV_USE_Q_DEPTH,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index caffadc2e0ae..e5b20c6afc18 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -229,7 +229,6 @@ static struct scsi_host_template nv_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index 84cb3940ad88..cc928c68a479 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -111,7 +111,6 @@ static struct scsi_host_template pdc_ata_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c
index 9602f43a298e..9ffe1ef0d205 100644
--- a/drivers/scsi/sata_qstor.c
+++ b/drivers/scsi/sata_qstor.c
@@ -132,7 +132,6 @@ static struct scsi_host_template qs_ata_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index 4f2a67ed39d8..3e75d6733239 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -146,7 +146,6 @@ static struct scsi_host_template sil_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index 9a53a5ed38c5..5d01e5ce5ac5 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -281,7 +281,6 @@ static struct scsi_host_template sil24_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c
index 7fd45f86de99..acc8439dea23 100644
--- a/drivers/scsi/sata_sis.c
+++ b/drivers/scsi/sata_sis.c
@@ -87,7 +87,6 @@ static struct scsi_host_template sis_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c
index 4aaccd53e736..051e47d975ca 100644
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -288,7 +288,6 @@ static struct scsi_host_template k2_sata_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 9f8a76815402..ae70f60c7c0d 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -182,7 +182,6 @@ static struct scsi_host_template pdc_sata_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c
index 37a487b7d655..8f5025733def 100644
--- a/drivers/scsi/sata_uli.c
+++ b/drivers/scsi/sata_uli.c
@@ -75,7 +75,6 @@ static struct scsi_host_template uli_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c
index ff65a0b0457f..791bf652ba63 100644
--- a/drivers/scsi/sata_via.c
+++ b/drivers/scsi/sata_via.c
@@ -94,7 +94,6 @@ static struct scsi_host_template svia_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index b574379a7a82..ee75b9b38ae8 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -251,7 +251,6 @@ static struct scsi_host_template vsc_sata_sht = {
 	.name			= DRV_NAME,
 	.ioctl			= ata_scsi_ioctl,
 	.queuecommand		= ata_scsi_queuecmd,
-	.eh_timed_out		= ata_scsi_timed_out,
 	.eh_strategy_handler	= ata_scsi_error,
 	.can_queue		= ATA_DEF_QUEUE,
 	.this_id		= ATA_SHT_THIS_ID,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 239408ecfddf..204c37a55f06 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -508,7 +508,6 @@ extern void ata_host_set_remove(struct ata_host_set *host_set);
 extern int ata_scsi_detect(struct scsi_host_template *sht);
 extern int ata_scsi_ioctl(struct scsi_device *dev, int cmd, void __user *arg);
 extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *));
-extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern int ata_scsi_error(struct Scsi_Host *host);
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 4de151d8cd2553e7e89044ab5d72fcad4eb04afb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 22 Mar 2006 00:13:35 +0100
Subject: It's UTF-8

Fix some comments to "UTF-8".

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/filesystems/isofs.txt | 4 ++--
 Documentation/filesystems/jfs.txt   | 2 +-
 Documentation/filesystems/vfat.txt  | 6 +++---
 fs/befs/linuxvfs.c                  | 2 +-
 fs/cifs/CHANGES                     | 2 +-
 fs/fat/dir.c                        | 2 +-
 fs/fat/inode.c                      | 2 +-
 fs/isofs/joliet.c                   | 2 +-
 fs/nls/Kconfig                      | 2 +-
 include/asm-mips/termbits.h         | 2 +-
 include/linux/msdos_fs.h            | 2 +-
 11 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt
index 424585ff6ea1..758e50401c16 100644
--- a/Documentation/filesystems/isofs.txt
+++ b/Documentation/filesystems/isofs.txt
@@ -9,9 +9,9 @@ when using discs encoded using Microsoft's Joliet extensions.
   iocharset=name Character set to use for converting from Unicode to
 		ASCII.  Joliet filenames are stored in Unicode format, but
 		Unix for the most part doesn't know how to deal with Unicode.
-		There is also an option of doing UTF8 translations with the
+		There is also an option of doing UTF-8 translations with the
 		utf8 option.
-  utf8          Encode Unicode names in UTF8 format. Default is no.
+  utf8          Encode Unicode names in UTF-8 format. Default is no.
 
 Mount options unique to the isofs filesystem.
   block=512     Set the block size for the disk to 512 bytes
diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
index 3e992daf99ad..bae128663748 100644
--- a/Documentation/filesystems/jfs.txt
+++ b/Documentation/filesystems/jfs.txt
@@ -6,7 +6,7 @@ The following mount options are supported:
 
 iocharset=name	Character set to use for converting from Unicode to
 		ASCII.  The default is to do no conversion.  Use
-		iocharset=utf8 for UTF8 translations.  This requires
+		iocharset=utf8 for UTF-8 translations.  This requires
 		CONFIG_NLS_UTF8 to be set in the kernel .config file.
 		iocharset=none specifies the default behavior explicitly.
 
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 5ead20c6c744..2001abbc60e6 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -28,16 +28,16 @@ iocharset=name -- Character set to use for converting between the
 		 know how to deal with Unicode.
 		 By default, FAT_DEFAULT_IOCHARSET setting is used.
 
-		 There is also an option of doing UTF8 translations
+		 There is also an option of doing UTF-8 translations
 		 with the utf8 option.
 
 		 NOTE: "iocharset=utf8" is not recommended. If unsure,
 		 you should consider the following option instead.
 
-utf8=<bool>   -- UTF8 is the filesystem safe version of Unicode that
+utf8=<bool>   -- UTF-8 is the filesystem safe version of Unicode that
 		 is used by the console.  It can be be enabled for the
 		 filesystem with this option. If 'uni_xlate' gets set,
-		 UTF8 gets disabled.
+		 UTF-8 gets disabled.
 
 uni_xlate=<bool> -- Translate unhandled Unicode characters to special
 		 escaped sequences.  This would let you backup and
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 2d365cb8eec6..dd6048ce0532 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -561,7 +561,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
  * @sb: Superblock
  * @src: Input string buffer in NLS format
  * @srclen: Length of input string in bytes
- * @dest: The output string in UTF8 format
+ * @dest: The output string in UTF-8 format
  * @destlen: Length of the output buffer
  * 
  * Converts input string @src, which is in the format of the loaded NLS map,
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index d335015473a5..cb68efba35db 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -160,7 +160,7 @@ improperly zeroed buffer in CIFS Unix extensions set times call.
 Version 1.25
 ------------
 Fix internationalization problem in cifs readdir with filenames that map to 
-longer UTF8 strings than the string on the wire was in Unicode.  Add workaround
+longer UTF-8 strings than the string on the wire was in Unicode.  Add workaround
 for readdir to netapp servers. Fix search rewind (seek into readdir to return 
 non-consecutive entries).  Do not do readdir when server negotiates 
 buffer size to small to fit filename. Add support for reading POSIX ACLs from
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index db0de5c621c7..4095bc149eb1 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -114,7 +114,7 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
 }
 
 /*
- * Convert Unicode 16 to UTF8, translated Unicode, or ASCII.
+ * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII.
  * If uni_xlate is enabled and we can't get a 1:1 conversion, use a
  * colon as an escape character since it is normally invalid on the vfat
  * filesystem. The following four characters are the hexadecimal digits
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index e7f4aa7fc686..e78d7b4842cc 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1101,7 +1101,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 			return -EINVAL;
 		}
 	}
-	/* UTF8 doesn't provide FAT semantics */
+	/* UTF-8 doesn't provide FAT semantics */
 	if (!strcmp(opts->iocharset, "utf8")) {
 		printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
 		       " for FAT filesystems, filesystem will be case sensitive!\n");
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 2931de7f1a6a..81a90e170ac3 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -11,7 +11,7 @@
 #include "isofs.h"
 
 /*
- * Convert Unicode 16 to UTF8 or ASCII.
+ * Convert Unicode 16 to UTF-8 or ASCII.
  */
 static int
 uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig
index 0ab8f00bdbb2..976ecccd6f56 100644
--- a/fs/nls/Kconfig
+++ b/fs/nls/Kconfig
@@ -491,7 +491,7 @@ config NLS_KOI8_U
 	  (koi8-u) and Belarusian (koi8-ru) character sets.
 
 config NLS_UTF8
-	tristate "NLS UTF8"
+	tristate "NLS UTF-8"
 	depends on NLS
 	help
 	  If you want to display filenames with native language characters
diff --git a/include/asm-mips/termbits.h b/include/asm-mips/termbits.h
index c29c65b7818e..fa6d04dac56b 100644
--- a/include/asm-mips/termbits.h
+++ b/include/asm-mips/termbits.h
@@ -77,7 +77,7 @@ struct termios {
 #define IXANY	0004000		/* Any character will restart after stop.  */
 #define IXOFF	0010000		/* Enable start/stop input control.  */
 #define IMAXBEL	0020000		/* Ring bell when input queue is full.  */
-#define IUTF8	0040000		/* Input is UTF8 */
+#define IUTF8	0040000		/* Input is UTF-8 */
 
 /* c_oflag bits */
 #define OPOST	0000001		/* Perform output processing.  */
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index e933e2a355ad..8bcd9450d926 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -199,7 +199,7 @@ struct fat_mount_options {
 		 sys_immutable:1, /* set = system files are immutable */
 		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
 		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
-		 utf8:1,	  /* Use of UTF8 character set (Default) */
+		 utf8:1,	  /* Use of UTF-8 character set (Default) */
 		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
 		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
 		 atari:1,         /* Use Atari GEMDOS variation of MS-DOS fs */
-- 
cgit v1.2.3


From 116f232b3794a8b6ebde21aef5004b18cc1cfa86 Mon Sep 17 00:00:00 2001
From: Rytchkov Alexey <lilo0@nm.ru>
Date: Wed, 22 Mar 2006 00:58:53 +0100
Subject: fixed path to moved file in include/linux/device.h

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 include/linux/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 5b595fdfb672..10c1693a2529 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -399,7 +399,7 @@ extern struct device * get_device(struct device * dev);
 extern void put_device(struct device * dev);
 
 
-/* drivers/base/power.c */
+/* drivers/base/power/shutdown.c */
 extern void device_shutdown(void);
 
 
-- 
cgit v1.2.3


From f59b0cf8a3a39b75e580066c6a9aeabd97ec2743 Mon Sep 17 00:00:00 2001
From: Albert Lee <albertcc@tw.ibm.com>
Date: Thu, 16 Mar 2006 17:59:22 +0800
Subject: [PATCH] libata-dev: Remove ATA_PROT_PIO_MULT

Remove the ATA_PROT_PIO_MULT protocol.

Signed-off-by: Albert Lee <albertcc@tw.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 1 -
 drivers/scsi/libata-scsi.c | 7 ++++---
 include/linux/ata.h        | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index a28569d0081d..7a5392c8ec6f 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3909,7 +3909,6 @@ static inline int ata_should_dma_map(struct ata_queued_cmd *qc)
 
 	case ATA_PROT_ATAPI:
 	case ATA_PROT_PIO:
-	case ATA_PROT_PIO_MULT:
 		if (ap->flags & ATA_FLAG_PIO_DMA)
 			return 1;
 
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 3aaa74cbef1d..cebf9b31b516 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -2379,9 +2379,6 @@ ata_scsi_map_proto(u8 byte1)
 
 		case 4:		/* PIO Data-in */
 		case 5:		/* PIO Data-out */
-			if (byte1 & 0xe0) {
-				return ATA_PROT_PIO_MULT;
-			}
 			return ATA_PROT_PIO;
 
 		case 10:	/* Device Reset */
@@ -2420,6 +2417,10 @@ ata_scsi_pass_thru(struct ata_queued_cmd *qc, const u8 *scsicmd)
 	if ((tf->protocol = ata_scsi_map_proto(scsicmd[1])) == ATA_PROT_UNKNOWN)
 		goto invalid_fld;
 
+	if (scsicmd[1] & 0xe0)
+		/* PIO multi not supported yet */
+		goto invalid_fld;
+
 	/*
 	 * 12 and 16 byte CDBs use different offsets to
 	 * provide the various register values.
diff --git a/include/linux/ata.h b/include/linux/ata.h
index b02a16c435e7..6b188b3b61dd 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -204,7 +204,6 @@ enum ata_tf_protocols {
 	ATA_PROT_UNKNOWN,	/* unknown/invalid */
 	ATA_PROT_NODATA,	/* no data */
 	ATA_PROT_PIO,		/* PIO single sector */
-	ATA_PROT_PIO_MULT,	/* PIO multiple sector */
 	ATA_PROT_DMA,		/* DMA */
 	ATA_PROT_ATAPI,		/* packet command, PIO data xfer*/
 	ATA_PROT_ATAPI_NODATA,	/* packet command, no data */
-- 
cgit v1.2.3


From e46834cd2ddb1e2941806cb8fec60fb6bdd2ec29 Mon Sep 17 00:00:00 2001
From: Brian King <brking@us.ibm.com>
Date: Fri, 17 Mar 2006 17:04:03 -0600
Subject: [PATCH] libata: Add some dummy noop functions

Add some dummy noop functions for use by libata clients
that do not need to do anything. Future SAS patches will
utilize these functions.

Signed-off-by: Brian King <brking@us.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 3 +++
 include/linux/libata.h     | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 7a5392c8ec6f..8c86ae8d79b5 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2865,6 +2865,8 @@ void ata_qc_prep(struct ata_queued_cmd *qc)
 	ata_fill_sg(qc);
 }
 
+void ata_noop_qc_prep(struct ata_queued_cmd *qc) { }
+
 /**
  *	ata_sg_init_one - Associate command with memory buffer
  *	@qc: Command to be associated
@@ -5063,6 +5065,7 @@ EXPORT_SYMBOL_GPL(ata_port_stop);
 EXPORT_SYMBOL_GPL(ata_host_stop);
 EXPORT_SYMBOL_GPL(ata_interrupt);
 EXPORT_SYMBOL_GPL(ata_qc_prep);
+EXPORT_SYMBOL_GPL(ata_noop_qc_prep);
 EXPORT_SYMBOL_GPL(ata_bmdma_setup);
 EXPORT_SYMBOL_GPL(ata_bmdma_start);
 EXPORT_SYMBOL_GPL(ata_bmdma_irq_clear);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 239408ecfddf..17e5a719c72b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -543,6 +543,7 @@ extern void ata_port_stop (struct ata_port *ap);
 extern void ata_host_stop (struct ata_host_set *host_set);
 extern irqreturn_t ata_interrupt (int irq, void *dev_instance, struct pt_regs *regs);
 extern void ata_qc_prep(struct ata_queued_cmd *qc);
+extern void ata_noop_qc_prep(struct ata_queued_cmd *qc);
 extern unsigned int ata_qc_issue_prot(struct ata_queued_cmd *qc);
 extern void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf,
 		unsigned int buflen);
-- 
cgit v1.2.3


From b6782728d703fa3f0e5478a8b89e49ea10b1fdd0 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 21 Mar 2006 15:52:49 +0000
Subject: [PATCH] libata: Add the useful macros/constants needed for merging
 PATA stuff

HPA presence/enabled
HPA commands

Also add ata_id_is_cfa() as that is needed to detect and handle CF cards
which currently we reject.

Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/ata.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 6b188b3b61dd..312a2c0c64e6 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -146,6 +146,8 @@ enum {
  	ATA_CMD_STANDBYNOW1	= 0xE0,
  	ATA_CMD_IDLEIMMEDIATE	= 0xE1,
 	ATA_CMD_INIT_DEV_PARAMS	= 0x91,
+	ATA_CMD_READ_NATIVE_MAX	= 0xF8,
+	ATA_CMD_READ_NATIVE_MAX_EXT = 0x27,
 
 	/* SETFEATURES stuff */
 	SETFEATURES_XFER	= 0x03,
@@ -246,18 +248,22 @@ struct ata_taskfile {
 };
 
 #define ata_id_is_ata(id)	(((id)[0] & (1 << 15)) == 0)
+#define ata_id_is_cfa(id)	((id)[0] == 0x848A)
 #define ata_id_is_sata(id)	((id)[93] == 0)
 #define ata_id_rahead_enabled(id) ((id)[85] & (1 << 6))
 #define ata_id_wcache_enabled(id) ((id)[85] & (1 << 5))
+#define ata_id_hpa_enabled(id)	((id)[85] & (1 << 10))
 #define ata_id_has_fua(id)	((id)[84] & (1 << 6))
 #define ata_id_has_flush(id)	((id)[83] & (1 << 12))
 #define ata_id_has_flush_ext(id) ((id)[83] & (1 << 13))
 #define ata_id_has_lba48(id)	((id)[83] & (1 << 10))
+#define ata_id_has_hpa(id)	((id)[82] & (1 << 10))
 #define ata_id_has_wcache(id)	((id)[82] & (1 << 5))
 #define ata_id_has_pm(id)	((id)[82] & (1 << 3))
 #define ata_id_has_lba(id)	((id)[49] & (1 << 9))
 #define ata_id_has_dma(id)	((id)[49] & (1 << 8))
 #define ata_id_removeable(id)	((id)[0] & (1 << 7))
+#define ata_id_has_dword_io(id)	((id)[50] & (1 << 0))
 #define ata_id_u32(id,n)	\
 	(((u32) (id)[(n) + 1] << 16) | ((u32) (id)[(n)]))
 #define ata_id_u64(id,n)	\
-- 
cgit v1.2.3


From 17bb34a3c548c4fd2a7c859123a631f97c2af09f Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Tue, 21 Mar 2006 21:29:21 -0500
Subject: [libata] add prototypes for helpers

Add prototypes for stuff recently added by Alan.
---
 include/linux/libata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 17e5a719c72b..d81cecdda4f3 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -502,6 +502,7 @@ extern int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_i
 extern void ata_pci_remove_one (struct pci_dev *pdev);
 extern int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t state);
 extern int ata_pci_device_resume(struct pci_dev *pdev);
+extern int ata_pci_clear_simplex(struct pci_dev *pdev);
 #endif /* CONFIG_PCI */
 extern int ata_device_add(const struct ata_probe_ent *ent);
 extern void ata_host_set_remove(struct ata_host_set *host_set);
@@ -610,7 +611,7 @@ extern void ata_pci_host_stop (struct ata_host_set *host_set);
 extern struct ata_probe_ent *
 ata_pci_init_native_mode(struct pci_dev *pdev, struct ata_port_info **port, int portmask);
 extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits);
-
+extern unsigned long ata_pci_default_filter(const struct ata_port *, struct ata_device *, unsigned long);
 #endif /* CONFIG_PCI */
 
 
-- 
cgit v1.2.3


From 89bbfc95d65839d6ae23ddab8a3cc5af4ae88383 Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Tue, 21 Mar 2006 23:58:08 -0800
Subject: [NET]: allow 32 bit socket ioctl in 64 bit kernel

Since the register_ioctl32_conversion() patch in the kernel is now obsolete,
provide another method to allow 32 bit user space ioctls to reach the kernel.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  6 ++++++
 net/socket.c        | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 152fa6551fd8..84a490e5f0a1 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -143,6 +143,8 @@ struct proto_ops {
 				      struct poll_table_struct *wait);
 	int		(*ioctl)     (struct socket *sock, unsigned int cmd,
 				      unsigned long arg);
+	int	 	(*compat_ioctl) (struct socket *sock, unsigned int cmd,
+				      unsigned long arg);
 	int		(*listen)    (struct socket *sock, int len);
 	int		(*shutdown)  (struct socket *sock, int flags);
 	int		(*setsockopt)(struct socket *sock, int level,
@@ -251,6 +253,8 @@ SOCKCALL_UWRAP(name, poll, (struct file *file, struct socket *sock, struct poll_
 	      (file, sock, wait)) \
 SOCKCALL_WRAP(name, ioctl, (struct socket *sock, unsigned int cmd, \
 			 unsigned long arg), (sock, cmd, arg)) \
+SOCKCALL_WRAP(name, compat_ioctl, (struct socket *sock, unsigned int cmd, \
+			 unsigned long arg), (sock, cmd, arg)) \
 SOCKCALL_WRAP(name, listen, (struct socket *sock, int len), (sock, len)) \
 SOCKCALL_WRAP(name, shutdown, (struct socket *sock, int flags), (sock, flags)) \
 SOCKCALL_WRAP(name, setsockopt, (struct socket *sock, int level, int optname, \
@@ -275,6 +279,7 @@ static const struct proto_ops name##_ops = {			\
 	.getname	= __lock_##name##_getname,	\
 	.poll		= __lock_##name##_poll,		\
 	.ioctl		= __lock_##name##_ioctl,	\
+	.compat_ioctl	= __lock_##name##_compat_ioctl,	\
 	.listen		= __lock_##name##_listen,	\
 	.shutdown	= __lock_##name##_shutdown,	\
 	.setsockopt	= __lock_##name##_setsockopt,	\
@@ -283,6 +288,7 @@ static const struct proto_ops name##_ops = {			\
 	.recvmsg	= __lock_##name##_recvmsg,	\
 	.mmap		= __lock_##name##_mmap,		\
 };
+
 #endif
 
 #define MODULE_ALIAS_NETPROTO(proto) \
diff --git a/net/socket.c b/net/socket.c
index e3c21d5ec288..e2d5bae994de 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -107,6 +107,10 @@ static unsigned int sock_poll(struct file *file,
 			      struct poll_table_struct *wait);
 static long sock_ioctl(struct file *file,
 		      unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+static long compat_sock_ioctl(struct file *file,
+		      unsigned int cmd, unsigned long arg);
+#endif
 static int sock_fasync(int fd, struct file *filp, int on);
 static ssize_t sock_readv(struct file *file, const struct iovec *vector,
 			  unsigned long count, loff_t *ppos);
@@ -128,6 +132,9 @@ static struct file_operations socket_file_ops = {
 	.aio_write =	sock_aio_write,
 	.poll =		sock_poll,
 	.unlocked_ioctl = sock_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = compat_sock_ioctl,
+#endif
 	.mmap =		sock_mmap,
 	.open =		sock_no_open,	/* special open code to disallow open via /proc */
 	.release =	sock_close,
@@ -2136,6 +2143,20 @@ void socket_seq_show(struct seq_file *seq)
 }
 #endif /* CONFIG_PROC_FS */
 
+#ifdef CONFIG_COMPAT
+static long compat_sock_ioctl(struct file *file, unsigned cmd,
+				unsigned long arg)
+{
+	struct socket *sock = file->private_data;
+	int ret = -ENOIOCTLCMD;
+
+	if (sock->ops->compat_ioctl)
+		ret = sock->ops->compat_ioctl(sock, cmd, arg);
+
+	return ret;
+}
+#endif
+
 /* ABI emulation layers need these two */
 EXPORT_SYMBOL(move_addr_to_kernel);
 EXPORT_SYMBOL(move_addr_to_user);
-- 
cgit v1.2.3


From a64b7b936dcd926ace745c07c14f45ecfaddb034 Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Mar 2006 00:01:31 -0800
Subject: [X25]: allow ITU-T DTE facilities for x25

Allows use of the optional user facility to insert ITU-T
(http://www.itu.int/ITU-T/) specified DTE facilities in call set-up x25
packets.  This feature is optional; no facilities will be added if the ioctl
is not used, and call setup packet remains the same as before.

If the ioctls provided by the patch are used, then a facility marker will be
added to the x25 packet header so that the called dte address extension
facility can be differentiated from other types of facilities (as described in
the ITU-T X.25 recommendation) that are also allowed in the x25 packet header.

Facility markers are made up of two octets, and may be present in the x25
packet headers of call-request, incoming call, call accepted, clear request,
and clear indication packets.  The first of the two octets represents the
facility code field and is set to zero by this patch.  The second octet of the
marker represents the facility parameter field and is set to 0x0F because the
marker will be inserted before ITU-T type DTE facilities.

Since according to ITU-T X.25 Recommendation X.25(10/96)- 7.1 "All networks
will support the facility markers with a facility parameter field set to all
ones or to 00001111", therefore this patch should work with all x.25 networks.

While there are many ITU-T DTE facilities, this patch implements only the
called and calling address extension, with placeholders in the
x25_dte_facilities structure for the rest of the facilities.

Testing:

This patch was tested using a cisco xot router connected on its serial ports
to an X.25 network, and on its lan ports to a host running an xotd daemon.

It is also possible to test this patch using an xotd daemon and an x25tap
patch, where the xotd daemons work back-to-back without actually using an x.25
network.  See www.fyonne.net for details on how to do this.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>
Acked-by: Andrew Hendry <ahendry@tusc.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h      | 26 +++++++++++++++
 include/net/x25.h        | 21 ++++++++++---
 net/x25/af_x25.c         | 45 +++++++++++++++++++++++++-
 net/x25/x25_facilities.c | 82 +++++++++++++++++++++++++++++++++++++++++-------
 net/x25/x25_in.c         |  3 +-
 net/x25/x25_subr.c       |  6 ++--
 6 files changed, 163 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index 16d44931afa0..d035e4e87d07 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -11,6 +11,8 @@
 #ifndef	X25_KERNEL_H
 #define	X25_KERNEL_H
 
+#include <linux/types.h>
+
 #define	SIOCX25GSUBSCRIP	(SIOCPROTOPRIVATE + 0)
 #define	SIOCX25SSUBSCRIP	(SIOCPROTOPRIVATE + 1)
 #define	SIOCX25GFACILITIES	(SIOCPROTOPRIVATE + 2)
@@ -21,6 +23,8 @@
 #define SIOCX25SCUDMATCHLEN	(SIOCPROTOPRIVATE + 7)
 #define SIOCX25CALLACCPTAPPRV   (SIOCPROTOPRIVATE + 8)
 #define SIOCX25SENDCALLACCPT    (SIOCPROTOPRIVATE + 9)
+#define SIOCX25GDTEFACILITIES (SIOCPROTOPRIVATE + 10)
+#define SIOCX25SDTEFACILITIES (SIOCPROTOPRIVATE + 11)
 
 /*
  *	Values for {get,set}sockopt.
@@ -77,6 +81,8 @@ struct x25_subscrip_struct {
 #define	X25_MASK_PACKET_SIZE	0x04
 #define	X25_MASK_WINDOW_SIZE	0x08
 
+#define X25_MASK_CALLING_AE 0x10
+#define X25_MASK_CALLED_AE 0x20
 
 
 /*
@@ -98,6 +104,26 @@ struct x25_facilities {
 	unsigned int	reverse;
 };
 
+/*
+* ITU DTE facilities
+* Only the called and calling address
+* extension are currently implemented.
+* The rest are in place to avoid the struct
+* changing size if someone needs them later
+*/
+
+struct x25_dte_facilities {
+	__u16 delay_cumul;
+	__u16 delay_target;
+	__u16 delay_max;
+	__u8 min_throughput;
+	__u8 expedited;
+	__u8 calling_len;
+	__u8 called_len;
+	__u8 calling_ae[20];
+	__u8 called_ae[20];
+};
+
 /*
  *	Call User Data structure.
  */
diff --git a/include/net/x25.h b/include/net/x25.h
index fee62ff8c194..0ad90ebcf86e 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -101,9 +101,17 @@ enum {
 #define	X25_FAC_PACKET_SIZE	0x42
 #define	X25_FAC_WINDOW_SIZE	0x43
 
-#define	X25_MAX_FAC_LEN		20		/* Plenty to spare */
+#define X25_MAX_FAC_LEN 	60
 #define	X25_MAX_CUD_LEN		128
 
+#define X25_FAC_CALLING_AE 	0xCB
+#define X25_FAC_CALLED_AE 	0xC9
+
+#define X25_MARKER 		0x00
+#define X25_DTE_SERVICES 	0x0F
+#define X25_MAX_AE_LEN 		40			/* Max num of semi-octets in AE - OSI Nw */
+#define X25_MAX_DTE_FACIL_LEN	21			/* Max length of DTE facility params */
+
 /**
  *	struct x25_route - x25 routing entry
  *	@node - entry in x25_list_lock
@@ -148,6 +156,7 @@ struct x25_sock {
 	struct timer_list	timer;
 	struct x25_causediag	causediag;
 	struct x25_facilities	facilities;
+	struct x25_dte_facilities dte_facilities;
 	struct x25_calluserdata	calluserdata;
 	unsigned long 		vc_facil_mask;	/* inc_call facilities mask */
 };
@@ -180,9 +189,13 @@ extern void x25_establish_link(struct x25_neigh *);
 extern void x25_terminate_link(struct x25_neigh *);
 
 /* x25_facilities.c */
-extern int  x25_parse_facilities(struct sk_buff *, struct x25_facilities *, unsigned long *);
-extern int  x25_create_facilities(unsigned char *, struct x25_facilities *, unsigned long);
-extern int  x25_negotiate_facilities(struct sk_buff *, struct sock *, struct x25_facilities *);
+extern int x25_parse_facilities(struct sk_buff *, struct x25_facilities *,
+				struct x25_dte_facilities *, unsigned long *);
+extern int x25_create_facilities(unsigned char *, struct x25_facilities *,
+				struct x25_dte_facilities *, unsigned long);
+extern int x25_negotiate_facilities(struct sk_buff *, struct sock *,
+				struct x25_facilities *,
+				struct x25_dte_facilities *);
 extern void x25_limit_facilities(struct x25_facilities *, struct x25_neigh *);
 
 /* x25_in.c */
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 03725c051752..7bf93df7248b 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -525,6 +525,13 @@ static int x25_create(struct socket *sock, int protocol)
 	x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE;
 	x25->facilities.throughput  = X25_DEFAULT_THROUGHPUT;
 	x25->facilities.reverse     = X25_DEFAULT_REVERSE;
+ 	x25->dte_facilities.calling_len = 0;
+ 	x25->dte_facilities.called_len = 0;
+ 	memset(x25->dte_facilities.called_ae, '\0',
+		 	sizeof(x25->dte_facilities.called_ae));
+ 	memset(x25->dte_facilities.calling_ae, '\0',
+		 	sizeof(x25->dte_facilities.calling_ae));
+
 	rc = 0;
 out:
 	return rc;
@@ -561,6 +568,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25->t2         = ox25->t2;
 	x25->facilities = ox25->facilities;
 	x25->qbitincl   = ox25->qbitincl;
+	x25->dte_facilities = ox25->dte_facilities;
 	x25->cudmatchlength = ox25->cudmatchlength;
 	x25->accptapprv = ox25->accptapprv;
 
@@ -840,6 +848,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	struct x25_sock *makex25;
 	struct x25_address source_addr, dest_addr;
 	struct x25_facilities facilities;
+	struct x25_dte_facilities dte_facilities;
 	int len, rc;
 
 	/*
@@ -876,7 +885,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	/*
 	 *	Try to reach a compromise on the requested facilities.
 	 */
-	if ((len = x25_negotiate_facilities(skb, sk, &facilities)) == -1)
+	len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities);
+	if (len == -1)
 		goto out_sock_put;
 
 	/*
@@ -907,9 +917,12 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	makex25->source_addr   = source_addr;
 	makex25->neighbour     = nb;
 	makex25->facilities    = facilities;
+	makex25->dte_facilities= dte_facilities;
 	makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
 	/* ensure no reverse facil on accept */
 	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
+	/* ensure no calling address extension on accept */
+	makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE;
 	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
 
 	/* Normally all calls are accepted immediatly */
@@ -1316,6 +1329,36 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25GDTEFACILITIES: {
+ 			rc = copy_to_user(argp, &x25->dte_facilities,
+						sizeof(x25->dte_facilities));
+			if (rc)
+				rc = -EFAULT;
+ 			break;
+ 		}
+
+	 	case SIOCX25SDTEFACILITIES: {
+	 		struct x25_dte_facilities dtefacs;
+	 		rc = -EFAULT;
+		 	if (copy_from_user(&dtefacs, argp, sizeof(dtefacs)))
+				break;
+			rc = -EINVAL;
+			if (sk->sk_state != TCP_LISTEN &&
+					sk->sk_state != TCP_CLOSE)
+				break;
+			if (dtefacs.calling_len > X25_MAX_AE_LEN)
+				break;
+			if (dtefacs.calling_ae == NULL)
+				break;
+			if (dtefacs.called_len > X25_MAX_AE_LEN)
+				break;
+			if (dtefacs.called_ae == NULL)
+				break;
+			x25->dte_facilities = dtefacs;
+			rc = 0;
+			break;
+		}
+
 		case SIOCX25GCALLUSERDATA: {
 			struct x25_calluserdata cud = x25->calluserdata;
 			rc = copy_to_user(argp, &cud,
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index 54278b962f4c..9f42b9c9de37 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -28,18 +28,28 @@
 #include <net/x25.h>
 
 /*
- *	Parse a set of facilities into the facilities structure. Unrecognised
+ * Parse a set of facilities into the facilities structures. Unrecognised
  *	facilities are written to the debug log file.
  */
-int x25_parse_facilities(struct sk_buff *skb,
-			 struct x25_facilities *facilities,
-			 unsigned long *vc_fac_mask)
+int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
+		struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask)
 {
 	unsigned char *p = skb->data;
 	unsigned int len = *p++;
 
 	*vc_fac_mask = 0;
 
+	/*
+	 * The kernel knows which facilities were set on an incoming call but
+	 * currently this information is not available to userspace.  Here we
+	 * give userspace who read incoming call facilities 0 length to indicate
+	 * it wasn't set.
+	 */
+	dte_facs->calling_len = 0;
+	dte_facs->called_len = 0;
+	memset(dte_facs->called_ae, '\0', sizeof(dte_facs->called_ae));
+	memset(dte_facs->calling_ae, '\0', sizeof(dte_facs->calling_ae));
+
 	while (len > 0) {
 		switch (*p & X25_FAC_CLASS_MASK) {
 		case X25_FAC_CLASS_A:
@@ -74,6 +84,8 @@ int x25_parse_facilities(struct sk_buff *skb,
 				facilities->throughput = p[1];
 				*vc_fac_mask |= X25_MASK_THROUGHPUT;
 				break;
+			case X25_MARKER:
+				break;
 			default:
 				printk(KERN_DEBUG "X.25: unknown facility "
 				       "%02X, value %02X\n",
@@ -112,11 +124,30 @@ int x25_parse_facilities(struct sk_buff *skb,
 			len -= 4;
 			break;
 		case X25_FAC_CLASS_D:
-			printk(KERN_DEBUG "X.25: unknown facility %02X, "
-			       "length %d, values %02X, %02X, %02X, %02X\n",
-			       p[0], p[1], p[2], p[3], p[4], p[5]);
+			switch (*p) {
+	 		case X25_FAC_CALLING_AE:
+	 			if (p[1] > X25_MAX_DTE_FACIL_LEN)
+					break;
+				dte_facs->calling_len = p[2];
+				memcpy(dte_facs->calling_ae, &p[3], p[1] - 1);
+				*vc_fac_mask |= X25_MASK_CALLING_AE;
+				break;
+			case X25_FAC_CALLED_AE:
+				if (p[1] > X25_MAX_DTE_FACIL_LEN)
+					break;
+				dte_facs->called_len = p[2];
+				memcpy(dte_facs->called_ae, &p[3], p[1] - 1);
+				*vc_fac_mask |= X25_MASK_CALLED_AE;
+				break;
+			default:
+				printk(KERN_DEBUG "X.25: unknown facility %02X,"
+					"length %d, values %02X, %02X, "
+					"%02X, %02X\n",
+					p[0], p[1], p[2], p[3], p[4], p[5]);
+				break;
+			}
 			len -= p[1] + 2;
-			p   += p[1] + 2;
+			p += p[1] + 2;
 			break;
 		}
 	}
@@ -128,8 +159,8 @@ int x25_parse_facilities(struct sk_buff *skb,
  *	Create a set of facilities.
  */
 int x25_create_facilities(unsigned char *buffer,
-			  struct x25_facilities *facilities,
-			  unsigned long facil_mask)
+		struct x25_facilities *facilities,
+		struct x25_dte_facilities *dte_facs, unsigned long facil_mask)
 {
 	unsigned char *p = buffer + 1;
 	int len;
@@ -168,6 +199,33 @@ int x25_create_facilities(unsigned char *buffer,
 		*p++ = facilities->winsize_out ? : facilities->winsize_in;
 	}
 
+	if (facil_mask & (X25_MASK_CALLING_AE|X25_MASK_CALLED_AE)) {
+		*p++ = X25_MARKER;
+		*p++ = X25_DTE_SERVICES;
+	}
+
+	if (dte_facs->calling_len && (facil_mask & X25_MASK_CALLING_AE)) {
+		unsigned bytecount = (dte_facs->calling_len % 2) ?
+					dte_facs->calling_len / 2 + 1 :
+					dte_facs->calling_len / 2;
+		*p++ = X25_FAC_CALLING_AE;
+		*p++ = 1 + bytecount;
+		*p++ = dte_facs->calling_len;
+		memcpy(p, dte_facs->calling_ae, bytecount);
+		p += bytecount;
+	}
+
+	if (dte_facs->called_len && (facil_mask & X25_MASK_CALLED_AE)) {
+		unsigned bytecount = (dte_facs->called_len % 2) ?
+		dte_facs->called_len / 2 + 1 :
+		dte_facs->called_len / 2;
+		*p++ = X25_FAC_CALLED_AE;
+		*p++ = 1 + bytecount;
+		*p++ = dte_facs->called_len;
+		memcpy(p, dte_facs->called_ae, bytecount);
+		p+=bytecount;
+	}
+
 	len       = p - buffer;
 	buffer[0] = len - 1;
 
@@ -180,7 +238,7 @@ int x25_create_facilities(unsigned char *buffer,
  *	The only real problem is with reverse charging.
  */
 int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
-			     struct x25_facilities *new)
+		struct x25_facilities *new, struct x25_dte_facilities *dte)
 {
 	struct x25_sock *x25 = x25_sk(sk);
 	struct x25_facilities *ours = &x25->facilities;
@@ -190,7 +248,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
 	memset(&theirs, 0, sizeof(theirs));
 	memcpy(new, ours, sizeof(*new));
 
-	len = x25_parse_facilities(skb, &theirs, &x25->vc_facil_mask);
+	len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask);
 
 	/*
 	 *	They want reverse charging, we won't accept it.
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index 26146874b839..eed50e10f09b 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -106,7 +106,8 @@ static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametyp
 			skb_pull(skb, x25_addr_ntoa(skb->data, &source_addr, &dest_addr));
 			skb_pull(skb,
 				 x25_parse_facilities(skb, &x25->facilities,
-						      &x25->vc_facil_mask));
+						&x25->dte_facilities,
+						&x25->vc_facil_mask));
 			/*
 			 *	Copy any Call User Data.
 			 */
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 8be9b8fbc24d..8d6220aa5d0f 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -190,8 +190,9 @@ void x25_write_internal(struct sock *sk, int frametype)
 			dptr    = skb_put(skb, len);
 			memcpy(dptr, addresses, len);
 			len     = x25_create_facilities(facilities,
-							&x25->facilities,
-					     x25->neighbour->global_facil_mask);
+					&x25->facilities,
+					&x25->dte_facilities,
+					x25->neighbour->global_facil_mask);
 			dptr    = skb_put(skb, len);
 			memcpy(dptr, facilities, len);
 			dptr = skb_put(skb, x25->calluserdata.cudlength);
@@ -206,6 +207,7 @@ void x25_write_internal(struct sock *sk, int frametype)
 			*dptr++ = 0x00;		/* Address lengths */
 			len     = x25_create_facilities(facilities,
 							&x25->facilities,
+							&x25->dte_facilities,
 							x25->vc_facil_mask);
 			dptr    = skb_put(skb, len);
 			memcpy(dptr, facilities, len);
-- 
cgit v1.2.3


From 9d2f928ddf64ca0361562e30faf584cd33055c60 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@nuerscht.ch>
Date: Wed, 22 Mar 2006 10:53:19 +0100
Subject: [PATCH] Intruduce DMA_28BIT_MASK

This patch introduces the DMA_28BIT_MASK constant in dma-mapping.h
ALSA drivers using this mask are changed to use the new constant.

Signed-off-by: Tobias Klauser <tklauser@nuerscht.ch>
Acked-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Jaroslav Kysela <perex@suse.cz>
---
 include/linux/dma-mapping.h  |  1 +
 sound/pci/ad1889.c           |  7 ++++---
 sound/pci/emu10k1/emu10k1x.c | 13 +++++++------
 sound/pci/es1968.c           |  5 +++--
 sound/pci/ice1712/ice1712.c  |  5 +++--
 sound/pci/maestro3.c         |  5 +++--
 sound/pci/mixart/mixart.c    |  3 ++-
 sound/pci/pcxhr/pcxhr.c      |  3 ++-
 8 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2d80cc761a15..a8731062a74c 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -20,6 +20,7 @@ enum dma_data_direction {
 #define DMA_31BIT_MASK	0x000000007fffffffULL
 #define DMA_30BIT_MASK	0x000000003fffffffULL
 #define DMA_29BIT_MASK	0x000000001fffffffULL
+#define DMA_28BIT_MASK	0x000000000fffffffULL
 
 #include <asm/dma-mapping.h>
 
diff --git a/sound/pci/ad1889.c b/sound/pci/ad1889.c
index a208075cdc1e..2aa5a7fdb6e0 100644
--- a/sound/pci/ad1889.c
+++ b/sound/pci/ad1889.c
@@ -34,6 +34,7 @@
 
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/compiler.h>
@@ -909,10 +910,10 @@ snd_ad1889_create(struct snd_card *card,
 
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
-	
+
 	/* check PCI availability (32bit DMA) */
-	if (pci_set_dma_mask(pci, 0xffffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0xffffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) < 0) {
 		printk(KERN_ERR PFX "error setting 32-bit DMA mask.\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c
index 1107c8ec7f78..2208dbd48be9 100644
--- a/sound/pci/emu10k1/emu10k1x.c
+++ b/sound/pci/emu10k1/emu10k1x.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
 #include <sound/core.h>
@@ -893,24 +894,24 @@ static int __devinit snd_emu10k1x_create(struct snd_card *card,
 	static struct snd_device_ops ops = {
 		.dev_free = snd_emu10k1x_dev_free,
 	};
-  
+
 	*rchip = NULL;
-  
+
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
-	if (pci_set_dma_mask(pci, 0x0fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "error to set 28bit mask DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
 	}
-  
+
 	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
 	if (chip == NULL) {
 		pci_disable_device(pci);
 		return -ENOMEM;
 	}
-  
+
 	chip->card = card;
 	chip->pci = pci;
 	chip->irq = -1;
diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c
index 6a265ab3894e..dd465a186e11 100644
--- a/sound/pci/es1968.c
+++ b/sound/pci/es1968.c
@@ -100,6 +100,7 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
@@ -2561,8 +2562,8 @@ static int __devinit snd_es1968_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 28 bits */
-	if (pci_set_dma_mask(pci, 0x0fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 28bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/ice1712/ice1712.c b/sound/pci/ice1712/ice1712.c
index b96b5d6efc5d..672e198317e1 100644
--- a/sound/pci/ice1712/ice1712.c
+++ b/sound/pci/ice1712/ice1712.c
@@ -53,6 +53,7 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
@@ -2553,8 +2554,8 @@ static int __devinit snd_ice1712_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 28 bits */
-	if (pci_set_dma_mask(pci, 0x0fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 28bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c
index d3ef0cc6c4f9..8bc084956c28 100644
--- a/sound/pci/maestro3.c
+++ b/sound/pci/maestro3.c
@@ -37,6 +37,7 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
@@ -2657,8 +2658,8 @@ snd_m3_create(struct snd_card *card, struct pci_dev *pci,
 		return -EIO;
 
 	/* check, if we can restrict PCI DMA transfers to 28 bits */
-	if (pci_set_dma_mask(pci, 0x0fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x0fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 28bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/mixart/mixart.c b/sound/pci/mixart/mixart.c
index e79fb264532b..43ee3b2b948f 100644
--- a/sound/pci/mixart/mixart.c
+++ b/sound/pci/mixart/mixart.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
 #include <sound/core.h>
@@ -1289,7 +1290,7 @@ static int __devinit snd_mixart_probe(struct pci_dev *pci,
 	pci_set_master(pci);
 
 	/* check if we can restrict PCI DMA transfers to 32 bits */
-	if (pci_set_dma_mask(pci, 0xffffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 32bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/pcxhr/pcxhr.c b/sound/pci/pcxhr/pcxhr.c
index 31a3e8e1b234..f679779d96e3 100644
--- a/sound/pci/pcxhr/pcxhr.c
+++ b/sound/pci/pcxhr/pcxhr.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/delay.h>
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
@@ -1217,7 +1218,7 @@ static int __devinit pcxhr_probe(struct pci_dev *pci, const struct pci_device_id
 	pci_set_master(pci);
 
 	/* check if we can restrict PCI DMA transfers to 32 bits */
-	if (pci_set_dma_mask(pci, 0xffffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_32BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 32bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
-- 
cgit v1.2.3


From 4024ce5e0f396447cc1e07fd65c2a1d056b066bb Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Wed, 22 Mar 2006 00:07:43 -0800
Subject: [PATCH] rtc.h broke strace(1) builds

Git patch 52dfa9a64cfb3dd01fa1ee1150d589481e54e28e

	[PATCH] move rtc_interrupt() prototype to rtc.h

broke strace(1) builds.  The below moves the kernel-only additions lower,
under the already provided #ifdef __KERNEL__ statement.

Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rtc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 0b2ba67ff13c..b739ac1f7ca0 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -11,8 +11,6 @@
 #ifndef _LINUX_RTC_H_
 #define _LINUX_RTC_H_
 
-#include <linux/interrupt.h>
-
 /*
  * The struct used to pass data via the following ioctl. Similar to the
  * struct tm in <time.h>, but it needs to be here so that the kernel 
@@ -95,6 +93,8 @@ struct rtc_pll_info {
 
 #ifdef __KERNEL__
 
+#include <linux/interrupt.h>
+
 typedef struct rtc_task {
 	void (*func)(void *private_data);
 	void *private_data;
-- 
cgit v1.2.3


From 8d438f96d2b8eade6cbcd8adfc22dae6f5cbd6c0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:07:59 -0800
Subject: [PATCH] mm: PageLRU no testset

PG_lru is protected by zone->lru_lock. It does not need TestSet/TestClear
operations.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h |  5 ++---
 mm/swap.c                  | 16 ++++++++--------
 mm/vmscan.c                | 20 +++++++++++---------
 3 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d52999c43336..58856c823f8b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -239,10 +239,9 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define __ClearPageDirty(page)	__clear_bit(PG_dirty, &(page)->flags)
 #define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags)
 
-#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
 #define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
+#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
+#define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
 
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
diff --git a/mm/swap.c b/mm/swap.c
index 3045a0f4c451..985324ee9368 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -214,8 +214,8 @@ void fastcall __page_cache_release(struct page *page)
 		struct zone *zone = page_zone(page);
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
-		if (!TestClearPageLRU(page))
-			BUG();
+		BUG_ON(!PageLRU(page));
+		ClearPageLRU(page);
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
@@ -265,8 +265,8 @@ void release_pages(struct page **pages, int nr, int cold)
 				zone = pagezone;
 				spin_lock_irq(&zone->lru_lock);
 			}
-			if (!TestClearPageLRU(page))
-				BUG();
+			BUG_ON(!PageLRU(page));
+			ClearPageLRU(page);
 			del_page_from_lru(zone, page);
 		}
 
@@ -345,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		add_page_to_inactive_list(zone, page);
 	}
 	if (zone)
@@ -372,8 +372,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		if (TestSetPageActive(page))
 			BUG();
 		add_page_to_active_list(zone, page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index acb7611cd525..40fb37828e8c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1042,9 +1042,10 @@ int isolate_lru_page(struct page *page)
 	if (PageLRU(page)) {
 		struct zone *zone = page_zone(page);
 		spin_lock_irq(&zone->lru_lock);
-		if (TestClearPageLRU(page)) {
+		if (PageLRU(page)) {
 			ret = 1;
 			get_page(page);
+			ClearPageLRU(page);
 			if (PageActive(page))
 				del_page_from_active_list(zone, page);
 			else
@@ -1085,6 +1086,8 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
+		BUG_ON(!PageLRU(page));
+
 		list_del(&page->lru);
 		if (unlikely(get_page_testone(page))) {
 			/*
@@ -1100,8 +1103,7 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		 * the page is not being freed elsewhere -- the page release
 		 * code relies on it.
 		 */
-		if (!TestClearPageLRU(page))
-			BUG();
+		ClearPageLRU(page);
 		list_add(&page->lru, dst);
 		nr_taken++;
 	}
@@ -1156,8 +1158,8 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 		 */
 		while (!list_empty(&page_list)) {
 			page = lru_to_page(&page_list);
-			if (TestSetPageLRU(page))
-				BUG();
+			BUG_ON(PageLRU(page));
+			SetPageLRU(page);
 			list_del(&page->lru);
 			if (PageActive(page))
 				add_page_to_active_list(zone, page);
@@ -1276,8 +1278,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		if (!TestClearPageActive(page))
 			BUG();
 		list_move(&page->lru, &zone->inactive_list);
@@ -1305,8 +1307,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
 		pgmoved++;
-- 
cgit v1.2.3


From 4c84cacfa424264f7ad5287298d3ea4a3e935278 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:00 -0800
Subject: [PATCH] mm: PageActive no testset

PG_active is protected by zone->lru_lock, it does not need TestSet/TestClear
operations.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 2 --
 mm/swap.c                  | 4 ++--
 mm/vmscan.c                | 5 +++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 58856c823f8b..5d1e7bd85107 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -246,8 +246,6 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
-#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
-#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
diff --git a/mm/swap.c b/mm/swap.c
index 985324ee9368..cf88226cf96d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -374,8 +374,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 		}
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (TestSetPageActive(page))
-			BUG();
+		BUG_ON(PageActive(page));
+		SetPageActive(page);
 		add_page_to_active_list(zone, page);
 	}
 	if (zone)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 40fb37828e8c..8e477b1a4838 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1280,8 +1280,9 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (!TestClearPageActive(page))
-			BUG();
+		BUG_ON(!PageActive(page));
+		ClearPageActive(page);
+
 		list_move(&page->lru, &zone->inactive_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-- 
cgit v1.2.3


From 674539115cc88473f623581e1d53c0e2ecef2179 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:00 -0800
Subject: [PATCH] mm: less atomic ops

In the page release paths, we can be sure that nobody will mess with our
page->flags because the refcount has dropped to 0.  So no need for atomic
operations here.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm_inline.h  | 2 +-
 include/linux/page-flags.h | 2 ++
 mm/swap.c                  | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8ac854f7f190..3b6723dfaff3 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -32,7 +32,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
 	if (PageActive(page)) {
-		ClearPageActive(page);
+		__ClearPageActive(page);
 		zone->nr_active--;
 	} else {
 		zone->nr_inactive--;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5d1e7bd85107..da71d63df465 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -242,10 +242,12 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
 #define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
 #define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
+#define __ClearPageLRU(page)	__clear_bit(PG_lru, &(page)->flags)
 
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
+#define __ClearPageActive(page)	__clear_bit(PG_active, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
diff --git a/mm/swap.c b/mm/swap.c
index cf88226cf96d..91b7e2026f69 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -215,7 +215,7 @@ void fastcall __page_cache_release(struct page *page)
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		BUG_ON(!PageLRU(page));
-		ClearPageLRU(page);
+		__ClearPageLRU(page);
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
@@ -266,7 +266,7 @@ void release_pages(struct page **pages, int nr, int cold)
 				spin_lock_irq(&zone->lru_lock);
 			}
 			BUG_ON(!PageLRU(page));
-			ClearPageLRU(page);
+			__ClearPageLRU(page);
 			del_page_from_lru(zone, page);
 		}
 
-- 
cgit v1.2.3


From 5e9dace8d386def04219134d7160e8a778824764 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:01 -0800
Subject: [PATCH] mm: page_alloc less atomics

More atomic operation removal from page allocator

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 4 ++--
 mm/page_alloc.c            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index da71d63df465..76c7ffdd0424 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -328,8 +328,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
 
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
-#define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
-#define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
+#define __SetPageCompound(page)	__set_bit(PG_compound, &(page)->flags)
+#define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags)
 
 #ifdef CONFIG_SWAP
 #define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 61775866ea18..102919851353 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -190,7 +190,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
-		SetPageCompound(p);
+		__SetPageCompound(p);
 		set_page_private(p, (unsigned long)page);
 	}
 }
@@ -209,7 +209,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 		if (unlikely(!PageCompound(p) |
 				(page_private(p) != (unsigned long)page)))
 			bad_page(page);
-		ClearPageCompound(p);
+		__ClearPageCompound(p);
 	}
 }
 
-- 
cgit v1.2.3


From f205b2fe62d321403525065a4cb31b6bff1bbe53 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:02 -0800
Subject: [PATCH] mm: slab less atomics

Atomic operation removal from slab

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 6 ++----
 mm/slab.c                  | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 76c7ffdd0424..8cef69d462f2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -250,10 +250,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define __ClearPageActive(page)	__clear_bit(PG_active, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
-#define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
-#define ClearPageSlab(page)	clear_bit(PG_slab, &(page)->flags)
-#define TestClearPageSlab(page)	test_and_clear_bit(PG_slab, &(page)->flags)
-#define TestSetPageSlab(page)	test_and_set_bit(PG_slab, &(page)->flags)
+#define __SetPageSlab(page)	__set_bit(PG_slab, &(page)->flags)
+#define __ClearPageSlab(page)	__clear_bit(PG_slab, &(page)->flags)
 
 #ifdef CONFIG_HIGHMEM
 #define PageHighMem(page)	is_highmem(page_zone(page))
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab04..5988adf010c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1402,7 +1402,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		atomic_add(i, &slab_reclaim_pages);
 	add_page_state(nr_slab, i);
 	while (i--) {
-		SetPageSlab(page);
+		__SetPageSlab(page);
 		page++;
 	}
 	return addr;
@@ -1418,8 +1418,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	const unsigned long nr_freed = i;
 
 	while (i--) {
-		if (!TestClearPageSlab(page))
-			BUG();
+		BUG_ON(!PageSlab(page));
+		__ClearPageSlab(page);
 		page++;
 	}
 	sub_page_state(nr_slab, nr_freed);
-- 
cgit v1.2.3


From 7c8ee9a86340db686cd4314e9944dc9b6111bda9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:03 -0800
Subject: [PATCH] mm: simplify vmscan vs release refcounting

The VM has an interesting race where a page refcount can drop to zero, but it
is still on the LRU lists for a short time.  This was solved by testing a 0->1
refcount transition when picking up pages from the LRU, and dropping the
refcount in that case.

Instead, use atomic_add_unless to ensure we never pick up a 0 refcount page
from the LRU, thus a 0 refcount page will never have its refcount elevated
until it is allocated again.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 19 +++++++++++--------
 mm/vmscan.c        | 25 +++++++++++--------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 498ff8778fb6..b12d5c76420d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -301,17 +301,20 @@ struct page {
  * Drop a ref, return true if the logical refcount fell to zero (the page has
  * no users)
  */
-#define put_page_testzero(p)				\
-	({						\
-		BUG_ON(atomic_read(&(p)->_count) == -1);\
-		atomic_add_negative(-1, &(p)->_count);	\
-	})
+static inline int put_page_testzero(struct page *page)
+{
+	BUG_ON(atomic_read(&page->_count) == -1);
+	return atomic_add_negative(-1, &page->_count);
+}
 
 /*
- * Grab a ref, return true if the page previously had a logical refcount of
- * zero.  ie: returns true if we just grabbed an already-deemed-to-be-free page
+ * Try to grab a ref unless the page has a refcount of zero, return false if
+ * that is the case.
  */
-#define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
+static inline int get_page_unless_zero(struct page *page)
+{
+	return atomic_add_unless(&page->_count, 1, -1);
+}
 
 #define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
 #define __put_page(p)		atomic_dec(&(p)->_count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e477b1a4838..e21bab4deda6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1083,29 +1083,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	int scan = 0;
 
 	while (scan++ < nr_to_scan && !list_empty(src)) {
+		struct list_head *target;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
 		BUG_ON(!PageLRU(page));
 
 		list_del(&page->lru);
-		if (unlikely(get_page_testone(page))) {
+		target = src;
+		if (likely(get_page_unless_zero(page))) {
 			/*
-			 * It is being freed elsewhere
+			 * Be careful not to clear PageLRU until after we're
+			 * sure the page is not being freed elsewhere -- the
+			 * page release code relies on it.
 			 */
-			__put_page(page);
-			list_add(&page->lru, src);
-			continue;
-		}
+			ClearPageLRU(page);
+			target = dst;
+			nr_taken++;
+		} /* else it is being freed elsewhere */
 
-		/*
-		 * Be careful not to clear PageLRU until after we're sure
-		 * the page is not being freed elsewhere -- the page release
-		 * code relies on it.
-		 */
-		ClearPageLRU(page);
-		list_add(&page->lru, dst);
-		nr_taken++;
+		list_add(&page->lru, target);
 	}
 
 	*scanned = scan;
-- 
cgit v1.2.3


From 8dc04efbfb3c08a08fb7a3b97348d5d561b26ae2 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:03 -0800
Subject: [PATCH] mm: de-skew page refcounting

atomic_add_unless (atomic_inc_not_zero) no longer requires an offset refcount
to function correctly.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b12d5c76420d..9bbddf228cd9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -286,15 +286,6 @@ struct page {
  *
  * Also, many kernel routines increase the page count before a critical
  * routine so they can be sure the page doesn't go away from under them.
- *
- * Since 2.6.6 (approx), a free page has ->_count = -1.  This is so that we
- * can use atomic_add_negative(-1, page->_count) to detect when the page
- * becomes free and so that we can also use atomic_inc_and_test to atomically
- * detect when we just tried to grab a ref on a page which some other CPU has
- * already deemed to be freeable.
- *
- * NO code should make assumptions about this internal detail!  Use the provided
- * macros which retain the old rules: page_count(page) == 0 is a free page.
  */
 
 /*
@@ -303,8 +294,8 @@ struct page {
  */
 static inline int put_page_testzero(struct page *page)
 {
-	BUG_ON(atomic_read(&page->_count) == -1);
-	return atomic_add_negative(-1, &page->_count);
+	BUG_ON(atomic_read(&page->_count) == 0);
+	return atomic_dec_and_test(&page->_count);
 }
 
 /*
@@ -313,10 +304,10 @@ static inline int put_page_testzero(struct page *page)
  */
 static inline int get_page_unless_zero(struct page *page)
 {
-	return atomic_add_unless(&page->_count, 1, -1);
+	return atomic_inc_not_zero(&page->_count);
 }
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
+#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v))
 #define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
@@ -325,7 +316,7 @@ static inline int page_count(struct page *page)
 {
 	if (PageCompound(page))
 		page = (struct page *)page_private(page);
-	return atomic_read(&page->_count) + 1;
+	return atomic_read(&page->_count);
 }
 
 static inline void get_page(struct page *page)
-- 
cgit v1.2.3


From 8dfcc9ba27e2ed257e5de9539f7f03e57c2c0e33 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:05 -0800
Subject: [PATCH] mm: split highorder pages

Have an explicit mm call to split higher order pages into individual pages.
 Should help to avoid bugs and be more explicit about the code's intention.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/consistent.c      |  4 ++--
 arch/frv/mm/dma-alloc.c       |  4 +---
 arch/mips/mm/init.c           |  5 +++--
 arch/ppc/kernel/dma-mapping.c |  4 ++--
 arch/sh/mm/consistent.c       |  3 +--
 arch/xtensa/mm/pgtable.c      | 10 +++-------
 include/linux/mm.h            |  6 ++++++
 mm/memory.c                   |  4 +---
 mm/page_alloc.c               | 22 ++++++++++++++++++++++
 9 files changed, 41 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c
index c2ee18d2075e..8a1bfcd50087 100644
--- a/arch/arm/mm/consistent.c
+++ b/arch/arm/mm/consistent.c
@@ -223,6 +223,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		pte = consistent_pte[idx] + off;
 		c->vm_pages = page;
 
+		split_page(page, order);
+
 		/*
 		 * Set the "dma handle"
 		 */
@@ -231,7 +233,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		do {
 			BUG_ON(!pte_none(*pte));
 
-			set_page_count(page, 1);
 			/*
 			 * x86 does not mark the pages reserved...
 			 */
@@ -250,7 +251,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		 * Free the otherwise unused pages.
 		 */
 		while (page < end) {
-			set_page_count(page, 1);
 			__free_page(page);
 			page++;
 		}
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c
index 342823aad758..636b2f8b5d98 100644
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -115,9 +115,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle)
 	 */
 	if (order > 0) {
 		struct page *rpage = virt_to_page(page);
-
-		for (i = 1; i < (1 << order); i++)
-			set_page_count(rpage + i, 1);
+		split_page(rpage, order);
 	}
 
 	err = 0;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 0ff9a348b843..a140da9732db 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -54,7 +54,8 @@ unsigned long empty_zero_page, zero_page_mask;
  */
 unsigned long setup_zero_pages(void)
 {
-	unsigned long order, size;
+	unsigned int order;
+	unsigned long size;
 	struct page *page;
 
 	if (cpu_has_vce)
@@ -67,9 +68,9 @@ unsigned long setup_zero_pages(void)
 		panic("Oh boy, that early out of memory?");
 
 	page = virt_to_page(empty_zero_page);
+	split_page(page, order);
 	while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) {
 		SetPageReserved(page);
-		set_page_count(page, 1);
 		page++;
 	}
 
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c
index 685fd0defe23..61465ec88bc7 100644
--- a/arch/ppc/kernel/dma-mapping.c
+++ b/arch/ppc/kernel/dma-mapping.c
@@ -223,6 +223,8 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
 		struct page *end = page + (1 << order);
 
+		split_page(page, order);
+
 		/*
 		 * Set the "dma handle"
 		 */
@@ -231,7 +233,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		do {
 			BUG_ON(!pte_none(*pte));
 
-			set_page_count(page, 1);
 			SetPageReserved(page);
 			set_pte_at(&init_mm, vaddr,
 				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
@@ -244,7 +245,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		 * Free the otherwise unused pages.
 		 */
 		while (page < end) {
-			set_page_count(page, 1);
 			__free_page(page);
 			page++;
 		}
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index df3a9e452cc5..ee73e30263af 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -23,6 +23,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
 	page = alloc_pages(gfp, order);
 	if (!page)
 		return NULL;
+	split_page(page, order);
 
 	ret = page_address(page);
 	*handle = virt_to_phys(ret);
@@ -37,8 +38,6 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
 	end  = page + (1 << order);
 
 	while (++page < end) {
-		set_page_count(page, 1);
-
 		/* Free any unused pages */
 		if (page >= free) {
 			__free_page(page);
diff --git a/arch/xtensa/mm/pgtable.c b/arch/xtensa/mm/pgtable.c
index cbc56aedf13e..7d28914d11cb 100644
--- a/arch/xtensa/mm/pgtable.c
+++ b/arch/xtensa/mm/pgtable.c
@@ -21,13 +21,9 @@ pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 	p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER);
 
 	if (likely(p)) {
-		struct page *page;
+		split_page(virt_to_page(p), COLOR_ORDER);
 
 		for (i = 0; i < COLOR_SIZE; i++) {
-			page = virt_to_page(p);
-
-			set_page_count(page, 1);
-
 			if (ADDR_COLOR(p) == color)
 				pte = p;
 			else
@@ -55,9 +51,9 @@ struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER);
 
 	if (likely(p)) {
-		for (i = 0; i < PAGE_ORDER; i++) {
-			set_page_count(p, 1);
+		split_page(p, COLOR_ORDER);
 
+		for (i = 0; i < PAGE_ORDER; i++) {
 			if (PADDR_COLOR(page_address(p)) == color)
 				page = p;
 			else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9bbddf228cd9..e67980654c49 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -328,6 +328,12 @@ static inline void get_page(struct page *page)
 
 void put_page(struct page *page);
 
+#ifdef CONFIG_MMU
+void split_page(struct page *page, unsigned int order);
+#else
+static inline void split_page(struct page *page, unsigned int order) {}
+#endif
+
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db29..6af555c1c42a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1221,9 +1221,7 @@ out:
  * The page has to be a nice clean _individual_ kernel allocation.
  * If you allocate a compound page, you need to have marked it as
  * such (__GFP_COMP), or manually just split the page up yourself
- * (which is mainly an issue of doing "set_page_count(page, 1)" for
- * each sub-page, and then freeing them one by one when you free
- * them rather than freeing it as a compound page).
+ * (see split_page()).
  *
  * NOTE! Traditionally this was done with "remap_pfn_range()" which
  * took an arbitrary page protection parameter. This doesn't allow
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 102919851353..fc65e87368b3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -752,6 +752,28 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 		clear_highpage(page + i);
 }
 
+#ifdef CONFIG_MMU
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
+{
+	int i;
+
+	BUG_ON(PageCompound(page));
+	BUG_ON(!page_count(page));
+	for (i = 1; i < (1 << order); i++) {
+		BUG_ON(page_count(page + i));
+		set_page_count(page + i, 1);
+	}
+}
+#endif
+
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
-- 
cgit v1.2.3


From 9d41415221214ca4820b9464dfa548e2f20e7dd5 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:06 -0800
Subject: [PATCH] mm: page_state comment more

Clarify that preemption needs to be guarded against with the
__xxx_page_state functions.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8cef69d462f2..9ea629c02a4b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -86,8 +86,9 @@
  * - The __xxx_page_state variants can be used safely when interrupts are
  * disabled.
  * - The __xxx_page_state variants can be used if the field is only
- * modified from process context, or only modified from interrupt context.
- * In this case, the field should be commented here.
+ * modified from process context and protected from preemption, or only
+ * modified from interrupt context.  In this case, the field should be
+ * commented here.
  */
 struct page_state {
 	unsigned long nr_dirty;		/* Dirty writeable pages */
-- 
cgit v1.2.3


From b50ec7d8070ae7a39fe78e65a8812bbc3ca2f7ac Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 22 Mar 2006 00:08:09 -0800
Subject: [PATCH] kcalloc(): INT_MAX -> ULONG_MAX

Since size_t has the same size as a long on all architectures, it's enough
for overflow checks to check against ULONG_MAX.

This change could allow a compiler better optimization (especially in the
n=1 case).

The practical effect seems to be positive, but quite small:

    text           data     bss      dec            hex filename
21762380        5859870 1848928 29471178        1c1b1ca vmlinux-old
21762211        5859870 1848928 29471009        1c1b121 vmlinux-patched

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8cf52939d0ab..38bed95dda7a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -118,7 +118,7 @@ extern void *kzalloc(size_t, gfp_t);
  */
 static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
-	if (n != 0 && size > INT_MAX / n)
+	if (n != 0 && size > ULONG_MAX / n)
 		return NULL;
 	return kzalloc(n * size, flags);
 }
-- 
cgit v1.2.3


From ac2b898ca6fb06196a26869c23b66afe7944e52e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:15 -0800
Subject: [PATCH] slab: Remove SLAB_NO_REAP option

SLAB_NO_REAP is documented as an option that will cause this slab not to be
reaped under memory pressure.  However, that is not what happens.  The only
thing that SLAB_NO_REAP controls at the moment is the reclaim of the unused
slab elements that were allocated in batch in cache_reap().  Cache_reap()
is run every few seconds independently of memory pressure.

Could we remove the whole thing?  Its only used by three slabs anyways and
I cannot find a reason for having this option.

There is an additional problem with SLAB_NO_REAP.  If set then the recovery
of objects from alien caches is switched off.  Objects not freed on the
same node where they were initially allocated will only be reused if a
certain amount of objects accumulates from one alien node (not very likely)
or if the cache is explicitly shrunk.  (Strangely __cache_shrink does not
check for SLAB_NO_REAP)

Getting rid of SLAB_NO_REAP fixes the problems with alien cache freeing.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/scsi/iscsi_tcp.c |  2 +-
 fs/ocfs2/super.c         |  2 +-
 include/linux/slab.h     |  1 -
 mm/slab.c                | 13 ++-----------
 4 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index ff79e68b347c..7b82ff090d42 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -3639,7 +3639,7 @@ iscsi_tcp_init(void)
 
 	taskcache = kmem_cache_create("iscsi_taskcache",
 			sizeof(struct iscsi_data_task), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL);
+			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!taskcache)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8dd3aafec499..09e1c57a86a0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -959,7 +959,7 @@ static int ocfs2_initialize_mem_caches(void)
 	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
 					     sizeof(struct ocfs2_journal_lock),
 					     0,
-					     SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
+					     SLAB_HWCACHE_ALIGN,
 					     NULL, NULL);
 	if (!ocfs2_lock_cache)
 		return -ENOMEM;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 38bed95dda7a..2b28c849d75a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -38,7 +38,6 @@ typedef struct kmem_cache kmem_cache_t;
 #define	SLAB_DEBUG_INITIAL	0x00000200UL	/* Call constructor (as verifier) */
 #define	SLAB_RED_ZONE		0x00000400UL	/* Red zone objs in a cache */
 #define	SLAB_POISON		0x00000800UL	/* Poison objects */
-#define	SLAB_NO_REAP		0x00001000UL	/* never reap from the cache */
 #define	SLAB_HWCACHE_ALIGN	0x00002000UL	/* align objs on a h/w cache lines */
 #define SLAB_CACHE_DMA		0x00004000UL	/* use GFP_DMA memory */
 #define SLAB_MUST_HWCACHE_ALIGN	0x00008000UL	/* force alignment */
diff --git a/mm/slab.c b/mm/slab.c
index 5c2574989834..24235506b2a0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -170,12 +170,12 @@
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
+			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
 #else
-# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
@@ -662,7 +662,6 @@ static struct kmem_cache cache_cache = {
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
 	.buffer_size = sizeof(struct kmem_cache),
-	.flags = SLAB_NO_REAP,
 	.name = "kmem_cache",
 #if DEBUG
 	.obj_size = sizeof(struct kmem_cache),
@@ -1848,9 +1847,6 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
  * for buffer overruns.
  *
- * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
- * memory pressure.
- *
  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
@@ -3584,10 +3580,6 @@ static void cache_reap(void *unused)
 		struct slab *slabp;
 
 		searchp = list_entry(walk, struct kmem_cache, next);
-
-		if (searchp->flags & SLAB_NO_REAP)
-			goto next;
-
 		check_irq_on();
 
 		l3 = searchp->nodelists[numa_node_id()];
@@ -3635,7 +3627,6 @@ static void cache_reap(void *unused)
 		} while (--tofree > 0);
 next_unlock:
 		spin_unlock_irq(&l3->list_lock);
-next:
 		cond_resched();
 	}
 	check_irq_on();
-- 
cgit v1.2.3


From 78eef01b0fae087c5fadbd85dd4fe2918c3a015f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:16 -0800
Subject: [PATCH] on_each_cpu(): disable local interrupts

When on_each_cpu() runs the callback on other CPUs, it runs with local
interrupts disabled.  So we should run the function with local interrupts
disabled on this CPU, too.

And do the same for UP, so the callback is run in the same environment on both
UP and SMP.  (strictly it should do preempt_disable() too, but I think
local_irq_disable is sufficiently equivalent).

Also uninlines on_each_cpu().  softirq.c was the most appropriate file I could
find, but it doesn't seem to justify creating a new file.

Oh, and fix up that comment over (under?) x86's smp_call_function().  It
drives me nuts.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smp.c | 28 ++++++++++++----------------
 include/linux/smp.h    | 23 +++++++++--------------
 kernel/softirq.c       | 20 ++++++++++++++++++++
 3 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 218d725a5a1e..d134e9643a58 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -504,27 +504,23 @@ void unlock_ipi_call_lock(void)
 	spin_unlock_irq(&call_lock);
 }
 
-static struct call_data_struct * call_data;
-
-/*
- * this function sends a 'generic call function' IPI to all other CPUs
- * in the system.
- */
-
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
-/*
- * [SUMMARY] Run a function on all other CPUs.
- * <func> The function to run. This must be fast and non-blocking.
- * <info> An arbitrary pointer to pass to the function.
- * <nonatomic> currently unused.
- * <wait> If true, wait (atomically) until function has completed on other CPUs.
- * [RETURNS] 0 on success, else a negative status code. Does not return until
+static struct call_data_struct *call_data;
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
  * remote CPUs are nearly ready to execute <<func>> or are or have executed.
  *
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			int wait)
 {
 	struct call_data_struct data;
 	int cpus;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 44153fdf73fc..d699a16b0cb2 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -52,23 +52,12 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-extern int smp_call_function (void (*func) (void *info), void *info,
-			      int retry, int wait);
+int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
 
 /*
  * Call a function on all processors
  */
-static inline int on_each_cpu(void (*func) (void *info), void *info,
-			      int retry, int wait)
-{
-	int ret = 0;
-
-	preempt_disable();
-	ret = smp_call_function(func, info, retry, wait);
-	func(info);
-	preempt_enable();
-	return ret;
-}
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
 
 #define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
 #define MSG_ALL			0x8001
@@ -94,7 +83,13 @@ void smp_prepare_boot_cpu(void);
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
-#define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
+#define on_each_cpu(func,info,retry,wait)	\
+	({					\
+		local_irq_disable();		\
+		func(info);			\
+		local_irq_enable();		\
+		0;				\
+	})
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdded5..ec8fed42a86f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/smp.h>
 
 #include <asm/irq.h>
 /*
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+	int ret = 0;
+
+	preempt_disable();
+	ret = smp_call_function(func, info, retry, wait);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
-- 
cgit v1.2.3


From 69e05944af39fc6c97b09380c8721e38433bd828 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:19 -0800
Subject: [PATCH] vmscan: use unsigned longs

Turn basically everything in vmscan.c into `unsigned long'.  This is to avoid
the possibility that some piece of code in there might decide to operate upon
more than 4G (or even 2G) of pages in one hit.

This might be silly, but we'll need it one day.

Cc: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h   |   2 +-
 include/linux/swap.h |   8 ++--
 mm/vmscan.c          | 104 +++++++++++++++++++++++++++++----------------------
 3 files changed, 64 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e67980654c49..1850cf8bad64 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1046,7 +1046,7 @@ int in_gate_area_no_task(unsigned long addr);
 
 int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages);
 void drop_pagecache(void);
 void drop_slab(void);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d572b19afb7d..3dc6c89c49b8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,8 +172,8 @@ extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(struct zone **, gfp_t);
-extern int shrink_all_memory(int);
+extern unsigned long try_to_free_pages(struct zone **, gfp_t);
+extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 
 #ifdef CONFIG_NUMA
@@ -190,11 +190,11 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
-extern int putback_lru_pages(struct list_head *l);
+extern unsigned long putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct page *, struct page *);
 extern void migrate_page_copy(struct page *, struct page *);
 extern int migrate_page_remove_references(struct page *, struct page *, int);
-extern int migrate_pages(struct list_head *l, struct list_head *t,
+extern unsigned long migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int fail_migrate_page(struct page *, struct page *);
 #else
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5feef4d4650e..62cd7cd257e3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -177,10 +177,11 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
-	int ret = 0;
+	unsigned long ret = 0;
 
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
@@ -410,12 +411,13 @@ cannot_free:
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_list(struct list_head *page_list,
+				struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
-	int reclaimed = 0;
+	unsigned long reclaimed = 0;
 
 	cond_resched();
 
@@ -599,11 +601,11 @@ static inline void move_to_lru(struct page *page)
  *
  * returns the number of pages put back.
  */
-int putback_lru_pages(struct list_head *l)
+unsigned long putback_lru_pages(struct list_head *l)
 {
 	struct page *page;
 	struct page *page2;
-	int count = 0;
+	unsigned long count = 0;
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		move_to_lru(page);
@@ -848,11 +850,11 @@ EXPORT_SYMBOL(migrate_page);
  *
  * Return: Number of pages not migrated when "to" ran empty.
  */
-int migrate_pages(struct list_head *from, struct list_head *to,
+unsigned long migrate_pages(struct list_head *from, struct list_head *to,
 		  struct list_head *moved, struct list_head *failed)
 {
-	int retry;
-	int nr_failed = 0;
+	unsigned long retry;
+	unsigned long nr_failed = 0;
 	int pass = 0;
 	struct page *page;
 	struct page *page2;
@@ -1069,12 +1071,13 @@ int isolate_lru_page(struct page *page)
  *
  * returns how many pages were moved onto *@dst.
  */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
-			     struct list_head *dst, int *scanned)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+		struct list_head *src, struct list_head *dst,
+		unsigned long *scanned)
 {
-	int nr_taken = 0;
+	unsigned long nr_taken = 0;
 	struct page *page;
-	int scan = 0;
+	unsigned long scan = 0;
 
 	while (scan++ < nr_to_scan && !list_empty(src)) {
 		struct list_head *target;
@@ -1106,20 +1109,22 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
-static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc)
+static void shrink_cache(unsigned long max_scan, struct zone *zone,
+			struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
+	unsigned long nr_scanned = 0;
 
 	pagevec_init(&pvec, 1);
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	while (max_scan > 0) {
+	do {
 		struct page *page;
-		int nr_taken;
-		int nr_scan;
-		int nr_freed;
+		unsigned long nr_taken;
+		unsigned long nr_scan;
+		unsigned long nr_freed;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
 					     &zone->inactive_list,
@@ -1131,7 +1136,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s
 		if (nr_taken == 0)
 			goto done;
 
-		max_scan -= nr_scan;
+		nr_scanned += nr_scan;
 		nr_freed = shrink_list(&page_list, sc);
 
 		local_irq_disable();
@@ -1161,7 +1166,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
-  	}
+  	} while (nr_scanned < max_scan);
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
@@ -1185,11 +1190,12 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
+			struct scan_control *sc)
 {
-	int pgmoved;
+	unsigned long pgmoved;
 	int pgdeactivate = 0;
-	int pgscanned;
+	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
@@ -1323,8 +1329,8 @@ refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void
-shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
+static void shrink_zone(int priority, struct zone *zone,
+			struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
@@ -1387,8 +1393,8 @@ shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static void
-shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
+static void shrink_caches(int priority, struct zone **zones,
+				struct scan_control *sc)
 {
 	int i;
 
@@ -1425,11 +1431,12 @@ shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 {
 	int priority;
 	int ret = 0;
-	int total_scanned = 0, total_reclaimed = 0;
+	unsigned long total_scanned = 0;
+	unsigned long total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
@@ -1525,13 +1532,15 @@ out:
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+				int order)
 {
-	int to_free = nr_pages;
+	unsigned long to_free = nr_pages;
 	int all_zones_ok;
 	int priority;
 	int i;
-	int total_scanned, total_reclaimed;
+	unsigned long total_scanned;
+	unsigned long total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -1776,22 +1785,23 @@ void wakeup_kswapd(struct zone *zone, int order)
  * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
  * pages.
  */
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
 {
 	pg_data_t *pgdat;
-	int nr_to_free = nr_pages;
-	int ret = 0;
+	unsigned long nr_to_free = nr_pages;
+	unsigned long ret = 0;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 
 	current->reclaim_state = &reclaim_state;
 	for_each_pgdat(pgdat) {
-		int freed;
+		unsigned long freed;
+
 		freed = balance_pgdat(pgdat, nr_to_free, 0);
 		ret += freed;
 		nr_to_free -= freed;
-		if (nr_to_free <= 0)
+		if ((long)nr_to_free <= 0)
 			break;
 	}
 	current->reclaim_state = NULL;
@@ -1805,8 +1815,7 @@ int shrink_all_memory(int nr_pages)
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int __devinit cpu_callback(struct notifier_block *nfb,
-				  unsigned long action,
-				  void *hcpu)
+				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
 	cpumask_t mask;
@@ -1826,10 +1835,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
+
 	swap_setup();
-	for_each_pgdat(pgdat)
-		pgdat->kswapd
-		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+	for_each_pgdat(pgdat) {
+		pid_t pid;
+
+		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+		BUG_ON(pid < 0);
+		pgdat->kswapd = find_task_by_pid(pid);
+	}
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
@@ -1873,7 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	const int nr_pages = 1 << order;
+	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
@@ -1881,7 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.nr_mapped = read_page_state(nr_mapped),
-		.swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX),
+		.swap_cluster_max = max_t(unsigned long, nr_pages,
+					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 	};
 
@@ -1966,4 +1981,3 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	return __zone_reclaim(zone, gfp_mask, order);
 }
 #endif
-
-- 
cgit v1.2.3


From 0f8053a509ceba4a077a50ea7b77039b5559b428 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:33 -0800
Subject: [PATCH] mm: make __put_page internal

Remove __put_page from outside the core mm/.  It is dangerous because it does
not handle compound pages nicely, and misses 1->0 transitions.  If a user
later appears that really needs the extra speed we can reevaluate.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  1 -
 mm/filemap.c       |  2 ++
 mm/internal.h      | 11 +++++++++++
 mm/vmscan.c        |  2 ++
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1850cf8bad64..9b3cdfc8046d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -308,7 +308,6 @@ static inline int get_page_unless_zero(struct page *page)
 }
 
 #define set_page_count(p,v) 	atomic_set(&(p)->_count, (v))
-#define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d476994..e8f58f7dd7a5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,6 +30,8 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include "filemap.h"
+#include "internal.h"
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4ef..e3042db2a2d6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,6 +8,10 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
+
+#include <linux/mm.h>
 
 static inline void set_page_refs(struct page *page, int order)
 {
@@ -26,5 +30,12 @@ static inline void set_page_refs(struct page *page, int order)
 #endif /* CONFIG_MMU */
 }
 
+static inline void __put_page(struct page *page)
+{
+	atomic_dec(&page->_count);
+}
+
 extern void fastcall __init __free_pages_bootmem(struct page *page,
 						unsigned int order);
+
+#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 486184d2b50c..3914a94aa905 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,8 @@
 
 #include <linux/swapops.h>
 
+#include "internal.h"
+
 /* possible outcome of pageout() */
 typedef enum {
 	/* failed to write page out, page is locked */
-- 
cgit v1.2.3


From 84097518d1ecd2330f9488e4c2d09953a3340e74 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:34 -0800
Subject: [PATCH] mm: nommu use compound pages

Now that compound page handling is properly fixed in the VM, move nommu
over to using compound pages rather than rolling their own refcounting.

nommu vm page refcounting is broken anyway, but there is no need to have
divergent code in the core VM now, nor when it gets fixed.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: David Howells <dhowells@redhat.com>

(Needs testing, please).
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ramfs/file-nommu.c |  3 +--
 include/linux/mm.h    |  4 ----
 mm/internal.h         | 12 ------------
 mm/nommu.c            |  4 ++--
 mm/page_alloc.c       |  7 -------
 mm/slab.c             |  9 ++++++++-
 6 files changed, 11 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 3f810acd0bfa..b1ca234068f6 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -87,8 +87,7 @@ static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	xpages = 1UL << order;
 	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	for (loop = 0; loop < npages; loop++)
-		set_page_count(pages + loop, 1);
+	split_page(pages, order);
 
 	/* trim off any pages we don't actually require */
 	for (loop = npages; loop < xpages; loop++)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9b3cdfc8046d..3d84b7a35e0d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -327,11 +327,7 @@ static inline void get_page(struct page *page)
 
 void put_page(struct page *page);
 
-#ifdef CONFIG_MMU
 void split_page(struct page *page, unsigned int order);
-#else
-static inline void split_page(struct page *page, unsigned int order) {}
-#endif
 
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
diff --git a/mm/internal.h b/mm/internal.h
index e3042db2a2d6..7bb339779818 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -15,19 +15,7 @@
 
 static inline void set_page_refs(struct page *page, int order)
 {
-#ifdef CONFIG_MMU
 	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 
 static inline void __put_page(struct page *page)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f28..db45efac17cc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 	/*
 	 * kmalloc doesn't like __GFP_HIGHMEM for some reason
 	 */
-	return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+	return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
 
 struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 	 * - note that this may not return a page-aligned address if the object
 	 *   we're allocating is smaller than a page
 	 */
-	base = kmalloc(len, GFP_KERNEL);
+	base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
 	if (!base)
 		goto enomem;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7aa0181287e1..e197818a7cf6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -422,11 +422,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		mutex_debug_check_no_locks_freed(page_address(page),
 						 PAGE_SIZE<<order);
 
-#ifndef CONFIG_MMU
-	for (i = 1 ; i < (1 << order) ; ++i)
-		__put_page(page + i);
-#endif
-
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
 	if (reserved)
@@ -746,7 +741,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 		clear_highpage(page + i);
 }
 
-#ifdef CONFIG_MMU
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -766,7 +760,6 @@ void split_page(struct page *page, unsigned int order)
 		set_page_count(page + i, 1);
 	}
 }
-#endif
 
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
diff --git a/mm/slab.c b/mm/slab.c
index f477acfb732f..ff0ab772f49d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -590,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct kmem_cache *)page->lru.next;
 }
 
@@ -600,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
 
 static inline struct slab *page_get_slab(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct slab *)page->lru.prev;
 }
 
@@ -2412,8 +2416,11 @@ static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
 	struct page *page;
 
 	/* Nasty!!!!!! I hope this is OK. */
-	i = 1 << cachep->gfporder;
 	page = virt_to_page(objp);
+
+	i = 1;
+	if (likely(!PageCompound(page)))
+		i <<= cachep->gfporder;
 	do {
 		page_set_cache(page, cachep);
 		page_set_slab(page, slabp);
-- 
cgit v1.2.3


From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:40 -0800
Subject: [PATCH] remove set_page_count() outside mm/

set_page_count usage outside mm/ is limited to setting the refcount to 1.
Remove set_page_count from outside mm/, and replace those users with
init_page_count() and set_page_refcounted().

This allows more debug checking, and tighter control on how code is allowed
to play around with page->_count.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/init.c                   |  2 +-
 arch/arm/mm/init.c                     |  2 +-
 arch/arm26/mm/init.c                   |  2 +-
 arch/cris/mm/init.c                    |  2 +-
 arch/frv/mm/init.c                     |  6 +++---
 arch/h8300/mm/init.c                   |  4 ++--
 arch/i386/mm/init.c                    |  6 +++---
 arch/ia64/mm/init.c                    |  6 +++---
 arch/m32r/mm/init.c                    |  4 ++--
 arch/m68k/mm/init.c                    |  2 +-
 arch/m68k/mm/memory.c                  |  2 +-
 arch/m68k/mm/motorola.c                |  2 +-
 arch/m68knommu/mm/init.c               |  4 ++--
 arch/mips/arc/memory.c                 |  2 +-
 arch/mips/dec/prom/memory.c            |  2 +-
 arch/mips/mips-boards/generic/memory.c |  2 +-
 arch/mips/mips-boards/sim/sim_mem.c    |  2 +-
 arch/mips/mm/init.c                    |  6 +++---
 arch/mips/sgi-ip27/ip27-memory.c       |  2 +-
 arch/parisc/mm/init.c                  |  4 ++--
 arch/powerpc/mm/init_32.c              |  4 ++--
 arch/powerpc/mm/init_64.c              |  4 ++--
 arch/powerpc/mm/mem.c                  |  4 ++--
 arch/powerpc/platforms/cell/setup.c    |  2 +-
 arch/ppc/mm/init.c                     |  6 +++---
 arch/s390/mm/init.c                    |  4 ++--
 arch/sh/mm/init.c                      |  4 ++--
 arch/sh64/mm/init.c                    |  4 ++--
 arch/sparc/kernel/sun4d_smp.c          |  6 +++---
 arch/sparc/kernel/sun4m_smp.c          |  6 +++---
 arch/sparc/mm/init.c                   |  6 +++---
 arch/sparc64/mm/init.c                 |  4 ++--
 arch/um/kernel/mem.c                   |  4 ++--
 arch/x86_64/mm/init.c                  |  6 +++---
 arch/xtensa/mm/init.c                  |  2 +-
 drivers/video/acornfb.c                |  2 +-
 include/linux/mm.h                     | 11 +++++++++--
 mm/hugetlb.c                           |  5 +++--
 mm/internal.h                          | 13 ++++++++++++-
 mm/page_alloc.c                        | 14 ++++++--------
 40 files changed, 96 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 486d7945583d..544ac5dc09eb 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end)
 	void *__start = start;
 	for (; __start < end; __start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(__start));
-		set_page_count(virt_to_page(__start), 1);
+		init_page_count(virt_to_page(__start));
 		free_page((long)__start);
 		totalram_pages++;
 	}
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 8b276ee38acf..b0321e943b76 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -531,7 +531,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
 	for (; addr < end; addr += PAGE_SIZE) {
 		struct page *page = virt_to_page(addr);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c
index 1f09a9d0fb83..e3ecaa453747 100644
--- a/arch/arm26/mm/init.c
+++ b/arch/arm26/mm/init.c
@@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
 	for (; addr < end; addr += PAGE_SIZE) {
 		struct page *page = virt_to_page(addr);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c
index 31a0018b525a..b7842ff213a6 100644
--- a/arch/cris/mm/init.c
+++ b/arch/cris/mm/init.c
@@ -216,7 +216,7 @@ free_initmem(void)
         addr = (unsigned long)(&__init_begin);
         for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(addr));
-                set_page_count(virt_to_page(addr), 1);
+                init_page_count(virt_to_page(addr));
                 free_page(addr);
                 totalram_pages++;
         }
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 765088ea8a50..8899aa1a4f06 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -169,7 +169,7 @@ void __init mem_init(void)
 		struct page *page = &mem_map[pfn];
 
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalram_pages++;
 	}
@@ -210,7 +210,7 @@ void __init free_initmem(void)
 	/* next to check that the page we free is not a partial page */
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 1e0929ddc8c4..09efc4b1f038 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
@@ -219,7 +219,7 @@ free_initmem()
 	/* next to check that the page we free is not a partial page */
 	for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 2700f01994ba..7ba55a6e2dbc 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 
 static void __meminit free_new_highpage(struct page *page)
 {
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalhigh_pages++;
 }
@@ -727,7 +727,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		memset((void *)addr, 0xcc, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
@@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b38b6d213c15..08d94e6bfa18 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -197,7 +197,7 @@ free_initmem (void)
 	eaddr = (unsigned long) ia64_imva(__init_end);
 	while (addr < eaddr) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		++totalram_pages;
 		addr += PAGE_SIZE;
@@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end)
 			continue;
 		page = virt_to_page(start);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(start);
 		++totalram_pages;
 	}
@@ -640,7 +640,7 @@ mem_init (void)
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 6facf15b04f3..c9e7dad860b7 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -226,7 +226,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index c45beb955943..a190e39c907a 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index 559942ce0e1e..d6d582a5abb0 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable)
 
 	/* unreserve the page so it's possible to free that page */
 	PD_PAGE(dp)->flags &= ~(1 << PG_reserved);
-	set_page_count(PD_PAGE(dp), 1);
+	init_page_count(PD_PAGE(dp));
 
 	return;
 }
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index d855fec26317..afb57eeafdcb 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -276,7 +276,7 @@ void free_initmem(void)
 	addr = (unsigned long)&__init_begin;
 	for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) {
 		virt_to_page(addr)->flags &= ~(1 << PG_reserved);
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c
index 89f0b554ffb7..d79503fe6e42 100644
--- a/arch/m68knommu/mm/init.c
+++ b/arch/m68knommu/mm/init.c
@@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
@@ -218,7 +218,7 @@ free_initmem()
 	/* next to check that the page we free is not a partial page */
 	for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c
index 958d2eb78862..8a9ef58cc399 100644
--- a/arch/mips/arc/memory.c
+++ b/arch/mips/arc/memory.c
@@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c
index 81cb5a76cfb7..1edaf3074ee9 100644
--- a/arch/mips/dec/prom/memory.c
+++ b/arch/mips/dec/prom/memory.c
@@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void)
 	addr = PAGE_SIZE;
 	while (addr < end) {
 		ClearPageReserved(virt_to_page(__va(addr)));
-		set_page_count(virt_to_page(__va(addr)), 1);
+		init_page_count(virt_to_page(__va(addr)));
 		free_page((unsigned long)__va(addr));
 		addr += PAGE_SIZE;
 	}
diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c
index 2c8afd77a20b..ee5e70c95cf3 100644
--- a/arch/mips/mips-boards/generic/memory.c
+++ b/arch/mips/mips-boards/generic/memory.c
@@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c
index 0dbd7435bb2a..1ec4e75656bd 100644
--- a/arch/mips/mips-boards/sim/sim_mem.c
+++ b/arch/mips/mips-boards/sim/sim_mem.c
@@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index a140da9732db..52f7d59fe612 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -245,7 +245,7 @@ void __init mem_init(void)
 #ifdef CONFIG_LIMITED_DMA
 		set_page_address(page, lowmem_page_address(page));
 #endif
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalhigh_pages++;
 	}
@@ -292,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
@@ -315,7 +315,7 @@ void free_initmem(void)
 		page = addr;
 #endif
 		ClearPageReserved(virt_to_page(page));
-		set_page_count(virt_to_page(page), 1);
+		init_page_count(virt_to_page(page));
 		free_page(page);
 		totalram_pages++;
 		freed += PAGE_SIZE;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index ed93a9792959..e0d095daa5ed 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -559,7 +559,7 @@ void __init mem_init(void)
 				/* if (!page_is_ram(pgnr)) continue; */
 				/* commented out until page_is_ram works */
 				ClearPageReserved(p);
-				set_page_count(p, 1);
+				init_page_count(p);
 				__free_page(p);
 				totalram_pages++;
 			}
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 7847ca13d6c2..852eda3953dc 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -398,7 +398,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		num_physpages++;
 		totalram_pages++;
@@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		num_physpages++;
 		totalram_pages++;
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 7d0d75c11848..b57fb3a2b7bb 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
 
 	while (start < end) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		cnt++;
 		start += PAGE_SIZE;
@@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 81cfb0c2ec58..bacb71c89811 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -140,7 +140,7 @@ void free_initmem(void)
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
 		memset((void *)addr, 0xcc, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 6ae5c130d0db..454cac01d8cc 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
@@ -376,7 +376,7 @@ void __init mem_init(void)
 			struct page *page = pfn_to_page(pfn);
 
 			ClearPageReserved(page);
-			set_page_count(page, 1);
+			init_page_count(page);
 			__free_page(page);
 			totalhigh_pages++;
 		}
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index b33a4443f5a9..fec8e65b36ea 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe,
 		for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 			struct page *page = pfn_to_page(pfn);
 			set_page_links(page, ZONE_DMA, node_id, pfn);
-			set_page_count(page, 1);
+			init_page_count(page);
 			reset_page_mapcount(page);
 			SetPageReserved(page);
 			INIT_LIST_HEAD(&page->lru);
diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c
index 134db5c04203..cb1c294fb932 100644
--- a/arch/ppc/mm/init.c
+++ b/arch/ppc/mm/init.c
@@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
 
 	while (start < end) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		cnt++;
 		start += PAGE_SIZE;
@@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
@@ -441,7 +441,7 @@ void __init mem_init(void)
 			struct page *page = mem_map + pfn;
 
 			ClearPageReserved(page);
-			set_page_count(page, 1);
+			init_page_count(page);
 			__free_page(page);
 			totalhigh_pages++;
 		}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index df953383724d..a055894f3bd8 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -292,7 +292,7 @@ void free_initmem(void)
         addr = (unsigned long)(&__init_begin);
         for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
         }
@@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
                 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
         for (; start < end; start += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(start));
-                set_page_count(virt_to_page(start), 1);
+                init_page_count(virt_to_page(start));
                 free_page(start);
                 totalram_pages++;
         }
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index e342565f75fb..77b4a838fe10 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -273,7 +273,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c
index a65e8bb2c3cc..1169757fb38b 100644
--- a/arch/sh64/mm/init.c
+++ b/arch/sh64/mm/init.c
@@ -173,7 +173,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 40d426cce824..4219dd2ce3a2 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void)
 
 	/* Free unneeded trap tables */
 	ClearPageReserved(virt_to_page(trapbase_cpu1));
-	set_page_count(virt_to_page(trapbase_cpu1), 1);
+	init_page_count(virt_to_page(trapbase_cpu1));
 	free_page((unsigned long)trapbase_cpu1);
 	totalram_pages++;
 	num_physpages++;
 
 	ClearPageReserved(virt_to_page(trapbase_cpu2));
-	set_page_count(virt_to_page(trapbase_cpu2), 1);
+	init_page_count(virt_to_page(trapbase_cpu2));
 	free_page((unsigned long)trapbase_cpu2);
 	totalram_pages++;
 	num_physpages++;
 
 	ClearPageReserved(virt_to_page(trapbase_cpu3));
-	set_page_count(virt_to_page(trapbase_cpu3), 1);
+	init_page_count(virt_to_page(trapbase_cpu3));
 	free_page((unsigned long)trapbase_cpu3);
 	totalram_pages++;
 	num_physpages++;
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index a21f27d10e55..fbbd8a474c4c 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void)
 	/* Free unneeded trap tables */
 	if (!cpu_isset(i, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu1));
-		set_page_count(virt_to_page(trapbase_cpu1), 1);
+		init_page_count(virt_to_page(trapbase_cpu1));
 		free_page((unsigned long)trapbase_cpu1);
 		totalram_pages++;
 		num_physpages++;
 	}
 	if (!cpu_isset(2, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu2));
-		set_page_count(virt_to_page(trapbase_cpu2), 1);
+		init_page_count(virt_to_page(trapbase_cpu2));
 		free_page((unsigned long)trapbase_cpu2);
 		totalram_pages++;
 		num_physpages++;
 	}
 	if (!cpu_isset(3, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu3));
-		set_page_count(virt_to_page(trapbase_cpu3), 1);
+		init_page_count(virt_to_page(trapbase_cpu3));
 		free_page((unsigned long)trapbase_cpu3);
 		totalram_pages++;
 		num_physpages++;
diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c
index c03babaa0498..898669732466 100644
--- a/arch/sparc/mm/init.c
+++ b/arch/sparc/mm/init.c
@@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
 		struct page *page = pfn_to_page(tmp);
 
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalhigh_pages++;
 	}
@@ -480,7 +480,7 @@ void free_initmem (void)
 		p = virt_to_page(addr);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		totalram_pages++;
 		num_physpages++;
@@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		struct page *p = virt_to_page(start);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 	}
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index c2b556106fc1..2ae143ba50d8 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1461,7 +1461,7 @@ void free_initmem(void)
 		p = virt_to_page(page);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
@@ -1477,7 +1477,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		struct page *p = virt_to_page(start);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index fa4f915be5c5..92cce96b5e24 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start,
 	for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){
 		page = &mem_map[highmem_pfn + i];
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 	}
 }
@@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 			(end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 7af1742aa958..40ed13d263cd 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
@@ -592,7 +592,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
 		free_page(addr);
 		totalram_pages++;
@@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 5a91d6c9e66d..e1be4235f367 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end)
 {
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page((unsigned long)start);
 		totalram_pages++;
 	}
diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index b058273527bb..76448d6ae896 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -1269,7 +1269,7 @@ free_unused_pages(unsigned int virtual_start, unsigned int virtual_end)
 		 */
 		page = virt_to_page(virtual_start);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(virtual_start);
 
 		virtual_start += PAGE_SIZE;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d84b7a35e0d..7d8c127daad7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -307,8 +307,6 @@ static inline int get_page_unless_zero(struct page *page)
 	return atomic_inc_not_zero(&page->_count);
 }
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v))
-
 extern void FASTCALL(__page_cache_release(struct page *));
 
 static inline int page_count(struct page *page)
@@ -325,6 +323,15 @@ static inline void get_page(struct page *page)
 	atomic_inc(&page->_count);
 }
 
+/*
+ * Setup the page count before being freed into the page allocator for
+ * the first time (boot or memory hotplug)
+ */
+static inline void init_page_count(struct page *page)
+{
+	atomic_set(&page->_count, 1);
+}
+
 void put_page(struct page *page);
 
 void split_page(struct page *page, unsigned int order);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 39d49ecea8e8..20117a4b8ab6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 
 #include <linux/hugetlb.h>
+#include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -106,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 		return NULL;
 	}
 	spin_unlock(&hugetlb_lock);
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
@@ -152,7 +153,7 @@ static void update_and_free_page(struct page *page)
 				1 << PG_private | 1<< PG_writeback);
 	}
 	page[1].lru.next = NULL;
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 7bb339779818..d20e3cc4aef0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,8 +13,19 @@
 
 #include <linux/mm.h>
 
-static inline void set_page_refs(struct page *page, int order)
+static inline void set_page_count(struct page *page, int v)
 {
+	atomic_set(&page->_count, v);
+}
+
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
+{
+	BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+	BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e197818a7cf6..7f65b5a63bb3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -442,7 +442,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
-		set_page_refs(page, 0);
+		set_page_refcounted(page);
 		__free_page(page);
 	} else {
 		int loop;
@@ -457,7 +457,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 			set_page_count(p, 0);
 		}
 
-		set_page_refs(page, order);
+		set_page_refcounted(page);
 		__free_pages(page, order);
 	}
 }
@@ -525,7 +525,7 @@ static int prep_new_page(struct page *page, int order)
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
-	set_page_refs(page, order);
+	set_page_refcounted(page);
 	kernel_map_pages(page, 1 << order, 1);
 	return 0;
 }
@@ -755,10 +755,8 @@ void split_page(struct page *page, unsigned int order)
 
 	BUG_ON(PageCompound(page));
 	BUG_ON(!page_count(page));
-	for (i = 1; i < (1 << order); i++) {
-		BUG_ON(page_count(page + i));
-		set_page_count(page + i, 1);
-	}
+	for (i = 1; i < (1 << order); i++)
+		set_page_refcounted(page + i);
 }
 
 /*
@@ -1771,7 +1769,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 1);
+		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
-- 
cgit v1.2.3


From 617d2214ee06c209e5c375c280d50abace8058e1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:43 -0800
Subject: [PATCH] mm: optimise page_count

Optimise page_count compound page test and make it consistent with similar
functions.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d8c127daad7..6aa016f1d3ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -311,7 +311,7 @@ extern void FASTCALL(__page_cache_release(struct page *));
 
 static inline int page_count(struct page *page)
 {
-	if (PageCompound(page))
+	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
 	return atomic_read(&page->_count);
 }
-- 
cgit v1.2.3


From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Wed, 22 Mar 2006 00:08:50 -0800
Subject: [PATCH] Enable mprotect on huge pages

2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb
mprotect.

From: David Gibson <david@gibson.dropbear.id.au>

  Remove a test from the mprotect() path which checks that the mprotect()ed
  range on a hugepage VMA is hugepage aligned (yes, really, the sense of
  is_aligned_hugepage_range() is the opposite of what you'd guess :-/).

  In fact, we don't need this test.  If the given addresses match the
  beginning/end of a hugepage VMA they must already be suitably aligned.  If
  they don't, then mprotect_fixup() will attempt to split the VMA.  The very
  first test in split_vma() will check for a badly aligned address on a
  hugepage VMA and return -EINVAL if necessary.

From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>

  On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE.  The
  identify of hugetlb pte is lost when changing page protection via mprotect.
  A page fault occurs later will trigger a bug check in huge_pte_alloc().

  The fix is to always make new pte a hugetlb pte and also to clean up
  legacy code where _PAGE_PRESENT is forced on in the pre-faulting day.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h   |  5 ++---
 include/asm-ia64/pgtable.h   |  2 +-
 include/asm-x86_64/pgtable.h |  4 ++--
 include/linux/hugetlb.h      |  4 ++++
 mm/hugetlb.c                 | 29 +++++++++++++++++++++++++++++
 mm/mprotect.c                | 12 +++++-------
 6 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 088a945bf26b..ee056c41a9fb 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -219,13 +219,12 @@ extern unsigned long pg0[];
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
-#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
 static inline int pte_user(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_read(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_dirty(pte_t pte)		{ return (pte).pte_low & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return (pte).pte_low & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return (pte).pte_low & _PAGE_RW; }
-static inline int pte_huge(pte_t pte)		{ return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
+static inline int pte_huge(pte_t pte)		{ return (pte).pte_low & _PAGE_PSE; }
 
 /*
  * The following only works if pte_present() is not true.
@@ -242,7 +241,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ (pte).pte_low |= _PAGE_USER; return
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= __LARGE_PTE; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return pte; }
 
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index e2560c58384b..5890972a69bf 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -314,7 +314,7 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_A))
 #define pte_mkclean(pte)	(__pte(pte_val(pte) & ~_PAGE_D))
 #define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_D))
-#define pte_mkhuge(pte)		(__pte(pte_val(pte) | _PAGE_P))
+#define pte_mkhuge(pte)		(__pte(pte_val(pte)))
 
 /*
  * Macro to a page protection value as "uncacheable".  Note that "protection" is really a
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 715fd94cf577..a617d364d08d 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return pte_val(pte) & _PAGE_RW; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
-static inline int pte_huge(pte_t pte)		{ return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; }
+static inline int pte_huge(pte_t pte)		{ return pte_val(pte) & _PAGE_PSE; }
 
 static inline pte_t pte_rdprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
 static inline pte_t pte_exprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
@@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _
 static inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
 
 struct vm_area_struct;
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 68d82ad6b17c..fa83836b63d2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -41,6 +41,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
 int pmd_huge(pmd_t pmd);
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot);
 
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
@@ -101,6 +103,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define free_huge_page(p)			({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
+#define hugetlb_change_protection(vma, address, end, newprot)
+
 #ifndef HPAGE_MASK
 #define HPAGE_MASK	PAGE_MASK		/* Keep the compiler happy */
 #define HPAGE_SIZE	PAGE_SIZE
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 20117a4b8ab6..783098f6cf8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -565,3 +565,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	return i;
 }
+
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long start = address;
+	pte_t *ptep;
+	pte_t pte;
+
+	BUG_ON(address >= end);
+	flush_cache_range(vma, address, end);
+
+	spin_lock(&mm->page_table_lock);
+	for (; address < end; address += HPAGE_SIZE) {
+		ptep = huge_pte_offset(mm, address);
+		if (!ptep)
+			continue;
+		if (!pte_none(*ptep)) {
+			pte = huge_ptep_get_and_clear(mm, address, ptep);
+			pte = pte_mkhuge(pte_modify(pte, newprot));
+			set_huge_pte_at(mm, address, ptep, pte);
+			lazy_mmu_prot_update(pte);
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	flush_tlb_range(vma, start, end);
+}
+
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1ed..4c14d4289b61 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
 				return -ENOMEM;
@@ -166,7 +166,10 @@ success:
 	 */
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
-	change_protection(vma, start, end, newprot);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_change_protection(vma, start, end, newprot);
+	else
+		change_protection(vma, start, end, newprot);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
-		if (is_vm_hugetlb_page(vma)) {
-			error = -EACCES;
-			goto out;
-		}
-
 		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
 
 		/* newflags >> 4 shift VM_MAY% in place of VM_% */
-- 
cgit v1.2.3


From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:55 -0800
Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes

These days, hugepages are demand-allocated at first fault time.  There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.

A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations.  In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available.  In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.

The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated.  MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL.  MAP_PRIVATE mappings can still
trigger an OOM.  (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)

This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.

This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c    |  74 ++++++++------------------
 include/linux/hugetlb.h |   8 ++-
 mm/hugetlb.c            | 136 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 154 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b35195289945..1a1c2fcb7823 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
-/*
- * huge_pages_needed tries to determine the number of new huge pages that
- * will be required to fully populate this VMA.  This will be equal to
- * the size of the VMA in huge pages minus the number of huge pages
- * (covered by this VMA) that are found in the page cache.
- *
- * Result is in bytes to be compatible with is_hugepage_mem_enough()
- */
-static unsigned long
-huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	int i;
-	struct pagevec pvec;
-	unsigned long start = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
-	pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
-	pgoff_t endpg = next + hugepages;
-
-	pagevec_init(&pvec, 0);
-	while (next < endpg) {
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
-			break;
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			if (page->index > next)
-				next = page->index;
-			if (page->index >= endpg)
-				break;
-			next++;
-			hugepages--;
-		}
-		huge_pagevec_release(&pvec);
-	}
-	return hugepages << HPAGE_SHIFT;
-}
-
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long bytes;
+	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 		return -EINVAL;
 
-	bytes = huge_pages_needed(mapping, vma);
-	if (!is_hugepage_mem_enough(bytes))
-		return -ENOMEM;
-
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	mutex_lock(&inode->i_mutex);
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
+	if (vma->vm_flags & VM_MAYSHARE)
+		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
+			goto out;
+
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
 	if (inode->i_size < len)
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page)
 	put_page(page);
 }
 
-static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
+static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+	struct address_space *mapping = &inode->i_data;
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
 	int i;
 
+	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
+				     lstart >> HPAGE_SHIFT);
+	if (!mapping->nrpages)
+		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 
 static void hugetlbfs_delete_inode(struct inode *inode)
 {
-	if (inode->i_data.nrpages)
-		truncate_hugepages(&inode->i_data, 0);
+	truncate_hugepages(inode, 0);
 	clear_inode(inode);
 }
 
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
-	if (inode->i_data.nrpages)
-		truncate_hugepages(&inode->i_data, 0);
+	truncate_hugepages(inode, 0);
 	clear_inode(inode);
 	destroy_inode(inode);
 }
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	if (!prio_tree_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
 	spin_unlock(&mapping->i_mmap_lock);
-	truncate_hugepages(mapping, offset);
+	truncate_hugepages(inode, offset);
 	return 0;
 }
 
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
+	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size)
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!is_hugepage_mem_enough(size))
-		return ERR_PTR(-ENOMEM);
-
 	if (!user_shm_lock(size, current->user))
 		return ERR_PTR(-ENOMEM);
 
@@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size)
 	if (!inode)
 		goto out_file;
 
+	error = -ENOMEM;
+	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
+				       size >> HPAGE_SHIFT) != 0)
+		goto out_inode;
+
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
 	inode->i_nlink = 0;
@@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size)
 	file->f_mode = FMODE_WRITE | FMODE_READ;
 	return file;
 
+out_inode:
+	iput(inode);
 out_file:
 	put_filp(file);
 out_dentry:
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fa83836b63d2..cafe73eecb05 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long)
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
-int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
 void free_huge_page(struct page *);
@@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
 #define unmap_hugepage_range(vma, start, end)	BUG()
-#define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
@@ -132,6 +130,8 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
+	/* Protected by the (global) hugetlb_lock */
+	unsigned long prereserved_hpages;
 	struct inode vfs_inode;
 };
 
@@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast_hpages);
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost_hpages);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a87bbe5..27fad5d9bcf6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
 
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
+	int use_reserve = 0;
+	unsigned long idx;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page(vma, addr);
-	if (!page) {
-		spin_unlock(&hugetlb_lock);
-		return NULL;
+
+	if (vma->vm_flags & VM_MAYSHARE) {
+
+		/* idx = radix tree index, i.e. offset into file in
+		 * HPAGE_SIZE units */
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+		/* The hugetlbfs specific inode info stores the number
+		 * of "guaranteed available" (huge) pages.  That is,
+		 * the first 'prereserved_hpages' pages of the inode
+		 * are either already instantiated, or have been
+		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+		 * we're in the process of instantiating the page, so
+		 * we use this to determine whether to draw from the
+		 * pre-reserved pool or the truly free pool. */
+		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+			use_reserve = 1;
+	}
+
+	if (!use_reserve) {
+		if (free_huge_pages <= reserved_huge_pages)
+			goto fail;
+	} else {
+		BUG_ON(reserved_huge_pages == 0);
+		reserved_huge_pages--;
 	}
+
+	page = dequeue_huge_page(vma, addr);
+	if (!page)
+		goto fail;
+
 	spin_unlock(&hugetlb_lock);
 	set_page_refcounted(page);
 	return page;
+
+ fail:
+	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+	spin_unlock(&hugetlb_lock);
+	return NULL;
+}
+
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast)
+{
+	struct inode *inode = &info->vfs_inode;
+	unsigned long change_in_reserve = 0;
+	int ret = 0;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages >= atleast)
+		goto out;
+
+	/* Because we always call this on shared mappings, none of the
+	 * pages beyond info->prereserved_hpages can have been
+	 * instantiated, so we need to reserve all of them now. */
+	change_in_reserve = atleast - info->prereserved_hpages;
+
+	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	reserved_huge_pages += change_in_reserve;
+	info->prereserved_hpages = atleast;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
+
+	return ret;
+}
+
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost)
+{
+	struct inode *inode = &info->vfs_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long idx;
+	unsigned long change_in_reserve = 0;
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages <= atmost)
+		goto out;
+
+	/* Count pages which were reserved, but not instantiated, and
+	 * which we can now release. */
+	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+		page = radix_tree_lookup(&mapping->page_tree, idx);
+		if (!page)
+			/* Pages which are already instantiated can't
+			 * be unreserved (and in fact have already
+			 * been removed from the reserved pool) */
+			change_in_reserve++;
+	}
+
+	BUG_ON(reserved_huge_pages < change_in_reserve);
+	reserved_huge_pages -= change_in_reserve;
+	info->prereserved_hpages = atmost;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
 }
 
 static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
+		        "HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
+		        reserved_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
-
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-- 
cgit v1.2.3


From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:56 -0800
Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local

Originally, mm/hugetlb.c just handled the hugepage physical allocation path
and its {alloc,free}_huge_page() functions were used from the arch specific
hugepage code.  These days those functions are only used with mm/hugetlb.c
itself.  Therefore, this patch makes them static and removes their
prototypes from hugetlb.h.  This requires a small rearrangement of code in
mm/hugetlb.c to avoid a forward declaration.

This patch causes no regressions on the libhugetlbfs testsuite (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hugetlb.h |  4 ----
 mm/hugetlb.c            | 25 +++++++++++++------------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index cafe73eecb05..5d84c368ffe4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -21,8 +21,6 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
-struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
-void free_huge_page(struct page *);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
 
@@ -97,8 +95,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
 						do { } while (0)
-#define alloc_huge_page(vma, addr)		({ NULL; })
-#define free_huge_page(p)			({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
 #define hugetlb_change_protection(vma, address, end, newprot)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27fad5d9bcf6..075877b1cbc0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
+static void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	spin_unlock(&hugetlb_lock);
+}
+
 static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
@@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void)
 	return 0;
 }
 
-void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&hugetlb_lock);
-	enqueue_huge_page(page);
-	spin_unlock(&hugetlb_lock);
-}
-
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+				    unsigned long addr)
 {
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
-- 
cgit v1.2.3


From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:57 -0800
Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables()

free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs.  However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma.  is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned.  So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.

At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range().  On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.

Nonetheless this should be fixed.  We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().

This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64).  We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64.  Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range().  Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.

This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/page.h       | 1 +
 include/asm-powerpc/pgtable.h | 5 -----
 include/linux/hugetlb.h       | 9 +++++----
 mm/memory.c                   | 5 ++---
 4 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 5e6362a786b7..732cf3086741 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -57,6 +57,7 @@
 
 # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 # define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef __ASSEMBLY__
diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h
index e38931379a72..185ee15963a1 100644
--- a/include/asm-powerpc/pgtable.h
+++ b/include/asm-powerpc/pgtable.h
@@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[];
 
 extern void paging_init(void);
 
-#ifdef CONFIG_HUGETLB_PAGE
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-	free_pgd_range(tlb, addr, end, floor, ceiling)
-#endif
-
 /*
  * This gets called at the end of handling a page fault, when
  * the kernel has put a new PTE into the page table for the process.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5d84c368ffe4..e465fbf1ef5f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,8 +43,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-						do { } while (0)
+#endif
+
+#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE
+#define hugetlb_free_pgd_range	free_pgd_range
 #endif
 
 #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
@@ -93,8 +95,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define prepare_hugepage_range(addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-						do { } while (0)
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
 #define hugetlb_change_protection(vma, address, end, newprot)
diff --git a/mm/memory.c b/mm/memory.c
index 71bc664efed5..f6e3be9cbf5a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
 
-		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		} else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			  && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
-							HPAGE_SIZE)) {
+			       && !is_vm_hugetlb_page(vma)) {
 				vma = next;
 				next = vma->vm_next;
 				anon_vma_unlink(vma);
-- 
cgit v1.2.3


From 3915bcf38fe0b6d130b4bbde97804f29a0becf32 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:59 -0800
Subject: [PATCH] hugepage: Move hugetlb_free_pgd_range() prototype to
 hugetlb.h

The optional hugepage callback, hugetlb_free_pgd_range() is presently
implemented non-trivially only on ia64 (but I plan to add one for powerpc
shortly).  It has its own prototype for the function in asm-ia64/pgtable.h.
 However, since the function is called from generic code, it make sense for
its prototype to be in the generic hugetlb.h header file, as the protypes
other arch callbacks already are (prepare_hugepage_range(),
set_huge_pte_at(), etc.).  This patch makes it so.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/pgtable.h | 3 ---
 include/linux/hugetlb.h    | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 5890972a69bf..c0f8144f2349 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -505,9 +505,6 @@ extern struct page *zero_page_memmap_ptr;
 #define HUGETLB_PGDIR_SHIFT	(HPAGE_SHIFT + 2*(PAGE_SHIFT-3))
 #define HUGETLB_PGDIR_SIZE	(__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
 #define HUGETLB_PGDIR_MASK	(~(HUGETLB_PGDIR_SIZE-1))
-struct mmu_gather;
-void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
-		unsigned long end, unsigned long floor, unsigned long ceiling);
 #endif
 
 /*
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e465fbf1ef5f..5db25ffdb3eb 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -47,6 +47,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 #ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #define hugetlb_free_pgd_range	free_pgd_range
+#else
+void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
+			    unsigned long end, unsigned long floor,
+			    unsigned long ceiling);
 #endif
 
 #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
-- 
cgit v1.2.3


From 42b88befd6e0dae1a5fe04c03925037fa890e1f3 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:09:01 -0800
Subject: [PATCH] hugepage: is_aligned_hugepage_range() cleanup

Quite a long time back, prepare_hugepage_range() replaced
is_aligned_hugepage_range() as the callback from mm/mmap.c to arch code to
verify if an address range is suitable for a hugepage mapping.
is_aligned_hugepage_range() stuck around, but only to implement
prepare_hugepage_range() on archs which didn't implement their own.

Most archs (everything except ia64 and powerpc) used the same
implementation of is_aligned_hugepage_range().  On powerpc, which
implements its own prepare_hugepage_range(), the custom version was never
used.

In addition, "is_aligned_hugepage_range()" was a bad name, because it
suggests it returns true iff the given range is a good hugepage range,
whereas in fact it returns 0-or-error (so the sense is reversed).

This patch cleans up by abolishing is_aligned_hugepage_range().  Instead
prepare_hugepage_range() is defined directly.  Most archs use the default
version, which simply checks the given region is aligned to the size of a
hugepage.  ia64 and powerpc define custom versions.  The ia64 one simply
checks that the range is in the correct address space region in addition to
being suitably aligned.  The powerpc version (just as previously) checks
for suitable addresses, and if necessary performs low-level MMU frobbing to
set up new areas for use by hugepages.

No libhugetlbfs testsuite regressions on ppc64 (POWER5 LPAR).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/hugetlbpage.c    | 12 ------------
 arch/ia64/mm/hugetlbpage.c    |  5 +++--
 arch/powerpc/mm/hugetlbpage.c | 15 ---------------
 arch/sh/mm/hugetlbpage.c      | 12 ------------
 arch/sh64/mm/hugetlbpage.c    | 12 ------------
 arch/sparc64/mm/hugetlbpage.c | 12 ------------
 include/asm-ia64/page.h       |  1 +
 include/linux/hugetlb.h       | 16 ++++++++++++----
 8 files changed, 16 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index d524127c9afc..a7d891585411 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return (pte_t *) pmd;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 #if 0	/* This is just for testing */
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 2d13889d0a99..9dbc7dadd165 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -68,9 +68,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
 #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
 
 /*
- * This function checks for proper alignment of input addr and len parameters.
+ * Don't actually need to do any preparation, but need to make sure
+ * the address is in the right region.
  */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index b51bb28c054b..7370f9f33e29 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -133,21 +133,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return __pte(old);
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	if (! (within_hugepage_low_range(addr, len)
-	       || within_hugepage_high_range(addr, len)) )
-		return -EINVAL;
-	return 0;
-}
-
 struct slb_flush_info {
 	struct mm_struct *mm;
 	u16 newareas;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6b7a7688c98e..a3568fd51508 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c
index ed6a505b3ee2..3d89f2a6c785 100644
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index a7a24869d045..280dc7958a13 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -263,18 +263,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 732cf3086741..3ab27333dae4 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -57,6 +57,7 @@
 
 # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 # define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+# define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
 # define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5db25ffdb3eb..d6f1019625af 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -36,7 +36,6 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
 int pmd_huge(pmd_t pmd);
 void hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
@@ -54,8 +53,18 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
 #endif
 
 #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
-#define prepare_hugepage_range(addr, len)	\
-	is_aligned_hugepage_range(addr, len)
+/*
+ * If the arch doesn't supply something else, assume that hugepage
+ * size aligned regions are ok without further preparation.
+ */
+static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+{
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (addr & ~HPAGE_MASK)
+		return -EINVAL;
+	return 0;
+}
 #else
 int prepare_hugepage_range(unsigned long addr, unsigned long len);
 #endif
@@ -95,7 +104,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define is_aligned_hugepage_range(addr, len)	0
 #define prepare_hugepage_range(addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
-- 
cgit v1.2.3


From b20a35035f983f4ac7e29c4a68f30e43510007e0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 22 Mar 2006 00:09:12 -0800
Subject: [PATCH] page migration reorg

Centralize the page migration functions in anticipation of additional
tinkering.  Creates a new file mm/migrate.c

1. Extract buffer_migrate_page() from fs/buffer.c

2. Extract central migration code from vmscan.c

3. Extract some components from mempolicy.c

4. Export pageout() and remove_from_swap() from vmscan.c

5. Make it possible to configure NUMA systems without page migration
   and non-NUMA systems with page migration.

I had to so some #ifdeffing in mempolicy.c that may need a cleanup.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                |  62 -----
 fs/xfs/linux-2.6/xfs_buf.c |   1 +
 include/linux/migrate.h    |  36 +++
 include/linux/swap.h       |  34 ++-
 mm/Kconfig                 |   6 +
 mm/Makefile                |   2 +
 mm/mempolicy.c             | 113 ++------
 mm/migrate.c               | 655 +++++++++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c            |   1 +
 mm/vmscan.c                | 491 +--------------------------------
 10 files changed, 741 insertions(+), 660 deletions(-)
 create mode 100644 include/linux/migrate.h
 create mode 100644 mm/migrate.c

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index a9b399402007..1d3683d496f8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3050,68 +3050,6 @@ asmlinkage long sys_bdflush(int func, long data)
 	return 0;
 }
 
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
- */
-#ifdef CONFIG_MIGRATION
-int buffer_migrate_page(struct page *newpage, struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct buffer_head *bh, *head;
-	int rc;
-
-	if (!mapping)
-		return -EAGAIN;
-
-	if (!page_has_buffers(page))
-		return migrate_page(newpage, page);
-
-	head = page_buffers(page);
-
-	rc = migrate_page_remove_references(newpage, page, 3);
-	if (rc)
-		return rc;
-
-	bh = head;
-	do {
-		get_bh(bh);
-		lock_buffer(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	ClearPagePrivate(page);
-	set_page_private(newpage, page_private(page));
-	set_page_private(page, 0);
-	put_page(page);
-	get_page(newpage);
-
-	bh = head;
-	do {
-		set_bh_page(bh, newpage, bh_offset(bh));
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	SetPagePrivate(newpage);
-
-	migrate_page_copy(newpage, page);
-
-	bh = head;
-	do {
-		unlock_buffer(bh);
- 		put_bh(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	return 0;
-}
-EXPORT_SYMBOL(buffer_migrate_page);
-#endif
-
 /*
  * Buffer-head allocation
  */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bfb4f2917bb6..8cdfa4151659 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/hash.h>
 #include <linux/kthread.h>
+#include <linux/migrate.h>
 #include "xfs_linux.h"
 
 STATIC kmem_zone_t *xfs_buf_zone;
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
new file mode 100644
index 000000000000..7d09962c3c0b
--- /dev/null
+++ b/include/linux/migrate.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_MIGRATE_H
+#define _LINUX_MIGRATE_H
+
+#include <linux/config.h>
+#include <linux/mm.h>
+
+#ifdef CONFIG_MIGRATION
+extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
+extern int putback_lru_pages(struct list_head *l);
+extern int migrate_page(struct page *, struct page *);
+extern void migrate_page_copy(struct page *, struct page *);
+extern int migrate_page_remove_references(struct page *, struct page *, int);
+extern int migrate_pages(struct list_head *l, struct list_head *t,
+		struct list_head *moved, struct list_head *failed);
+int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest);
+extern int fail_migrate_page(struct page *, struct page *);
+
+extern int migrate_prep(void);
+
+#else
+
+static inline int isolate_lru_page(struct page *p, struct list_head *list)
+					{ return -ENOSYS; }
+static inline int putback_lru_pages(struct list_head *l) { return 0; }
+static inline int migrate_pages(struct list_head *l, struct list_head *t,
+	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+
+static inline int migrate_prep(void) { return -ENOSYS; }
+
+/* Possible settings for the migrate_page() method in address_operations */
+#define migrate_page NULL
+#define fail_migrate_page NULL
+
+#endif /* CONFIG_MIGRATION */
+#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3dc6c89c49b8..12415dd94451 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,6 +175,21 @@ extern void swap_setup(void);
 extern unsigned long try_to_free_pages(struct zone **, gfp_t);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
+extern int remove_mapping(struct address_space *mapping, struct page *page);
+
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+extern pageout_t pageout(struct page *page, struct address_space *mapping);
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
@@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
-#ifdef CONFIG_MIGRATION
-extern int isolate_lru_page(struct page *p);
-extern unsigned long putback_lru_pages(struct list_head *l);
-extern int migrate_page(struct page *, struct page *);
-extern void migrate_page_copy(struct page *, struct page *);
-extern int migrate_page_remove_references(struct page *, struct page *, int);
-extern unsigned long migrate_pages(struct list_head *l, struct list_head *t,
-		struct list_head *moved, struct list_head *failed);
-extern int fail_migrate_page(struct page *, struct page *);
-#else
-static inline int isolate_lru_page(struct page *p) { return -ENOSYS; }
-static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t,
-	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
-/* Possible settings for the migrate_page() method in address_operations */
-#define migrate_page NULL
-#define fail_migrate_page NULL
-#endif
-
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae6409..bd80460360db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
 # support for page migration
 #
 config MIGRATION
+	bool "Page migration"
 	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
 	depends on SWAP
+	help
+	  Allows the migration of the physical location of pages of processes
+	  while the virtual addresses are not changed. This is useful for
+	  example on NUMA systems to put pages nearer to the processors accessing
+	  the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc3..f10c753dce6d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 96195dcb62e1..e93cc740c22b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/migrate.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -95,9 +96,6 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
 
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	struct vm_area_struct *first, *vma, *prev;
 
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-		/* Must have swap device for migration */
-		if (nr_swap_pages <= 0)
-			return ERR_PTR(-ENODEV);
 
-		/*
-		 * Clear the LRU lists so pages can be isolated.
-		 * Note that pages may be moved off the LRU after we have
-		 * drained them. Those pages will fail to migrate like other
-		 * pages that may be busy.
-		 */
-		lru_add_drain_all();
+		err = migrate_prep();
+		if (err)
+			return ERR_PTR(err);
 	}
 
 	first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+#ifdef CONFIG_MIGRATION
 /*
  * page migration
  */
-
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-		if (isolate_lru_page(page))
-			list_add_tail(&page->lru, pagelist);
-	}
-}
-
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest)
-{
-	LIST_HEAD(newlist);
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
-	int err = 0;
-	unsigned long offset = 0;
-	int nr_pages;
-	struct page *page;
-	struct list_head *p;
-
-redo:
-	nr_pages = 0;
-	list_for_each(p, pagelist) {
-		if (vma) {
-			/*
-			 * The address passed to alloc_page_vma is used to
-			 * generate the proper interleave behavior. We fake
-			 * the address here by an increasing offset in order
-			 * to get the proper distribution of pages.
-			 *
-			 * No decision has been made as to which page
-			 * a certain old page is moved to so we cannot
-			 * specify the correct address.
-			 */
-			page = alloc_page_vma(GFP_HIGHUSER, vma,
-					offset + vma->vm_start);
-			offset += PAGE_SIZE;
-		}
-		else
-			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-		if (!page) {
-			err = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&page->lru, &newlist);
-		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE)
-			break;
-	}
-	err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
-	putback_lru_pages(&moved);	/* Call release pages instead ?? */
-
-	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-		goto redo;
-out:
-	/* Return leftover allocated pages */
-	while (!list_empty(&newlist)) {
-		page = list_entry(newlist.next, struct page, lru);
-		list_del(&page->lru);
-		__free_page(page);
-	}
-	list_splice(&failed, pagelist);
-	if (err < 0)
-		return err;
-
-	/* Calculate number of leftover pages */
-	nr_pages = 0;
-	list_for_each(p, pagelist)
-		nr_pages++;
-	return nr_pages;
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+		isolate_lru_page(page, pagelist);
 }
 
 /*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
 	if (err < 0)
 		return err;
 	return busy;
+
 }
 
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+				unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	return -ENOSYS;
+}
+#endif
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
+
 	if (!list_empty(&pagelist))
 		putback_lru_pages(&pagelist);
 
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 000000000000..09f6e4aa87fc
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+#include "internal.h"
+
+/* The maximum number of pages to take off the LRU for migration */
+#define MIGRATE_CHUNK_SIZE 256
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+/*
+ * Isolate one page from the LRU lists. If successful put it onto
+ * the indicated list with elevated page count.
+ *
+ * Result:
+ *  -EBUSY: page not on LRU list
+ *  0: page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page, struct list_head *pagelist)
+{
+	int ret = -EBUSY;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irq(&zone->lru_lock);
+		if (PageLRU(page)) {
+			ret = 0;
+			get_page(page);
+			ClearPageLRU(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+			list_add_tail(&page->lru, pagelist);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+	return ret;
+}
+
+/*
+ * migrate_prep() needs to be called after we have compiled the list of pages
+ * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to migrate_pages().
+ */
+int migrate_prep(void)
+{
+	/* Must have swap device for migration */
+	if (nr_swap_pages <= 0)
+		return -ENODEV;
+
+	/*
+	 * Clear the LRU lists so pages can be isolated.
+	 * Note that pages may be moved off the LRU after we have
+	 * drained them. Those pages will fail to migrate like other
+	 * pages that may be busy.
+	 */
+	lru_add_drain_all();
+
+	return 0;
+}
+
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+				struct page *page, int nr_refs)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page **radix_pointer;
+
+	/*
+	 * Avoid doing any of the following work if the page count
+	 * indicates that the page is in use or truncate has removed
+	 * the page.
+	 */
+	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+		return -EAGAIN;
+
+	/*
+	 * Establish swap ptes for anonymous pages or destroy pte
+	 * maps for files.
+	 *
+	 * In order to reestablish file backed mappings the fault handlers
+	 * will take the radix tree_lock which may then be used to stop
+  	 * processses from accessing this page until the new page is ready.
+	 *
+	 * A process accessing via a swap pte (an anonymous page) will take a
+	 * page_lock on the old page which will block the process until the
+	 * migration attempt is complete. At that time the PageSwapCache bit
+	 * will be examined. If the page was migrated then the PageSwapCache
+	 * bit will be clear and the operation to retrieve the page will be
+	 * retried which will find the new page in the radix tree. Then a new
+	 * direct mapping may be generated based on the radix tree contents.
+	 *
+	 * If the page was not migrated then the PageSwapCache bit
+	 * is still set and the operation may continue.
+	 */
+	if (try_to_unmap(page, 1) == SWAP_FAIL)
+		/* A vma has VM_LOCKED set -> permanent failure */
+		return -EPERM;
+
+	/*
+	 * Give up if we were unable to remove all mappings.
+	 */
+	if (page_mapcount(page))
+		return -EAGAIN;
+
+	write_lock_irq(&mapping->tree_lock);
+
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+						&mapping->page_tree,
+						page_index(page));
+
+	if (!page_mapping(page) || page_count(page) != nr_refs ||
+			*radix_pointer != page) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 1;
+	}
+
+	/*
+	 * Now we know that no one else is looking at the page.
+	 *
+	 * Certain minimal information about a page must be available
+	 * in order for other subsystems to properly handle the page if they
+	 * find it through the radix tree update before we are finished
+	 * copying the page.
+	 */
+	get_page(newpage);
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapCache(page)) {
+		SetPageSwapCache(newpage);
+		set_page_private(newpage, page_private(page));
+	}
+
+	*radix_pointer = newpage;
+	__put_page(page);
+	write_unlock_irq(&mapping->tree_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	copy_highpage(newpage, page);
+
+	if (PageError(page))
+		SetPageError(newpage);
+	if (PageReferenced(page))
+		SetPageReferenced(newpage);
+	if (PageUptodate(page))
+		SetPageUptodate(newpage);
+	if (PageActive(page))
+		SetPageActive(newpage);
+	if (PageChecked(page))
+		SetPageChecked(newpage);
+	if (PageMappedToDisk(page))
+		SetPageMappedToDisk(newpage);
+
+	if (PageDirty(page)) {
+		clear_page_dirty_for_io(page);
+		set_page_dirty(newpage);
+ 	}
+
+	ClearPageSwapCache(page);
+	ClearPageActive(page);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page->mapping = NULL;
+
+	/*
+	 * If any waiters have accumulated on the new page then
+	 * wake them up.
+	 */
+	if (PageWriteback(newpage))
+		end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+	int rc;
+
+	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
+
+	rc = migrate_page_remove_references(newpage, page, 2);
+
+	if (rc)
+		return rc;
+
+	migrate_page_copy(newpage, page);
+
+	/*
+	 * Remove auxiliary swap entries and replace
+	 * them with real ptes.
+	 *
+	 * Note that a real pte entry will allow processes that are not
+	 * waiting on the page lock to use the new page via the page tables
+	 * before the new page is unlocked.
+	 */
+	remove_from_swap(newpage);
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
+{
+	int retry;
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, from, lru) {
+		struct page *newpage = NULL;
+		struct address_space *mapping;
+
+		cond_resched();
+
+		rc = 0;
+		if (page_count(page) == 1)
+			/* page was freed from under us. So we are done. */
+			goto next;
+
+		if (to && list_empty(to))
+			break;
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
+		 */
+		rc = -EAGAIN;
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto next;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0) {
+			wait_on_page_writeback(page);
+		} else {
+			if (PageWriteback(page))
+				goto unlock_page;
+		}
+
+		/*
+		 * Anonymous pages must have swap cache references otherwise
+		 * the information contained in the page maps cannot be
+		 * preserved.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
+				rc = -ENOMEM;
+				goto unlock_page;
+			}
+		}
+
+		if (!to) {
+			rc = swap_page(page);
+			goto next;
+		}
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+
+		/*
+		 * Pages are properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		mapping = page_mapping(page);
+		if (!mapping)
+			goto unlock_both;
+
+		if (mapping->a_ops->migratepage) {
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
+		/*
+		 * Default handling if a filesystem does not provide
+		 * a migration function. We can only migrate clean
+		 * pages so try to write out any dirty pages first.
+		 */
+		if (PageDirty(page)) {
+			switch (pageout(page, mapping)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+				goto unlock_both;
+
+			case PAGE_SUCCESS:
+				unlock_page(newpage);
+				goto next;
+
+			case PAGE_CLEAN:
+				; /* try to migrate the page below */
+			}
+                }
+
+		/*
+		 * Buffers are managed in a filesystem specific way.
+		 * We must have no buffers or drop them.
+		 */
+		if (!page_has_buffers(page) ||
+		    try_to_release_page(page, GFP_KERNEL)) {
+			rc = migrate_page(newpage, page);
+			goto unlock_both;
+		}
+
+		/*
+		 * On early passes with mapped pages simply
+		 * retry. There may be a lock held for some
+		 * buffers that may go away. Later
+		 * swap them out.
+		 */
+		if (pass > 4) {
+			/*
+			 * Persistently unable to drop buffers..... As a
+			 * measure of last resort we fall back to
+			 * swap_page().
+			 */
+			unlock_page(newpage);
+			newpage = NULL;
+			rc = swap_page(page);
+			goto next;
+		}
+
+unlock_both:
+		unlock_page(newpage);
+
+unlock_page:
+		unlock_page(page);
+
+next:
+		if (rc == -EAGAIN) {
+			retry++;
+		} else if (rc) {
+			/* Permanent failure */
+			list_move(&page->lru, failed);
+			nr_failed++;
+		} else {
+			if (newpage) {
+				/* Successful migration. Return page to LRU */
+				move_to_lru(newpage);
+			}
+			list_move(&page->lru, moved);
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	return nr_failed + retry;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct buffer_head *bh, *head;
+	int rc;
+
+	if (!mapping)
+		return -EAGAIN;
+
+	if (!page_has_buffers(page))
+		return migrate_page(newpage, page);
+
+	head = page_buffers(page);
+
+	rc = migrate_page_remove_references(newpage, page, 3);
+
+	if (rc)
+		return rc;
+
+	bh = head;
+	do {
+		get_bh(bh);
+		lock_buffer(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	ClearPagePrivate(page);
+	set_page_private(newpage, page_private(page));
+	set_page_private(page, 0);
+	put_page(page);
+	get_page(newpage);
+
+	bh = head;
+	do {
+		set_bh_page(bh, newpage, bh_offset(bh));
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	SetPagePrivate(newpage);
+
+	migrate_page_copy(newpage, page);
+
+	bh = head;
+	do {
+		unlock_buffer(bh);
+ 		put_bh(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest)
+{
+	LIST_HEAD(newlist);
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int err = 0;
+	unsigned long offset = 0;
+	int nr_pages;
+	struct page *page;
+	struct list_head *p;
+
+redo:
+	nr_pages = 0;
+	list_for_each(p, pagelist) {
+		if (vma) {
+			/*
+			 * The address passed to alloc_page_vma is used to
+			 * generate the proper interleave behavior. We fake
+			 * the address here by an increasing offset in order
+			 * to get the proper distribution of pages.
+			 *
+			 * No decision has been made as to which page
+			 * a certain old page is moved to so we cannot
+			 * specify the correct address.
+			 */
+			page = alloc_page_vma(GFP_HIGHUSER, vma,
+					offset + vma->vm_start);
+			offset += PAGE_SIZE;
+		}
+		else
+			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		list_add_tail(&page->lru, &newlist);
+		nr_pages++;
+		if (nr_pages > MIGRATE_CHUNK_SIZE)
+			break;
+	}
+	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+	putback_lru_pages(&moved);	/* Call release pages instead ?? */
+
+	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+		goto redo;
+out:
+	/* Return leftover allocated pages */
+	while (!list_empty(&newlist)) {
+		page = list_entry(newlist.next, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	list_splice(&failed, pagelist);
+	if (err < 0)
+		return err;
+
+	/* Calculate number of leftover pages */
+	nr_pages = 0;
+	list_for_each(p, pagelist)
+		nr_pages++;
+	return nr_pages;
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e1636..d7af296833fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/migrate.h>
 
 #include <asm/pgtable.h>
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 548e023c193b..fd572bbdc9f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,18 +42,6 @@
 
 #include "internal.h"
 
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -304,7 +292,7 @@ static void handle_write_error(struct address_space *mapping,
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -372,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
-static int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	if (!mapping)
 		return 0;		/* truncate got there first */
@@ -570,481 +558,6 @@ keep:
 	return nr_reclaimed;
 }
 
-#ifdef CONFIG_MIGRATION
-static inline void move_to_lru(struct page *page)
-{
-	list_del(&page->lru);
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU.
- *
- * returns the number of pages put back.
- */
-unsigned long putback_lru_pages(struct list_head *l)
-{
-	struct page *page;
-	struct page *page2;
-	unsigned long count = 0;
-
-	list_for_each_entry_safe(page, page2, l, lru) {
-		move_to_lru(page);
-		count++;
-	}
-	return count;
-}
-
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
-	return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-
-	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-			goto unlock_retry;
-
-	if (PageDirty(page)) {
-		/* Page is dirty, try to write it out here */
-		switch(pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			goto unlock_retry;
-
-		case PAGE_SUCCESS:
-			goto retry;
-
-		case PAGE_CLEAN:
-			; /* try to free the page below */
-		}
-	}
-
-	if (PagePrivate(page)) {
-		if (!try_to_release_page(page, GFP_KERNEL) ||
-		    (!mapping && page_count(page) == 1))
-			goto unlock_retry;
-	}
-
-	if (remove_mapping(mapping, page)) {
-		/* Success */
-		unlock_page(page);
-		return 0;
-	}
-
-unlock_retry:
-	unlock_page(page);
-
-retry:
-	return -EAGAIN;
-}
-EXPORT_SYMBOL(swap_page);
-
-/*
- * Page migration was first developed in the context of the memory hotplug
- * project. The main authors of the migration code are:
- *
- * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
- * Hirokazu Takahashi <taka@valinux.co.jp>
- * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
- */
-
-/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
- */
-int migrate_page_remove_references(struct page *newpage,
-				struct page *page, int nr_refs)
-{
-	struct address_space *mapping = page_mapping(page);
-	struct page **radix_pointer;
-
-	/*
-	 * Avoid doing any of the following work if the page count
-	 * indicates that the page is in use or truncate has removed
-	 * the page.
-	 */
-	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-		return -EAGAIN;
-
-	/*
-	 * Establish swap ptes for anonymous pages or destroy pte
-	 * maps for files.
-	 *
-	 * In order to reestablish file backed mappings the fault handlers
-	 * will take the radix tree_lock which may then be used to stop
-  	 * processses from accessing this page until the new page is ready.
-	 *
-	 * A process accessing via a swap pte (an anonymous page) will take a
-	 * page_lock on the old page which will block the process until the
-	 * migration attempt is complete. At that time the PageSwapCache bit
-	 * will be examined. If the page was migrated then the PageSwapCache
-	 * bit will be clear and the operation to retrieve the page will be
-	 * retried which will find the new page in the radix tree. Then a new
-	 * direct mapping may be generated based on the radix tree contents.
-	 *
-	 * If the page was not migrated then the PageSwapCache bit
-	 * is still set and the operation may continue.
-	 */
-	if (try_to_unmap(page, 1) == SWAP_FAIL)
-		/* A vma has VM_LOCKED set -> Permanent failure */
-		return -EPERM;
-
-	/*
-	 * Give up if we were unable to remove all mappings.
-	 */
-	if (page_mapcount(page))
-		return -EAGAIN;
-
-	write_lock_irq(&mapping->tree_lock);
-
-	radix_pointer = (struct page **)radix_tree_lookup_slot(
-						&mapping->page_tree,
-						page_index(page));
-
-	if (!page_mapping(page) || page_count(page) != nr_refs ||
-			*radix_pointer != page) {
-		write_unlock_irq(&mapping->tree_lock);
-		return -EAGAIN;
-	}
-
-	/*
-	 * Now we know that no one else is looking at the page.
-	 *
-	 * Certain minimal information about a page must be available
-	 * in order for other subsystems to properly handle the page if they
-	 * find it through the radix tree update before we are finished
-	 * copying the page.
-	 */
-	get_page(newpage);
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
-	if (PageSwapCache(page)) {
-		SetPageSwapCache(newpage);
-		set_page_private(newpage, page_private(page));
-	}
-
-	*radix_pointer = newpage;
-	__put_page(page);
-	write_unlock_irq(&mapping->tree_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(migrate_page_remove_references);
-
-/*
- * Copy the page to its new location
- */
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
-	copy_highpage(newpage, page);
-
-	if (PageError(page))
-		SetPageError(newpage);
-	if (PageReferenced(page))
-		SetPageReferenced(newpage);
-	if (PageUptodate(page))
-		SetPageUptodate(newpage);
-	if (PageActive(page))
-		SetPageActive(newpage);
-	if (PageChecked(page))
-		SetPageChecked(newpage);
-	if (PageMappedToDisk(page))
-		SetPageMappedToDisk(newpage);
-
-	if (PageDirty(page)) {
-		clear_page_dirty_for_io(page);
-		set_page_dirty(newpage);
- 	}
-
-	ClearPageSwapCache(page);
-	ClearPageActive(page);
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	page->mapping = NULL;
-
-	/*
-	 * If any waiters have accumulated on the new page then
-	 * wake them up.
-	 */
-	if (PageWriteback(newpage))
-		end_page_writeback(newpage);
-}
-EXPORT_SYMBOL(migrate_page_copy);
-
-/*
- * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct page *newpage, struct page *page)
-{
-	int rc;
-
-	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
-
-	rc = migrate_page_remove_references(newpage, page, 2);
-
-	if (rc)
-		return rc;
-
-	migrate_page_copy(newpage, page);
-
-	/*
-	 * Remove auxiliary swap entries and replace
-	 * them with real ptes.
-	 *
-	 * Note that a real pte entry will allow processes that are not
-	 * waiting on the page lock to use the new page via the page tables
-	 * before the new page is unlocked.
-	 */
-	remove_from_swap(newpage);
-	return 0;
-}
-EXPORT_SYMBOL(migrate_page);
-
-/*
- * migrate_pages
- *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
- *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- *
- * Return: Number of pages not migrated when "to" ran empty.
- */
-unsigned long migrate_pages(struct list_head *from, struct list_head *to,
-		  struct list_head *moved, struct list_head *failed)
-{
-	unsigned long retry;
-	unsigned long nr_failed = 0;
-	int pass = 0;
-	struct page *page;
-	struct page *page2;
-	int swapwrite = current->flags & PF_SWAPWRITE;
-	int rc;
-
-	if (!swapwrite)
-		current->flags |= PF_SWAPWRITE;
-
-redo:
-	retry = 0;
-
-	list_for_each_entry_safe(page, page2, from, lru) {
-		struct page *newpage = NULL;
-		struct address_space *mapping;
-
-		cond_resched();
-
-		rc = 0;
-		if (page_count(page) == 1)
-			/* page was freed from under us. So we are done. */
-			goto next;
-
-		if (to && list_empty(to))
-			break;
-
-		/*
-		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we
-		 * use lock_page() to have a higher chance of acquiring the
-		 * lock.
-		 */
-		rc = -EAGAIN;
-		if (pass > 2)
-			lock_page(page);
-		else
-			if (TestSetPageLocked(page))
-				goto next;
-
-		/*
-		 * Only wait on writeback if we have already done a pass where
-		 * we we may have triggered writeouts for lots of pages.
-		 */
-		if (pass > 0) {
-			wait_on_page_writeback(page);
-		} else {
-			if (PageWriteback(page))
-				goto unlock_page;
-		}
-
-		/*
-		 * Anonymous pages must have swap cache references otherwise
-		 * the information contained in the page maps cannot be
-		 * preserved.
-		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page, GFP_KERNEL)) {
-				rc = -ENOMEM;
-				goto unlock_page;
-			}
-		}
-
-		if (!to) {
-			rc = swap_page(page);
-			goto next;
-		}
-
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-
-		/*
-		 * Pages are properly locked and writeback is complete.
-		 * Try to migrate the page.
-		 */
-		mapping = page_mapping(page);
-		if (!mapping)
-			goto unlock_both;
-
-		if (mapping->a_ops->migratepage) {
-			/*
-			 * Most pages have a mapping and most filesystems
-			 * should provide a migration function. Anonymous
-			 * pages are part of swap space which also has its
-			 * own migration function. This is the most common
-			 * path for page migration.
-			 */
-			rc = mapping->a_ops->migratepage(newpage, page);
-			goto unlock_both;
-                }
-
-		/*
-		 * Default handling if a filesystem does not provide
-		 * a migration function. We can only migrate clean
-		 * pages so try to write out any dirty pages first.
-		 */
-		if (PageDirty(page)) {
-			switch (pageout(page, mapping)) {
-			case PAGE_KEEP:
-			case PAGE_ACTIVATE:
-				goto unlock_both;
-
-			case PAGE_SUCCESS:
-				unlock_page(newpage);
-				goto next;
-
-			case PAGE_CLEAN:
-				; /* try to migrate the page below */
-			}
-                }
-
-		/*
-		 * Buffers are managed in a filesystem specific way.
-		 * We must have no buffers or drop them.
-		 */
-		if (!page_has_buffers(page) ||
-		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(newpage, page);
-			goto unlock_both;
-		}
-
-		/*
-		 * On early passes with mapped pages simply
-		 * retry. There may be a lock held for some
-		 * buffers that may go away. Later
-		 * swap them out.
-		 */
-		if (pass > 4) {
-			/*
-			 * Persistently unable to drop buffers..... As a
-			 * measure of last resort we fall back to
-			 * swap_page().
-			 */
-			unlock_page(newpage);
-			newpage = NULL;
-			rc = swap_page(page);
-			goto next;
-		}
-
-unlock_both:
-		unlock_page(newpage);
-
-unlock_page:
-		unlock_page(page);
-
-next:
-		if (rc == -EAGAIN) {
-			retry++;
-		} else if (rc) {
-			/* Permanent failure */
-			list_move(&page->lru, failed);
-			nr_failed++;
-		} else {
-			if (newpage) {
-				/* Successful migration. Return page to LRU */
-				move_to_lru(newpage);
-			}
-			list_move(&page->lru, moved);
-		}
-	}
-	if (retry && pass++ < 10)
-		goto redo;
-
-	if (!swapwrite)
-		current->flags &= ~PF_SWAPWRITE;
-
-	return nr_failed + retry;
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list with elevated refcount.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page)
-{
-	int ret = 0;
-
-	if (PageLRU(page)) {
-		struct zone *zone = page_zone(page);
-		spin_lock_irq(&zone->lru_lock);
-		if (PageLRU(page)) {
-			ret = 1;
-			get_page(page);
-			ClearPageLRU(page);
-			if (PageActive(page))
-				del_page_from_active_list(zone, page);
-			else
-				del_page_from_inactive_list(zone, page);
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-
-	return ret;
-}
-#endif
-
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
-- 
cgit v1.2.3


From a45049c51ce6a3fecf2a909b591b28164c927112 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 22 Mar 2006 13:55:40 -0800
Subject: [NETFILTER]: x_tables: set the protocol family in x_tables
 targets/matches

Set the family field in xt_[matches|targets] registered.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h        |  8 ++++----
 include/linux/netfilter_arp/arp_tables.h  |  6 ++++--
 include/linux/netfilter_ipv4/ip_tables.h  | 14 +++++++++-----
 include/linux/netfilter_ipv6/ip6_tables.h | 14 +++++++++-----
 net/ipv4/netfilter/arp_tables.c           |  6 ++++--
 net/ipv4/netfilter/ip_tables.c            | 15 +++++++++------
 net/ipv6/netfilter/ip6_tables.c           | 15 +++++++++------
 net/netfilter/x_tables.c                  | 16 ++++++++++------
 net/netfilter/xt_CLASSIFY.c               | 12 +++++++-----
 net/netfilter/xt_CONNMARK.c               | 12 +++++++-----
 net/netfilter/xt_MARK.c                   | 21 ++++++++++++---------
 net/netfilter/xt_NFQUEUE.c                | 19 +++++++++++--------
 net/netfilter/xt_NOTRACK.c                | 12 +++++++-----
 net/netfilter/xt_comment.c                | 12 +++++++-----
 net/netfilter/xt_connbytes.c              | 12 +++++++-----
 net/netfilter/xt_connmark.c               | 12 +++++++-----
 net/netfilter/xt_conntrack.c              |  5 +++--
 net/netfilter/xt_dccp.c                   | 12 +++++++-----
 net/netfilter/xt_helper.c                 | 12 +++++++-----
 net/netfilter/xt_length.c                 | 12 +++++++-----
 net/netfilter/xt_limit.c                  | 12 +++++++-----
 net/netfilter/xt_mac.c                    | 12 +++++++-----
 net/netfilter/xt_mark.c                   | 12 +++++++-----
 net/netfilter/xt_physdev.c                | 12 +++++++-----
 net/netfilter/xt_pkttype.c                | 12 +++++++-----
 net/netfilter/xt_policy.c                 | 12 +++++++-----
 net/netfilter/xt_realm.c                  |  5 +++--
 net/netfilter/xt_sctp.c                   | 12 +++++++-----
 net/netfilter/xt_state.c                  | 12 +++++++-----
 net/netfilter/xt_string.c                 | 12 +++++++-----
 net/netfilter/xt_tcpmss.c                 | 12 +++++++-----
 net/netfilter/xt_tcpudp.c                 | 26 +++++++++++++++-----------
 32 files changed, 235 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 46a0f974f87c..bf71efb63007 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -220,10 +220,10 @@ struct xt_table_info
 	char *entries[NR_CPUS];
 };
 
-extern int xt_register_target(int af, struct xt_target *target);
-extern void xt_unregister_target(int af, struct xt_target *target);
-extern int xt_register_match(int af, struct xt_match *target);
-extern void xt_unregister_match(int af, struct xt_match *target);
+extern int xt_register_target(struct xt_target *target);
+extern void xt_unregister_target(struct xt_target *target);
+extern int xt_register_match(struct xt_match *target);
+extern void xt_unregister_match(struct xt_match *target);
 
 extern int xt_check_match(const struct xt_match *match, unsigned short family,
 			  unsigned int size, const char *table, unsigned int hook,
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index fd21796e5131..a27be05f67f0 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -263,8 +263,10 @@ static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e
  */
 #ifdef __KERNEL__
 
-#define arpt_register_target(tgt) xt_register_target(NF_ARP, tgt)
-#define arpt_unregister_target(tgt) xt_unregister_target(NF_ARP, tgt)
+#define arpt_register_target(tgt) 	\
+({	(tgt)->family = NF_ARP;		\
+ 	xt_register_target(tgt); })
+#define arpt_unregister_target(tgt) xt_unregister_target(tgt)
 
 extern int arpt_register_table(struct arpt_table *table,
 			       const struct arpt_replace *repl);
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 76ba24b68515..ee262b5344e1 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -321,11 +321,15 @@ ipt_get_target(struct ipt_entry *e)
 #include <linux/init.h>
 extern void ipt_init(void) __init;
 
-#define ipt_register_target(tgt) xt_register_target(AF_INET, tgt)
-#define ipt_unregister_target(tgt) xt_unregister_target(AF_INET, tgt)
-
-#define ipt_register_match(mtch) xt_register_match(AF_INET, mtch)
-#define ipt_unregister_match(mtch) xt_unregister_match(AF_INET, mtch)
+#define ipt_register_target(tgt) 	\
+({	(tgt)->family = AF_INET;	\
+ 	xt_register_target(tgt); })
+#define ipt_unregister_target(tgt) xt_unregister_target(tgt)
+
+#define ipt_register_match(mtch) 	\
+({	(mtch)->family = AF_INET;	\
+	xt_register_match(mtch); })
+#define ipt_unregister_match(mtch) xt_unregister_match(mtch)
 
 //#define ipt_register_table(tbl, repl) xt_register_table(AF_INET, tbl, repl)
 //#define ipt_unregister_table(tbl) xt_unregister_table(AF_INET, tbl)
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index f249b574f0fa..7107f942eb05 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -334,11 +334,15 @@ ip6t_get_target(struct ip6t_entry *e)
 #include <linux/init.h>
 extern void ip6t_init(void) __init;
 
-#define ip6t_register_target(tgt) xt_register_target(AF_INET6, tgt)
-#define ip6t_unregister_target(tgt) xt_unregister_target(AF_INET6, tgt)
-
-#define ip6t_register_match(match) xt_register_match(AF_INET6, match)
-#define ip6t_unregister_match(match) xt_unregister_match(AF_INET6, match)
+#define ip6t_register_target(tgt) 		\
+({	(tgt)->family = AF_INET6;		\
+ 	xt_register_target(tgt); })
+#define ip6t_unregister_target(tgt) xt_unregister_target(tgt)
+
+#define ip6t_register_match(match)		\
+({	(match)->family = AF_INET6;		\
+	xt_register_match(match); })
+#define ip6t_unregister_match(match) xt_unregister_match(match)
 
 extern int ip6t_register_table(struct ip6t_table *table,
 			       const struct ip6t_replace *repl);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f7efb3f27bf5..ff0c594a4198 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1146,12 +1146,14 @@ void arpt_unregister_table(struct arpt_table *table)
 static struct arpt_target arpt_standard_target = {
 	.name		= ARPT_STANDARD_TARGET,
 	.targetsize	= sizeof(int),
+	.family		= NF_ARP,
 };
 
 static struct arpt_target arpt_error_target = {
 	.name		= ARPT_ERROR_TARGET,
 	.target		= arpt_error,
 	.targetsize	= ARPT_FUNCTION_MAXNAMELEN,
+	.family		= NF_ARP,
 };
 
 static struct nf_sockopt_ops arpt_sockopts = {
@@ -1171,8 +1173,8 @@ static int __init init(void)
 	xt_proto_init(NF_ARP);
 
 	/* Noone else will be downing sem now, so we won't sleep */
-	xt_register_target(NF_ARP, &arpt_standard_target);
-	xt_register_target(NF_ARP, &arpt_error_target);
+	xt_register_target(&arpt_standard_target);
+	xt_register_target(&arpt_error_target);
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&arpt_sockopts);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 39705f9bc154..a7b194c4d79d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1335,12 +1335,14 @@ icmp_checkentry(const char *tablename,
 static struct ipt_target ipt_standard_target = {
 	.name		= IPT_STANDARD_TARGET,
 	.targetsize	= sizeof(int),
+	.family		= AF_INET,
 };
 
 static struct ipt_target ipt_error_target = {
 	.name		= IPT_ERROR_TARGET,
 	.target		= ipt_error,
 	.targetsize	= IPT_FUNCTION_MAXNAMELEN,
+	.family		= AF_INET,
 };
 
 static struct nf_sockopt_ops ipt_sockopts = {
@@ -1358,6 +1360,7 @@ static struct ipt_match icmp_matchstruct = {
 	.match		= icmp_match,
 	.matchsize	= sizeof(struct ipt_icmp),
 	.proto		= IPPROTO_ICMP,
+	.family		= AF_INET,
 	.checkentry	= icmp_checkentry,
 };
 
@@ -1368,9 +1371,9 @@ static int __init init(void)
 	xt_proto_init(AF_INET);
 
 	/* Noone else will be downing sem now, so we won't sleep */
-	xt_register_target(AF_INET, &ipt_standard_target);
-	xt_register_target(AF_INET, &ipt_error_target);
-	xt_register_match(AF_INET, &icmp_matchstruct);
+	xt_register_target(&ipt_standard_target);
+	xt_register_target(&ipt_error_target);
+	xt_register_match(&icmp_matchstruct);
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&ipt_sockopts);
@@ -1387,9 +1390,9 @@ static void __exit fini(void)
 {
 	nf_unregister_sockopt(&ipt_sockopts);
 
-	xt_unregister_match(AF_INET, &icmp_matchstruct);
-	xt_unregister_target(AF_INET, &ipt_error_target);
-	xt_unregister_target(AF_INET, &ipt_standard_target);
+	xt_unregister_match(&icmp_matchstruct);
+	xt_unregister_target(&ipt_error_target);
+	xt_unregister_target(&ipt_standard_target);
 
 	xt_proto_fini(AF_INET);
 }
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 5a2063bda676..db3c9ae98e95 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1377,12 +1377,14 @@ icmp6_checkentry(const char *tablename,
 static struct ip6t_target ip6t_standard_target = {
 	.name		= IP6T_STANDARD_TARGET,
 	.targetsize	= sizeof(int),
+	.family		= AF_INET6,
 };
 
 static struct ip6t_target ip6t_error_target = {
 	.name		= IP6T_ERROR_TARGET,
 	.target		= ip6t_error,
 	.targetsize	= IP6T_FUNCTION_MAXNAMELEN,
+	.family		= AF_INET6,
 };
 
 static struct nf_sockopt_ops ip6t_sockopts = {
@@ -1401,6 +1403,7 @@ static struct ip6t_match icmp6_matchstruct = {
 	.matchsize	= sizeof(struct ip6t_icmp),
 	.checkentry	= icmp6_checkentry,
 	.proto		= IPPROTO_ICMPV6,
+	.family		= AF_INET6,
 };
 
 static int __init init(void)
@@ -1410,9 +1413,9 @@ static int __init init(void)
 	xt_proto_init(AF_INET6);
 
 	/* Noone else will be downing sem now, so we won't sleep */
-	xt_register_target(AF_INET6, &ip6t_standard_target);
-	xt_register_target(AF_INET6, &ip6t_error_target);
-	xt_register_match(AF_INET6, &icmp6_matchstruct);
+	xt_register_target(&ip6t_standard_target);
+	xt_register_target(&ip6t_error_target);
+	xt_register_match(&icmp6_matchstruct);
 
 	/* Register setsockopt */
 	ret = nf_register_sockopt(&ip6t_sockopts);
@@ -1429,9 +1432,9 @@ static int __init init(void)
 static void __exit fini(void)
 {
 	nf_unregister_sockopt(&ip6t_sockopts);
-	xt_unregister_match(AF_INET6, &icmp6_matchstruct);
-	xt_unregister_target(AF_INET6, &ip6t_error_target);
-	xt_unregister_target(AF_INET6, &ip6t_standard_target);
+	xt_unregister_match(&icmp6_matchstruct);
+	xt_unregister_target(&ip6t_error_target);
+	xt_unregister_target(&ip6t_standard_target);
 	xt_proto_fini(AF_INET6);
 }
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 750b92829766..0a29a24d9a72 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -60,9 +60,9 @@ static const char *xt_prefix[NPROTO] = {
 
 /* Registration hooks for targets. */
 int
-xt_register_target(int af, struct xt_target *target)
+xt_register_target(struct xt_target *target)
 {
-	int ret;
+	int ret, af = target->family;
 
 	ret = down_interruptible(&xt[af].mutex);
 	if (ret != 0)
@@ -74,8 +74,10 @@ xt_register_target(int af, struct xt_target *target)
 EXPORT_SYMBOL(xt_register_target);
 
 void
-xt_unregister_target(int af, struct xt_target *target)
+xt_unregister_target(struct xt_target *target)
 {
+	int af = target->family;
+
 	down(&xt[af].mutex);
 	LIST_DELETE(&xt[af].target, target);
 	up(&xt[af].mutex);
@@ -83,9 +85,9 @@ xt_unregister_target(int af, struct xt_target *target)
 EXPORT_SYMBOL(xt_unregister_target);
 
 int
-xt_register_match(int af, struct xt_match *match)
+xt_register_match(struct xt_match *match)
 {
-	int ret;
+	int ret, af = match->family;
 
 	ret = down_interruptible(&xt[af].mutex);
 	if (ret != 0)
@@ -99,8 +101,10 @@ xt_register_match(int af, struct xt_match *match)
 EXPORT_SYMBOL(xt_register_match);
 
 void
-xt_unregister_match(int af, struct xt_match *match)
+xt_unregister_match(struct xt_match *match)
 {
+	int af =  match->family;
+
 	down(&xt[af].mutex);
 	LIST_DELETE(&xt[af].match, match);
 	up(&xt[af].mutex);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index 3224ed87d4c7..3cd2ac90a25b 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -47,6 +47,7 @@ static struct xt_target classify_reg = {
 	.table		= "mangle",
 	.hooks		= (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
 		          (1 << NF_IP_POST_ROUTING),
+	.family		= AF_INET,
 	.me 		= THIS_MODULE,
 };
 static struct xt_target classify6_reg = { 
@@ -56,6 +57,7 @@ static struct xt_target classify6_reg = {
 	.table		= "mangle",
 	.hooks		= (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
 		          (1 << NF_IP_POST_ROUTING),
+	.family		= AF_INET6,
 	.me 		= THIS_MODULE,
 };
 
@@ -64,21 +66,21 @@ static int __init init(void)
 {
 	int ret;
 
-	ret = xt_register_target(AF_INET, &classify_reg);
+	ret = xt_register_target(&classify_reg);
 	if (ret)
 		return ret;
 
-	ret = xt_register_target(AF_INET6, &classify6_reg);
+	ret = xt_register_target(&classify6_reg);
 	if (ret)
-		xt_unregister_target(AF_INET, &classify_reg);
+		xt_unregister_target(&classify_reg);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_target(AF_INET, &classify_reg);
-	xt_unregister_target(AF_INET6, &classify6_reg);
+	xt_unregister_target(&classify_reg);
+	xt_unregister_target(&classify6_reg);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index df2486a3efd5..35448b8e6883 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -102,6 +102,7 @@ static struct xt_target connmark_reg = {
 	.target		= target,
 	.targetsize	= sizeof(struct xt_connmark_target_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET,
 	.me		= THIS_MODULE
 };
 
@@ -110,6 +111,7 @@ static struct xt_target connmark6_reg = {
 	.target		= target,
 	.targetsize	= sizeof(struct xt_connmark_target_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE
 };
 
@@ -119,21 +121,21 @@ static int __init init(void)
 
 	need_conntrack();
 
-	ret = xt_register_target(AF_INET, &connmark_reg);
+	ret = xt_register_target(&connmark_reg);
 	if (ret)
 		return ret;
 
-	ret = xt_register_target(AF_INET6, &connmark6_reg);
+	ret = xt_register_target(&connmark6_reg);
 	if (ret)
-		xt_unregister_target(AF_INET, &connmark_reg);
+		xt_unregister_target(&connmark_reg);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_target(AF_INET, &connmark_reg);
-	xt_unregister_target(AF_INET6, &connmark6_reg);
+	xt_unregister_target(&connmark_reg);
+	xt_unregister_target(&connmark6_reg);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index dcb5266efae0..73bdd5c80e17 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -119,6 +119,7 @@ static struct xt_target ipt_mark_reg_v0 = {
 	.table		= "mangle",
 	.checkentry	= checkentry_v0,
 	.me		= THIS_MODULE,
+	.family		= AF_INET,
 	.revision	= 0,
 };
 
@@ -129,6 +130,7 @@ static struct xt_target ipt_mark_reg_v1 = {
 	.table		= "mangle",
 	.checkentry	= checkentry_v1,
 	.me		= THIS_MODULE,
+	.family		= AF_INET,
 	.revision	= 1,
 };
 
@@ -139,6 +141,7 @@ static struct xt_target ip6t_mark_reg_v0 = {
 	.table		= "mangle",
 	.checkentry	= checkentry_v0,
 	.me		= THIS_MODULE,
+	.family		= AF_INET6,
 	.revision	= 0,
 };
 
@@ -146,18 +149,18 @@ static int __init init(void)
 {
 	int err;
 
-	err = xt_register_target(AF_INET, &ipt_mark_reg_v0);
+	err = xt_register_target(&ipt_mark_reg_v0);
 	if (err)
 		return err;
 
-	err = xt_register_target(AF_INET, &ipt_mark_reg_v1);
+	err = xt_register_target(&ipt_mark_reg_v1);
 	if (err)
-		xt_unregister_target(AF_INET, &ipt_mark_reg_v0);
+		xt_unregister_target(&ipt_mark_reg_v0);
 
-	err = xt_register_target(AF_INET6, &ip6t_mark_reg_v0);
+	err = xt_register_target(&ip6t_mark_reg_v0);
 	if (err) {
-		xt_unregister_target(AF_INET, &ipt_mark_reg_v0);
-		xt_unregister_target(AF_INET, &ipt_mark_reg_v1);
+		xt_unregister_target(&ipt_mark_reg_v0);
+		xt_unregister_target(&ipt_mark_reg_v1);
 	}
 
 	return err;
@@ -165,9 +168,9 @@ static int __init init(void)
 
 static void __exit fini(void)
 {
-	xt_unregister_target(AF_INET, &ipt_mark_reg_v0);
-	xt_unregister_target(AF_INET, &ipt_mark_reg_v1);
-	xt_unregister_target(AF_INET6, &ip6t_mark_reg_v0);
+	xt_unregister_target(&ipt_mark_reg_v0);
+	xt_unregister_target(&ipt_mark_reg_v1);
+	xt_unregister_target(&ip6t_mark_reg_v0);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 39a963edf16b..2873e1c60f68 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -41,6 +41,7 @@ static struct xt_target ipt_NFQ_reg = {
 	.name		= "NFQUEUE",
 	.target		= target,
 	.targetsize	= sizeof(struct xt_NFQ_info),
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -48,6 +49,7 @@ static struct xt_target ip6t_NFQ_reg = {
 	.name		= "NFQUEUE",
 	.target		= target,
 	.targetsize	= sizeof(struct xt_NFQ_info),
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -55,36 +57,37 @@ static struct xt_target arpt_NFQ_reg = {
 	.name		= "NFQUEUE",
 	.target		= target,
 	.targetsize	= sizeof(struct xt_NFQ_info),
+	.family		= NF_ARP,
 	.me		= THIS_MODULE,
 };
 
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_target(AF_INET, &ipt_NFQ_reg);
+	ret = xt_register_target(&ipt_NFQ_reg);
 	if (ret)
 		return ret;
-	ret = xt_register_target(AF_INET6, &ip6t_NFQ_reg);
+	ret = xt_register_target(&ip6t_NFQ_reg);
 	if (ret)
 		goto out_ip;
-	ret = xt_register_target(NF_ARP, &arpt_NFQ_reg);
+	ret = xt_register_target(&arpt_NFQ_reg);
 	if (ret)
 		goto out_ip6;
 
 	return ret;
 out_ip6:
-	xt_unregister_target(AF_INET6, &ip6t_NFQ_reg);
+	xt_unregister_target(&ip6t_NFQ_reg);
 out_ip:
-	xt_unregister_target(AF_INET, &ipt_NFQ_reg);
+	xt_unregister_target(&ipt_NFQ_reg);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_target(NF_ARP, &arpt_NFQ_reg);
-	xt_unregister_target(AF_INET6, &ip6t_NFQ_reg);
-	xt_unregister_target(AF_INET, &ipt_NFQ_reg);
+	xt_unregister_target(&arpt_NFQ_reg);
+	xt_unregister_target(&ip6t_NFQ_reg);
+	xt_unregister_target(&ipt_NFQ_reg);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index b8634e3f6169..cf2ebd76fd6f 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -39,6 +39,7 @@ static struct xt_target notrack_reg = {
 	.target		= target,
 	.targetsize	= 0,
 	.table		= "raw",
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -47,6 +48,7 @@ static struct xt_target notrack6_reg = {
 	.target		= target,
 	.targetsize	= 0,
 	.table		= "raw",
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -54,21 +56,21 @@ static int __init init(void)
 {
 	int ret;
 
-	ret = xt_register_target(AF_INET, &notrack_reg);
+	ret = xt_register_target(&notrack_reg);
 	if (ret)
 		return ret;
 
-	ret = xt_register_target(AF_INET6, &notrack6_reg);
+	ret = xt_register_target(&notrack6_reg);
 	if (ret)
-		xt_unregister_target(AF_INET, &notrack_reg);
+		xt_unregister_target(&notrack_reg);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_target(AF_INET6, &notrack6_reg);
-	xt_unregister_target(AF_INET, &notrack_reg);
+	xt_unregister_target(&notrack6_reg);
+	xt_unregister_target(&notrack_reg);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
index 03d9d741231c..2637724b498d 100644
--- a/net/netfilter/xt_comment.c
+++ b/net/netfilter/xt_comment.c
@@ -33,6 +33,7 @@ static struct xt_match comment_match = {
 	.name		= "comment",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_comment_info),
+	.family		= AF_INET,
 	.me		= THIS_MODULE
 };
 
@@ -40,6 +41,7 @@ static struct xt_match comment6_match = {
 	.name		= "comment",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_comment_info),
+	.family		= AF_INET6,
 	.me		= THIS_MODULE
 };
 
@@ -47,21 +49,21 @@ static int __init init(void)
 {
 	int ret;
 
-	ret = xt_register_match(AF_INET, &comment_match);
+	ret = xt_register_match(&comment_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &comment6_match);
+	ret = xt_register_match(&comment6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &comment_match);
+		xt_unregister_match(&comment_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &comment_match);
-	xt_unregister_match(AF_INET6, &comment6_match);
+	xt_unregister_match(&comment_match);
+	xt_unregister_match(&comment6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index f34ecb9485c7..4985f5ec58ca 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -148,6 +148,7 @@ static struct xt_match connbytes_match = {
 	.match		= match,
 	.checkentry	= check,
 	.matchsize	= sizeof(struct xt_connbytes_info),
+	.family		= AF_INET,
 	.me		= THIS_MODULE
 };
 static struct xt_match connbytes6_match = {
@@ -155,26 +156,27 @@ static struct xt_match connbytes6_match = {
 	.match		= match,
 	.checkentry	= check,
 	.matchsize	= sizeof(struct xt_connbytes_info),
+	.family		= AF_INET6,
 	.me		= THIS_MODULE
 };
 
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &connbytes_match);
+	ret = xt_register_match(&connbytes_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &connbytes6_match);
+	ret = xt_register_match(&connbytes6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &connbytes_match);
+		xt_unregister_match(&connbytes_match);
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &connbytes_match);
-	xt_unregister_match(AF_INET6, &connbytes6_match);
+	xt_unregister_match(&connbytes_match);
+	xt_unregister_match(&connbytes6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 51822471e919..e810600345e3 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -72,6 +72,7 @@ static struct xt_match connmark_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_connmark_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET,
 	.me		= THIS_MODULE
 };
 
@@ -80,6 +81,7 @@ static struct xt_match connmark6_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_connmark_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE
 };
 
@@ -89,20 +91,20 @@ static int __init init(void)
 
 	need_conntrack();
 
-	ret = xt_register_match(AF_INET, &connmark_match);
+	ret = xt_register_match(&connmark_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &connmark6_match);
+	ret = xt_register_match(&connmark6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &connmark_match);
+		xt_unregister_match(&connmark_match);
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET6, &connmark6_match);
-	xt_unregister_match(AF_INET, &connmark_match);
+	xt_unregister_match(&connmark6_match);
+	xt_unregister_match(&connmark_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 39fc29496e00..7d20caa0d605 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -207,6 +207,7 @@ static struct xt_match conntrack_match = {
 	.name		= "conntrack",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_conntrack_info),
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -214,14 +215,14 @@ static int __init init(void)
 {
 	int ret;
 	need_conntrack();
-	ret = xt_register_match(AF_INET, &conntrack_match);
+	ret = xt_register_match(&conntrack_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &conntrack_match);
+	xt_unregister_match(&conntrack_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index db6b70cdc770..2f331decd151 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -149,6 +149,7 @@ static struct xt_match dccp_match =
 	.matchsize	= sizeof(struct xt_dccp_info),
 	.proto		= IPPROTO_DCCP,
 	.checkentry	= checkentry,
+	.family		= AF_INET,
 	.me 		= THIS_MODULE,
 };
 static struct xt_match dccp6_match = 
@@ -158,6 +159,7 @@ static struct xt_match dccp6_match =
 	.matchsize	= sizeof(struct xt_dccp_info),
 	.proto		= IPPROTO_DCCP,
 	.checkentry	= checkentry,
+	.family		= AF_INET6,
 	.me 		= THIS_MODULE,
 };
 
@@ -172,17 +174,17 @@ static int __init init(void)
 	dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
 	if (!dccp_optbuf)
 		return -ENOMEM;
-	ret = xt_register_match(AF_INET, &dccp_match);
+	ret = xt_register_match(&dccp_match);
 	if (ret)
 		goto out_kfree;
-	ret = xt_register_match(AF_INET6, &dccp6_match);
+	ret = xt_register_match(&dccp6_match);
 	if (ret)
 		goto out_unreg;
 
 	return ret;
 
 out_unreg:
-	xt_unregister_match(AF_INET, &dccp_match);
+	xt_unregister_match(&dccp_match);
 out_kfree:
 	kfree(dccp_optbuf);
 
@@ -191,8 +193,8 @@ out_kfree:
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET6, &dccp6_match);
-	xt_unregister_match(AF_INET, &dccp_match);
+	xt_unregister_match(&dccp6_match);
+	xt_unregister_match(&dccp_match);
 	kfree(dccp_optbuf);
 }
 
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index ef8e54d40c92..7d2d68b9155f 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -153,6 +153,7 @@ static struct xt_match helper_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_helper_info),
 	.checkentry	= check,
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 static struct xt_match helper6_match = {
@@ -160,6 +161,7 @@ static struct xt_match helper6_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_helper_info),
 	.checkentry	= check,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -168,21 +170,21 @@ static int __init init(void)
 	int ret;
 	need_conntrack();
 
-	ret = xt_register_match(AF_INET, &helper_match);
+	ret = xt_register_match(&helper_match);
 	if (ret < 0)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &helper6_match);
+	ret = xt_register_match(&helper6_match);
 	if (ret < 0)
-		xt_unregister_match(AF_INET, &helper_match);
+		xt_unregister_match(&helper_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &helper_match);
-	xt_unregister_match(AF_INET6, &helper6_match);
+	xt_unregister_match(&helper_match);
+	xt_unregister_match(&helper6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index b9e60f041a64..38560caef757 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -56,6 +56,7 @@ static struct xt_match length_match = {
 	.name		= "length",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_length_info),
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -63,26 +64,27 @@ static struct xt_match length6_match = {
 	.name		= "length",
 	.match		= match6,
 	.matchsize	= sizeof(struct xt_length_info),
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &length_match);
+	ret = xt_register_match(&length_match);
 	if (ret)
 		return ret;
-	ret = xt_register_match(AF_INET6, &length6_match);
+	ret = xt_register_match(&length6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &length_match);
+		xt_unregister_match(&length_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &length_match);
-	xt_unregister_match(AF_INET6, &length6_match);
+	xt_unregister_match(&length_match);
+	xt_unregister_match(&length6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 3049e6f8889e..e91c1a444e77 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -141,6 +141,7 @@ static struct xt_match ipt_limit_reg = {
 	.match		= ipt_limit_match,
 	.matchsize	= sizeof(struct xt_rateinfo),
 	.checkentry	= ipt_limit_checkentry,
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 static struct xt_match limit6_reg = {
@@ -148,6 +149,7 @@ static struct xt_match limit6_reg = {
 	.match		= ipt_limit_match,
 	.matchsize	= sizeof(struct xt_rateinfo),
 	.checkentry	= ipt_limit_checkentry,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -155,21 +157,21 @@ static int __init init(void)
 {
 	int ret;
 	
-	ret = xt_register_match(AF_INET, &ipt_limit_reg);
+	ret = xt_register_match(&ipt_limit_reg);
 	if (ret)
 		return ret;
 	
-	ret = xt_register_match(AF_INET6, &limit6_reg);
+	ret = xt_register_match(&limit6_reg);
 	if (ret)
-		xt_unregister_match(AF_INET, &ipt_limit_reg);
+		xt_unregister_match(&ipt_limit_reg);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &ipt_limit_reg);
-	xt_unregister_match(AF_INET6, &limit6_reg);
+	xt_unregister_match(&ipt_limit_reg);
+	xt_unregister_match(&limit6_reg);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index b4559a46dce8..f4defa28a6ec 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -49,6 +49,7 @@ static struct xt_match mac_match = {
 	.matchsize	= sizeof(struct xt_mac_info),
 	.hooks		= (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) |
 			  (1 << NF_IP_FORWARD),
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 static struct xt_match mac6_match = {
@@ -57,27 +58,28 @@ static struct xt_match mac6_match = {
 	.matchsize	= sizeof(struct xt_mac_info),
 	.hooks		= (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) |
 			  (1 << NF_IP_FORWARD),
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &mac_match);
+	ret = xt_register_match(&mac_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &mac6_match);
+	ret = xt_register_match(&mac6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &mac_match);
+		xt_unregister_match(&mac_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &mac_match);
-	xt_unregister_match(AF_INET6, &mac6_match);
+	xt_unregister_match(&mac_match);
+	xt_unregister_match(&mac6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index c1a8f0f587f0..ce0badfeef9a 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -56,6 +56,7 @@ static struct xt_match mark_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_mark_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -64,27 +65,28 @@ static struct xt_match mark6_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_mark_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &mark_match);
+	ret = xt_register_match(&mark_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &mark6_match);
+	ret = xt_register_match(&mark6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &mark_match);
+		xt_unregister_match(&mark_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &mark_match);
-	xt_unregister_match(AF_INET6, &mark6_match);
+	xt_unregister_match(&mark_match);
+	xt_unregister_match(&mark6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index f788e8e76254..089f4f7e8636 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -121,6 +121,7 @@ static struct xt_match physdev_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_physdev_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -129,6 +130,7 @@ static struct xt_match physdev6_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_physdev_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -136,21 +138,21 @@ static int __init init(void)
 {
 	int ret;
 
-	ret = xt_register_match(AF_INET, &physdev_match);
+	ret = xt_register_match(&physdev_match);
 	if (ret < 0)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &physdev6_match);
+	ret = xt_register_match(&physdev6_match);
 	if (ret < 0)
-		xt_unregister_match(AF_INET, &physdev_match);
+		xt_unregister_match(&physdev_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &physdev_match);
-	xt_unregister_match(AF_INET6, &physdev6_match);
+	xt_unregister_match(&physdev_match);
+	xt_unregister_match(&physdev6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index f38638dfd139..8b8bca988ac6 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -37,6 +37,7 @@ static struct xt_match pkttype_match = {
 	.name		= "pkttype",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_pkttype_info),
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -44,27 +45,28 @@ static struct xt_match pkttype6_match = {
 	.name		= "pkttype",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_pkttype_info),
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &pkttype_match);
+	ret = xt_register_match(&pkttype_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &pkttype6_match);
+	ret = xt_register_match(&pkttype6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &pkttype_match);
+		xt_unregister_match(&pkttype_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &pkttype_match);
-	xt_unregister_match(AF_INET6, &pkttype6_match);
+	xt_unregister_match(&pkttype_match);
+	xt_unregister_match(&pkttype6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index 1ec22082f04d..d57a611ae0d3 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -172,6 +172,7 @@ static struct xt_match policy_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_policy_info),
 	.checkentry 	= checkentry,
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -181,6 +182,7 @@ static struct xt_match policy6_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_policy_info),
 	.checkentry	= checkentry,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -188,19 +190,19 @@ static int __init init(void)
 {
 	int ret;
 
-	ret = xt_register_match(AF_INET, &policy_match);
+	ret = xt_register_match(&policy_match);
 	if (ret)
 		return ret;
-	ret = xt_register_match(AF_INET6, &policy6_match);
+	ret = xt_register_match(&policy6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &policy_match);
+		xt_unregister_match(&policy_match);
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET6, &policy6_match);
-	xt_unregister_match(AF_INET, &policy_match);
+	xt_unregister_match(&policy6_match);
+	xt_unregister_match(&policy_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 57815a07db67..5e31a4a835bf 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -45,17 +45,18 @@ static struct xt_match realm_match = {
 	.matchsize	= sizeof(struct xt_realm_info),
 	.hooks		= (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) |
 			  (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN),
+	.family		= AF_INET,
 	.me		= THIS_MODULE
 };
 
 static int __init init(void)
 {
-	return xt_register_match(AF_INET, &realm_match);
+	return xt_register_match(&realm_match);
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &realm_match);
+	xt_unregister_match(&realm_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index f5d698ba03ca..c6eb24a2fe13 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -186,6 +186,7 @@ static struct xt_match sctp_match = {
 	.matchsize	= sizeof(struct xt_sctp_info),
 	.proto		= IPPROTO_SCTP,
 	.checkentry	= checkentry,
+	.family		= AF_INET,
 	.me		= THIS_MODULE
 };
 
@@ -195,27 +196,28 @@ static struct xt_match sctp6_match = {
 	.matchsize	= sizeof(struct xt_sctp_info),
 	.proto		= IPPROTO_SCTP,
 	.checkentry	= checkentry,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE
 };
 
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &sctp_match);
+	ret = xt_register_match(&sctp_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &sctp6_match);
+	ret = xt_register_match(&sctp6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &sctp_match);
+		xt_unregister_match(&sctp_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET6, &sctp6_match);
-	xt_unregister_match(AF_INET, &sctp_match);
+	xt_unregister_match(&sctp6_match);
+	xt_unregister_match(&sctp_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index b8ec00cd51fc..7cd557c932ba 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -48,6 +48,7 @@ static struct xt_match state_match = {
 	.name		= "state",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_state_info),
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -55,6 +56,7 @@ static struct xt_match state6_match = {
 	.name		= "state",
 	.match		= match,
 	.matchsize	= sizeof(struct xt_state_info),
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -64,21 +66,21 @@ static int __init init(void)
 
 	need_conntrack();
 
-	ret = xt_register_match(AF_INET, &state_match);
+	ret = xt_register_match(&state_match);
 	if (ret < 0)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &state6_match);
+	ret = xt_register_match(&state6_match);
 	if (ret < 0)
-		xt_unregister_match(AF_INET,&state_match);
+		xt_unregister_match(&state_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &state_match);
-	xt_unregister_match(AF_INET6, &state6_match);
+	xt_unregister_match(&state_match);
+	xt_unregister_match(&state6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index fccbad6a7f40..703d80fccacf 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -78,6 +78,7 @@ static struct xt_match string_match = {
 	.matchsize	= sizeof(struct xt_string_info),
 	.checkentry	= checkentry,
 	.destroy 	= destroy,
+	.family		= AF_INET,
 	.me 		= THIS_MODULE
 };
 static struct xt_match string6_match = {
@@ -86,6 +87,7 @@ static struct xt_match string6_match = {
 	.matchsize	= sizeof(struct xt_string_info),
 	.checkentry	= checkentry,
 	.destroy 	= destroy,
+	.family		= AF_INET6,
 	.me 		= THIS_MODULE
 };
 
@@ -93,20 +95,20 @@ static int __init init(void)
 {
 	int ret;
 
-	ret = xt_register_match(AF_INET, &string_match);
+	ret = xt_register_match(&string_match);
 	if (ret)
 		return ret;
-	ret = xt_register_match(AF_INET6, &string6_match);
+	ret = xt_register_match(&string6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &string_match);
+		xt_unregister_match(&string_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET, &string_match);
-	xt_unregister_match(AF_INET6, &string6_match);
+	xt_unregister_match(&string_match);
+	xt_unregister_match(&string6_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 4925fc98f4ae..70a8858ae3f1 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -98,6 +98,7 @@ static struct xt_match tcpmss_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_tcpmss_match_info),
 	.proto		= IPPROTO_TCP,
+	.family		= AF_INET,
 	.me		= THIS_MODULE,
 };
 
@@ -106,6 +107,7 @@ static struct xt_match tcpmss6_match = {
 	.match		= match,
 	.matchsize	= sizeof(struct xt_tcpmss_match_info),
 	.proto		= IPPROTO_TCP,
+	.family		= AF_INET6,
 	.me		= THIS_MODULE,
 };
 
@@ -113,21 +115,21 @@ static struct xt_match tcpmss6_match = {
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &tcpmss_match);
+	ret = xt_register_match(&tcpmss_match);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &tcpmss6_match);
+	ret = xt_register_match(&tcpmss6_match);
 	if (ret)
-		xt_unregister_match(AF_INET, &tcpmss_match);
+		xt_unregister_match(&tcpmss_match);
 
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET6, &tcpmss6_match);
-	xt_unregister_match(AF_INET, &tcpmss_match);
+	xt_unregister_match(&tcpmss6_match);
+	xt_unregister_match(&tcpmss_match);
 }
 
 module_init(init);
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index b5cd0dd4e41f..14a990eb666a 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -204,6 +204,7 @@ static struct xt_match tcp_matchstruct = {
 	.match		= tcp_match,
 	.matchsize	= sizeof(struct xt_tcp),
 	.proto		= IPPROTO_TCP,
+	.family		= AF_INET,
 	.checkentry	= tcp_checkentry,
 	.me		= THIS_MODULE,
 };
@@ -213,6 +214,7 @@ static struct xt_match tcp6_matchstruct = {
 	.match		= tcp_match,
 	.matchsize	= sizeof(struct xt_tcp),
 	.proto		= IPPROTO_TCP,
+	.family		= AF_INET6,
 	.checkentry	= tcp_checkentry,
 	.me		= THIS_MODULE,
 };
@@ -222,6 +224,7 @@ static struct xt_match udp_matchstruct = {
 	.match		= udp_match,
 	.matchsize	= sizeof(struct xt_udp),
 	.proto		= IPPROTO_UDP,
+	.family		= AF_INET,
 	.checkentry	= udp_checkentry,
 	.me		= THIS_MODULE,
 };
@@ -230,6 +233,7 @@ static struct xt_match udp6_matchstruct = {
 	.match		= udp_match,
 	.matchsize	= sizeof(struct xt_udp),
 	.proto		= IPPROTO_UDP,
+	.family		= AF_INET6,
 	.checkentry	= udp_checkentry,
 	.me		= THIS_MODULE,
 };
@@ -237,39 +241,39 @@ static struct xt_match udp6_matchstruct = {
 static int __init init(void)
 {
 	int ret;
-	ret = xt_register_match(AF_INET, &tcp_matchstruct);
+	ret = xt_register_match(&tcp_matchstruct);
 	if (ret)
 		return ret;
 
-	ret = xt_register_match(AF_INET6, &tcp6_matchstruct);
+	ret = xt_register_match(&tcp6_matchstruct);
 	if (ret)
 		goto out_unreg_tcp;
 
-	ret = xt_register_match(AF_INET, &udp_matchstruct);
+	ret = xt_register_match(&udp_matchstruct);
 	if (ret)
 		goto out_unreg_tcp6;
 	
-	ret = xt_register_match(AF_INET6, &udp6_matchstruct);
+	ret = xt_register_match(&udp6_matchstruct);
 	if (ret)
 		goto out_unreg_udp;
 
 	return ret;
 
 out_unreg_udp:
-	xt_unregister_match(AF_INET, &tcp_matchstruct);
+	xt_unregister_match(&tcp_matchstruct);
 out_unreg_tcp6:
-	xt_unregister_match(AF_INET6, &tcp6_matchstruct);
+	xt_unregister_match(&tcp6_matchstruct);
 out_unreg_tcp:
-	xt_unregister_match(AF_INET, &tcp_matchstruct);
+	xt_unregister_match(&tcp_matchstruct);
 	return ret;
 }
 
 static void __exit fini(void)
 {
-	xt_unregister_match(AF_INET6, &udp6_matchstruct);
-	xt_unregister_match(AF_INET, &udp_matchstruct);
-	xt_unregister_match(AF_INET6, &tcp6_matchstruct);
-	xt_unregister_match(AF_INET, &tcp_matchstruct);
+	xt_unregister_match(&udp6_matchstruct);
+	xt_unregister_match(&udp_matchstruct);
+	xt_unregister_match(&tcp6_matchstruct);
+	xt_unregister_match(&tcp_matchstruct);
 }
 
 module_init(init);
-- 
cgit v1.2.3


From 1e30a014e311e627b91489ff5ec1b54496d308af Mon Sep 17 00:00:00 2001
From: Dmitry Mishin <dim@openvz.org>
Date: Wed, 22 Mar 2006 13:56:56 -0800
Subject: [NETFILTER]: futher {ip,ip6,arp}_tables unification

This patch moves {ip,ip6,arp}t_entry_{match,target} definitions to
x_tables.h. This move simplifies code and future compatibility fixes.

Signed-off-by: Dmitry Mishin <dim@openvz.org>
Acked-off-by: Kirill Korotaev <dev@openvz.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h        | 56 +++++++++++++++++++++++++++++
 include/linux/netfilter_arp/arp_tables.h  | 31 ++---------------
 include/linux/netfilter_ipv4/ip_tables.h  | 58 ++-----------------------------
 include/linux/netfilter_ipv6/ip6_tables.h | 57 ++----------------------------
 include/net/tc_act/tc_ipt.h               |  4 +--
 5 files changed, 66 insertions(+), 140 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index bf71efb63007..1350e47b0234 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -4,6 +4,62 @@
 #define XT_FUNCTION_MAXNAMELEN 30
 #define XT_TABLE_MAXNAMELEN 32
 
+struct xt_entry_match
+{
+	union {
+		struct {
+			u_int16_t match_size;
+
+			/* Used by userspace */
+			char name[XT_FUNCTION_MAXNAMELEN-1];
+
+			u_int8_t revision;
+		} user;
+		struct {
+			u_int16_t match_size;
+
+			/* Used inside the kernel */
+			struct xt_match *match;
+		} kernel;
+
+		/* Total length */
+		u_int16_t match_size;
+	} u;
+
+	unsigned char data[0];
+};
+
+struct xt_entry_target
+{
+	union {
+		struct {
+			u_int16_t target_size;
+
+			/* Used by userspace */
+			char name[XT_FUNCTION_MAXNAMELEN-1];
+
+			u_int8_t revision;
+		} user;
+		struct {
+			u_int16_t target_size;
+
+			/* Used inside the kernel */
+			struct xt_target *target;
+		} kernel;
+
+		/* Total length */
+		u_int16_t target_size;
+	} u;
+
+	unsigned char data[0];
+};
+
+struct xt_standard_target
+{
+	struct xt_entry_target target;
+	int verdict;
+};
+
 /* The argument to IPT_SO_GET_REVISION_*.  Returns highest revision
  * kernel supports, if >= revision. */
 struct xt_get_revision
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index a27be05f67f0..62cc27daca4e 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -65,35 +65,8 @@ struct arpt_arp {
 	u_int16_t invflags;
 };
 
-struct arpt_entry_target
-{
-	union {
-		struct {
-			u_int16_t target_size;
-
-			/* Used by userspace */
-			char name[ARPT_FUNCTION_MAXNAMELEN-1];
-			u_int8_t revision;
-		} user;
-		struct {
-			u_int16_t target_size;
-
-			/* Used inside the kernel */
-			struct arpt_target *target;
-		} kernel;
-
-		/* Total length */
-		u_int16_t target_size;
-	} u;
-
-	unsigned char data[0];
-};
-
-struct arpt_standard_target
-{
-	struct arpt_entry_target target;
-	int verdict;
-};
+#define arpt_entry_target xt_entry_target
+#define arpt_standard_target xt_standard_target
 
 /* Values for "flag" field in struct arpt_ip (general arp structure).
  * No flags defined yet.
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index ee262b5344e1..d5b8c0d6a12b 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -52,61 +52,9 @@ struct ipt_ip {
 	u_int8_t invflags;
 };
 
-struct ipt_entry_match
-{
-	union {
-		struct {
-			u_int16_t match_size;
-
-			/* Used by userspace */
-			char name[IPT_FUNCTION_MAXNAMELEN-1];
-
-			u_int8_t revision;
-		} user;
-		struct {
-			u_int16_t match_size;
-
-			/* Used inside the kernel */
-			struct ipt_match *match;
-		} kernel;
-
-		/* Total length */
-		u_int16_t match_size;
-	} u;
-
-	unsigned char data[0];
-};
-
-struct ipt_entry_target
-{
-	union {
-		struct {
-			u_int16_t target_size;
-
-			/* Used by userspace */
-			char name[IPT_FUNCTION_MAXNAMELEN-1];
-
-			u_int8_t revision;
-		} user;
-		struct {
-			u_int16_t target_size;
-
-			/* Used inside the kernel */
-			struct ipt_target *target;
-		} kernel;
-
-		/* Total length */
-		u_int16_t target_size;
-	} u;
-
-	unsigned char data[0];
-};
-
-struct ipt_standard_target
-{
-	struct ipt_entry_target target;
-	int verdict;
-};
+#define ipt_entry_match xt_entry_match
+#define ipt_entry_target xt_entry_target
+#define ipt_standard_target xt_standard_target
 
 #define ipt_counters xt_counters
 
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 7107f942eb05..d0d5d1ee4be3 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -56,60 +56,9 @@ struct ip6t_ip6 {
 	u_int8_t invflags;
 };
 
-/* FIXME: If alignment in kernel different from userspace? --RR */
-struct ip6t_entry_match
-{
-	union {
-		struct {
-			u_int16_t match_size;
-
-			/* Used by userspace */
-			char name[IP6T_FUNCTION_MAXNAMELEN-1];
-			u_int8_t revision;
-		} user;
-		struct {
-			u_int16_t match_size;
-
-			/* Used inside the kernel */
-			struct ip6t_match *match;
-		} kernel;
-
-		/* Total length */
-		u_int16_t match_size;
-	} u;
-
-	unsigned char data[0];
-};
-
-struct ip6t_entry_target
-{
-	union {
-		struct {
-			u_int16_t target_size;
-
-			/* Used by userspace */
-			char name[IP6T_FUNCTION_MAXNAMELEN-1];
-			u_int8_t revision;
-		} user;
-		struct {
-			u_int16_t target_size;
-
-			/* Used inside the kernel */
-			struct ip6t_target *target;
-		} kernel;
-
-		/* Total length */
-		u_int16_t target_size;
-	} u;
-
-	unsigned char data[0];
-};
-
-struct ip6t_standard_target
-{
-	struct ip6t_entry_target target;
-	int verdict;
-};
+#define ip6t_entry_match xt_entry_match
+#define ip6t_entry_target xt_entry_target
+#define ip6t_standard_target xt_standard_target
 
 #define ip6t_counters	xt_counters
 
diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h
index 02eccebd55ae..cb37ad08427f 100644
--- a/include/net/tc_act/tc_ipt.h
+++ b/include/net/tc_act/tc_ipt.h
@@ -3,14 +3,14 @@
 
 #include <net/act_api.h>
 
-struct ipt_entry_target;
+struct xt_entry_target;
 
 struct tcf_ipt
 {
 	tca_gen(ipt);
 	u32 hook;
 	char *tname;
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 };
 
 #endif
-- 
cgit v1.2.3


From a5cdc030030ef5a16e48aebeb65067bdc3120899 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 23 Mar 2006 01:16:06 -0800
Subject: [IPV4]: Add fib rule netlink notifications

To really make sense of route notifications in the presence of
multiple tables, userspace also needs to be notified about routing
rule updates.  Notifications are sent to the so far unused
RTNLGRP_NOP1 (now RTNLGRP_RULE) group.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  4 +++-
 net/ipv4/fib_rules.c      | 27 ++++++++++++++++++++++++---
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index d572d5376319..df0cdd41085c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -839,6 +839,7 @@ enum
 #define RTMGRP_IPV4_IFADDR	0x10
 #define RTMGRP_IPV4_MROUTE	0x20
 #define RTMGRP_IPV4_ROUTE	0x40
+#define RTMGRP_IPV4_RULE	0x80
 
 #define RTMGRP_IPV6_IFADDR	0x100
 #define RTMGRP_IPV6_MROUTE	0x200
@@ -869,7 +870,8 @@ enum rtnetlink_groups {
 #define	RTNLGRP_IPV4_MROUTE	RTNLGRP_IPV4_MROUTE
 	RTNLGRP_IPV4_ROUTE,
 #define RTNLGRP_IPV4_ROUTE	RTNLGRP_IPV4_ROUTE
-	RTNLGRP_NOP1,
+	RTNLGRP_IPV4_RULE,
+#define RTNLGRP_IPV4_RULE	RTNLGRP_IPV4_RULE
 	RTNLGRP_IPV6_IFADDR,
 #define RTNLGRP_IPV6_IFADDR	RTNLGRP_IPV6_IFADDR
 	RTNLGRP_IPV6_MROUTE,
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 768e8f5d7daa..ec566f3e66c7 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -104,6 +104,8 @@ static struct hlist_head fib_rules;
 
 /* writer func called from netlink -- rtnl_sem hold*/
 
+static void rtmsg_rule(int, struct fib_rule *);
+
 int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct rtattr **rta = arg;
@@ -131,6 +133,7 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 			hlist_del_rcu(&r->hlist);
 			r->r_dead = 1;
+			rtmsg_rule(RTM_DELRULE, r);
 			fib_rule_put(r);
 			err = 0;
 			break;
@@ -253,6 +256,7 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	else
 		hlist_add_before_rcu(&new_r->hlist, &r->hlist);
 
+	rtmsg_rule(RTM_NEWRULE, new_r);
 	return 0;
 }
 
@@ -382,14 +386,14 @@ static struct notifier_block fib_rules_notifier = {
 
 static __inline__ int inet_fill_rule(struct sk_buff *skb,
 				     struct fib_rule *r,
-				     struct netlink_callback *cb,
+				     u32 pid, u32 seq, int event,
 				     unsigned int flags)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
 	rtm = NLMSG_DATA(nlh);
 	rtm->rtm_family = AF_INET;
 	rtm->rtm_dst_len = r->r_dst_len;
@@ -430,6 +434,21 @@ rtattr_failure:
 
 /* callers should hold rtnl semaphore */
 
+static void rtmsg_rule(int event, struct fib_rule *r)
+{
+	int size = NLMSG_SPACE(sizeof(struct rtmsg) + 128);
+	struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
+
+	if (!skb)
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_RULE, ENOBUFS);
+	else if (inet_fill_rule(skb, r, 0, 0, event, 0) < 0) {
+		kfree_skb(skb);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_RULE, EINVAL);
+	} else {
+		netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_RULE, GFP_KERNEL);
+	}
+}
+
 int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int idx = 0;
@@ -442,7 +461,9 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
 
 		if (idx < s_idx)
 			continue;
-		if (inet_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
+		if (inet_fill_rule(skb, r, NETLINK_CB(cb->skb).pid,
+				   cb->nlh->nlmsg_seq,
+				   RTM_NEWRULE, NLM_F_MULTI) < 0)
 			break;
 		idx++;
 	}
-- 
cgit v1.2.3


From af36e6b6d7f4ad7a5ccfd14dfa71ec941255f93d Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Thu, 23 Mar 2006 01:28:06 -0800
Subject: [TG3]: Add 5755 support

Add support for new chip 5755 which is very similar to 5787.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 37 +++++++++++++++++++++++++++++++------
 drivers/net/tg3.h       |  6 +++++-
 include/linux/pci_ids.h |  2 ++
 3 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 88829eb9568e..c69c8df088d0 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -225,6 +225,10 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5754M,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5755,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5755M,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787M,
@@ -4557,6 +4561,7 @@ static int tg3_chip_reset(struct tg3 *tp)
 	}
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
 		tw32(GRC_FASTBOOT_PC, 0);
 
@@ -6152,6 +6157,9 @@ static int tg3_reset_hw(struct tg3 *tp)
 			gpio_mask |= GRC_LCLCTRL_GPIO_OE3 |
 				     GRC_LCLCTRL_GPIO_OUTPUT3;
 
+		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755)
+			gpio_mask |= GRC_LCLCTRL_GPIO_UART_SEL;
+
 		tp->grc_local_ctrl |= tr32(GRC_LOCAL_CTRL) & gpio_mask;
 
 		/* GPIO1 must be driven high for eeprom write protect */
@@ -6191,7 +6199,8 @@ static int tg3_reset_hw(struct tg3 *tp)
 	}
 
 	/* Enable host coalescing bug fix */
-	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
+	if ((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755) ||
+	    (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787))
 		val |= (1 << 29);
 
 	tw32_f(WDMAC_MODE, val);
@@ -6249,6 +6258,9 @@ static int tg3_reset_hw(struct tg3 *tp)
 	udelay(100);
 
 	tp->rx_mode = RX_MODE_ENABLE;
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755)
+		tp->rx_mode |= RX_MODE_IPV6_CSUM_ENABLE;
+
 	tw32_f(MAC_RX_MODE, tp->rx_mode);
 	udelay(10);
 
@@ -7907,7 +7919,8 @@ static int tg3_set_tx_csum(struct net_device *dev, u32 data)
   		return 0;
   	}
   
-	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
 		ethtool_op_set_tx_hw_csum(dev, data);
 	else
 		ethtool_op_set_tx_csum(dev, data);
@@ -8332,7 +8345,8 @@ static int tg3_test_memory(struct tg3 *tp)
 	int i;
 
 	if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) {
-		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
+		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
+		    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
 			mem_tbl = mem_tbl_5755;
 		else
 			mem_tbl = mem_tbl_5705;
@@ -9310,6 +9324,7 @@ static int tg3_nvram_write_block_buffered(struct tg3 *tp, u32 offset, u32 len,
 			nvram_cmd |= NVRAM_CMD_LAST;
 
 		if ((GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5752) &&
+		    (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5755) &&
 		    (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5787) &&
 		    (tp->nvram_jedecnum == JEDEC_ST) &&
 		    (nvram_cmd & NVRAM_CMD_FIRST)) {
@@ -10044,6 +10059,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5750 ||
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
 	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787 ||
 	    (tp->tg3_flags2 & TG3_FLG2_5780_CLASS))
 		tp->tg3_flags2 |= TG3_FLG2_5750_PLUS;
@@ -10053,7 +10069,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 		tp->tg3_flags2 |= TG3_FLG2_5705_PLUS;
 
 	if (tp->tg3_flags2 & TG3_FLG2_5750_PLUS) {
-		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787) {
+		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
+		    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787) {
 			tp->tg3_flags2 |= TG3_FLG2_HW_TSO_2;
 			tp->tg3_flags2 |= TG3_FLG2_1SHOT_MSI;
 		} else
@@ -10063,6 +10080,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5705 &&
 	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5750 &&
 	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5752 &&
+	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5755 &&
 	    GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5787)
 		tp->tg3_flags2 |= TG3_FLG2_JUMBO_CAPABLE;
 
@@ -10219,6 +10237,9 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	else if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752)
 		tp->grc_local_ctrl |= GRC_LCLCTRL_GPIO_OE3;
 
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755)
+		tp->grc_local_ctrl |= GRC_LCLCTRL_GPIO_UART_SEL;
+
 	/* Force the chip into D0. */
 	err = tg3_set_power_state(tp, PCI_D0);
 	if (err) {
@@ -10274,6 +10295,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 		tp->tg3_flags2 |= TG3_FLG2_PHY_5704_A0_BUG;
 
 	if ((tp->tg3_flags2 & TG3_FLG2_5705_PLUS) &&
+	    (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5755) &&
 	    (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5787))
 		tp->tg3_flags2 |= TG3_FLG2_PHY_BER_BUG;
 
@@ -10413,7 +10435,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	/* All chips before 5787 can get confused if TX buffers
 	 * straddle the 4GB address boundary in some cases.
 	 */
-	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
 		tp->dev->hard_start_xmit = tg3_start_xmit;
 	else
 		tp->dev->hard_start_xmit = tg3_start_xmit_dma_bug;
@@ -11002,6 +11025,7 @@ static char * __devinit tg3_phy_string(struct tg3 *tp)
 	case PHY_ID_BCM5752:	return "5752";
 	case PHY_ID_BCM5714:	return "5714";
 	case PHY_ID_BCM5780:	return "5780";
+	case PHY_ID_BCM5755:	return "5755";
 	case PHY_ID_BCM5787:	return "5787";
 	case PHY_ID_BCM8002:	return "8002/serdes";
 	case 0:			return "serdes";
@@ -11350,7 +11374,8 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 	 * checksumming.
 	 */
 	if ((tp->tg3_flags & TG3_FLAG_BROKEN_CHECKSUMS) == 0) {
-		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
+		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
+		    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787)
 			dev->features |= NETIF_F_HW_CSUM;
 		else
 			dev->features |= NETIF_F_IP_CSUM;
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index baa34c4721db..672f375ef711 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -138,6 +138,7 @@
 #define   ASIC_REV_5752			 0x06
 #define   ASIC_REV_5780			 0x08
 #define   ASIC_REV_5714			 0x09
+#define   ASIC_REV_5755			 0x0a
 #define   ASIC_REV_5787			 0x0b
 #define  GET_CHIP_REV(CHIP_REV_ID)	((CHIP_REV_ID) >> 8)
 #define   CHIPREV_5700_AX		 0x70
@@ -456,6 +457,7 @@
 #define  RX_MODE_PROMISC		 0x00000100
 #define  RX_MODE_NO_CRC_CHECK		 0x00000200
 #define  RX_MODE_KEEP_VLAN_TAG		 0x00000400
+#define  RX_MODE_IPV6_CSUM_ENABLE	 0x01000000
 #define MAC_RX_STATUS			0x0000046c
 #define  RX_STATUS_REMOTE_TX_XOFFED	 0x00000001
 #define  RX_STATUS_XOFF_RCVD		 0x00000002
@@ -1340,6 +1342,7 @@
 #define  GRC_LCLCTRL_CLEARINT		0x00000002
 #define  GRC_LCLCTRL_SETINT		0x00000004
 #define  GRC_LCLCTRL_INT_ON_ATTN	0x00000008
+#define  GRC_LCLCTRL_GPIO_UART_SEL	0x00000010	/* 5755 only */
 #define  GRC_LCLCTRL_USE_SIG_DETECT	0x00000010	/* 5714/5780 only */
 #define  GRC_LCLCTRL_USE_EXT_SIG_DETECT	0x00000020	/* 5714/5780 only */
 #define  GRC_LCLCTRL_GPIO_INPUT3	0x00000020
@@ -2259,6 +2262,7 @@ struct tg3 {
 #define PHY_ID_BCM5752			0x60008100
 #define PHY_ID_BCM5714			0x60008340
 #define PHY_ID_BCM5780			0x60008350
+#define PHY_ID_BCM5755			0xbc050cc0
 #define PHY_ID_BCM5787			0xbc050ce0
 #define PHY_ID_BCM8002			0x60010140
 #define PHY_ID_INVALID			0xffffffff
@@ -2286,7 +2290,7 @@ struct tg3 {
 	 (X) == PHY_ID_BCM5705 || (X) == PHY_ID_BCM5750 || \
 	 (X) == PHY_ID_BCM5752 || (X) == PHY_ID_BCM5714 || \
 	 (X) == PHY_ID_BCM5780 || (X) == PHY_ID_BCM5787 || \
-	 (X) == PHY_ID_BCM8002)
+	 (X) == PHY_ID_BCM5755 || (X) == PHY_ID_BCM8002)
 
 	struct tg3_hw_stats		*hw_stats;
 	dma_addr_t			stats_mapping;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ec3c32932620..989a9d00dec1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1864,11 +1864,13 @@
 #define PCI_DEVICE_ID_TIGON3_5780S	0x166b
 #define PCI_DEVICE_ID_TIGON3_5705F	0x166e
 #define PCI_DEVICE_ID_TIGON3_5754M	0x1672
+#define PCI_DEVICE_ID_TIGON3_5755M	0x1673
 #define PCI_DEVICE_ID_TIGON3_5750	0x1676
 #define PCI_DEVICE_ID_TIGON3_5751	0x1677
 #define PCI_DEVICE_ID_TIGON3_5715	0x1678
 #define PCI_DEVICE_ID_TIGON3_5715S	0x1679
 #define PCI_DEVICE_ID_TIGON3_5754	0x167a
+#define PCI_DEVICE_ID_TIGON3_5755	0x167b
 #define PCI_DEVICE_ID_TIGON3_5750M	0x167c
 #define PCI_DEVICE_ID_TIGON3_5751M	0x167d
 #define PCI_DEVICE_ID_TIGON3_5751F	0x167e
-- 
cgit v1.2.3


From 711e2c33ac9221a419a9e28d05dd78a6a9c5fd4d Mon Sep 17 00:00:00 2001
From: Jean Tourrilhes <jt@hpl.hp.com>
Date: Wed, 22 Feb 2006 15:10:56 -0800
Subject: [PATCH] WE-20 for kernel 2.6.16

	This is version 20 of the Wireless Extensions. This is the
completion of the RtNetlink work I started early 2004, it enables the
full Wireless Extension API over RtNetlink.

	Few comments on the patch :
	o totally driver transparent, no change in drivers needed.
	o iwevent were already RtNetlink based since they were created
(around 2.5.7). This adds all the regular SET and GET requests over
RtNetlink, using the exact same mechanism and data format as iwevents.
	o This is a Kconfig option, as currently most people have no
need for it. Surprisingly, patch is actually small and well
encapsulated.
	o Tested on SMP, attention as been paid to make it 64 bits clean.
	o Code do probably too many checks and could be further
optimised, but better safe than sorry.
	o RtNetlink based version of the Wireless Tools available on
my web page for people inclined to try out this stuff.

	I would also like to thank Alexey Kuznetsov for his helpful
suggestions to make this patch better.

Signed-off-by: Jean Tourrilhes <jt@hpl.hp.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/Kconfig |   9 +
 include/linux/wireless.h     |  10 +-
 include/net/iw_handler.h     |  12 +-
 net/core/rtnetlink.c         |  98 ++++-
 net/core/wireless.c          | 911 ++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 947 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/Kconfig b/drivers/net/wireless/Kconfig
index 5b0a19a5058d..6a1033ec06cf 100644
--- a/drivers/net/wireless/Kconfig
+++ b/drivers/net/wireless/Kconfig
@@ -25,6 +25,15 @@ config NET_RADIO
 	  the tools from
 	  <http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Tools.html>.
 
+config NET_WIRELESS_RTNETLINK
+	bool "Wireless Extension API over RtNetlink"
+	---help---
+	  Support the Wireless Extension API over the RtNetlink socket
+	  in addition to the traditional ioctl interface (selected above).
+
+	  For now, few tools use this facility, but it might grow in the
+	  future. The only downside is that it adds 4.5 kB to your kernel.
+
 # Note : the cards are obsolete (can't buy them anymore), but the drivers
 # are not, as people are still using them...
 comment "Obsolete Wireless cards support (pre-802.11)"
diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index a555a0f7a7b4..13588564b42b 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -1,10 +1,10 @@
 /*
  * This file define a set of standard wireless extensions
  *
- * Version :	19	18.3.05
+ * Version :	20	17.2.06
  *
  * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 1997-2005 Jean Tourrilhes, All Rights Reserved.
+ * Copyright (c) 1997-2006 Jean Tourrilhes, All Rights Reserved.
  */
 
 #ifndef _LINUX_WIRELESS_H
@@ -80,7 +80,7 @@
  * (there is some stuff that will be added in the future...)
  * I just plan to increment with each new version.
  */
-#define WIRELESS_EXT	19
+#define WIRELESS_EXT	20
 
 /*
  * Changes :
@@ -204,6 +204,10 @@
  *	- Add IW_QUAL_ALL_UPDATED and IW_QUAL_ALL_INVALID macros
  *	- Add explicit flag to tell stats are in dBm : IW_QUAL_DBM
  *	- Add IW_IOCTL_IDX() and IW_EVENT_IDX() macros
+ *
+ * V19 to V20
+ * ----------
+ *	- RtNetlink requests support (SET/GET)
  */
 
 /**************************** CONSTANTS ****************************/
diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h
index a2c5e0b88422..10559e937d27 100644
--- a/include/net/iw_handler.h
+++ b/include/net/iw_handler.h
@@ -4,7 +4,7 @@
  * Version :	7	18.3.05
  *
  * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 2001-2005 Jean Tourrilhes, All Rights Reserved.
+ * Copyright (c) 2001-2006 Jean Tourrilhes, All Rights Reserved.
  */
 
 #ifndef _IW_HANDLER_H
@@ -436,6 +436,16 @@ extern int dev_get_wireless_info(char * buffer, char **start, off_t offset,
 /* Handle IOCTLs, called in net/core/dev.c */
 extern int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd);
 
+/* Handle RtNetlink requests, called in net/core/rtnetlink.c */
+extern int wireless_rtnetlink_set(struct net_device *	dev,
+				  char *		data,
+				  int			len);
+extern int wireless_rtnetlink_get(struct net_device *	dev,
+				  char *		data,
+				  int			len,
+				  char **		p_buf,
+				  int *			p_len);
+
 /* Second : functions that may be called by driver modules */
 
 /* Send a single event to user space */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ae10d3740faa..3fcfa9c59e1f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -51,6 +51,10 @@
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/netlink.h>
+#ifdef CONFIG_NET_WIRELESS_RTNETLINK
+#include <linux/wireless.h>
+#include <net/iw_handler.h>
+#endif	/* CONFIG_NET_WIRELESS_RTNETLINK */
 
 static DEFINE_MUTEX(rtnl_mutex);
 
@@ -467,6 +471,17 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 			goto out;
 	}
 
+#ifdef CONFIG_NET_WIRELESS_RTNETLINK
+	if (ida[IFLA_WIRELESS - 1]) {
+
+		/* Call Wireless Extensions.
+		 * Various stuff checked in there... */
+		err = wireless_rtnetlink_set(dev, RTA_DATA(ida[IFLA_WIRELESS - 1]), ida[IFLA_WIRELESS - 1]->rta_len);
+		if (err)
+			goto out;
+	}
+#endif	/* CONFIG_NET_WIRELESS_RTNETLINK */
+
 	err = 0;
 
 out:
@@ -477,6 +492,83 @@ out:
 	return err;
 }
 
+#ifdef CONFIG_NET_WIRELESS_RTNETLINK
+static int do_getlink(struct sk_buff *in_skb, struct nlmsghdr* in_nlh, void *arg)
+{
+	struct ifinfomsg  *ifm = NLMSG_DATA(in_nlh);
+	struct rtattr    **ida = arg;
+	struct net_device *dev;
+	struct ifinfomsg *r;
+	struct nlmsghdr  *nlh;
+	int err = -ENOBUFS;
+	struct sk_buff *skb;
+	unsigned char	 *b;
+	char *iw_buf = NULL;
+	int iw_buf_len = 0;
+
+	if (ifm->ifi_index >= 0)
+		dev = dev_get_by_index(ifm->ifi_index);
+	else
+		return -EINVAL;
+	if (!dev)
+		return -ENODEV;
+
+#ifdef CONFIG_NET_WIRELESS_RTNETLINK
+	if (ida[IFLA_WIRELESS - 1]) {
+
+		/* Call Wireless Extensions. We need to know the size before
+		 * we can alloc. Various stuff checked in there... */
+		err = wireless_rtnetlink_get(dev, RTA_DATA(ida[IFLA_WIRELESS - 1]), ida[IFLA_WIRELESS - 1]->rta_len, &iw_buf, &iw_buf_len);
+		if (err)
+			goto out;
+	}
+#endif	/* CONFIG_NET_WIRELESS_RTNETLINK */
+
+	/* Create a skb big enough to include all the data.
+	 * Some requests are way bigger than 4k... Jean II */
+	skb = alloc_skb((NLMSG_LENGTH(sizeof(*r))) + (RTA_SPACE(iw_buf_len)),
+			GFP_KERNEL);
+	if (!skb)
+		goto out;
+	b = skb->tail;
+
+	/* Put in the message the usual good stuff */
+	nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, in_nlh->nlmsg_seq,
+			RTM_NEWLINK, sizeof(*r));
+	r = NLMSG_DATA(nlh);
+	r->ifi_family = AF_UNSPEC;
+	r->__ifi_pad = 0;
+	r->ifi_type = dev->type;
+	r->ifi_index = dev->ifindex;
+	r->ifi_flags = dev->flags;
+	r->ifi_change = 0;
+
+	/* Put the wireless payload if it exist */
+	if(iw_buf != NULL)
+		RTA_PUT(skb, IFLA_WIRELESS, iw_buf_len,
+			iw_buf + IW_EV_POINT_OFF);
+
+	nlh->nlmsg_len = skb->tail - b;
+
+	/* Needed ? */
+	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
+
+	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+out:
+	if(iw_buf != NULL)
+		kfree(iw_buf);
+	dev_put(dev);
+	return err;
+
+rtattr_failure:
+nlmsg_failure:
+	kfree_skb(skb);
+	goto out;
+}
+#endif	/* CONFIG_NET_WIRELESS_RTNETLINK */
+
 static int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int idx;
@@ -642,7 +734,11 @@ static void rtnetlink_rcv(struct sock *sk, int len)
 
 static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
 {
-	[RTM_GETLINK     - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
+	[RTM_GETLINK     - RTM_BASE] = {
+#ifdef CONFIG_NET_WIRELESS_RTNETLINK
+					 .doit   = do_getlink,
+#endif	/* CONFIG_NET_WIRELESS_RTNETLINK */
+					 .dumpit = rtnetlink_dump_ifinfo },
 	[RTM_SETLINK     - RTM_BASE] = { .doit   = do_setlink		 },
 	[RTM_GETADDR     - RTM_BASE] = { .dumpit = rtnetlink_dump_all	 },
 	[RTM_GETROUTE    - RTM_BASE] = { .dumpit = rtnetlink_dump_all	 },
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 2add7ed609e9..81d6995fcfdb 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -2,7 +2,7 @@
  * This file implement the Wireless Extensions APIs.
  *
  * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 1997-2005 Jean Tourrilhes, All Rights Reserved.
+ * Copyright (c) 1997-2006 Jean Tourrilhes, All Rights Reserved.
  *
  * (As all part of the Linux kernel, this file is GPL)
  */
@@ -65,6 +65,9 @@
  *	o Start deprecating dev->get_wireless_stats, output a warning
  *	o If IW_QUAL_DBM is set, show dBm values in /proc/net/wireless
  *	o Don't loose INVALID/DBM flags when clearing UPDATED flags (iwstats)
+ *
+ * v8 - 17.02.06 - Jean II
+ *	o RtNetlink requests support (SET/GET)
  */
 
 /***************************** INCLUDES *****************************/
@@ -89,11 +92,13 @@
 
 /* Debugging stuff */
 #undef WE_IOCTL_DEBUG		/* Debug IOCTL API */
+#undef WE_RTNETLINK_DEBUG	/* Debug RtNetlink API */
 #undef WE_EVENT_DEBUG		/* Debug Event dispatcher */
 #undef WE_SPY_DEBUG		/* Debug enhanced spy support */
 
 /* Options */
-#define WE_EVENT_NETLINK	/* Propagate events using rtnetlink */
+//CONFIG_NET_WIRELESS_RTNETLINK	/* Wireless requests over RtNetlink */
+#define WE_EVENT_RTNETLINK	/* Propagate events using RtNetlink */
 #define WE_SET_EVENT		/* Generate an event on some set commands */
 
 /************************* GLOBAL VARIABLES *************************/
@@ -156,13 +161,18 @@ static const struct iw_ioctl_description standard_ioctl[] = {
 		.header_type	= IW_HEADER_TYPE_NULL,
 	},
 	[SIOCGIWPRIV	- SIOCIWFIRST] = { /* (handled directly by us) */
-		.header_type	= IW_HEADER_TYPE_NULL,
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= sizeof(struct iw_priv_args),
+		.max_tokens	= 16,
+		.flags		= IW_DESCR_FLAG_NOMAX,
 	},
 	[SIOCSIWSTATS	- SIOCIWFIRST] = {
 		.header_type	= IW_HEADER_TYPE_NULL,
 	},
 	[SIOCGIWSTATS	- SIOCIWFIRST] = { /* (handled directly by us) */
-		.header_type	= IW_HEADER_TYPE_NULL,
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= sizeof(struct iw_statistics),
 		.flags		= IW_DESCR_FLAG_DUMP,
 	},
 	[SIOCSIWSPY	- SIOCIWFIRST] = {
@@ -529,6 +539,70 @@ static inline int adjust_priv_size(__u16		args,
 	return num * iw_priv_type_size[type];
 }
 
+/* ---------------------------------------------------------------- */
+/*
+ * Standard Wireless Handler : get wireless stats
+ *	Allow programatic access to /proc/net/wireless even if /proc
+ *	doesn't exist... Also more efficient...
+ */
+static int iw_handler_get_iwstats(struct net_device *		dev,
+				  struct iw_request_info *	info,
+				  union iwreq_data *		wrqu,
+				  char *			extra)
+{
+	/* Get stats from the driver */
+	struct iw_statistics *stats;
+
+	stats = get_wireless_stats(dev);
+	if (stats != (struct iw_statistics *) NULL) {
+
+		/* Copy statistics to extra */
+		memcpy(extra, stats, sizeof(struct iw_statistics));
+		wrqu->data.length = sizeof(struct iw_statistics);
+
+		/* Check if we need to clear the updated flag */
+		if(wrqu->data.flags != 0)
+			stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
+		return 0;
+	} else
+		return -EOPNOTSUPP;
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Standard Wireless Handler : get iwpriv definitions
+ * Export the driver private handler definition
+ * They will be picked up by tools like iwpriv...
+ */
+static int iw_handler_get_private(struct net_device *		dev,
+				  struct iw_request_info *	info,
+				  union iwreq_data *		wrqu,
+				  char *			extra)
+{
+	/* Check if the driver has something to export */
+	if((dev->wireless_handlers->num_private_args == 0) ||
+	   (dev->wireless_handlers->private_args == NULL))
+		return -EOPNOTSUPP;
+
+	/* Check if there is enough buffer up there */
+	if(wrqu->data.length < dev->wireless_handlers->num_private_args) {
+		/* User space can't know in advance how large the buffer
+		 * needs to be. Give it a hint, so that we can support
+		 * any size buffer we want somewhat efficiently... */
+		wrqu->data.length = dev->wireless_handlers->num_private_args;
+		return -E2BIG;
+	}
+
+	/* Set the number of available ioctls. */
+	wrqu->data.length = dev->wireless_handlers->num_private_args;
+
+	/* Copy structure to the user buffer. */
+	memcpy(extra, dev->wireless_handlers->private_args,
+	       sizeof(struct iw_priv_args) * wrqu->data.length);
+
+	return 0;
+}
+
 
 /******************** /proc/net/wireless SUPPORT ********************/
 /*
@@ -628,83 +702,16 @@ int __init wireless_proc_init(void)
  * or just call the driver ioctl handler.
  */
 
-/* ---------------------------------------------------------------- */
-/*
- *	Allow programatic access to /proc/net/wireless even if /proc
- *	doesn't exist... Also more efficient...
- */
-static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr)
-{
-	/* Get stats from the driver */
-	struct iw_statistics *stats;
-
-	stats = get_wireless_stats(dev);
-	if (stats != (struct iw_statistics *) NULL) {
-		struct iwreq *	wrq = (struct iwreq *)ifr;
-
-		/* Copy statistics to the user buffer */
-		if(copy_to_user(wrq->u.data.pointer, stats,
-				sizeof(struct iw_statistics)))
-			return -EFAULT;
-
-		/* Check if we need to clear the updated flag */
-		if(wrq->u.data.flags != 0)
-			stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
-		return 0;
-	} else
-		return -EOPNOTSUPP;
-}
-
-/* ---------------------------------------------------------------- */
-/*
- * Export the driver private handler definition
- * They will be picked up by tools like iwpriv...
- */
-static inline int ioctl_export_private(struct net_device *	dev,
-				       struct ifreq *		ifr)
-{
-	struct iwreq *				iwr = (struct iwreq *) ifr;
-
-	/* Check if the driver has something to export */
-	if((dev->wireless_handlers->num_private_args == 0) ||
-	   (dev->wireless_handlers->private_args == NULL))
-		return -EOPNOTSUPP;
-
-	/* Check NULL pointer */
-	if(iwr->u.data.pointer == NULL)
-		return -EFAULT;
-
-	/* Check if there is enough buffer up there */
-	if(iwr->u.data.length < dev->wireless_handlers->num_private_args) {
-		/* User space can't know in advance how large the buffer
-		 * needs to be. Give it a hint, so that we can support
-		 * any size buffer we want somewhat efficiently... */
-		iwr->u.data.length = dev->wireless_handlers->num_private_args;
-		return -E2BIG;
-	}
-
-	/* Set the number of available ioctls. */
-	iwr->u.data.length = dev->wireless_handlers->num_private_args;
-
-	/* Copy structure to the user buffer. */
-	if (copy_to_user(iwr->u.data.pointer,
-			 dev->wireless_handlers->private_args,
-			 sizeof(struct iw_priv_args) * iwr->u.data.length))
-		return -EFAULT;
-
-	return 0;
-}
-
 /* ---------------------------------------------------------------- */
 /*
  * Wrapper to call a standard Wireless Extension handler.
  * We do various checks and also take care of moving data between
  * user space and kernel space.
  */
-static inline int ioctl_standard_call(struct net_device *	dev,
-				      struct ifreq *		ifr,
-				      unsigned int		cmd,
-				      iw_handler		handler)
+static int ioctl_standard_call(struct net_device *	dev,
+			       struct ifreq *		ifr,
+			       unsigned int		cmd,
+			       iw_handler		handler)
 {
 	struct iwreq *				iwr = (struct iwreq *) ifr;
 	const struct iw_ioctl_description *	descr;
@@ -1048,14 +1055,20 @@ int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd)
 	{
 		case SIOCGIWSTATS:
 			/* Get Wireless Stats */
-			return dev_iwstats(dev, ifr);
+			return ioctl_standard_call(dev,
+						   ifr,
+						   cmd,
+						   &iw_handler_get_iwstats);
 
 		case SIOCGIWPRIV:
 			/* Check if we have some wireless handlers defined */
 			if(dev->wireless_handlers != NULL) {
 				/* We export to user space the definition of
 				 * the private handler ourselves */
-				return ioctl_export_private(dev, ifr);
+				return ioctl_standard_call(dev,
+							   ifr,
+							   cmd,
+							   &iw_handler_get_private);
 			}
 			// ## Fall-through for old API ##
 		default:
@@ -1088,16 +1101,739 @@ int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd)
 	return -EINVAL;
 }
 
+/********************** RTNETLINK REQUEST API **********************/
+/*
+ * The alternate user space API to configure all those Wireless Extensions
+ * is through RtNetlink.
+ * This API support only the new driver API (iw_handler).
+ *
+ * This RtNetlink API use the same query/reply model as the ioctl API.
+ * Maximum effort has been done to fit in the RtNetlink model, and
+ * we support both RtNetlink Set and RtNelink Get operations.
+ * On the other hand, we don't offer Dump operations because of the
+ * following reasons :
+ *	o Large number of parameters, most optional
+ *	o Large size of some parameters (> 100 bytes)
+ *	o Each parameters need to be extracted from hardware
+ *	o Scan requests can take seconds and disable network activity.
+ * Because of this high cost/overhead, we want to return only the
+ * parameters the user application is really interested in.
+ * We could offer partial Dump using the IW_DESCR_FLAG_DUMP flag.
+ *
+ * The API uses the standard RtNetlink socket. When the RtNetlink code
+ * find a IFLA_WIRELESS field in a RtNetlink SET_LINK request,
+ * it calls here.
+ */
+
+#ifdef CONFIG_NET_WIRELESS_RTNETLINK
+/* ---------------------------------------------------------------- */
+/*
+ * Wrapper to call a standard Wireless Extension GET handler.
+ * We do various checks and call the handler with the proper args.
+ */
+static int rtnetlink_standard_get(struct net_device *	dev,
+				  struct iw_event *	request,
+				  int			request_len,
+				  iw_handler		handler,
+				  char **		p_buf,
+				  int *			p_len)
+{
+	const struct iw_ioctl_description *	descr = NULL;
+	unsigned int				cmd;
+	union iwreq_data *			wrqu;
+	int					hdr_len;
+	struct iw_request_info			info;
+	char *					buffer = NULL;
+	int					buffer_size = 0;
+	int					ret = -EINVAL;
+
+	/* Get the description of the Request */
+	cmd = request->cmd;
+	if((cmd - SIOCIWFIRST) >= standard_ioctl_num)
+		return -EOPNOTSUPP;
+	descr = &(standard_ioctl[cmd - SIOCIWFIRST]);
+
+#ifdef WE_RTNETLINK_DEBUG
+	printk(KERN_DEBUG "%s (WE.r) : Found standard handler for 0x%04X\n",
+	       dev->name, cmd);
+	printk(KERN_DEBUG "%s (WE.r) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+	/* Check if wrqu is complete */
+	hdr_len = event_type_size[descr->header_type];
+	if(request_len < hdr_len) {
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG
+		       "%s (WE.r) : Wireless request too short (%d)\n",
+		       dev->name, request_len);
+#endif	/* WE_RTNETLINK_DEBUG */
+		return -EINVAL;
+	}
+
+	/* Prepare the call */
+	info.cmd = cmd;
+	info.flags = 0;
+
+	/* Check if we have extra data in the reply or not */
+	if(descr->header_type != IW_HEADER_TYPE_POINT) {
+
+		/* Create the kernel buffer that we will return.
+		 * It's at an offset to match the TYPE_POINT case... */
+		buffer_size = request_len + IW_EV_POINT_OFF;
+		buffer = kmalloc(buffer_size, GFP_KERNEL);
+		if (buffer == NULL) {
+			return -ENOMEM;
+		}
+		/* Copy event data */
+		memcpy(buffer + IW_EV_POINT_OFF, request, request_len);
+		/* Use our own copy of wrqu */
+		wrqu = (union iwreq_data *) (buffer + IW_EV_POINT_OFF
+					     + IW_EV_LCP_LEN);
+
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, &info, wrqu, NULL);
+
+	} else {
+		union iwreq_data	wrqu_point;
+		char *			extra = NULL;
+		int			extra_size = 0;
+
+		/* Get a temp copy of wrqu (skip pointer) */
+		memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF,
+		       ((char *) request) + IW_EV_LCP_LEN,
+		       IW_EV_POINT_LEN - IW_EV_LCP_LEN);
+
+		/* Calculate space needed by arguments. Always allocate
+		 * for max space. Easier, and won't last long... */
+		extra_size = descr->max_tokens * descr->token_size;
+		/* Support for very large requests */
+		if((descr->flags & IW_DESCR_FLAG_NOMAX) &&
+		   (wrqu_point.data.length > descr->max_tokens))
+			extra_size = (wrqu_point.data.length
+				      * descr->token_size);
+		buffer_size = extra_size + IW_EV_POINT_LEN + IW_EV_POINT_OFF;
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes (%d bytes)\n",
+		       dev->name, extra_size, buffer_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+		/* Create the kernel buffer that we will return */
+		buffer = kmalloc(buffer_size, GFP_KERNEL);
+		if (buffer == NULL) {
+			return -ENOMEM;
+		}
+
+		/* Put wrqu in the right place (just before extra).
+		 * Leave space for IWE header and dummy pointer...
+		 * Note that IW_EV_LCP_LEN==4 bytes, so it's still aligned...
+		 */
+		memcpy(buffer + IW_EV_LCP_LEN + IW_EV_POINT_OFF,
+		       ((char *) &wrqu_point) + IW_EV_POINT_OFF,
+		       IW_EV_POINT_LEN - IW_EV_LCP_LEN);
+		wrqu = (union iwreq_data *) (buffer + IW_EV_LCP_LEN);
+
+		/* Extra comes logically after that. Offset +12 bytes. */
+		extra = buffer + IW_EV_POINT_OFF + IW_EV_POINT_LEN;
+
+		/* Call the handler */
+		ret = handler(dev, &info, wrqu, extra);
+
+		/* Calculate real returned length */
+		extra_size = (wrqu->data.length * descr->token_size);
+		/* Re-adjust reply size */
+		request->len = extra_size + IW_EV_POINT_LEN;
+
+		/* Put the iwe header where it should, i.e. scrap the
+		 * dummy pointer. */
+		memcpy(buffer + IW_EV_POINT_OFF, request, IW_EV_LCP_LEN);
+
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG "%s (WE.r) : Reply 0x%04X, hdr_len %d, tokens %d, extra_size %d, buffer_size %d\n", dev->name, cmd, hdr_len, wrqu->data.length, extra_size, buffer_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+		/* Check if there is enough buffer up there */
+		if(wrqu_point.data.length < wrqu->data.length)
+			ret = -E2BIG;
+	}
+
+	/* Return the buffer to the caller */
+	if (!ret) {
+		*p_buf = buffer;
+		*p_len = request->len;
+	} else {
+		/* Cleanup */
+		if(buffer)
+			kfree(buffer);
+	}
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Wrapper to call a standard Wireless Extension SET handler.
+ * We do various checks and call the handler with the proper args.
+ */
+static inline int rtnetlink_standard_set(struct net_device *	dev,
+					 struct iw_event *	request,
+					 int			request_len,
+					 iw_handler		handler)
+{
+	const struct iw_ioctl_description *	descr = NULL;
+	unsigned int				cmd;
+	union iwreq_data *			wrqu;
+	union iwreq_data			wrqu_point;
+	int					hdr_len;
+	char *					extra = NULL;
+	int					extra_size = 0;
+	struct iw_request_info			info;
+	int					ret = -EINVAL;
+
+	/* Get the description of the Request */
+	cmd = request->cmd;
+	if((cmd - SIOCIWFIRST) >= standard_ioctl_num)
+		return -EOPNOTSUPP;
+	descr = &(standard_ioctl[cmd - SIOCIWFIRST]);
+
+#ifdef WE_RTNETLINK_DEBUG
+	printk(KERN_DEBUG "%s (WE.r) : Found standard SET handler for 0x%04X\n",
+	       dev->name, cmd);
+	printk(KERN_DEBUG "%s (WE.r) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+	/* Extract fixed header from request. This is properly aligned. */
+	wrqu = &request->u;
+
+	/* Check if wrqu is complete */
+	hdr_len = event_type_size[descr->header_type];
+	if(request_len < hdr_len) {
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG
+		       "%s (WE.r) : Wireless request too short (%d)\n",
+		       dev->name, request_len);
+#endif	/* WE_RTNETLINK_DEBUG */
+		return -EINVAL;
+	}
+
+	/* Prepare the call */
+	info.cmd = cmd;
+	info.flags = 0;
+
+	/* Check if we have extra data in the request or not */
+	if(descr->header_type != IW_HEADER_TYPE_POINT) {
+
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, &info, wrqu, NULL);
+
+	} else {
+		int	extra_len;
+
+		/* Put wrqu in the right place (skip pointer) */
+		memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF,
+		       wrqu, IW_EV_POINT_LEN - IW_EV_LCP_LEN);
+		/* Don't forget about the event code... */
+		wrqu = &wrqu_point;
+
+		/* Check if number of token fits within bounds */
+		if(wrqu_point.data.length > descr->max_tokens)
+			return -E2BIG;
+		if(wrqu_point.data.length < descr->min_tokens)
+			return -EINVAL;
+
+		/* Real length of payload */
+		extra_len = wrqu_point.data.length * descr->token_size;
+
+		/* Check if request is self consistent */
+		if((request_len - hdr_len) < extra_len) {
+#ifdef WE_RTNETLINK_DEBUG
+			printk(KERN_DEBUG "%s (WE.r) : Wireless request data too short (%d)\n",
+			       dev->name, extra_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+			return -EINVAL;
+		}
+
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes\n",
+		       dev->name, extra_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+		/* Always allocate for max space. Easier, and won't last
+		 * long... */
+		extra_size = descr->max_tokens * descr->token_size;
+		extra = kmalloc(extra_size, GFP_KERNEL);
+		if (extra == NULL)
+			return -ENOMEM;
+
+		/* Copy extra in aligned buffer */
+		memcpy(extra, ((char *) request) + hdr_len, extra_len);
+
+		/* Call the handler */
+		ret = handler(dev, &info, &wrqu_point, extra);
+	}
+
+#ifdef WE_SET_EVENT
+	/* Generate an event to notify listeners of the change */
+	if((descr->flags & IW_DESCR_FLAG_EVENT) &&
+	   ((ret == 0) || (ret == -EIWCOMMIT))) {
+		if(descr->flags & IW_DESCR_FLAG_RESTRICT)
+			/* If the event is restricted, don't
+			 * export the payload */
+			wireless_send_event(dev, cmd, wrqu, NULL);
+		else
+			wireless_send_event(dev, cmd, wrqu, extra);
+	}
+#endif	/* WE_SET_EVENT */
+
+	/* Cleanup - I told you it wasn't that long ;-) */
+	if(extra)
+		kfree(extra);
+
+	/* Call commit handler if needed and defined */
+	if(ret == -EIWCOMMIT)
+		ret = call_commit_handler(dev);
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Wrapper to call a private Wireless Extension GET handler.
+ * Same as above...
+ * It's not as nice and slimline as the standard wrapper. The cause
+ * is struct iw_priv_args, which was not really designed for the
+ * job we are going here.
+ *
+ * IMPORTANT : This function prevent to set and get data on the same
+ * IOCTL and enforce the SET/GET convention. Not doing it would be
+ * far too hairy...
+ * If you need to set and get data at the same time, please don't use
+ * a iw_handler but process it in your ioctl handler (i.e. use the
+ * old driver API).
+ */
+static inline int rtnetlink_private_get(struct net_device *	dev,
+					struct iw_event *	request,
+					int			request_len,
+					iw_handler		handler,
+					char **			p_buf,
+					int *			p_len)
+{
+	const struct iw_priv_args *	descr = NULL;
+	unsigned int			cmd;
+	union iwreq_data *		wrqu;
+	int				hdr_len;
+	struct iw_request_info		info;
+	int				extra_size = 0;
+	int				i;
+	char *				buffer = NULL;
+	int				buffer_size = 0;
+	int				ret = -EINVAL;
+
+	/* Get the description of the Request */
+	cmd = request->cmd;
+	for(i = 0; i < dev->wireless_handlers->num_private_args; i++)
+		if(cmd == dev->wireless_handlers->private_args[i].cmd) {
+			descr = &(dev->wireless_handlers->private_args[i]);
+			break;
+		}
+	if(descr == NULL)
+		return -EOPNOTSUPP;
+
+#ifdef WE_RTNETLINK_DEBUG
+	printk(KERN_DEBUG "%s (WE.r) : Found private handler for 0x%04X\n",
+	       dev->name, cmd);
+	printk(KERN_DEBUG "%s (WE.r) : Name %s, set %X, get %X\n",
+	       dev->name, descr->name, descr->set_args, descr->get_args);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+	/* Compute the max size of the get arguments */
+	extra_size = get_priv_size(descr->get_args);
+
+	/* Does it fits in wrqu ? */
+	if((descr->get_args & IW_PRIV_SIZE_FIXED) &&
+	   (extra_size <= IFNAMSIZ)) {
+		hdr_len = extra_size;
+		extra_size = 0;
+	} else {
+		hdr_len = IW_EV_POINT_LEN;
+	}
+
+	/* Check if wrqu is complete */
+	if(request_len < hdr_len) {
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG
+		       "%s (WE.r) : Wireless request too short (%d)\n",
+		       dev->name, request_len);
+#endif	/* WE_RTNETLINK_DEBUG */
+		return -EINVAL;
+	}
+
+	/* Prepare the call */
+	info.cmd = cmd;
+	info.flags = 0;
+
+	/* Check if we have a pointer to user space data or not. */
+	if(extra_size == 0) {
+
+		/* Create the kernel buffer that we will return.
+		 * It's at an offset to match the TYPE_POINT case... */
+		buffer_size = request_len + IW_EV_POINT_OFF;
+		buffer = kmalloc(buffer_size, GFP_KERNEL);
+		if (buffer == NULL) {
+			return -ENOMEM;
+		}
+		/* Copy event data */
+		memcpy(buffer + IW_EV_POINT_OFF, request, request_len);
+		/* Use our own copy of wrqu */
+		wrqu = (union iwreq_data *) (buffer + IW_EV_POINT_OFF
+					     + IW_EV_LCP_LEN);
+
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, &info, wrqu, (char *) wrqu);
+
+	} else {
+		char *	extra;
+
+		/* Buffer for full reply */
+		buffer_size = extra_size + IW_EV_POINT_LEN + IW_EV_POINT_OFF;
+
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes (%d bytes)\n",
+		       dev->name, extra_size, buffer_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+		/* Create the kernel buffer that we will return */
+		buffer = kmalloc(buffer_size, GFP_KERNEL);
+		if (buffer == NULL) {
+			return -ENOMEM;
+		}
+
+		/* Put wrqu in the right place (just before extra).
+		 * Leave space for IWE header and dummy pointer...
+		 * Note that IW_EV_LCP_LEN==4 bytes, so it's still aligned...
+		 */
+		memcpy(buffer + IW_EV_LCP_LEN + IW_EV_POINT_OFF,
+		       ((char *) request) + IW_EV_LCP_LEN,
+		       IW_EV_POINT_LEN - IW_EV_LCP_LEN);
+		wrqu = (union iwreq_data *) (buffer + IW_EV_LCP_LEN);
+
+		/* Extra comes logically after that. Offset +12 bytes. */
+		extra = buffer + IW_EV_POINT_OFF + IW_EV_POINT_LEN;
+
+		/* Call the handler */
+		ret = handler(dev, &info, wrqu, extra);
+
+		/* Adjust for the actual length if it's variable,
+		 * avoid leaking kernel bits outside. */
+		if (!(descr->get_args & IW_PRIV_SIZE_FIXED))
+			extra_size = adjust_priv_size(descr->get_args, wrqu);
+		/* Re-adjust reply size */
+		request->len = extra_size + IW_EV_POINT_LEN;
+
+		/* Put the iwe header where it should, i.e. scrap the
+		 * dummy pointer. */
+		memcpy(buffer + IW_EV_POINT_OFF, request, IW_EV_LCP_LEN);
+
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG "%s (WE.r) : Reply 0x%04X, hdr_len %d, tokens %d, extra_size %d, buffer_size %d\n", dev->name, cmd, hdr_len, wrqu->data.length, extra_size, buffer_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+	}
+
+	/* Return the buffer to the caller */
+	if (!ret) {
+		*p_buf = buffer;
+		*p_len = request->len;
+	} else {
+		/* Cleanup */
+		if(buffer)
+			kfree(buffer);
+	}
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Wrapper to call a private Wireless Extension SET handler.
+ * Same as above...
+ * It's not as nice and slimline as the standard wrapper. The cause
+ * is struct iw_priv_args, which was not really designed for the
+ * job we are going here.
+ *
+ * IMPORTANT : This function prevent to set and get data on the same
+ * IOCTL and enforce the SET/GET convention. Not doing it would be
+ * far too hairy...
+ * If you need to set and get data at the same time, please don't use
+ * a iw_handler but process it in your ioctl handler (i.e. use the
+ * old driver API).
+ */
+static inline int rtnetlink_private_set(struct net_device *	dev,
+					struct iw_event *	request,
+					int			request_len,
+					iw_handler		handler)
+{
+	const struct iw_priv_args *	descr = NULL;
+	unsigned int			cmd;
+	union iwreq_data *		wrqu;
+	union iwreq_data		wrqu_point;
+	int				hdr_len;
+	char *				extra = NULL;
+	int				extra_size = 0;
+	int				offset = 0;	/* For sub-ioctls */
+	struct iw_request_info		info;
+	int				i;
+	int				ret = -EINVAL;
+
+	/* Get the description of the Request */
+	cmd = request->cmd;
+	for(i = 0; i < dev->wireless_handlers->num_private_args; i++)
+		if(cmd == dev->wireless_handlers->private_args[i].cmd) {
+			descr = &(dev->wireless_handlers->private_args[i]);
+			break;
+		}
+	if(descr == NULL)
+		return -EOPNOTSUPP;
+
+#ifdef WE_RTNETLINK_DEBUG
+	printk(KERN_DEBUG "%s (WE.r) : Found private handler for 0x%04X\n",
+	       ifr->ifr_name, cmd);
+	printk(KERN_DEBUG "%s (WE.r) : Name %s, set %X, get %X\n",
+	       dev->name, descr->name, descr->set_args, descr->get_args);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+	/* Compute the size of the set arguments */
+	/* Check for sub-ioctl handler */
+	if(descr->name[0] == '\0')
+		/* Reserve one int for sub-ioctl index */
+		offset = sizeof(__u32);
+
+	/* Size of set arguments */
+	extra_size = get_priv_size(descr->set_args);
+
+	/* Does it fits in wrqu ? */
+	if((descr->set_args & IW_PRIV_SIZE_FIXED) &&
+	   (extra_size <= IFNAMSIZ)) {
+		hdr_len = IW_EV_LCP_LEN + extra_size;
+		extra_size = 0;
+	} else {
+		hdr_len = IW_EV_POINT_LEN;
+	}
+
+	/* Extract fixed header from request. This is properly aligned. */
+	wrqu = &request->u;
+
+	/* Check if wrqu is complete */
+	if(request_len < hdr_len) {
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG
+		       "%s (WE.r) : Wireless request too short (%d)\n",
+		       dev->name, request_len);
+#endif	/* WE_RTNETLINK_DEBUG */
+		return -EINVAL;
+	}
+
+	/* Prepare the call */
+	info.cmd = cmd;
+	info.flags = 0;
+
+	/* Check if we have a pointer to user space data or not. */
+	if(extra_size == 0) {
+
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, &info, wrqu, (char *) wrqu);
+
+	} else {
+		int	extra_len;
+
+		/* Put wrqu in the right place (skip pointer) */
+		memcpy(((char *) &wrqu_point) + IW_EV_POINT_OFF,
+		       wrqu, IW_EV_POINT_LEN - IW_EV_LCP_LEN);
+
+		/* Does it fits within bounds ? */
+		if(wrqu_point.data.length > (descr->set_args &
+					     IW_PRIV_SIZE_MASK))
+			return -E2BIG;
+
+		/* Real length of payload */
+		extra_len = adjust_priv_size(descr->set_args, &wrqu_point);
+
+		/* Check if request is self consistent */
+		if((request_len - hdr_len) < extra_len) {
+#ifdef WE_RTNETLINK_DEBUG
+			printk(KERN_DEBUG "%s (WE.r) : Wireless request data too short (%d)\n",
+			       dev->name, extra_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+			return -EINVAL;
+		}
+
+#ifdef WE_RTNETLINK_DEBUG
+		printk(KERN_DEBUG "%s (WE.r) : Malloc %d bytes\n",
+		       dev->name, extra_size);
+#endif	/* WE_RTNETLINK_DEBUG */
+
+		/* Always allocate for max space. Easier, and won't last
+		 * long... */
+		extra = kmalloc(extra_size, GFP_KERNEL);
+		if (extra == NULL)
+			return -ENOMEM;
+
+		/* Copy extra in aligned buffer */
+		memcpy(extra, ((char *) request) + hdr_len, extra_len);
+
+		/* Call the handler */
+		ret = handler(dev, &info, &wrqu_point, extra);
+
+		/* Cleanup - I told you it wasn't that long ;-) */
+		kfree(extra);
+	}
+
+	/* Call commit handler if needed and defined */
+	if(ret == -EIWCOMMIT)
+		ret = call_commit_handler(dev);
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Main RtNetlink dispatcher. Called from the main networking code
+ * (do_getlink() in net/core/rtnetlink.c).
+ * Check the type of Request and call the appropriate wrapper...
+ */
+int wireless_rtnetlink_get(struct net_device *	dev,
+			   char *		data,
+			   int			len,
+			   char **		p_buf,
+			   int *		p_len)
+{
+	struct iw_event *	request = (struct iw_event *) data;
+	iw_handler		handler;
+
+	/* Check length */
+	if(len < IW_EV_LCP_LEN) {
+		printk(KERN_DEBUG "%s (WE.r) : RtNetlink request too short (%d)\n",
+		       dev->name, len);
+		return -EINVAL;
+	}
+
+	/* ReCheck length (len may have padding) */
+	if(request->len > len) {
+		printk(KERN_DEBUG "%s (WE.r) : RtNetlink request len invalid (%d-%d)\n",
+		       dev->name, request->len, len);
+		return -EINVAL;
+	}
+
+	/* Only accept GET requests in here */
+	if(!IW_IS_GET(request->cmd))
+		return -EOPNOTSUPP;
+
+	/* Special cases */
+	if(request->cmd == SIOCGIWSTATS)
+		/* Get Wireless Stats */
+		return rtnetlink_standard_get(dev,
+					      request,
+					      request->len,
+					      &iw_handler_get_iwstats,
+					      p_buf, p_len);
+	if(request->cmd == SIOCGIWPRIV) {
+		/* Check if we have some wireless handlers defined */
+		if(dev->wireless_handlers == NULL)
+			return -EOPNOTSUPP;
+		/* Get Wireless Stats */
+		return rtnetlink_standard_get(dev,
+					      request,
+					      request->len,
+					      &iw_handler_get_private,
+					      p_buf, p_len);
+	}
+
+	/* Basic check */
+	if (!netif_device_present(dev))
+		return -ENODEV;
+
+	/* Try to find the handler */
+	handler = get_handler(dev, request->cmd);
+	if(handler != NULL) {
+		/* Standard and private are not the same */
+		if(request->cmd < SIOCIWFIRSTPRIV)
+			return rtnetlink_standard_get(dev,
+						      request,
+						      request->len,
+						      handler,
+						      p_buf, p_len);
+		else
+			return rtnetlink_private_get(dev,
+						     request,
+						     request->len,
+						     handler,
+						     p_buf, p_len);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Main RtNetlink dispatcher. Called from the main networking code
+ * (do_setlink() in net/core/rtnetlink.c).
+ * Check the type of Request and call the appropriate wrapper...
+ */
+int wireless_rtnetlink_set(struct net_device *	dev,
+			   char *		data,
+			   int			len)
+{
+	struct iw_event *	request = (struct iw_event *) data;
+	iw_handler		handler;
+
+	/* Check length */
+	if(len < IW_EV_LCP_LEN) {
+		printk(KERN_DEBUG "%s (WE.r) : RtNetlink request too short (%d)\n",
+		       dev->name, len);
+		return -EINVAL;
+	}
+
+	/* ReCheck length (len may have padding) */
+	if(request->len > len) {
+		printk(KERN_DEBUG "%s (WE.r) : RtNetlink request len invalid (%d-%d)\n",
+		       dev->name, request->len, len);
+		return -EINVAL;
+	}
+
+	/* Only accept SET requests in here */
+	if(!IW_IS_SET(request->cmd))
+		return -EOPNOTSUPP;
+
+	/* Basic check */
+	if (!netif_device_present(dev))
+		return -ENODEV;
+
+	/* New driver API : try to find the handler */
+	handler = get_handler(dev, request->cmd);
+	if(handler != NULL) {
+		/* Standard and private are not the same */
+		if(request->cmd < SIOCIWFIRSTPRIV)
+			return rtnetlink_standard_set(dev,
+						      request,
+						      request->len,
+						      handler);
+		else
+			return rtnetlink_private_set(dev,
+						     request,
+						     request->len,
+						     handler);
+	}
+
+	return -EOPNOTSUPP;
+}
+#endif	/* CONFIG_NET_WIRELESS_RTNETLINK */
+
+
 /************************* EVENT PROCESSING *************************/
 /*
  * Process events generated by the wireless layer or the driver.
  * Most often, the event will be propagated through rtnetlink
  */
 
-#ifdef WE_EVENT_NETLINK
-/* "rtnl" is defined in net/core/rtnetlink.c, but we need it here.
- * It is declared in <linux/rtnetlink.h> */
-
+#ifdef WE_EVENT_RTNETLINK
 /* ---------------------------------------------------------------- */
 /*
  * Fill a rtnetlink message with our event data.
@@ -1121,12 +1857,11 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff *	skb,
 	r->__ifi_pad = 0;
 	r->ifi_type = dev->type;
 	r->ifi_index = dev->ifindex;
-	r->ifi_flags = dev->flags;
+	r->ifi_flags = dev_get_flags(dev);
 	r->ifi_change = 0;	/* Wireless changes don't affect those flags */
 
 	/* Add the wireless events in the netlink packet */
-	RTA_PUT(skb, IFLA_WIRELESS,
-		event_len, event);
+	RTA_PUT(skb, IFLA_WIRELESS, event_len, event);
 
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
@@ -1163,7 +1898,7 @@ static inline void rtmsg_iwinfo(struct net_device *	dev,
 	NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
 }
-#endif	/* WE_EVENT_NETLINK */
+#endif	/* WE_EVENT_RTNETLINK */
 
 /* ---------------------------------------------------------------- */
 /*
@@ -1255,10 +1990,10 @@ void wireless_send_event(struct net_device *	dev,
 	if(extra != NULL)
 		memcpy(((char *) event) + hdr_len, extra, extra_len);
 
-#ifdef WE_EVENT_NETLINK
-	/* rtnetlink event channel */
+#ifdef WE_EVENT_RTNETLINK
+	/* Send via the RtNetlink event channel */
 	rtmsg_iwinfo(dev, (char *) event, event_len);
-#endif	/* WE_EVENT_NETLINK */
+#endif	/* WE_EVENT_RTNETLINK */
 
 	/* Cleanup */
 	kfree(event);
-- 
cgit v1.2.3


From 9e71f9c848e669f003b1319da52f03cb8e7d7403 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 23 Mar 2006 02:59:22 -0800
Subject: [PATCH] DM: Fix bug: BIO_RW_BARRIER requests to md/raid1 hang.

Both R1BIO_Barrier and R1BIO_Returned are 4 !!!!

This means that barrier requests don't get returned (i.e.  b_endio called)
because it looks like they already have been.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/raid1.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 9d5494aaac0f..3009c813d83d 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -130,6 +130,6 @@ struct r1bio_s {
  * with failure when last write completes (and all failed).
  * Record that bi_end_io was called with this flag...
  */
-#define	R1BIO_Returned 4
+#define	R1BIO_Returned 6
 
 #endif
-- 
cgit v1.2.3


From f577eb30afdc68233f25d4d82b04102129262365 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 02:59:59 -0800
Subject: [PATCH] swsusp: low level interface

Introduce the low level interface that can be used for handling the
snapshot of the system memory by the in-kernel swap-writing/reading code of
swsusp and the userland interface code (to be introduced shortly).

Also change the way in which swsusp records the allocated swap pages and,
consequently, simplifies the in-kernel swap-writing/reading code (this is
necessary for the userland interface too).  To this end, it introduces two
helper functions in mm/swapfile.c, so that the swsusp code does not refer
directly to the swap internals.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   5 +-
 kernel/power/disk.c     |  12 +-
 kernel/power/power.h    |  26 +-
 kernel/power/snapshot.c | 326 +++++++++++++++++++++-
 kernel/power/swsusp.c   | 723 +++++++++++++++++-------------------------------
 mm/swapfile.c           |  55 +++-
 6 files changed, 656 insertions(+), 491 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 12415dd94451..54eac8a39a4c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -234,14 +234,15 @@ extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *v
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
-extern struct swap_info_struct swap_info[];
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
-extern swp_entry_t get_swap_page_of_type(int type);
+extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
 extern void free_swap_and_cache(swp_entry_t);
+extern int swap_type_of(dev_t);
+extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc980..4eb464b71347 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -26,9 +26,9 @@ extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
-extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
+extern int swsusp_write(void);
 extern int swsusp_check(void);
-extern int swsusp_read(struct pbe **pblist_ptr);
+extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -70,10 +70,6 @@ static void power_down(suspend_disk_method_t mode)
 	while(1);
 }
 
-
-static int in_suspend __nosavedata = 0;
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -145,7 +141,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write(pagedir_nosave, nr_copy_pages);
+		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -216,7 +212,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read(&pagedir_nosave))) {
+	if ((error = swsusp_read())) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba680841..ea7132ed029b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -37,21 +37,31 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
-extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 
+extern int in_suspend;
+
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
-extern void free_pagedir(struct pbe *pblist);
-extern void release_eaten_pages(void);
-extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
-extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
-extern unsigned int snapshot_nr_pages(void);
-extern struct pbe *snapshot_pblist(void);
-extern void snapshot_pblist_set(struct pbe *pblist);
+
+struct snapshot_handle {
+	loff_t		offset;
+	unsigned int	page;
+	unsigned int	page_offset;
+	unsigned int	prev;
+	struct pbe	*pbe;
+	void		*buffer;
+	unsigned int	buf_offset;
+};
+
+#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d621..cc349437fb72 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
  */
 
 
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
@@ -34,7 +35,8 @@
 #include "power.h"
 
 struct pbe *pagedir_nosave;
-unsigned int nr_copy_pages;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -235,7 +237,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
@@ -301,7 +303,7 @@ struct eaten_page {
 
 static struct eaten_page *eaten_pages = NULL;
 
-void release_eaten_pages(void)
+static void release_eaten_pages(void)
 {
 	struct eaten_page *p, *q;
 
@@ -376,7 +378,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!nr_pages)
 		return NULL;
 
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
 	pblist = alloc_image_page(gfp_mask, safe_needed);
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -414,6 +415,9 @@ void swsusp_free(void)
 				}
 			}
 	}
+	nr_copy_pages = 0;
+	nr_meta_pages = 0;
+	pagedir_nosave = NULL;
 }
 
 
@@ -437,7 +441,7 @@ static int enough_free_mem(unsigned int nr_pages)
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
 {
 	struct pbe *p;
 
@@ -504,7 +508,319 @@ asmlinkage int swsusp_save(void)
 	 */
 
 	nr_copy_pages = nr_pages;
+	nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
+
+static void init_header(struct swsusp_info *info)
+{
+	memset(info, 0, sizeof(struct swsusp_info));
+	info->version_code = LINUX_VERSION_CODE;
+	info->num_physpages = num_physpages;
+	memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+	info->cpus = num_online_cpus();
+	info->image_pages = nr_copy_pages;
+	info->pages = nr_copy_pages + nr_meta_pages + 1;
+}
+
+/**
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
+}
+
+/**
+ *	snapshot_read_next - used for reading the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to read from the snapshot.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to read up to the returned number of bytes from the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the read would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the end of data stream condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+
+	if (handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset) {
+		init_header((struct swsusp_info *)buffer);
+		handle->buffer = buffer;
+		handle->pbe = pagedir_nosave;
+	}
+	if (handle->prev < handle->page) {
+		if (handle->page <= nr_meta_pages) {
+			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe)
+				handle->pbe = pagedir_nosave;
+		} else {
+			handle->buffer = (void *)handle->pbe->address;
+			handle->pbe = handle->pbe->next;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+/**
+ *	mark_unsafe_pages - mark the pages that cannot be used for storing
+ *	the image during resume, because they conflict with the pages that
+ *	had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *p;
+
+	if (!pblist) /* a sanity check */
+		return -EINVAL;
+
+	/* Clear page flags */
+	for_each_zone (zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+				ClearPageNosaveFree(pfn_to_page(zone_pfn +
+					zone->zone_start_pfn));
+	}
+
+	/* Mark orig addresses */
+	for_each_pbe (p, pblist) {
+		if (virt_addr_valid(p->orig_address))
+			SetPageNosaveFree(virt_to_page(p->orig_address));
+		else
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+	/* We assume both lists contain the same number of elements */
+	while (src) {
+		dst->orig_address = src->orig_address;
+		dst = dst->next;
+		src = src->next;
+	}
+}
+
+static int check_header(struct swsusp_info *info)
+{
+	char *reason = NULL;
+
+	if (info->version_code != LINUX_VERSION_CODE)
+		reason = "kernel version";
+	if (info->num_physpages != num_physpages)
+		reason = "memory size";
+	if (strcmp(info->uts.sysname,system_utsname.sysname))
+		reason = "system type";
+	if (strcmp(info->uts.release,system_utsname.release))
+		reason = "kernel release";
+	if (strcmp(info->uts.version,system_utsname.version))
+		reason = "version";
+	if (strcmp(info->uts.machine,system_utsname.machine))
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+		return -EPERM;
+	}
+	return 0;
+}
+
+/**
+ *	load header - check the image header and copy data from it
+ */
+
+static int load_header(struct snapshot_handle *handle,
+                              struct swsusp_info *info)
+{
+	int error;
+	struct pbe *pblist;
+
+	error = check_header(info);
+	if (!error) {
+		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+		if (!pblist)
+			return -ENOMEM;
+		pagedir_nosave = pblist;
+		handle->pbe = pblist;
+		nr_copy_pages = info->image_pages;
+		nr_meta_pages = info->pages - info->image_pages - 1;
+	}
+	return error;
+}
+
+/**
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
+ */
+
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
+
+/**
+ *	create_image - use metadata contained in the PBE list
+ *	pointed to by pagedir_nosave to mark the pages that will
+ *	be overwritten in the process of restoring the system
+ *	memory state from the image and allocate memory for
+ *	the image avoiding these pages
+ */
+
+static int create_image(struct snapshot_handle *handle)
+{
+	int error = 0;
+	struct pbe *p, *pblist;
+
+	p = pagedir_nosave;
+	error = mark_unsafe_pages(p);
+	if (!error) {
+		pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+	}
+	if (!error)
+		error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+	if (!error) {
+		release_eaten_pages();
+		pagedir_nosave = pblist;
+	} else {
+		pagedir_nosave = NULL;
+		handle->pbe = NULL;
+		nr_copy_pages = 0;
+		nr_meta_pages = 0;
+	}
+	return error;
+}
+
+/**
+ *	snapshot_write_next - used for writing the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to write to the image.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to write up to the returned number of bytes to the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the write would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the "end of file" condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+	int error = 0;
+
+	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset)
+		handle->buffer = buffer;
+	if (handle->prev < handle->page) {
+		if (!handle->prev) {
+			error = load_header(handle, (struct swsusp_info *)buffer);
+			if (error)
+				return error;
+		} else if (handle->prev <= nr_meta_pages) {
+			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe) {
+				error = create_image(handle);
+				if (error)
+					return error;
+				handle->pbe = pagedir_nosave;
+				handle->buffer = (void *)handle->pbe->address;
+			}
+		} else {
+			handle->pbe = handle->pbe->next;
+			handle->buffer = (void *)handle->pbe->address;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+		handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e87..457084f50010 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -77,6 +77,8 @@
  */
 unsigned long image_size = 500 * 1024 * 1024;
 
+int in_suspend __nosavedata = 0;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -98,8 +100,6 @@ static struct swsusp_header {
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 
-static struct swsusp_info swsusp_info;
-
 /*
  * Saving part...
  */
@@ -129,255 +129,261 @@ static int mark_swapfiles(swp_entry_t start)
 	return error;
 }
 
-/*
- * Check whether the swap device is the specified resume
- * device, irrespective of whether they are specified by
- * identical names.
- *
- * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
- * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
- * and they'll be considered the same device.  This is *necessary* for
- * devfs, since the resume code can only recognize the form /dev/hda4,
- * but the suspend code would see the long name.)
+/**
+ *	swsusp_swap_check - check if the resume device is a swap device
+ *	and get its index (if so)
  */
-static inline int is_resume_device(const struct swap_info_struct *swap_info)
-{
-	struct file *file = swap_info->swap_file;
-	struct inode *inode = file->f_dentry->d_inode;
-
-	return S_ISBLK(inode->i_mode) &&
-		swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
-}
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int i;
-
-	if (!swsusp_resume_device)
-		return -ENODEV;
-	spin_lock(&swap_lock);
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		if (!(swap_info[i].flags & SWP_WRITEOK))
-			continue;
-		if (is_resume_device(swap_info + i)) {
-			spin_unlock(&swap_lock);
-			root_swap = i;
-			return 0;
-		}
+	int res = swap_type_of(swsusp_resume_device);
+
+	if (res >= 0) {
+		root_swap = res;
+		return 0;
 	}
-	spin_unlock(&swap_lock);
-	return -ENODEV;
+	return res;
 }
 
 /**
- *	write_page - Write one page to a fresh swap location.
- *	@addr:	Address we're writing.
- *	@loc:	Place to store the entry we used.
+ *	The bitmap is used for tracing allocated swap pages
  *
- *	Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *	errors. That is an artifact left over from swsusp. It did not
- *	check the return of rw_swap_page_sync() at all, since most pages
- *	written back to swap would return -EIO.
- *	This is a partial improvement, since we will at least return other
- *	errors, though we need to eventually fix the damn code.
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
  */
-static int write_page(unsigned long addr, swp_entry_t *loc)
-{
-	swp_entry_t entry;
-	int error = -ENOSPC;
 
-	entry = get_swap_page_of_type(root_swap);
-	if (swp_offset(entry)) {
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
-		if (!error || error == -EIO)
-			*loc = entry;
-	}
-	return error;
-}
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
 
 /**
- *	Swap map-handling functions
+ *	The following functions are used for tracing the allocated
+ *	swap pages, so that they can be freed in case of an error.
  *
- *	The swap map is a data structure used for keeping track of each page
- *	written to the swap.  It consists of many swap_map_page structures
- *	that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are linked together with the help of either the
- *	.next (in memory) or the .next_swap (in swap) member.
- *
- *	The swap map is created during suspend.  At that time we need to keep
- *	it in memory, because we have to free all of the allocated swap
- *	entries if an error occurs.  The memory needed is preallocated
- *	so that we know in advance if there's enough of it.
- *
- *	The first swap_map_page structure is filled with the swap entries that
- *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
- *	so on.  After the all of the data pages have been written, the order
- *	of the swap_map_page structures in the map is reversed so that they
- *	can be read from swap in the original order.  This causes the data
- *	pages to be loaded in exactly the same order in which they have been
- *	saved.
- *
- *	During resume we only need to use one swap_map_page structure
- *	at a time, which means that we only need to use two memory pages for
- *	reading the image - one for reading the swap_map_page structures
- *	and the second for reading the data pages from swap.
+ *	The functions operate on a linked bitmap structure defined
+ *	above
  */
 
-#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
-			/ sizeof(swp_entry_t))
-
-struct swap_map_page {
-	swp_entry_t		entries[MAP_PAGE_SIZE];
-	swp_entry_t		next_swap;
-	struct swap_map_page	*next;
-};
-
-static inline void free_swap_map(struct swap_map_page *swap_map)
+static void free_bitmap(struct bitmap_page *bitmap)
 {
-	struct swap_map_page *swp;
+	struct bitmap_page *bp;
 
-	while (swap_map) {
-		swp = swap_map->next;
-		free_page((unsigned long)swap_map);
-		swap_map = swp;
+	while (bitmap) {
+		bp = bitmap->next;
+		free_page((unsigned long)bitmap);
+		bitmap = bp;
 	}
 }
 
-static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+static struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
-	struct swap_map_page *swap_map, *swp;
-	unsigned n = 0;
+	struct bitmap_page *bitmap, *bp;
+	unsigned int n;
 
-	if (!nr_pages)
+	if (!nr_bits)
 		return NULL;
 
-	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
-	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-	swp = swap_map;
-	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
-		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-		swp = swp->next;
-		if (!swp) {
-			free_swap_map(swap_map);
+	bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+	bp = bitmap;
+	for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
+		bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+		bp = bp->next;
+		if (!bp) {
+			free_bitmap(bitmap);
 			return NULL;
 		}
 	}
-	return swap_map;
+	return bitmap;
 }
 
-/**
- *	reverse_swap_map - reverse the order of pages in the swap map
- *	@swap_map
- */
-
-static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 {
-	struct swap_map_page *prev, *next;
-
-	prev = NULL;
-	while (swap_map) {
-		next = swap_map->next;
-		swap_map->next = prev;
-		prev = swap_map;
-		swap_map = next;
+	unsigned int n;
+
+	n = BITMAP_PAGE_BITS;
+	while (bitmap && n <= bit) {
+		n += BITMAP_PAGE_BITS;
+		bitmap = bitmap->next;
 	}
-	return prev;
+	if (!bitmap)
+		return -EINVAL;
+	n -= BITMAP_PAGE_BITS;
+	bit -= n;
+	n = 0;
+	while (bit >= BITS_PER_CHUNK) {
+		bit -= BITS_PER_CHUNK;
+		n++;
+	}
+	bitmap->chunks[n] |= (1UL << bit);
+	return 0;
 }
 
-/**
- *	free_swap_map_entries - free the swap entries allocated to store
- *	the swap map @swap_map (this is only called in case of an error)
- */
-static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 {
-	while (swap_map) {
-		if (swap_map->next_swap.val)
-			swap_free(swap_map->next_swap);
-		swap_map = swap_map->next;
+	unsigned long offset;
+
+	offset = swp_offset(get_swap_page_of_type(swap));
+	if (offset) {
+		if (bitmap_set(bitmap, offset)) {
+			swap_free(swp_entry(swap, offset));
+			offset = 0;
+		}
 	}
+	return offset;
 }
 
-/**
- *	save_swap_map - save the swap map used for tracing the data pages
- *	stored in the swap
- */
-
-static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 {
-	swp_entry_t entry = (swp_entry_t){0};
-	int error;
+	unsigned int bit, n;
+	unsigned long test;
 
-	while (swap_map) {
-		swap_map->next_swap = entry;
-		if ((error = write_page((unsigned long)swap_map, &entry)))
-			return error;
-		swap_map = swap_map->next;
+	bit = 0;
+	while (bitmap) {
+		for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
+			for (test = 1UL; test; test <<= 1) {
+				if (bitmap->chunks[n] & test)
+					swap_free(swp_entry(swap, bit));
+				bit++;
+			}
+		bitmap = bitmap->next;
 	}
-	*start = entry;
-	return 0;
 }
 
 /**
- *	free_image_entries - free the swap entries allocated to store
- *	the image data pages (this is only called in case of an error)
+ *	write_page - Write one page to given swap location.
+ *	@buf:		Address we're writing.
+ *	@offset:	Offset of the swap page we're writing to.
  */
 
-static inline void free_image_entries(struct swap_map_page *swp)
+static int write_page(void *buf, unsigned long offset)
 {
-	unsigned k;
+	swp_entry_t entry;
+	int error = -ENOSPC;
 
-	while (swp) {
-		for (k = 0; k < MAP_PAGE_SIZE; k++)
-			if (swp->entries[k].val)
-				swap_free(swp->entries[k]);
-		swp = swp->next;
+	if (offset) {
+		entry = swp_entry(root_swap, offset);
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
 	}
+	return error;
 }
 
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+
+struct swap_map_page {
+	unsigned long		entries[MAP_PAGE_ENTRIES];
+	unsigned long		next_swap;
+};
+
 /**
- *	The swap_map_handle structure is used for handling the swap map in
+ *	The swap_map_handle structure is used for handling swap in
  *	a file-alike way
  */
 
 struct swap_map_handle {
 	struct swap_map_page *cur;
+	unsigned long cur_swap;
+	struct bitmap_page *bitmap;
 	unsigned int k;
 };
 
-static inline void init_swap_map_handle(struct swap_map_handle *handle,
-                                        struct swap_map_page *map)
+static void release_swap_writer(struct swap_map_handle *handle)
 {
-	handle->cur = map;
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+	if (handle->bitmap)
+		free_bitmap(handle->bitmap);
+	handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+	if (!handle->cur)
+		return -ENOMEM;
+	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+	if (!handle->bitmap) {
+		release_swap_writer(handle);
+		return -ENOMEM;
+	}
+	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	if (!handle->cur_swap) {
+		release_swap_writer(handle);
+		return -ENOSPC;
+	}
 	handle->k = 0;
+	return 0;
 }
 
-static inline int swap_map_write_page(struct swap_map_handle *handle,
-                                      unsigned long addr)
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
 {
 	int error;
+	unsigned long offset;
 
-	error = write_page(addr, handle->cur->entries + handle->k);
+	if (!handle->cur)
+		return -EINVAL;
+	offset = alloc_swap_page(root_swap, handle->bitmap);
+	error = write_page(buf, offset);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
-		handle->cur = handle->cur->next;
+	handle->cur->entries[handle->k++] = offset;
+	if (handle->k >= MAP_PAGE_ENTRIES) {
+		offset = alloc_swap_page(root_swap, handle->bitmap);
+		if (!offset)
+			return -ENOSPC;
+		handle->cur->next_swap = offset;
+		error = write_page(handle->cur, handle->cur_swap);
+		if (error)
+			return error;
+		memset(handle->cur, 0, PAGE_SIZE);
+		handle->cur_swap = offset;
 		handle->k = 0;
 	}
 	return 0;
 }
 
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur && handle->cur_swap)
+		return write_page(handle->cur, handle->cur_swap);
+	else
+		return -EINVAL;
+}
+
 /**
- *	save_image_data - save the data pages pointed to by the PBEs
- *	from the list @pblist using the swap map handle @handle
- *	(assume there are @nr_pages data pages to save)
+ *	save_image - save the suspend image data
  */
 
-static int save_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
 	unsigned int m;
-	struct pbe *p;
+	int ret;
 	int error = 0;
 
 	printk("Saving image data pages (%u pages) ...     ", nr_pages);
@@ -385,98 +391,22 @@ static int save_image_data(struct pbe *pblist,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	for_each_pbe (p, pblist) {
-		error = swap_map_write_page(handle, p->address);
-		if (error)
-			break;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_write_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
 	return error;
 }
 
-static void dump_info(void)
-{
-	pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
-	pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
-	pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
-	pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
-	pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
-	pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
-	pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
-	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
-	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
-	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
-}
-
-static void init_header(unsigned int nr_pages)
-{
-	memset(&swsusp_info, 0, sizeof(swsusp_info));
-	swsusp_info.version_code = LINUX_VERSION_CODE;
-	swsusp_info.num_physpages = num_physpages;
-	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
-
-	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_pages;
-	swsusp_info.pages = nr_pages +
-		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-}
-
-/**
- *	pack_orig_addresses - the .orig_address fields of the PBEs from the
- *	list starting at @pbe are stored in the array @buf[] (1 page)
- */
-
-static inline struct pbe *pack_orig_addresses(unsigned long *buf,
-                                              struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		buf[j] = pbe->orig_address;
-		pbe = pbe->next;
-	}
-	if (!pbe)
-		for (; j < PAGE_SIZE / sizeof(long); j++)
-			buf[j] = 0;
-	return pbe;
-}
-
-/**
- *	save_image_metadata - save the .orig_address fields of the PBEs
- *	from the list @pblist using the swap map handle @handle
- */
-
-static int save_image_metadata(struct pbe *pblist,
-                               struct swap_map_handle *handle)
-{
-	unsigned long *buf;
-	unsigned int n = 0;
-	struct pbe *p;
-	int error = 0;
-
-	printk("Saving image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		p = pack_orig_addresses(buf, p);
-		error = swap_map_write_page(handle, (unsigned long)buf);
-		if (error)
-			break;
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages saved)\n", n);
-	return error;
-}
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -486,8 +416,7 @@ static int save_image_metadata(struct pbe *pblist,
 
 static int enough_swap(unsigned int nr_pages)
 {
-	unsigned int free_swap = swap_info[root_swap].pages -
-		swap_info[root_swap].inuse_pages;
+	unsigned int free_swap = count_swap_pages(root_swap, 1);
 
 	pr_debug("swsusp: free swap pages: %u\n", free_swap);
 	return free_swap > (nr_pages + PAGES_FOR_IO +
@@ -503,57 +432,44 @@ static int enough_swap(unsigned int nr_pages)
  *	correctly, we'll mark system clean, anyway.)
  */
 
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
+int swsusp_write(void)
 {
-	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
-	swp_entry_t start;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+	unsigned long start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
 		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
 		return error;
 	}
-	if (!enough_swap(nr_pages)) {
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	if (!enough_swap(header->pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
-
-	init_header(nr_pages);
-	swap_map = alloc_swap_map(swsusp_info.pages);
-	if (!swap_map)
-		return -ENOMEM;
-	init_swap_map_handle(&handle, swap_map);
-
-	error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
-	if (!error)
-		error = save_image_metadata(pblist, &handle);
+	error = get_swap_writer(&handle);
+	if (!error) {
+		start = handle.cur_swap;
+		error = swap_write_page(&handle, header);
+	}
 	if (!error)
-		error = save_image_data(pblist, &handle, nr_pages);
-	if (error)
-		goto Free_image_entries;
-
-	swap_map = reverse_swap_map(swap_map);
-	error = save_swap_map(swap_map, &start);
-	if (error)
-		goto Free_map_entries;
-
-	dump_info();
-	printk( "S" );
-	error = mark_swapfiles(start);
-	printk( "|\n" );
+		error = save_image(&handle, &snapshot, header->pages - 1);
+	if (!error) {
+		flush_swap_writer(&handle);
+		printk("S");
+		error = mark_swapfiles(swp_entry(root_swap, start));
+		printk("|\n");
+	}
 	if (error)
-		goto Free_map_entries;
-
-Free_swap_map:
-	free_swap_map(swap_map);
+		free_all_swap_pages(root_swap, handle.bitmap);
+	release_swap_writer(&handle);
 	return error;
-
-Free_map_entries:
-	free_swap_map_entries(swap_map);
-Free_image_entries:
-	free_image_entries(swap_map);
-	goto Free_swap_map;
 }
 
 /**
@@ -663,45 +579,6 @@ int swsusp_resume(void)
 	return error;
 }
 
-/**
- *	mark_unsafe_pages - mark the pages that cannot be used for storing
- *	the image during resume, because they conflict with the pages that
- *	had been used before suspend
- */
-
-static void mark_unsafe_pages(struct pbe *pblist)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe *p;
-
-	if (!pblist) /* a sanity check */
-		return;
-
-	/* Clear page flags */
-	for_each_zone (zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-				ClearPageNosaveFree(pfn_to_page(zone_pfn +
-					zone->zone_start_pfn));
-	}
-
-	/* Mark orig addresses */
-	for_each_pbe (p, pblist)
-		SetPageNosaveFree(virt_to_page(p->orig_address));
-
-}
-
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-{
-	/* We assume both lists contain the same number of elements */
-	while (src) {
-		dst->orig_address = src->orig_address;
-		dst = dst->next;
-		src = src->next;
-	}
-}
-
 /*
  *	Using bio to read from swap.
  *	This code requires a bit more work than just using buffer heads
@@ -779,14 +656,14 @@ static int bio_write_page(pgoff_t page_off, void *page)
  *	in a file-alike way
  */
 
-static inline void release_swap_map_reader(struct swap_map_handle *handle)
+static void release_swap_reader(struct swap_map_handle *handle)
 {
 	if (handle->cur)
 		free_page((unsigned long)handle->cur);
 	handle->cur = NULL;
 }
 
-static inline int get_swap_map_reader(struct swap_map_handle *handle,
+static int get_swap_reader(struct swap_map_handle *handle,
                                       swp_entry_t start)
 {
 	int error;
@@ -798,149 +675,80 @@ static inline int get_swap_map_reader(struct swap_map_handle *handle,
 		return -ENOMEM;
 	error = bio_read_page(swp_offset(start), handle->cur);
 	if (error) {
-		release_swap_map_reader(handle);
+		release_swap_reader(handle);
 		return error;
 	}
 	handle->k = 0;
 	return 0;
 }
 
-static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
 {
 	unsigned long offset;
 	int error;
 
 	if (!handle->cur)
 		return -EINVAL;
-	offset = swp_offset(handle->cur->entries[handle->k]);
+	offset = handle->cur->entries[handle->k];
 	if (!offset)
-		return -EINVAL;
+		return -EFAULT;
 	error = bio_read_page(offset, buf);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
+	if (++handle->k >= MAP_PAGE_ENTRIES) {
 		handle->k = 0;
-		offset = swp_offset(handle->cur->next_swap);
+		offset = handle->cur->next_swap;
 		if (!offset)
-			release_swap_map_reader(handle);
+			release_swap_reader(handle);
 		else
 			error = bio_read_page(offset, handle->cur);
 	}
 	return error;
 }
 
-static int check_header(void)
-{
-	char *reason = NULL;
-
-	dump_info();
-	if (swsusp_info.version_code != LINUX_VERSION_CODE)
-		reason = "kernel version";
-	if (swsusp_info.num_physpages != num_physpages)
-		reason = "memory size";
-	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-		reason = "system type";
-	if (strcmp(swsusp_info.uts.release,system_utsname.release))
-		reason = "kernel release";
-	if (strcmp(swsusp_info.uts.version,system_utsname.version))
-		reason = "version";
-	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-		reason = "machine";
-	if (reason) {
-		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
-		return -EPERM;
-	}
-	return 0;
-}
-
 /**
- *	load_image_data - load the image data using the swap map handle
- *	@handle and store them using the page backup list @pblist
+ *	load_image - load the image using the swap map handle
+ *	@handle and the snapshot handle @snapshot
  *	(assume there are @nr_pages pages to load)
  */
 
-static int load_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
-	int error;
 	unsigned int m;
-	struct pbe *p;
+	int ret;
+	int error = 0;
 
-	if (!pblist)
-		return -EINVAL;
 	printk("Loading image data pages (%u pages) ...     ", nr_pages);
 	m = nr_pages / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, (void *)p->address);
-		if (error)
-			break;
-		p = p->next;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_read_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
+	if (!snapshot_image_loaded(snapshot))
+		error = -ENODATA;
 	return error;
 }
 
-/**
- *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *	the PBEs in the list starting at @pbe
- */
-
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		pbe->orig_address = buf[j];
-		pbe = pbe->next;
-	}
-	return pbe;
-}
-
-/**
- *	load_image_metadata - load the image metadata using the swap map
- *	handle @handle and put them into the PBEs in the list @pblist
- */
-
-static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
-{
-	struct pbe *p;
-	unsigned long *buf;
-	unsigned int n = 0;
-	int error = 0;
-
-	printk("Loading image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, buf);
-		if (error)
-			break;
-		p = unpack_orig_addresses(buf, p);
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages loaded)\n", n);
-	return error;
-}
-
-int swsusp_read(struct pbe **pblist_ptr)
+int swsusp_read(void)
 {
 	int error;
-	struct pbe *p, *pblist;
 	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
 	unsigned int nr_pages;
 
 	if (IS_ERR(resume_bdev)) {
@@ -948,38 +756,19 @@ int swsusp_read(struct pbe **pblist_ptr)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = get_swap_map_reader(&handle, swsusp_header.image);
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	error = get_swap_reader(&handle, swsusp_header.image);
 	if (!error)
-		error = swap_map_read_page(&handle, &swsusp_info);
-	if (!error)
-		error = check_header();
-	if (error)
-		return error;
-	nr_pages = swsusp_info.image_pages;
-	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
-	if (!p)
-		return -ENOMEM;
-	error = load_image_metadata(p, &handle);
+		error = swap_read_page(&handle, header);
 	if (!error) {
-		mark_unsafe_pages(p);
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
-		if (pblist)
-			copy_page_backup_list(pblist, p);
-		free_pagedir(p);
-		if (!pblist)
-			error = -ENOMEM;
-
-		/* Allocate memory for the image and read the data from swap */
-		if (!error)
-			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error) {
-			release_eaten_pages();
-			error = load_image_data(pblist, &handle, nr_pages);
-		}
-		if (!error)
-			*pblist_ptr = pblist;
+		nr_pages = header->image_pages;
+		error = load_image(&handle, &snapshot, nr_pages);
 	}
-	release_swap_map_reader(&handle);
+	release_swap_reader(&handle);
 
 	blkdev_put(resume_bdev);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 365ed6ff182d..4d11f9d84666 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
 
 struct swap_list_t swap_list = {-1, -1};
 
-struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
@@ -417,6 +417,59 @@ void free_swap_and_cache(swp_entry_t entry)
 	}
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Find the swap type that corresponds to given device (if any)
+ *
+ * This is needed for software suspend and is done in such a way that inode
+ * aliasing is allowed.
+ */
+int swap_type_of(dev_t device)
+{
+	int i;
+
+	if (!device)
+		return -EINVAL;
+	spin_lock(&swap_lock);
+	for (i = 0; i < nr_swapfiles; i++) {
+		struct inode *inode;
+
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		inode = swap_info->swap_file->f_dentry->d_inode;
+		if (S_ISBLK(inode->i_mode) &&
+		    device == MKDEV(imajor(inode), iminor(inode))) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
+	}
+	spin_unlock(&swap_lock);
+	return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+	unsigned int n = 0;
+
+	if (type < nr_swapfiles) {
+		spin_lock(&swap_lock);
+		if (swap_info[type].flags & SWP_WRITEOK) {
+			n = swap_info[type].pages;
+			if (free)
+				n -= swap_info[type].inuse_pages;
+		}
+		spin_unlock(&swap_lock);
+	}
+	return n;
+}
+#endif
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
-- 
cgit v1.2.3


From 74c7e2efbe37378026f00ad9e7253796d7b2fc99 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 23 Mar 2006 03:00:01 -0800
Subject: [PATCH] kernel/power: move externs to header files

Move externs from C source files to header files.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pm.h   |  3 ++-
 kernel/power/disk.c  | 11 -----------
 kernel/power/power.h |  6 +++++-
 3 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 5be87ba3b7ac..6df2585c0169 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -188,6 +188,8 @@ extern void device_power_up(void);
 extern void device_resume(void);
 
 #ifdef CONFIG_PM
+extern suspend_disk_method_t pm_disk_mode;
+
 extern int device_suspend(pm_message_t state);
 
 #define device_set_wakeup_enable(dev,val) \
@@ -215,7 +217,6 @@ static inline int dpm_runtime_suspend(struct device * dev, pm_message_t state)
 
 static inline void dpm_runtime_resume(struct device * dev)
 {
-
 }
 
 #endif
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4eb464b71347..4bd68f482f2b 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
 #include "power.h"
 
 
-extern suspend_disk_method_t pm_disk_mode;
-
-extern int swsusp_shrink_memory(void);
-extern int swsusp_suspend(void);
-extern int swsusp_write(void);
-extern int swsusp_check(void);
-extern int swsusp_read(void);
-extern void swsusp_close(void);
-extern int swsusp_resume(void);
-
-
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 089c84bed895..5d1abffbb9ce 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -48,7 +48,6 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
-extern void swsusp_free(void);
 
 struct snapshot_handle {
 	loff_t		offset;
@@ -91,6 +90,11 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
 extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
+extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
+extern void swsusp_free(void);
 extern int swsusp_suspend(void);
 extern int swsusp_resume(void);
+extern int swsusp_read(void);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
-- 
cgit v1.2.3


From ff4da2e262d2509fe1bacff70dd00934be569c66 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:07 -0800
Subject: [PATCH] swsusp: add check for suspension of X-controlled devices

It is unsafe to suspend devices if the hardware is controlled by X.  Add an
extra check to prevent this from happening.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/power/suspend.c | 5 ++++-
 drivers/char/vt.c            | 8 ++++++++
 include/linux/vt_kern.h      | 5 +++++
 3 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/suspend.c b/drivers/base/power/suspend.c
index 8660779fb288..bdb60663f2ef 100644
--- a/drivers/base/power/suspend.c
+++ b/drivers/base/power/suspend.c
@@ -8,6 +8,7 @@
  *
  */
 
+#include <linux/vt_kern.h>
 #include <linux/device.h>
 #include "../base.h"
 #include "power.h"
@@ -62,7 +63,6 @@ int suspend_device(struct device * dev, pm_message_t state)
 	return error;
 }
 
-
 /**
  *	device_suspend - Save state and stop all devices in system.
  *	@state:		Power state to put each device in.
@@ -82,6 +82,9 @@ int device_suspend(pm_message_t state)
 {
 	int error = 0;
 
+	if (!is_console_suspend_safe())
+		return -EINVAL;
+
 	down(&dpm_sem);
 	down(&dpm_list_sem);
 	while (!list_empty(&dpm_active) && error == 0) {
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 0900d1dbee59..86b31b87eb85 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -3234,6 +3234,14 @@ void vcs_scr_writew(struct vc_data *vc, u16 val, u16 *org)
 	}
 }
 
+int is_console_suspend_safe(void)
+{
+	/* It is unsafe to suspend devices while X has control of the
+	 * hardware. Make sure we are running on a kernel-controlled console.
+	 */
+	return vc_cons[fg_console].d->vc_mode == KD_TEXT;
+}
+
 /*
  *	Visible symbols for modules
  */
diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h
index fab5aed8ca31..530ae3f4248c 100644
--- a/include/linux/vt_kern.h
+++ b/include/linux/vt_kern.h
@@ -73,6 +73,11 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc);
 int vt_waitactive(int vt);
 void change_console(struct vc_data *new_vc);
 void reset_vc(struct vc_data *vc);
+#ifdef CONFIG_VT
+int is_console_suspend_safe(void);
+#else
+static inline int is_console_suspend_safe(void) { return 1; }
+#endif
 
 /*
  * vc_screen.c shares this temporary buffer with the console write code so that
-- 
cgit v1.2.3


From d8733c2956968a01394a4d2a9e97a8b431a78776 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:00:11 -0800
Subject: [PATCH] ext3_readdir: use generic readahead

Linus points out that ext3_readdir's readahead only cuts in when
ext3_readdir() is operating at the very start of the directory.  So for large
directories we end up performing no readahead at all and we suck.

So take it all out and use the core VM's page_cache_readahead().  This means
that ext3 directory reads will use all of readahead's dynamic sizing goop.

Note that we're using the directory's filp->f_ra to hold the readahead state,
but readahead is actually being performed against the underlying blockdev's
address_space.  Fortunately the readahead code is all set up to handle this.

Tested with printk.  It works.  I was struggling to find a real workload which
actually cared.

(The patch also exports page_cache_readahead() to GPL modules)

Cc: "Stephen C. Tweedie" <sct@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/dir.c           | 52 +++++++++++++++++++++++--------------------------
 fs/ext3/inode.c         |  2 +-
 include/linux/ext3_fs.h |  9 ++++++---
 mm/readahead.c          |  1 +
 4 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 832867aef3dc..773459164bb2 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -95,11 +95,10 @@ static int ext3_readdir(struct file * filp,
 			 void * dirent, filldir_t filldir)
 {
 	int error = 0;
-	unsigned long offset, blk;
-	int i, num, stored;
-	struct buffer_head * bh, * tmp, * bha[16];
-	struct ext3_dir_entry_2 * de;
-	struct super_block * sb;
+	unsigned long offset;
+	int i, stored;
+	struct ext3_dir_entry_2 *de;
+	struct super_block *sb;
 	int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	int ret = 0;
@@ -124,12 +123,29 @@ static int ext3_readdir(struct file * filp,
 	}
 #endif
 	stored = 0;
-	bh = NULL;
 	offset = filp->f_pos & (sb->s_blocksize - 1);
 
 	while (!error && !stored && filp->f_pos < inode->i_size) {
-		blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
-		bh = ext3_bread(NULL, inode, blk, 0, &err);
+		unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+		struct buffer_head map_bh;
+		struct buffer_head *bh = NULL;
+
+		map_bh.b_state = 0;
+		err = ext3_get_block_handle(NULL, inode, blk, &map_bh, 0, 0);
+		if (!err) {
+			page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
+				&filp->f_ra,
+				filp,
+				map_bh.b_blocknr >>
+					(PAGE_CACHE_SHIFT - inode->i_blkbits),
+				1);
+			bh = ext3_bread(NULL, inode, blk, 0, &err);
+		}
+
+		/*
+		 * We ignore I/O errors on directories so users have a chance
+		 * of recovering data when there's a bad sector
+		 */
 		if (!bh) {
 			ext3_error (sb, "ext3_readdir",
 				"directory #%lu contains a hole at offset %lu",
@@ -138,26 +154,6 @@ static int ext3_readdir(struct file * filp,
 			continue;
 		}
 
-		/*
-		 * Do the readahead
-		 */
-		if (!offset) {
-			for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
-			     i > 0; i--) {
-				tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
-				if (tmp && !buffer_uptodate(tmp) &&
-						!buffer_locked(tmp))
-					bha[num++] = tmp;
-				else
-					brelse (tmp);
-			}
-			if (num) {
-				ll_rw_block (READA, num, bha);
-				for (i = 0; i < num; i++)
-					brelse (bha[i]);
-			}
-		}
-
 revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 0384e539b88f..d59d5a667b0b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -671,7 +671,7 @@ err_out:
  * The BKL may not be held on entry here.  Be sure to take it early.
  */
 
-static int
+int
 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 		struct buffer_head *bh_result, int create, int extend_disksize)
 {
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index c0272d73ab20..e7239f2f97a1 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -772,9 +772,12 @@ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
 
 
 /* inode.c */
-extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
-extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
-extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_get_block_handle(handle_t *handle, struct inode *inode,
+	sector_t iblock, struct buffer_head *bh_result, int create,
+	int extend_disksize);
 
 extern void ext3_read_inode (struct inode *);
 extern int  ext3_write_inode (struct inode *, int);
diff --git a/mm/readahead.c b/mm/readahead.c
index 301b36c4a0ce..0f142a40984b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -555,6 +555,7 @@ recheck:
 out:
 	return ra->prev_page + 1;
 }
+EXPORT_SYMBOL_GPL(page_cache_readahead);
 
 /*
  * handle_ra_miss() is called when it is known that a page which should have
-- 
cgit v1.2.3


From 0c9e63fd38a2fb2181668a0cdd622a3c23cfd567 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 23 Mar 2006 03:00:12 -0800
Subject: [PATCH] Shrinks sizeof(files_struct) and better layout

1) Reduce the size of (struct fdtable) to exactly 64 bytes on 32bits
   platforms, lowering kmalloc() allocated space by 50%.

2) Reduce the size of (files_struct), using a special 32 bits (or
   64bits) embedded_fd_set, instead of a 1024 bits fd_set for the
   close_on_exec_init and open_fds_init fields.  This save some ram (248
   bytes per task) as most tasks dont open more than 32 files.  D-Cache
   footprint for such tasks is also reduced to the minimum.

3) Reduce size of allocated fdset.  Currently two full pages are
   allocated, that is 32768 bits on x86 for example, and way too much.  The
   minimum is now L1_CACHE_BYTES.

UP and SMP should benefit from this patch, because most tasks will touch
only one cache line when open()/close() stdin/stdout/stderr (0/1/2),
(next_fd, close_on_exec_init, open_fds_init, fd_array[0 ..  2] being in the
same cache line)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fcntl.c                |  9 ++++-----
 fs/file.c                 | 34 ++++++++++++++--------------------
 fs/open.c                 |  8 ++++----
 include/linux/file.h      | 28 ++++++++++++++++++++++++----
 include/linux/init_task.h | 10 +++++-----
 kernel/fork.c             |  8 ++++----
 6 files changed, 55 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index dc4a7007f4e7..03c789560fb8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -73,8 +73,8 @@ repeat:
 	 * orig_start..fdt->next_fd
 	 */
 	start = orig_start;
-	if (start < fdt->next_fd)
-		start = fdt->next_fd;
+	if (start < files->next_fd)
+		start = files->next_fd;
 
 	newfd = start;
 	if (start < fdt->max_fdset) {
@@ -102,9 +102,8 @@ repeat:
 	 * we reacquire the fdtable pointer and use it while holding
 	 * the lock, no one can free it during that time.
 	 */
-	fdt = files_fdtable(files);
-	if (start <= fdt->next_fd)
-		fdt->next_fd = newfd + 1;
+	if (start <= files->next_fd)
+		files->next_fd = newfd + 1;
 
 	error = newfd;
 	
diff --git a/fs/file.c b/fs/file.c
index cea7cbea11d0..bbc743314730 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -125,7 +125,8 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 		kmem_cache_free(files_cachep, fdt->free_files);
 		return;
 	}
-	if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
+	if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE &&
+		fdt->max_fds <= NR_OPEN_DEFAULT) {
 		/*
 		 * The fdtable was embedded
 		 */
@@ -155,8 +156,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 
 void free_fdtable(struct fdtable *fdt)
 {
-	if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
-					fdt->max_fds > NR_OPEN_DEFAULT)
+	if (fdt->free_files ||
+		fdt->max_fdset > EMBEDDED_FD_SET_SIZE ||
+		fdt->max_fds > NR_OPEN_DEFAULT)
 		call_rcu(&fdt->rcu, free_fdtable_rcu);
 }
 
@@ -199,7 +201,6 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
 		       (nfdt->max_fds - fdt->max_fds) *
 					sizeof(struct file *));
 	}
-	nfdt->next_fd = fdt->next_fd;
 }
 
 /*
@@ -220,11 +221,9 @@ fd_set * alloc_fdset(int num)
 
 void free_fdset(fd_set *array, int num)
 {
-	int size = num / 8;
-
-	if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */
+	if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */
 		return;
-	else if (size <= PAGE_SIZE)
+	else if (num <= 8 * PAGE_SIZE)
 		kfree(array);
 	else
 		vfree(array);
@@ -237,22 +236,17 @@ static struct fdtable *alloc_fdtable(int nr)
   	fd_set *new_openset = NULL, *new_execset = NULL;
 	struct file **new_fds;
 
-	fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
+	fdt = kzalloc(sizeof(*fdt), GFP_KERNEL);
 	if (!fdt)
   		goto out;
-	memset(fdt, 0, sizeof(*fdt));
 
-	nfds = __FD_SETSIZE;
+	nfds = 8 * L1_CACHE_BYTES;
   	/* Expand to the max in easy steps */
-  	do {
-		if (nfds < (PAGE_SIZE * 8))
-			nfds = PAGE_SIZE * 8;
-		else {
-			nfds = nfds * 2;
-			if (nfds > NR_OPEN)
-				nfds = NR_OPEN;
-		}
-	} while (nfds <= nr);
+  	while (nfds <= nr) {
+		nfds = nfds * 2;
+		if (nfds > NR_OPEN)
+			nfds = NR_OPEN;
+	}
 
   	new_openset = alloc_fdset(nfds);
   	new_execset = alloc_fdset(nfds);
diff --git a/fs/open.c b/fs/open.c
index 70e0230d8e77..1091dadd6c38 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -973,7 +973,7 @@ repeat:
 	fdt = files_fdtable(files);
  	fd = find_next_zero_bit(fdt->open_fds->fds_bits,
 				fdt->max_fdset,
-				fdt->next_fd);
+				files->next_fd);
 
 	/*
 	 * N.B. For clone tasks sharing a files structure, this test
@@ -998,7 +998,7 @@ repeat:
 
 	FD_SET(fd, fdt->open_fds);
 	FD_CLR(fd, fdt->close_on_exec);
-	fdt->next_fd = fd + 1;
+	files->next_fd = fd + 1;
 #if 1
 	/* Sanity check */
 	if (fdt->fd[fd] != NULL) {
@@ -1019,8 +1019,8 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
 	__FD_CLR(fd, fdt->open_fds);
-	if (fd < fdt->next_fd)
-		fdt->next_fd = fd;
+	if (fd < files->next_fd)
+		files->next_fd = fd;
 }
 
 void fastcall put_unused_fd(unsigned int fd)
diff --git a/include/linux/file.h b/include/linux/file.h
index 9901b850f2e4..9f7c2513866f 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -10,6 +10,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/types.h>
 
 /*
  * The default fd array needs to be at least BITS_PER_LONG,
@@ -17,10 +18,22 @@
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
 
+/*
+ * The embedded_fd_set is a small fd_set,
+ * suitable for most tasks (which open <= BITS_PER_LONG files)
+ */
+struct embedded_fd_set {
+	unsigned long fds_bits[1];
+};
+
+/*
+ * More than this number of fds: we use a separately allocated fd_set
+ */
+#define EMBEDDED_FD_SET_SIZE (BITS_PER_BYTE * sizeof(struct embedded_fd_set))
+
 struct fdtable {
 	unsigned int max_fds;
 	int max_fdset;
-	int next_fd;
 	struct file ** fd;      /* current fd array */
 	fd_set *close_on_exec;
 	fd_set *open_fds;
@@ -33,13 +46,20 @@ struct fdtable {
  * Open file table structure
  */
 struct files_struct {
+  /*
+   * read mostly part
+   */
 	atomic_t count;
 	struct fdtable *fdt;
 	struct fdtable fdtab;
-	fd_set close_on_exec_init;
-	fd_set open_fds_init;
+  /*
+   * written part on a separate cache line in SMP
+   */
+	spinlock_t file_lock ____cacheline_aligned_in_smp;
+	int next_fd;
+	struct embedded_fd_set close_on_exec_init;
+	struct embedded_fd_set open_fds_init;
 	struct file * fd_array[NR_OPEN_DEFAULT];
-	spinlock_t file_lock;     /* Protects concurrent writers.  Nests inside tsk->alloc_lock */
 };
 
 #define files_fdtable(files) (rcu_dereference((files)->fdt))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index dcfd2ecccb5d..92146f3b7423 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -7,11 +7,10 @@
 #define INIT_FDTABLE \
 {							\
 	.max_fds	= NR_OPEN_DEFAULT, 		\
-	.max_fdset	= __FD_SETSIZE, 		\
-	.next_fd	= 0, 				\
+	.max_fdset	= EMBEDDED_FD_SET_SIZE,		\
 	.fd		= &init_files.fd_array[0], 	\
-	.close_on_exec	= &init_files.close_on_exec_init, \
-	.open_fds	= &init_files.open_fds_init, 	\
+	.close_on_exec	= (fd_set *)&init_files.close_on_exec_init, \
+	.open_fds	= (fd_set *)&init_files.open_fds_init, 	\
 	.rcu		= RCU_HEAD_INIT, 		\
 	.free_files	= NULL,		 		\
 	.next		= NULL,		 		\
@@ -20,9 +19,10 @@
 #define INIT_FILES \
 { 							\
 	.count		= ATOMIC_INIT(1), 		\
-	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
 	.fdt		= &init_files.fdtab, 		\
 	.fdtab		= INIT_FDTABLE,			\
+	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.next_fd	= 0, 				\
 	.close_on_exec_init = { { 0, } }, 		\
 	.open_fds_init	= { { 0, } }, 			\
 	.fd_array	= { NULL, } 			\
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd7b65ee418..c79ae0b19a49 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
 	fdt = &newf->fdtab;
-	fdt->next_fd = 0;
 	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->max_fdset = __FD_SETSIZE;
-	fdt->close_on_exec = &newf->close_on_exec_init;
-	fdt->open_fds = &newf->open_fds_init;
+	fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
+	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	fdt->open_fds = (fd_set *)&newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
 	INIT_RCU_HEAD(&fdt->rcu);
 	fdt->free_files = NULL;
-- 
cgit v1.2.3


From 6a2900b67652421b51fe25e4b86ecfec742b1f30 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Mar 2006 03:00:15 -0800
Subject: [PATCH] kill cdrom ->dev_ioctl method

Since early 2.4.x all cdrom drivers implement the block_device methods
themselves, so they can handle additional ioctls directly instead of going
through the cdrom layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/cdrom/cdrom.c   |    7 -
 drivers/cdrom/cdu31a.c  |    8 +-
 drivers/cdrom/cm206.c   |   44 +-
 drivers/cdrom/sbpcd.c   | 1890 +++++++++++++++++++++++------------------------
 drivers/cdrom/viocd.c   |    2 +-
 drivers/ide/ide-cd.c    |   99 +--
 drivers/scsi/sr.c       |   37 +-
 drivers/scsi/sr.h       |    1 -
 drivers/scsi/sr_ioctl.c |   19 -
 include/linux/cdrom.h   |    5 +-
 10 files changed, 1040 insertions(+), 1072 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index d6653fc03b92..a59876a0bfa1 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -407,7 +407,6 @@ int register_cdrom(struct cdrom_device_info *cdi)
 	ENSURE(get_mcn, CDC_MCN);
 	ENSURE(reset, CDC_RESET);
 	ENSURE(audio_ioctl, CDC_PLAY_AUDIO);
-	ENSURE(dev_ioctl, CDC_IOCTLS);
 	ENSURE(generic_packet, CDC_GENERIC_PACKET);
 	cdi->mc_flags = 0;
 	cdo->n_minors = 0;
@@ -2776,12 +2775,6 @@ int cdrom_ioctl(struct file * file, struct cdrom_device_info *cdi,
 		return cdrom_ioctl_audioctl(cdi, cmd);
 	}
 
-	/*
-	 * Finally, do the device specific ioctls
-	 */
-	if (CDROM_CAN(CDC_IOCTLS))
-		return cdi->ops->dev_ioctl(cdi, cmd, arg);
-
 	return -ENOSYS;
 }
 
diff --git a/drivers/cdrom/cdu31a.c b/drivers/cdrom/cdu31a.c
index 378e88d20757..72ffd64e8b1e 100644
--- a/drivers/cdrom/cdu31a.c
+++ b/drivers/cdrom/cdu31a.c
@@ -2668,7 +2668,7 @@ static int scd_audio_ioctl(struct cdrom_device_info *cdi,
 	return retval;
 }
 
-static int scd_dev_ioctl(struct cdrom_device_info *cdi,
+static int scd_read_audio(struct cdrom_device_info *cdi,
 			 unsigned int cmd, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -2894,11 +2894,10 @@ static struct cdrom_device_ops scd_dops = {
 	.get_mcn		= scd_get_mcn,
 	.reset			= scd_reset,
 	.audio_ioctl		= scd_audio_ioctl,
-	.dev_ioctl		= scd_dev_ioctl,
 	.capability		= CDC_OPEN_TRAY | CDC_CLOSE_TRAY | CDC_LOCK |
 				  CDC_SELECT_SPEED | CDC_MULTI_SESSION |
 				  CDC_MCN | CDC_MEDIA_CHANGED | CDC_PLAY_AUDIO |
-				  CDC_RESET | CDC_IOCTLS | CDC_DRIVE_STATUS,
+				  CDC_RESET | CDC_DRIVE_STATUS,
 	.n_minors		= 1,
 };
 
@@ -2936,6 +2935,9 @@ static int scd_block_ioctl(struct inode *inode, struct file *file,
 		case CDROMCLOSETRAY:
 			retval = scd_tray_move(&scd_info, 0);
 			break;
+		case CDROMREADAUDIO:
+			retval = scd_read_audio(&scd_info, CDROMREADAUDIO, arg);
+			break;
 		default:
 			retval = cdrom_ioctl(file, &scd_info, inode, cmd, arg);
 	}
diff --git a/drivers/cdrom/cm206.c b/drivers/cdrom/cm206.c
index ce127f7ec0f6..fad27a87ce35 100644
--- a/drivers/cdrom/cm206.c
+++ b/drivers/cdrom/cm206.c
@@ -1157,32 +1157,6 @@ static int cm206_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
 	}
 }
 
-/* Ioctl. These ioctls are specific to the cm206 driver. I have made
-   some driver statistics accessible through ioctl calls.
- */
-
-static int cm206_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
-		       unsigned long arg)
-{
-	switch (cmd) {
-#ifdef STATISTICS
-	case CM206CTL_GET_STAT:
-		if (arg >= NR_STATS)
-			return -EINVAL;
-		else
-			return cd->stats[arg];
-	case CM206CTL_GET_LAST_STAT:
-		if (arg >= NR_STATS)
-			return -EINVAL;
-		else
-			return cd->last_stat[arg];
-#endif
-	default:
-		debug(("Unknown ioctl call 0x%x\n", cmd));
-		return -EINVAL;
-	}
-}
-
 static int cm206_media_changed(struct cdrom_device_info *cdi, int disc_nr)
 {
 	if (cd != NULL) {
@@ -1321,11 +1295,10 @@ static struct cdrom_device_ops cm206_dops = {
 	.get_mcn		= cm206_get_upc,
 	.reset			= cm206_reset,
 	.audio_ioctl		= cm206_audio_ioctl,
-	.dev_ioctl		= cm206_ioctl,
 	.capability		= CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK |
 				  CDC_MULTI_SESSION | CDC_MEDIA_CHANGED |
 				  CDC_MCN | CDC_PLAY_AUDIO | CDC_SELECT_SPEED |
-				  CDC_IOCTLS | CDC_DRIVE_STATUS,
+				  CDC_DRIVE_STATUS,
 	.n_minors		= 1,
 };
 
@@ -1350,6 +1323,21 @@ static int cm206_block_release(struct inode *inode, struct file *file)
 static int cm206_block_ioctl(struct inode *inode, struct file *file,
 				unsigned cmd, unsigned long arg)
 {
+	switch (cmd) {
+#ifdef STATISTICS
+	case CM206CTL_GET_STAT:
+		if (arg >= NR_STATS)
+			return -EINVAL;
+		return cd->stats[arg];
+	case CM206CTL_GET_LAST_STAT:
+		if (arg >= NR_STATS)
+			return -EINVAL;
+		return cd->last_stat[arg];
+#endif
+	default:
+		break;
+	}
+
 	return cdrom_ioctl(file, &cm206_info, inode, cmd, arg);
 }
 
diff --git a/drivers/cdrom/sbpcd.c b/drivers/cdrom/sbpcd.c
index 466e9c2974bd..4760f515f591 100644
--- a/drivers/cdrom/sbpcd.c
+++ b/drivers/cdrom/sbpcd.c
@@ -4160,18 +4160,13 @@ static int sbpcd_get_last_session(struct cdrom_device_info *cdi, struct cdrom_mu
 	return  0;
 }
 
-/*==========================================================================*/
-/*==========================================================================*/
-/*
- * ioctl support
- */
-static int sbpcd_dev_ioctl(struct cdrom_device_info *cdi, u_int cmd,
-		      u_long arg)
+static int sbpcd_audio_ioctl(struct cdrom_device_info *cdi, u_int cmd,
+		       void * arg)
 {
 	struct sbpcd_drive *p = cdi->handle;
-	int i;
+	int i, st, j;
 	
-	msg(DBG_IO2,"ioctl(%s, 0x%08lX, 0x%08lX)\n", cdi->name, cmd, arg);
+	msg(DBG_IO2,"ioctl(%s, 0x%08lX, 0x%08p)\n", cdi->name, cmd, arg);
 	if (p->drv_id==-1) {
 		msg(DBG_INF, "ioctl: bad device: %s\n", cdi->name);
 		return (-ENXIO);             /* no such drive */
@@ -4183,484 +4178,163 @@ static int sbpcd_dev_ioctl(struct cdrom_device_info *cdi, u_int cmd,
 	msg(DBG_IO2,"ioctl: device %s, request %04X\n",cdi->name,cmd);
 	switch (cmd) 		/* Sun-compatible */
 	{
-	case DDIOCSDBG:		/* DDI Debug */
-		if (!capable(CAP_SYS_ADMIN)) RETURN_UP(-EPERM);
-		i=sbpcd_dbg_ioctl(arg,1);
-		RETURN_UP(i);
-	case CDROMRESET:      /* hard reset the drive */
-		msg(DBG_IOC,"ioctl: CDROMRESET entered.\n");
-		i=DriveReset();
-		current_drive->audio_state=0;
-		RETURN_UP(i);
 		
-	case CDROMREADMODE1:
-		msg(DBG_IOC,"ioctl: CDROMREADMODE1 requested.\n");
+	case CDROMPAUSE:     /* Pause the drive */
+		msg(DBG_IOC,"ioctl: CDROMPAUSE entered.\n");
+		/* pause the drive unit when it is currently in PLAY mode,         */
+		/* or reset the starting and ending locations when in PAUSED mode. */
+		/* If applicable, at the next stopping point it reaches            */
+		/* the drive will discontinue playing.                             */
+		switch (current_drive->audio_state)
+		{
+		case audio_playing:
+			if (famL_drive) i=cc_ReadSubQ();
+			else i=cc_Pause_Resume(1);
+			if (i<0) RETURN_UP(-EIO);
+			if (famL_drive) i=cc_Pause_Resume(1);
+			else i=cc_ReadSubQ();
+			if (i<0) RETURN_UP(-EIO);
+			current_drive->pos_audio_start=current_drive->SubQ_run_tot;
+			current_drive->audio_state=audio_pausing;
+			RETURN_UP(0);
+		case audio_pausing:
+			i=cc_Seek(current_drive->pos_audio_start,1);
+			if (i<0) RETURN_UP(-EIO);
+			RETURN_UP(0);
+		default:
+			RETURN_UP(-EINVAL);
+		}
+
+	case CDROMRESUME: /* resume paused audio play */
+		msg(DBG_IOC,"ioctl: CDROMRESUME entered.\n");
+		/* resume playing audio tracks when a previous PLAY AUDIO call has  */
+		/* been paused with a PAUSE command.                                */
+		/* It will resume playing from the location saved in SubQ_run_tot.  */
+		if (current_drive->audio_state!=audio_pausing) RETURN_UP(-EINVAL);
+		if (famL_drive)
+			i=cc_PlayAudio(current_drive->pos_audio_start,
+				       current_drive->pos_audio_end);
+		else i=cc_Pause_Resume(3);
+		if (i<0) RETURN_UP(-EIO);
+		current_drive->audio_state=audio_playing;
+		RETURN_UP(0);
+
+	case CDROMPLAYMSF:
+		msg(DBG_IOC,"ioctl: CDROMPLAYMSF entered.\n");
 #ifdef SAFE_MIXED
 		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
 #endif /* SAFE_MIXED */
-		cc_ModeSelect(CD_FRAMESIZE);
-		cc_ModeSense();
-		current_drive->mode=READ_M1;
+		if (current_drive->audio_state==audio_playing)
+		{
+			i=cc_Pause_Resume(1);
+			if (i<0) RETURN_UP(-EIO);
+			i=cc_ReadSubQ();
+			if (i<0) RETURN_UP(-EIO);
+			current_drive->pos_audio_start=current_drive->SubQ_run_tot;
+			i=cc_Seek(current_drive->pos_audio_start,1);
+		}
+		memcpy(&msf, (void *) arg, sizeof(struct cdrom_msf));
+		/* values come as msf-bin */
+		current_drive->pos_audio_start = (msf.cdmsf_min0<<16) |
+                        (msf.cdmsf_sec0<<8) |
+				msf.cdmsf_frame0;
+		current_drive->pos_audio_end = (msf.cdmsf_min1<<16) |
+			(msf.cdmsf_sec1<<8) |
+				msf.cdmsf_frame1;
+		msg(DBG_IOX,"ioctl: CDROMPLAYMSF %08X %08X\n",
+		    current_drive->pos_audio_start,current_drive->pos_audio_end);
+		i=cc_PlayAudio(current_drive->pos_audio_start,current_drive->pos_audio_end);
+		if (i<0)
+		{
+			msg(DBG_INF,"ioctl: cc_PlayAudio returns %d\n",i);
+			DriveReset();
+			current_drive->audio_state=0;
+			RETURN_UP(-EIO);
+		}
+		current_drive->audio_state=audio_playing;
 		RETURN_UP(0);
 		
-	case CDROMREADMODE2: /* not usable at the moment */
-		msg(DBG_IOC,"ioctl: CDROMREADMODE2 requested.\n");
+	case CDROMPLAYTRKIND: /* Play a track.  This currently ignores index. */
+		msg(DBG_IOC,"ioctl: CDROMPLAYTRKIND entered.\n");
 #ifdef SAFE_MIXED
 		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
 #endif /* SAFE_MIXED */
-		cc_ModeSelect(CD_FRAMESIZE_RAW1);
-		cc_ModeSense();
-		current_drive->mode=READ_M2;
+		if (current_drive->audio_state==audio_playing)
+		{
+			msg(DBG_IOX,"CDROMPLAYTRKIND: already audio_playing.\n");
+#if 1
+			RETURN_UP(0); /* just let us play on */
+#else
+			RETURN_UP(-EINVAL); /* play on, but say "error" */
+#endif
+		}
+		memcpy(&ti,(void *) arg,sizeof(struct cdrom_ti));
+		msg(DBG_IOX,"ioctl: trk0: %d, ind0: %d, trk1:%d, ind1:%d\n",
+		    ti.cdti_trk0,ti.cdti_ind0,ti.cdti_trk1,ti.cdti_ind1);
+		if (ti.cdti_trk0<current_drive->n_first_track) RETURN_UP(-EINVAL);
+		if (ti.cdti_trk0>current_drive->n_last_track) RETURN_UP(-EINVAL);
+		if (ti.cdti_trk1<ti.cdti_trk0) ti.cdti_trk1=ti.cdti_trk0;
+		if (ti.cdti_trk1>current_drive->n_last_track) ti.cdti_trk1=current_drive->n_last_track;
+		current_drive->pos_audio_start=current_drive->TocBuffer[ti.cdti_trk0].address;
+		current_drive->pos_audio_end=current_drive->TocBuffer[ti.cdti_trk1+1].address;
+		i=cc_PlayAudio(current_drive->pos_audio_start,current_drive->pos_audio_end);
+		if (i<0)
+		{
+			msg(DBG_INF,"ioctl: cc_PlayAudio returns %d\n",i);
+			DriveReset();
+			current_drive->audio_state=0;
+			RETURN_UP(-EIO);
+		}
+		current_drive->audio_state=audio_playing;
 		RETURN_UP(0);
 		
-	case CDROMAUDIOBUFSIZ: /* configure the audio buffer size */
-		msg(DBG_IOC,"ioctl: CDROMAUDIOBUFSIZ entered.\n");
-		if (current_drive->sbp_audsiz>0)
-			vfree(current_drive->aud_buf);
-		current_drive->aud_buf=NULL;
-		current_drive->sbp_audsiz=arg;
+	case CDROMREADTOCHDR:        /* Read the table of contents header */
+		msg(DBG_IOC,"ioctl: CDROMREADTOCHDR entered.\n");
+		tochdr.cdth_trk0=current_drive->n_first_track;
+		tochdr.cdth_trk1=current_drive->n_last_track;
+		memcpy((void *) arg, &tochdr, sizeof(struct cdrom_tochdr));
+		RETURN_UP(0);
 		
-		if (current_drive->sbp_audsiz>16)
-		{
-			current_drive->sbp_audsiz = 0;
-			RETURN_UP(current_drive->sbp_audsiz);
-		}
-	
-		if (current_drive->sbp_audsiz>0)
+	case CDROMREADTOCENTRY:      /* Read an entry in the table of contents */
+		msg(DBG_IOC,"ioctl: CDROMREADTOCENTRY entered.\n");
+		memcpy(&tocentry, (void *) arg, sizeof(struct cdrom_tocentry));
+		i=tocentry.cdte_track;
+		if (i==CDROM_LEADOUT) i=current_drive->n_last_track+1;
+		else if (i<current_drive->n_first_track||i>current_drive->n_last_track)
+                  RETURN_UP(-EINVAL);
+		tocentry.cdte_adr=current_drive->TocBuffer[i].ctl_adr&0x0F;
+		tocentry.cdte_ctrl=(current_drive->TocBuffer[i].ctl_adr>>4)&0x0F;
+		tocentry.cdte_datamode=current_drive->TocBuffer[i].format;
+		if (tocentry.cdte_format==CDROM_MSF) /* MSF-bin required */
 		{
-			current_drive->aud_buf=(u_char *) vmalloc(current_drive->sbp_audsiz*CD_FRAMESIZE_RAW);
-			if (current_drive->aud_buf==NULL)
-			{
-				msg(DBG_INF,"audio buffer (%d frames) not available.\n",current_drive->sbp_audsiz);
-				current_drive->sbp_audsiz=0;
-			}
-			else msg(DBG_INF,"audio buffer size: %d frames.\n",current_drive->sbp_audsiz);
+			tocentry.cdte_addr.msf.minute=(current_drive->TocBuffer[i].address>>16)&0x00FF;
+			tocentry.cdte_addr.msf.second=(current_drive->TocBuffer[i].address>>8)&0x00FF;
+			tocentry.cdte_addr.msf.frame=current_drive->TocBuffer[i].address&0x00FF;
 		}
-		RETURN_UP(current_drive->sbp_audsiz);
-
-	case CDROMREADAUDIO:
-	{ /* start of CDROMREADAUDIO */
-		int i=0, j=0, frame, block=0;
-		u_int try=0;
-		u_long timeout;
-		u_char *p;
-		u_int data_tries = 0;
-		u_int data_waits = 0;
-		u_int data_retrying = 0;
-		int status_tries;
-		int error_flag;
+		else if (tocentry.cdte_format==CDROM_LBA) /* blk required */
+			tocentry.cdte_addr.lba=msf2blk(current_drive->TocBuffer[i].address);
+		else RETURN_UP(-EINVAL);
+		memcpy((void *) arg, &tocentry, sizeof(struct cdrom_tocentry));
+		RETURN_UP(0);
 		
-		msg(DBG_IOC,"ioctl: CDROMREADAUDIO entered.\n");
-		if (fam0_drive) RETURN_UP(-EINVAL);
-		if (famL_drive) RETURN_UP(-EINVAL);
-		if (famV_drive) RETURN_UP(-EINVAL);
-		if (famT_drive) RETURN_UP(-EINVAL);
+	case CDROMSTOP:      /* Spin down the drive */
+		msg(DBG_IOC,"ioctl: CDROMSTOP entered.\n");
 #ifdef SAFE_MIXED
 		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
 #endif /* SAFE_MIXED */ 
-		if (current_drive->aud_buf==NULL) RETURN_UP(-EINVAL);
-		if (copy_from_user(&read_audio, (void __user *)arg,
-				   sizeof(struct cdrom_read_audio)))
-			RETURN_UP(-EFAULT);
-		if (read_audio.nframes < 0 || read_audio.nframes>current_drive->sbp_audsiz) RETURN_UP(-EINVAL);
-		if (!access_ok(VERIFY_WRITE, read_audio.buf,
-			      read_audio.nframes*CD_FRAMESIZE_RAW))
-                	RETURN_UP(-EFAULT);
-		
-		if (read_audio.addr_format==CDROM_MSF) /* MSF-bin specification of where to start */
-			block=msf2lba(&read_audio.addr.msf.minute);
-		else if (read_audio.addr_format==CDROM_LBA) /* lba specification of where to start */
-			block=read_audio.addr.lba;
-		else RETURN_UP(-EINVAL);
-#if 000
-		i=cc_SetSpeed(speed_150,0,0);
-		if (i) msg(DBG_AUD,"read_audio: SetSpeed error %d\n", i);
-#endif
-		msg(DBG_AUD,"read_audio: lba: %d, msf: %06X\n",
-		    block, blk2msf(block));
-		msg(DBG_AUD,"read_audio: before cc_ReadStatus.\n");
-#if OLD_BUSY
-		while (busy_data) sbp_sleep(HZ/10); /* wait a bit */
-		busy_audio=1;
-#endif /* OLD_BUSY */ 
-		error_flag=0;
-		for (data_tries=5; data_tries>0; data_tries--)
-		{
-			msg(DBG_AUD,"data_tries=%d ...\n", data_tries);
-			current_drive->mode=READ_AU;
-			cc_ModeSelect(CD_FRAMESIZE_RAW);
-			cc_ModeSense();
-			for (status_tries=3; status_tries > 0; status_tries--)
-			{
-				flags_cmd_out |= f_respo3;
-				cc_ReadStatus();
-				if (sbp_status() != 0) break;
-				if (st_check) cc_ReadError();
-				sbp_sleep(1);    /* wait a bit, try again */
-			}
-			if (status_tries == 0)
-			{
-				msg(DBG_AUD,"read_audio: sbp_status: failed after 3 tries in line %d.\n", __LINE__);
-				continue;
-			}
-			msg(DBG_AUD,"read_audio: sbp_status: ok.\n");
-			
-			flags_cmd_out = f_putcmd | f_respo2 | f_ResponseStatus | f_obey_p_check;
-			if (fam0L_drive)
-			{
-				flags_cmd_out |= f_lopsta | f_getsta | f_bit1;
-				cmd_type=READ_M2;
-				drvcmd[0]=CMD0_READ_XA; /* "read XA frames", old drives */
-				drvcmd[1]=(block>>16)&0x000000ff;
-				drvcmd[2]=(block>>8)&0x000000ff;
-				drvcmd[3]=block&0x000000ff;
-				drvcmd[4]=0;
-				drvcmd[5]=read_audio.nframes; /* # of frames */
-				drvcmd[6]=0;
-			}
-			else if (fam1_drive)
-			{
-				drvcmd[0]=CMD1_READ; /* "read frames", new drives */
-				lba2msf(block,&drvcmd[1]); /* msf-bin format required */
-				drvcmd[4]=0;
-				drvcmd[5]=0;
-				drvcmd[6]=read_audio.nframes; /* # of frames */
-			}
-			else if (fam2_drive)
-			{
-				drvcmd[0]=CMD2_READ_XA2;
-				lba2msf(block,&drvcmd[1]); /* msf-bin format required */
-				drvcmd[4]=0;
-				drvcmd[5]=read_audio.nframes; /* # of frames */
-				drvcmd[6]=0x11; /* raw mode */
-			}
-			else if (famT_drive) /* CD-55A: not tested yet */
-			{
-			}
-			msg(DBG_AUD,"read_audio: before giving \"read\" command.\n");
-			flags_cmd_out=f_putcmd;
-			response_count=0;
-			i=cmd_out();
-			if (i<0) msg(DBG_INF,"error giving READ AUDIO command: %0d\n", i);
-			sbp_sleep(0);
-			msg(DBG_AUD,"read_audio: after giving \"read\" command.\n");
-			for (frame=1;frame<2 && !error_flag; frame++)
-			{
-				try=maxtim_data;
-				for (timeout=jiffies+9*HZ; ; )
-				{
-					for ( ; try!=0;try--)
-					{
-						j=inb(CDi_status);
-						if (!(j&s_not_data_ready)) break;
-						if (!(j&s_not_result_ready)) break;
-						if (fam0L_drive) if (j&s_attention) break;
-					}
-					if (try != 0 || time_after_eq(jiffies, timeout)) break;
-					if (data_retrying == 0) data_waits++;
-					data_retrying = 1;
-					sbp_sleep(1);
-					try = 1;
-				}
-				if (try==0)
-				{
-					msg(DBG_INF,"read_audio: sbp_data: CDi_status timeout.\n");
-					error_flag++;
-					break;
-				}
-				msg(DBG_AUD,"read_audio: sbp_data: CDi_status ok.\n");
-				if (j&s_not_data_ready)
-				{
-					msg(DBG_INF, "read_audio: sbp_data: DATA_READY timeout.\n");
-					error_flag++;
-					break;
-				}
-				msg(DBG_AUD,"read_audio: before reading data.\n");
-				error_flag=0;
-				p = current_drive->aud_buf;
-				if (sbpro_type==1) OUT(CDo_sel_i_d,1);
-				if (do_16bit)
-				{
-					u_short *p2 = (u_short *) p;
-
-					for (; (u_char *) p2 < current_drive->aud_buf + read_audio.nframes*CD_FRAMESIZE_RAW;)
-				  	{
-						if ((inb_p(CDi_status)&s_not_data_ready)) continue;
-
-						/* get one sample */
-						*p2++ = inw_p(CDi_data);
-						*p2++ = inw_p(CDi_data);
-					}
-				} else {
-					for (; p < current_drive->aud_buf + read_audio.nframes*CD_FRAMESIZE_RAW;)
-				  	{
-						if ((inb_p(CDi_status)&s_not_data_ready)) continue;
-
-						/* get one sample */
-						*p++ = inb_p(CDi_data);
-						*p++ = inb_p(CDi_data);
-						*p++ = inb_p(CDi_data);
-						*p++ = inb_p(CDi_data);
-					}
-				}
-				if (sbpro_type==1) OUT(CDo_sel_i_d,0);
-				data_retrying = 0;
-			}
-			msg(DBG_AUD,"read_audio: after reading data.\n");
-			if (error_flag)    /* must have been spurious D_RDY or (ATTN&&!D_RDY) */
-			{
-				msg(DBG_AUD,"read_audio: read aborted by drive\n");
-#if 0000
-				i=cc_DriveReset();                /* ugly fix to prevent a hang */
-#else
-				i=cc_ReadError();
-#endif
-				continue;
-			}
-			if (fam0L_drive)
-			{
-				i=maxtim_data;
-				for (timeout=jiffies+9*HZ; time_before(jiffies, timeout); timeout--)
-				{
-					for ( ;i!=0;i--)
-					{
-						j=inb(CDi_status);
-						if (!(j&s_not_data_ready)) break;
-						if (!(j&s_not_result_ready)) break;
-						if (j&s_attention) break;
-					}
-					if (i != 0 || time_after_eq(jiffies, timeout)) break;
-					sbp_sleep(0);
-					i = 1;
-				}
-				if (i==0) msg(DBG_AUD,"read_audio: STATUS TIMEOUT AFTER READ");
-				if (!(j&s_attention))
-				{
-					msg(DBG_AUD,"read_audio: sbp_data: timeout waiting DRV_ATTN - retrying\n");
-					i=cc_DriveReset();  /* ugly fix to prevent a hang */
-					continue;
-				}
-			}
-			do
-			{
-				if (fam0L_drive) cc_ReadStatus();
-				i=ResponseStatus();  /* builds status_bits, returns orig. status (old) or faked p_success (new) */
-				if (i<0) { msg(DBG_AUD,
-					       "read_audio: cc_ReadStatus error after read: %02X\n",
-					       current_drive->status_bits);
-					   continue; /* FIXME */
-				   }
-			}
-			while ((fam0L_drive)&&(!st_check)&&(!(i&p_success)));
-			if (st_check)
-			{
-				i=cc_ReadError();
-				msg(DBG_AUD,"read_audio: cc_ReadError was necessary after read: %02X\n",i);
-				continue;
-			}
-			if (copy_to_user(read_audio.buf,
-					 current_drive->aud_buf,
-					 read_audio.nframes * CD_FRAMESIZE_RAW))
-				RETURN_UP(-EFAULT);
-			msg(DBG_AUD,"read_audio: copy_to_user done.\n");
-			break;
-		}
-		cc_ModeSelect(CD_FRAMESIZE);
-		cc_ModeSense();
-		current_drive->mode=READ_M1;
-#if OLD_BUSY
-		busy_audio=0;
-#endif /* OLD_BUSY */ 
-		if (data_tries == 0)
-		{
-			msg(DBG_AUD,"read_audio: failed after 5 tries in line %d.\n", __LINE__);
-			RETURN_UP(-EIO);
-		}
-		msg(DBG_AUD,"read_audio: successful return.\n");
-		RETURN_UP(0);
-	} /* end of CDROMREADAUDIO */
-		
-	default:
-		msg(DBG_IOC,"ioctl: unknown function request %04X\n", cmd);
-		RETURN_UP(-EINVAL);
-	} /* end switch(cmd) */
-}
-
-static int sbpcd_audio_ioctl(struct cdrom_device_info *cdi, u_int cmd,
-		       void * arg)
-{
-	struct sbpcd_drive *p = cdi->handle;
-	int i, st, j;
-	
-	msg(DBG_IO2,"ioctl(%s, 0x%08lX, 0x%08p)\n", cdi->name, cmd, arg);
-	if (p->drv_id==-1) {
-		msg(DBG_INF, "ioctl: bad device: %s\n", cdi->name);
-		return (-ENXIO);             /* no such drive */
-	}
-	down(&ioctl_read_sem);
-	if (p != current_drive)
-		switch_drive(p);
-	
-	msg(DBG_IO2,"ioctl: device %s, request %04X\n",cdi->name,cmd);
-	switch (cmd) 		/* Sun-compatible */
-	{
-		
-	case CDROMPAUSE:     /* Pause the drive */
-		msg(DBG_IOC,"ioctl: CDROMPAUSE entered.\n");
-		/* pause the drive unit when it is currently in PLAY mode,         */
-		/* or reset the starting and ending locations when in PAUSED mode. */
-		/* If applicable, at the next stopping point it reaches            */
-		/* the drive will discontinue playing.                             */
-		switch (current_drive->audio_state)
-		{
-		case audio_playing:
-			if (famL_drive) i=cc_ReadSubQ();
-			else i=cc_Pause_Resume(1);
-			if (i<0) RETURN_UP(-EIO);
-			if (famL_drive) i=cc_Pause_Resume(1);
-			else i=cc_ReadSubQ();
-			if (i<0) RETURN_UP(-EIO);
-			current_drive->pos_audio_start=current_drive->SubQ_run_tot;
-			current_drive->audio_state=audio_pausing;
-			RETURN_UP(0);
-		case audio_pausing:
-			i=cc_Seek(current_drive->pos_audio_start,1);
-			if (i<0) RETURN_UP(-EIO);
-			RETURN_UP(0);
-		default:
-			RETURN_UP(-EINVAL);
-		}
-		
-	case CDROMRESUME: /* resume paused audio play */
-		msg(DBG_IOC,"ioctl: CDROMRESUME entered.\n");
-		/* resume playing audio tracks when a previous PLAY AUDIO call has  */
-		/* been paused with a PAUSE command.                                */
-		/* It will resume playing from the location saved in SubQ_run_tot.  */
-		if (current_drive->audio_state!=audio_pausing) RETURN_UP(-EINVAL);
-		if (famL_drive)
-			i=cc_PlayAudio(current_drive->pos_audio_start,
-				       current_drive->pos_audio_end);
-		else i=cc_Pause_Resume(3);
-		if (i<0) RETURN_UP(-EIO);
-		current_drive->audio_state=audio_playing;
-		RETURN_UP(0);
-		
-	case CDROMPLAYMSF:
-		msg(DBG_IOC,"ioctl: CDROMPLAYMSF entered.\n");
-#ifdef SAFE_MIXED
-		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
-#endif /* SAFE_MIXED */ 
-		if (current_drive->audio_state==audio_playing)
-		{
-			i=cc_Pause_Resume(1);
-			if (i<0) RETURN_UP(-EIO);
-			i=cc_ReadSubQ();
-			if (i<0) RETURN_UP(-EIO);
-			current_drive->pos_audio_start=current_drive->SubQ_run_tot;
-			i=cc_Seek(current_drive->pos_audio_start,1);
-		}
-		memcpy(&msf, (void *) arg, sizeof(struct cdrom_msf));
-		/* values come as msf-bin */
-		current_drive->pos_audio_start = (msf.cdmsf_min0<<16) |
-                        (msf.cdmsf_sec0<<8) |
-				msf.cdmsf_frame0;
-		current_drive->pos_audio_end = (msf.cdmsf_min1<<16) |
-			(msf.cdmsf_sec1<<8) |
-				msf.cdmsf_frame1;
-		msg(DBG_IOX,"ioctl: CDROMPLAYMSF %08X %08X\n",
-		    current_drive->pos_audio_start,current_drive->pos_audio_end);
-		i=cc_PlayAudio(current_drive->pos_audio_start,current_drive->pos_audio_end);
-		if (i<0)
-		{
-			msg(DBG_INF,"ioctl: cc_PlayAudio returns %d\n",i);
-			DriveReset();
-			current_drive->audio_state=0;
-			RETURN_UP(-EIO);
-		}
-		current_drive->audio_state=audio_playing;
-		RETURN_UP(0);
-		
-	case CDROMPLAYTRKIND: /* Play a track.  This currently ignores index. */
-		msg(DBG_IOC,"ioctl: CDROMPLAYTRKIND entered.\n");
-#ifdef SAFE_MIXED
-		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
-#endif /* SAFE_MIXED */ 
-		if (current_drive->audio_state==audio_playing)
-		{
-			msg(DBG_IOX,"CDROMPLAYTRKIND: already audio_playing.\n");
-#if 1
-			RETURN_UP(0); /* just let us play on */
-#else
-			RETURN_UP(-EINVAL); /* play on, but say "error" */
-#endif
-		}
-		memcpy(&ti,(void *) arg,sizeof(struct cdrom_ti));
-		msg(DBG_IOX,"ioctl: trk0: %d, ind0: %d, trk1:%d, ind1:%d\n",
-		    ti.cdti_trk0,ti.cdti_ind0,ti.cdti_trk1,ti.cdti_ind1);
-		if (ti.cdti_trk0<current_drive->n_first_track) RETURN_UP(-EINVAL);
-		if (ti.cdti_trk0>current_drive->n_last_track) RETURN_UP(-EINVAL);
-		if (ti.cdti_trk1<ti.cdti_trk0) ti.cdti_trk1=ti.cdti_trk0;
-		if (ti.cdti_trk1>current_drive->n_last_track) ti.cdti_trk1=current_drive->n_last_track;
-		current_drive->pos_audio_start=current_drive->TocBuffer[ti.cdti_trk0].address;
-		current_drive->pos_audio_end=current_drive->TocBuffer[ti.cdti_trk1+1].address;
-		i=cc_PlayAudio(current_drive->pos_audio_start,current_drive->pos_audio_end);
-		if (i<0)
-		{
-			msg(DBG_INF,"ioctl: cc_PlayAudio returns %d\n",i);
-			DriveReset();
-			current_drive->audio_state=0;
-			RETURN_UP(-EIO);
-		}
-		current_drive->audio_state=audio_playing;
-		RETURN_UP(0);
-		
-	case CDROMREADTOCHDR:        /* Read the table of contents header */
-		msg(DBG_IOC,"ioctl: CDROMREADTOCHDR entered.\n");
-		tochdr.cdth_trk0=current_drive->n_first_track;
-		tochdr.cdth_trk1=current_drive->n_last_track;
-		memcpy((void *) arg, &tochdr, sizeof(struct cdrom_tochdr));
-		RETURN_UP(0);
-		
-	case CDROMREADTOCENTRY:      /* Read an entry in the table of contents */
-		msg(DBG_IOC,"ioctl: CDROMREADTOCENTRY entered.\n");
-		memcpy(&tocentry, (void *) arg, sizeof(struct cdrom_tocentry));
-		i=tocentry.cdte_track;
-		if (i==CDROM_LEADOUT) i=current_drive->n_last_track+1;
-		else if (i<current_drive->n_first_track||i>current_drive->n_last_track)
-                  RETURN_UP(-EINVAL);
-		tocentry.cdte_adr=current_drive->TocBuffer[i].ctl_adr&0x0F;
-		tocentry.cdte_ctrl=(current_drive->TocBuffer[i].ctl_adr>>4)&0x0F;
-		tocentry.cdte_datamode=current_drive->TocBuffer[i].format;
-		if (tocentry.cdte_format==CDROM_MSF) /* MSF-bin required */
-		{
-			tocentry.cdte_addr.msf.minute=(current_drive->TocBuffer[i].address>>16)&0x00FF;
-			tocentry.cdte_addr.msf.second=(current_drive->TocBuffer[i].address>>8)&0x00FF;
-			tocentry.cdte_addr.msf.frame=current_drive->TocBuffer[i].address&0x00FF;
-		}
-		else if (tocentry.cdte_format==CDROM_LBA) /* blk required */
-			tocentry.cdte_addr.lba=msf2blk(current_drive->TocBuffer[i].address);
-		else RETURN_UP(-EINVAL);
-		memcpy((void *) arg, &tocentry, sizeof(struct cdrom_tocentry));
-		RETURN_UP(0);
-		
-	case CDROMSTOP:      /* Spin down the drive */
-		msg(DBG_IOC,"ioctl: CDROMSTOP entered.\n");
-#ifdef SAFE_MIXED
-		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
-#endif /* SAFE_MIXED */ 
-		i=cc_Pause_Resume(1);
-		current_drive->audio_state=0;
-#if 0
-		cc_DriveReset();
+		i=cc_Pause_Resume(1);
+		current_drive->audio_state=0;
+#if 0
+		cc_DriveReset();
 #endif
 		RETURN_UP(i);
-		
+
 	case CDROMSTART:  /* Spin up the drive */
 		msg(DBG_IOC,"ioctl: CDROMSTART entered.\n");
 		cc_SpinUp();
 		current_drive->audio_state=0;
 		RETURN_UP(0);
-		
+
 	case CDROMVOLCTRL:   /* Volume control */
 		msg(DBG_IOC,"ioctl: CDROMVOLCTRL entered.\n");
 		memcpy(&volctrl,(char *) arg,sizeof(volctrl));
@@ -4670,7 +4344,7 @@ static int sbpcd_audio_ioctl(struct cdrom_device_info *cdi, u_int cmd,
 		current_drive->vol_ctrl1=volctrl.channel1;
 		i=cc_SetVolume();
 		RETURN_UP(0);
-		
+
 	case CDROMVOLREAD:   /* read Volume settings from drive */
 		msg(DBG_IOC,"ioctl: CDROMVOLREAD entered.\n");
 		st=cc_GetVolume();
@@ -4694,7 +4368,7 @@ static int sbpcd_audio_ioctl(struct cdrom_device_info *cdi, u_int cmd,
 		if (i<0) {
 			j=cc_ReadError(); /* clear out error status from drive */
 			current_drive->audio_state=CDROM_AUDIO_NO_STATUS;
-			/* get and set the disk state here, 
+			/* get and set the disk state here,
 			probably not the right place, but who cares!
 			It makes it work properly! --AJK */
 			if (current_drive->CD_changed==0xFF) {
@@ -4715,8 +4389,8 @@ static int sbpcd_audio_ioctl(struct cdrom_device_info *cdi, u_int cmd,
 			}
 		}
 		memcpy(&SC, (void *) arg, sizeof(struct cdrom_subchnl));
-		/* 
-			This virtual crap is very bogus! 
+		/*
+			This virtual crap is very bogus!
 			It doesn't detect when the cd is done playing audio!
 			Lets do this right with proper hardware register reading!
 		*/
@@ -4775,7 +4449,7 @@ static int sbpcd_audio_ioctl(struct cdrom_device_info *cdi, u_int cmd,
 		    SC.cdsc_trk,SC.cdsc_ind,
 		    SC.cdsc_absaddr,SC.cdsc_reladdr);
 		RETURN_UP(0);
-		
+
 	default:
 		msg(DBG_IOC,"ioctl: unknown function request %04X\n", cmd);
 		RETURN_UP(-EINVAL);
@@ -4788,7 +4462,7 @@ static int sbpcd_audio_ioctl(struct cdrom_device_info *cdi, u_int cmd,
 static void sbp_transfer(struct request *req)
 {
 	long offs;
-	
+
 	while ( (req->nr_sectors > 0) &&
 	       (req->sector/4 >= current_drive->sbp_first_frame) &&
 	       (req->sector/4 <= current_drive->sbp_last_frame) )
@@ -4807,11 +4481,11 @@ static void sbp_transfer(struct request *req)
  *
  *  This is a kludge so we don't need to modify end_request.
  *  We put the req we take out after INIT_REQUEST in the requests list,
- *  so that end_request will discard it. 
+ *  so that end_request will discard it.
  *
  *  The bug could be present in other block devices, perhaps we
  *  should modify INIT_REQUEST and end_request instead, and
- *  change every block device.. 
+ *  change every block device..
  *
  *  Could be a race here?? Could e.g. a timer interrupt schedule() us?
  *  If so, we should copy end_request here, and do it right.. (or
@@ -4831,546 +4505,865 @@ static void sbp_transfer(struct request *req)
 
 /*==========================================================================*/
 /*
- *  I/O request routine, called from Linux kernel.
+ *  I/O request routine, called from Linux kernel.
+ */
+static void do_sbpcd_request(request_queue_t * q)
+{
+	u_int block;
+	u_int nsect;
+	int status_tries, data_tries;
+	struct request *req;
+	struct sbpcd_drive *p;
+#ifdef DEBUG_GTL
+	static int xx_nr=0;
+	int xnr;
+#endif
+
+ request_loop:
+#ifdef DEBUG_GTL
+	xnr=++xx_nr;
+
+	req = elv_next_request(q);
+
+	if (!req)
+	{
+		printk( "do_sbpcd_request[%di](NULL), Pid:%d, Time:%li\n",
+			xnr, current->pid, jiffies);
+		printk( "do_sbpcd_request[%do](NULL) end 0 (null), Time:%li\n",
+			xnr, jiffies);
+		return;
+	}
+
+	printk(" do_sbpcd_request[%di](%p:%ld+%ld), Pid:%d, Time:%li\n",
+		xnr, req, req->sector, req->nr_sectors, current->pid, jiffies);
+#endif
+
+	req = elv_next_request(q);	/* take out our request so no other */
+	if (!req)
+		return;
+
+	if (req -> sector == -1)
+		end_request(req, 0);
+	spin_unlock_irq(q->queue_lock);
+
+	down(&ioctl_read_sem);
+	if (rq_data_dir(elv_next_request(q)) != READ)
+	{
+		msg(DBG_INF, "bad cmd %d\n", req->cmd[0]);
+		goto err_done;
+	}
+	p = req->rq_disk->private_data;
+#if OLD_BUSY
+	while (busy_audio) sbp_sleep(HZ); /* wait a bit */
+	busy_data=1;
+#endif /* OLD_BUSY */
+
+	if (p->audio_state==audio_playing) goto err_done;
+	if (p != current_drive)
+		switch_drive(p);
+
+	block = req->sector; /* always numbered as 512-byte-pieces */
+	nsect = req->nr_sectors; /* always counted as 512-byte-pieces */
+
+	msg(DBG_BSZ,"read sector %d (%d sectors)\n", block, nsect);
+#if 0
+	msg(DBG_MUL,"read LBA %d\n", block/4);
+#endif
+
+	sbp_transfer(req);
+	/* if we satisfied the request from the buffer, we're done. */
+	if (req->nr_sectors == 0)
+	{
+#ifdef DEBUG_GTL
+		printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 2, Time:%li\n",
+			xnr, req, req->sector, req->nr_sectors, jiffies);
+#endif
+		up(&ioctl_read_sem);
+		spin_lock_irq(q->queue_lock);
+		end_request(req, 1);
+		goto request_loop;
+	}
+
+#ifdef FUTURE
+	i=prepare(0,0); /* at moment not really a hassle check, but ... */
+	if (i!=0)
+		msg(DBG_INF,"\"prepare\" tells error %d -- ignored\n", i);
+#endif /* FUTURE */
+
+	if (!st_spinning) cc_SpinUp();
+
+	for (data_tries=n_retries; data_tries > 0; data_tries--)
+	{
+		for (status_tries=3; status_tries > 0; status_tries--)
+		{
+			flags_cmd_out |= f_respo3;
+			cc_ReadStatus();
+			if (sbp_status() != 0) break;
+			if (st_check) cc_ReadError();
+			sbp_sleep(1);    /* wait a bit, try again */
+		}
+		if (status_tries == 0)
+		{
+			msg(DBG_INF,"sbp_status: failed after 3 tries in line %d\n", __LINE__);
+			break;
+		}
+		
+		sbp_read_cmd(req);
+		sbp_sleep(0);
+		if (sbp_data(req) != 0)
+		{
+#ifdef SAFE_MIXED
+			current_drive->has_data=2; /* is really a data disk */
+#endif /* SAFE_MIXED */
+#ifdef DEBUG_GTL
+			printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 3, Time:%li\n",
+				xnr, req, req->sector, req->nr_sectors, jiffies);
+#endif
+			up(&ioctl_read_sem);
+			spin_lock_irq(q->queue_lock);
+			end_request(req, 1);
+			goto request_loop;
+		}
+	}
+
+ err_done:
+#if OLD_BUSY
+	busy_data=0;
+#endif /* OLD_BUSY */
+#ifdef DEBUG_GTL
+	printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 4 (error), Time:%li\n",
+		xnr, req, req->sector, req->nr_sectors, jiffies);
+#endif
+	up(&ioctl_read_sem);
+	sbp_sleep(0);    /* wait a bit, try again */
+	spin_lock_irq(q->queue_lock);
+	end_request(req, 0);
+	goto request_loop;
+}
+/*==========================================================================*/
+/*
+ *  build and send the READ command.
+ */
+static void sbp_read_cmd(struct request *req)
+{
+#undef OLD
+
+	int i;
+	int block;
+
+	current_drive->sbp_first_frame=current_drive->sbp_last_frame=-1;      /* purge buffer */
+	current_drive->sbp_current = 0;
+	block=req->sector/4;
+	if (block+current_drive->sbp_bufsiz <= current_drive->CDsize_frm)
+		current_drive->sbp_read_frames = current_drive->sbp_bufsiz;
+	else
+	{
+		current_drive->sbp_read_frames=current_drive->CDsize_frm-block;
+		/* avoid reading past end of data */
+		if (current_drive->sbp_read_frames < 1)
+		{
+			msg(DBG_INF,"requested frame %d, CD size %d ???\n",
+			    block, current_drive->CDsize_frm);
+			current_drive->sbp_read_frames=1;
+		}
+	}
+
+	flags_cmd_out = f_putcmd | f_respo2 | f_ResponseStatus | f_obey_p_check;
+	clr_cmdbuf();
+	if (famV_drive)
+	  {
+	    drvcmd[0]=CMDV_READ;
+	    lba2msf(block,&drvcmd[1]); /* msf-bcd format required */
+	    bin2bcdx(&drvcmd[1]);
+	    bin2bcdx(&drvcmd[2]);
+	    bin2bcdx(&drvcmd[3]);
+	    drvcmd[4]=current_drive->sbp_read_frames>>8;
+	    drvcmd[5]=current_drive->sbp_read_frames&0xff;
+	    drvcmd[6]=0x02; /* flag "msf-bcd" */
+	}
+	else if (fam0L_drive)
+	{
+		flags_cmd_out |= f_lopsta | f_getsta | f_bit1;
+		if (current_drive->xa_byte==0x20)
+		{
+			cmd_type=READ_M2;
+			drvcmd[0]=CMD0_READ_XA; /* "read XA frames", old drives */
+			drvcmd[1]=(block>>16)&0x0ff;
+			drvcmd[2]=(block>>8)&0x0ff;
+			drvcmd[3]=block&0x0ff;
+			drvcmd[4]=(current_drive->sbp_read_frames>>8)&0x0ff;
+			drvcmd[5]=current_drive->sbp_read_frames&0x0ff;
+		}
+		else
+		{
+			drvcmd[0]=CMD0_READ; /* "read frames", old drives */
+			if (current_drive->drv_type>=drv_201)
+			{
+				lba2msf(block,&drvcmd[1]); /* msf-bcd format required */
+				bin2bcdx(&drvcmd[1]);
+				bin2bcdx(&drvcmd[2]);
+				bin2bcdx(&drvcmd[3]);
+			}
+			else
+			{
+				drvcmd[1]=(block>>16)&0x0ff;
+				drvcmd[2]=(block>>8)&0x0ff;
+				drvcmd[3]=block&0x0ff;
+			}
+			drvcmd[4]=(current_drive->sbp_read_frames>>8)&0x0ff;
+			drvcmd[5]=current_drive->sbp_read_frames&0x0ff;
+			drvcmd[6]=(current_drive->drv_type<drv_201)?0:2; /* flag "lba or msf-bcd format" */
+		}
+	}
+	else if (fam1_drive)
+	{
+		drvcmd[0]=CMD1_READ;
+		lba2msf(block,&drvcmd[1]); /* msf-bin format required */
+		drvcmd[5]=(current_drive->sbp_read_frames>>8)&0x0ff;
+		drvcmd[6]=current_drive->sbp_read_frames&0x0ff;
+	}
+	else if (fam2_drive)
+	{
+		drvcmd[0]=CMD2_READ;
+		lba2msf(block,&drvcmd[1]); /* msf-bin format required */
+		drvcmd[4]=(current_drive->sbp_read_frames>>8)&0x0ff;
+		drvcmd[5]=current_drive->sbp_read_frames&0x0ff;
+		drvcmd[6]=0x02;
+	}
+	else if (famT_drive)
+	{
+		drvcmd[0]=CMDT_READ;
+		drvcmd[2]=(block>>24)&0x0ff;
+		drvcmd[3]=(block>>16)&0x0ff;
+		drvcmd[4]=(block>>8)&0x0ff;
+		drvcmd[5]=block&0x0ff;
+		drvcmd[7]=(current_drive->sbp_read_frames>>8)&0x0ff;
+		drvcmd[8]=current_drive->sbp_read_frames&0x0ff;
+	}
+	flags_cmd_out=f_putcmd;
+	response_count=0;
+	i=cmd_out();
+	if (i<0) msg(DBG_INF,"error giving READ command: %0d\n", i);
+	return;
+}
+/*==========================================================================*/
+/*
+ *  Check the completion of the read-data command.  On success, read
+ *  the current_drive->sbp_bufsiz * 2048 bytes of data from the disk into buffer.
  */
-static void do_sbpcd_request(request_queue_t * q)
+static int sbp_data(struct request *req)
 {
-	u_int block;
-	u_int nsect;
-	int status_tries, data_tries;
-	struct request *req;
-	struct sbpcd_drive *p;
-#ifdef DEBUG_GTL
-	static int xx_nr=0;
-	int xnr;
-#endif
-
- request_loop:
-#ifdef DEBUG_GTL
-	xnr=++xx_nr;
-
-	req = elv_next_request(q);
-
-	if (!req)
-	{
-		printk( "do_sbpcd_request[%di](NULL), Pid:%d, Time:%li\n",
-			xnr, current->pid, jiffies);
-		printk( "do_sbpcd_request[%do](NULL) end 0 (null), Time:%li\n",
-			xnr, jiffies);
-		return;
-	}
+	int i=0, j=0, l, frame;
+	u_int try=0;
+	u_long timeout;
+	u_char *p;
+	u_int data_tries = 0;
+	u_int data_waits = 0;
+	u_int data_retrying = 0;
+	int error_flag;
+	int xa_count;
+	int max_latency;
+	int success;
+	int wait;
+	int duration;
 
-	printk(" do_sbpcd_request[%di](%p:%ld+%ld), Pid:%d, Time:%li\n",
-		xnr, req, req->sector, req->nr_sectors, current->pid, jiffies);
+	error_flag=0;
+	success=0;
+#if LONG_TIMING
+	max_latency=9*HZ;
+#else
+	if (current_drive->f_multisession) max_latency=15*HZ;
+	else max_latency=5*HZ;
 #endif
+	duration=jiffies;
+	for (frame=0;frame<current_drive->sbp_read_frames&&!error_flag; frame++)
+	{
+		SBPCD_CLI;
 
-	req = elv_next_request(q);	/* take out our request so no other */
-	if (!req)
-		return;
+		del_timer(&data_timer);
+		data_timer.expires=jiffies+max_latency;
+		timed_out_data=0;
+		add_timer(&data_timer);
+		while (!timed_out_data)
+		{
+			if (current_drive->f_multisession) try=maxtim_data*4;
+			else try=maxtim_data;
+			msg(DBG_000,"sbp_data: CDi_status loop: try=%d.\n",try);
+			for ( ; try!=0;try--)
+			{
+				j=inb(CDi_status);
+				if (!(j&s_not_data_ready)) break;
+				if (!(j&s_not_result_ready)) break;
+				if (fam0LV_drive) if (j&s_attention) break;
+			}
+			if (!(j&s_not_data_ready)) goto data_ready;
+			if (try==0)
+			{
+				if (data_retrying == 0) data_waits++;
+				data_retrying = 1;
+				msg(DBG_000,"sbp_data: CDi_status loop: sleeping.\n");
+				sbp_sleep(1);
+				try = 1;
+			}
+		}
+		msg(DBG_INF,"sbp_data: CDi_status loop expired.\n");
+	data_ready:
+		del_timer(&data_timer);
 
-	if (req -> sector == -1)
-		end_request(req, 0);
-	spin_unlock_irq(q->queue_lock);
+		if (timed_out_data)
+		{
+			msg(DBG_INF,"sbp_data: CDi_status timeout (timed_out_data) (%02X).\n", j);
+			error_flag++;
+		}
+		if (try==0)
+		{
+			msg(DBG_INF,"sbp_data: CDi_status timeout (try=0) (%02X).\n", j);
+			error_flag++;
+		}
+		if (!(j&s_not_result_ready))
+		{
+			msg(DBG_INF, "sbp_data: RESULT_READY where DATA_READY awaited (%02X).\n", j);
+			response_count=20;
+			j=ResponseInfo();
+			j=inb(CDi_status);
+		}
+		if (j&s_not_data_ready)
+		{
+			if ((current_drive->ored_ctl_adr&0x40)==0)
+				msg(DBG_INF, "CD contains no data tracks.\n");
+			else msg(DBG_INF, "sbp_data: DATA_READY timeout (%02X).\n", j);
+			error_flag++;
+		}
+		SBPCD_STI;
+		if (error_flag) break;
 
-	down(&ioctl_read_sem);
-	if (rq_data_dir(elv_next_request(q)) != READ)
-	{
-		msg(DBG_INF, "bad cmd %d\n", req->cmd[0]);
-		goto err_done;
+		msg(DBG_000, "sbp_data: beginning to read.\n");
+		p = current_drive->sbp_buf + frame *  CD_FRAMESIZE;
+		if (sbpro_type==1) OUT(CDo_sel_i_d,1);
+		if (cmd_type==READ_M2) {
+                        if (do_16bit) insw(CDi_data, xa_head_buf, CD_XA_HEAD>>1);
+                        else insb(CDi_data, xa_head_buf, CD_XA_HEAD);
+		}
+		if (do_16bit) insw(CDi_data, p, CD_FRAMESIZE>>1);
+		else insb(CDi_data, p, CD_FRAMESIZE);
+		if (cmd_type==READ_M2) {
+                        if (do_16bit) insw(CDi_data, xa_tail_buf, CD_XA_TAIL>>1);
+                        else insb(CDi_data, xa_tail_buf, CD_XA_TAIL);
+		}
+		current_drive->sbp_current++;
+		if (sbpro_type==1) OUT(CDo_sel_i_d,0);
+		if (cmd_type==READ_M2)
+		{
+			for (xa_count=0;xa_count<CD_XA_HEAD;xa_count++)
+				sprintf(&msgbuf[xa_count*3], " %02X", xa_head_buf[xa_count]);
+			msgbuf[xa_count*3]=0;
+			msg(DBG_XA1,"xa head:%s\n", msgbuf);
+		}
+		data_retrying = 0;
+		data_tries++;
+		if (data_tries >= 1000)
+		{
+			msg(DBG_INF,"sbp_data() statistics: %d waits in %d frames.\n", data_waits, data_tries);
+			data_waits = data_tries = 0;
+		}
 	}
-	p = req->rq_disk->private_data;
-#if OLD_BUSY
-	while (busy_audio) sbp_sleep(HZ); /* wait a bit */
-	busy_data=1;
-#endif /* OLD_BUSY */
-	
-	if (p->audio_state==audio_playing) goto err_done;
-	if (p != current_drive)
-		switch_drive(p);
-
-	block = req->sector; /* always numbered as 512-byte-pieces */
-	nsect = req->nr_sectors; /* always counted as 512-byte-pieces */
-	
-	msg(DBG_BSZ,"read sector %d (%d sectors)\n", block, nsect);
+	duration=jiffies-duration;
+	msg(DBG_TEA,"time to read %d frames: %d jiffies .\n",frame,duration);
+	if (famT_drive)
+	{
+		wait=8;
+		do
+		{
+			if (teac==2)
+                          {
+                            if ((i=CDi_stat_loop_T()) == -1) break;
+                          }
+                        else
+                          {
+                            sbp_sleep(1);
+                            OUT(CDo_sel_i_d,0);
+                            i=inb(CDi_status);
+                          }
+			if (!(i&s_not_data_ready))
+			{
+				OUT(CDo_sel_i_d,1);
+				j=0;
+				do
+				{
+					if (do_16bit) i=inw(CDi_data);
+					else i=inb(CDi_data);
+					j++;
+					i=inb(CDi_status);
+				}
+				while (!(i&s_not_data_ready));
+				msg(DBG_TEA, "==========too much data (%d bytes/words)==============.\n", j);
+			}
+			if (!(i&s_not_result_ready))
+			{
+				OUT(CDo_sel_i_d,0);
+				l=0;
+				do
+				{
+					infobuf[l++]=inb(CDi_info);
+					i=inb(CDi_status);
+				}
+				while (!(i&s_not_result_ready));
+				if (infobuf[0]==0x00) success=1;
+#if 1
+				for (j=0;j<l;j++) sprintf(&msgbuf[j*3], " %02X", infobuf[j]);
+				msgbuf[j*3]=0;
+				msg(DBG_TEA,"sbp_data info response:%s\n", msgbuf);
+#endif
+				if (infobuf[0]==0x02)
+				{
+					error_flag++;
+					do
+					{
+						++recursion;
+						if (recursion>1) msg(DBG_TEA,"cmd_out_T READ_ERR recursion (sbp_data): %d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",recursion);
+						else msg(DBG_TEA,"sbp_data: CMDT_READ_ERR necessary.\n");
+						clr_cmdbuf();
+						drvcmd[0]=CMDT_READ_ERR;
+						j=cmd_out_T(); /* !!! recursive here !!! */
+						--recursion;
+						sbp_sleep(1);
+					}
+					while (j<0);
+					current_drive->error_state=infobuf[2];
+					current_drive->b3=infobuf[3];
+					current_drive->b4=infobuf[4];
+				}
+				break;
+			}
+			else
+			{
 #if 0
-	msg(DBG_MUL,"read LBA %d\n", block/4);
+				msg(DBG_TEA, "============= waiting for result=================.\n");
+				sbp_sleep(1);
 #endif
-	
-	sbp_transfer(req);
-	/* if we satisfied the request from the buffer, we're done. */
-	if (req->nr_sectors == 0)
+			}
+		}
+		while (wait--);
+	}
+
+	if (error_flag) /* must have been spurious D_RDY or (ATTN&&!D_RDY) */
 	{
-#ifdef DEBUG_GTL
-		printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 2, Time:%li\n",
-			xnr, req, req->sector, req->nr_sectors, jiffies);
+		msg(DBG_TEA, "================error flag: %d=================.\n", error_flag);
+		msg(DBG_INF,"sbp_data: read aborted by drive.\n");
+#if 1
+		i=cc_DriveReset(); /* ugly fix to prevent a hang */
+#else
+		i=cc_ReadError();
 #endif
-		up(&ioctl_read_sem);
-		spin_lock_irq(q->queue_lock);
-		end_request(req, 1);
-		goto request_loop;
+		return (0);
 	}
-
-#ifdef FUTURE
-	i=prepare(0,0); /* at moment not really a hassle check, but ... */
-	if (i!=0)
-		msg(DBG_INF,"\"prepare\" tells error %d -- ignored\n", i);
-#endif /* FUTURE */ 
-	
-	if (!st_spinning) cc_SpinUp();
 	
-	for (data_tries=n_retries; data_tries > 0; data_tries--)
+	if (fam0LV_drive)
 	{
-		for (status_tries=3; status_tries > 0; status_tries--)
+		SBPCD_CLI;
+		i=maxtim_data;
+		for (timeout=jiffies+HZ; time_before(jiffies, timeout); timeout--)
 		{
-			flags_cmd_out |= f_respo3;
-			cc_ReadStatus();
-			if (sbp_status() != 0) break;
-			if (st_check) cc_ReadError();
-			sbp_sleep(1);    /* wait a bit, try again */
+			for ( ;i!=0;i--)
+			{
+				j=inb(CDi_status);
+				if (!(j&s_not_data_ready)) break;
+				if (!(j&s_not_result_ready)) break;
+				if (j&s_attention) break;
+			}
+			if (i != 0 || time_after_eq(jiffies, timeout)) break;
+			sbp_sleep(0);
+			i = 1;
 		}
-		if (status_tries == 0)
+		if (i==0) msg(DBG_INF,"status timeout after READ.\n");
+		if (!(j&s_attention))
 		{
-			msg(DBG_INF,"sbp_status: failed after 3 tries in line %d\n", __LINE__);
-			break;
+			msg(DBG_INF,"sbp_data: timeout waiting DRV_ATTN - retrying.\n");
+			i=cc_DriveReset();  /* ugly fix to prevent a hang */
+			SBPCD_STI;
+			return (0);
 		}
-		
-		sbp_read_cmd(req);
-		sbp_sleep(0);
-		if (sbp_data(req) != 0)
+		SBPCD_STI;
+	}
+
+#if 0
+	if (!success)
+#endif
+		do
 		{
-#ifdef SAFE_MIXED
-			current_drive->has_data=2; /* is really a data disk */
-#endif /* SAFE_MIXED */ 
-#ifdef DEBUG_GTL
-			printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 3, Time:%li\n",
-				xnr, req, req->sector, req->nr_sectors, jiffies);
+			if (fam0LV_drive) cc_ReadStatus();
+#if 1
+			if (famT_drive) msg(DBG_TEA, "================before ResponseStatus=================.\n", i);
 #endif
-			up(&ioctl_read_sem);
-			spin_lock_irq(q->queue_lock);
-			end_request(req, 1);
-			goto request_loop;
+			i=ResponseStatus();  /* builds status_bits, returns orig. status (old) or faked p_success (new) */
+#if 1
+			if (famT_drive)	msg(DBG_TEA, "================ResponseStatus: %d=================.\n", i);
+#endif
+			if (i<0)
+			{
+				msg(DBG_INF,"bad cc_ReadStatus after read: %02X\n", current_drive->status_bits);
+				return (0);
+			}
 		}
+		while ((fam0LV_drive)&&(!st_check)&&(!(i&p_success)));
+	if (st_check)
+	{
+		i=cc_ReadError();
+		msg(DBG_INF,"cc_ReadError was necessary after read: %d\n",i);
+		return (0);
+	}
+	if (fatal_err)
+	{
+		fatal_err=0;
+		current_drive->sbp_first_frame=current_drive->sbp_last_frame=-1;      /* purge buffer */
+		current_drive->sbp_current = 0;
+		msg(DBG_INF,"sbp_data: fatal_err - retrying.\n");
+		return (0);
 	}
 	
- err_done:
-#if OLD_BUSY
-	busy_data=0;
-#endif /* OLD_BUSY */
-#ifdef DEBUG_GTL
-	printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 4 (error), Time:%li\n",
-		xnr, req, req->sector, req->nr_sectors, jiffies);
-#endif
-	up(&ioctl_read_sem);
-	sbp_sleep(0);    /* wait a bit, try again */
-	spin_lock_irq(q->queue_lock);
-	end_request(req, 0);
-	goto request_loop;
+	current_drive->sbp_first_frame = req -> sector / 4;
+	current_drive->sbp_last_frame = current_drive->sbp_first_frame + current_drive->sbp_read_frames - 1;
+	sbp_transfer(req);
+	return (1);
 }
 /*==========================================================================*/
-/*
- *  build and send the READ command.
- */
-static void sbp_read_cmd(struct request *req)
+
+static int sbpcd_block_open(struct inode *inode, struct file *file)
 {
-#undef OLD
+	struct sbpcd_drive *p = inode->i_bdev->bd_disk->private_data;
+	return cdrom_open(p->sbpcd_infop, inode, file);
+}
 
-	int i;
-	int block;
+static int sbpcd_block_release(struct inode *inode, struct file *file)
+{
+	struct sbpcd_drive *p = inode->i_bdev->bd_disk->private_data;
+	return cdrom_release(p->sbpcd_infop, file);
+}
+
+static int sbpcd_block_ioctl(struct inode *inode, struct file *file,
+				unsigned cmd, unsigned long arg)
+{
+	struct sbpcd_drive *p = inode->i_bdev->bd_disk->private_data;
+	struct cdrom_device_info *cdi = p->sbpcd_infop;
+	int ret, i;
+
+	ret = cdrom_ioctl(file, p->sbpcd_infop, inode, cmd, arg);
+	if (ret != -ENOSYS)
+		return ret;
+
+	msg(DBG_IO2,"ioctl(%s, 0x%08lX, 0x%08lX)\n", cdi->name, cmd, arg);
+	if (p->drv_id==-1) {
+		msg(DBG_INF, "ioctl: bad device: %s\n", cdi->name);
+		return (-ENXIO);             /* no such drive */
+	}
+	down(&ioctl_read_sem);
+	if (p != current_drive)
+		switch_drive(p);
 	
-	current_drive->sbp_first_frame=current_drive->sbp_last_frame=-1;      /* purge buffer */
-	current_drive->sbp_current = 0;
-	block=req->sector/4;
-	if (block+current_drive->sbp_bufsiz <= current_drive->CDsize_frm)
-		current_drive->sbp_read_frames = current_drive->sbp_bufsiz;
-	else
+	msg(DBG_IO2,"ioctl: device %s, request %04X\n",cdi->name,cmd);
+	switch (cmd) 		/* Sun-compatible */
 	{
-		current_drive->sbp_read_frames=current_drive->CDsize_frm-block;
-		/* avoid reading past end of data */
-		if (current_drive->sbp_read_frames < 1)
+	case DDIOCSDBG:		/* DDI Debug */
+		if (!capable(CAP_SYS_ADMIN)) RETURN_UP(-EPERM);
+		i=sbpcd_dbg_ioctl(arg,1);
+		RETURN_UP(i);
+	case CDROMRESET:      /* hard reset the drive */
+		msg(DBG_IOC,"ioctl: CDROMRESET entered.\n");
+		i=DriveReset();
+		current_drive->audio_state=0;
+		RETURN_UP(i);
+
+	case CDROMREADMODE1:
+		msg(DBG_IOC,"ioctl: CDROMREADMODE1 requested.\n");
+#ifdef SAFE_MIXED
+		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
+#endif /* SAFE_MIXED */
+		cc_ModeSelect(CD_FRAMESIZE);
+		cc_ModeSense();
+		current_drive->mode=READ_M1;
+		RETURN_UP(0);
+
+	case CDROMREADMODE2: /* not usable at the moment */
+		msg(DBG_IOC,"ioctl: CDROMREADMODE2 requested.\n");
+#ifdef SAFE_MIXED
+		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
+#endif /* SAFE_MIXED */
+		cc_ModeSelect(CD_FRAMESIZE_RAW1);
+		cc_ModeSense();
+		current_drive->mode=READ_M2;
+		RETURN_UP(0);
+
+	case CDROMAUDIOBUFSIZ: /* configure the audio buffer size */
+		msg(DBG_IOC,"ioctl: CDROMAUDIOBUFSIZ entered.\n");
+		if (current_drive->sbp_audsiz>0)
+			vfree(current_drive->aud_buf);
+		current_drive->aud_buf=NULL;
+		current_drive->sbp_audsiz=arg;
+
+		if (current_drive->sbp_audsiz>16)
 		{
-			msg(DBG_INF,"requested frame %d, CD size %d ???\n",
-			    block, current_drive->CDsize_frm);
-			current_drive->sbp_read_frames=1;
+			current_drive->sbp_audsiz = 0;
+			RETURN_UP(current_drive->sbp_audsiz);
 		}
-	}
 	
-	flags_cmd_out = f_putcmd | f_respo2 | f_ResponseStatus | f_obey_p_check;
-	clr_cmdbuf();
-	if (famV_drive)
-	  {
-	    drvcmd[0]=CMDV_READ;
-	    lba2msf(block,&drvcmd[1]); /* msf-bcd format required */
-	    bin2bcdx(&drvcmd[1]);
-	    bin2bcdx(&drvcmd[2]);
-	    bin2bcdx(&drvcmd[3]);
-	    drvcmd[4]=current_drive->sbp_read_frames>>8;
-	    drvcmd[5]=current_drive->sbp_read_frames&0xff;
-	    drvcmd[6]=0x02; /* flag "msf-bcd" */
-	}
-	else if (fam0L_drive)
-	{
-		flags_cmd_out |= f_lopsta | f_getsta | f_bit1;
-		if (current_drive->xa_byte==0x20)
+		if (current_drive->sbp_audsiz>0)
 		{
-			cmd_type=READ_M2;
-			drvcmd[0]=CMD0_READ_XA; /* "read XA frames", old drives */
-			drvcmd[1]=(block>>16)&0x0ff;
-			drvcmd[2]=(block>>8)&0x0ff;
-			drvcmd[3]=block&0x0ff;
-			drvcmd[4]=(current_drive->sbp_read_frames>>8)&0x0ff;
-			drvcmd[5]=current_drive->sbp_read_frames&0x0ff;
+			current_drive->aud_buf=(u_char *) vmalloc(current_drive->sbp_audsiz*CD_FRAMESIZE_RAW);
+			if (current_drive->aud_buf==NULL)
+			{
+				msg(DBG_INF,"audio buffer (%d frames) not available.\n",current_drive->sbp_audsiz);
+				current_drive->sbp_audsiz=0;
+			}
+			else msg(DBG_INF,"audio buffer size: %d frames.\n",current_drive->sbp_audsiz);
 		}
-		else
+		RETURN_UP(current_drive->sbp_audsiz);
+
+	case CDROMREADAUDIO:
+	{ /* start of CDROMREADAUDIO */
+		int i=0, j=0, frame, block=0;
+		u_int try=0;
+		u_long timeout;
+		u_char *p;
+		u_int data_tries = 0;
+		u_int data_waits = 0;
+		u_int data_retrying = 0;
+		int status_tries;
+		int error_flag;
+
+		msg(DBG_IOC,"ioctl: CDROMREADAUDIO entered.\n");
+		if (fam0_drive) RETURN_UP(-EINVAL);
+		if (famL_drive) RETURN_UP(-EINVAL);
+		if (famV_drive) RETURN_UP(-EINVAL);
+		if (famT_drive) RETURN_UP(-EINVAL);
+#ifdef SAFE_MIXED
+		if (current_drive->has_data>1) RETURN_UP(-EBUSY);
+#endif /* SAFE_MIXED */
+		if (current_drive->aud_buf==NULL) RETURN_UP(-EINVAL);
+		if (copy_from_user(&read_audio, (void __user *)arg,
+				   sizeof(struct cdrom_read_audio)))
+			RETURN_UP(-EFAULT);
+		if (read_audio.nframes < 0 || read_audio.nframes>current_drive->sbp_audsiz) RETURN_UP(-EINVAL);
+		if (!access_ok(VERIFY_WRITE, read_audio.buf,
+			      read_audio.nframes*CD_FRAMESIZE_RAW))
+                	RETURN_UP(-EFAULT);
+
+		if (read_audio.addr_format==CDROM_MSF) /* MSF-bin specification of where to start */
+			block=msf2lba(&read_audio.addr.msf.minute);
+		else if (read_audio.addr_format==CDROM_LBA) /* lba specification of where to start */
+			block=read_audio.addr.lba;
+		else RETURN_UP(-EINVAL);
+#if 000
+		i=cc_SetSpeed(speed_150,0,0);
+		if (i) msg(DBG_AUD,"read_audio: SetSpeed error %d\n", i);
+#endif
+		msg(DBG_AUD,"read_audio: lba: %d, msf: %06X\n",
+		    block, blk2msf(block));
+		msg(DBG_AUD,"read_audio: before cc_ReadStatus.\n");
+#if OLD_BUSY
+		while (busy_data) sbp_sleep(HZ/10); /* wait a bit */
+		busy_audio=1;
+#endif /* OLD_BUSY */
+		error_flag=0;
+		for (data_tries=5; data_tries>0; data_tries--)
 		{
-			drvcmd[0]=CMD0_READ; /* "read frames", old drives */
-			if (current_drive->drv_type>=drv_201)
+			msg(DBG_AUD,"data_tries=%d ...\n", data_tries);
+			current_drive->mode=READ_AU;
+			cc_ModeSelect(CD_FRAMESIZE_RAW);
+			cc_ModeSense();
+			for (status_tries=3; status_tries > 0; status_tries--)
 			{
-				lba2msf(block,&drvcmd[1]); /* msf-bcd format required */
-				bin2bcdx(&drvcmd[1]);
-				bin2bcdx(&drvcmd[2]);
-				bin2bcdx(&drvcmd[3]);
+				flags_cmd_out |= f_respo3;
+				cc_ReadStatus();
+				if (sbp_status() != 0) break;
+				if (st_check) cc_ReadError();
+				sbp_sleep(1);    /* wait a bit, try again */
 			}
-			else
+			if (status_tries == 0)
 			{
-				drvcmd[1]=(block>>16)&0x0ff;
-				drvcmd[2]=(block>>8)&0x0ff;
-				drvcmd[3]=block&0x0ff;
+				msg(DBG_AUD,"read_audio: sbp_status: failed after 3 tries in line %d.\n", __LINE__);
+				continue;
 			}
-			drvcmd[4]=(current_drive->sbp_read_frames>>8)&0x0ff;
-			drvcmd[5]=current_drive->sbp_read_frames&0x0ff;
-			drvcmd[6]=(current_drive->drv_type<drv_201)?0:2; /* flag "lba or msf-bcd format" */
-		}
-	}
-	else if (fam1_drive)
-	{
-		drvcmd[0]=CMD1_READ;
-		lba2msf(block,&drvcmd[1]); /* msf-bin format required */
-		drvcmd[5]=(current_drive->sbp_read_frames>>8)&0x0ff;
-		drvcmd[6]=current_drive->sbp_read_frames&0x0ff;
-	}
-	else if (fam2_drive)
-	{
-		drvcmd[0]=CMD2_READ;
-		lba2msf(block,&drvcmd[1]); /* msf-bin format required */
-		drvcmd[4]=(current_drive->sbp_read_frames>>8)&0x0ff;
-		drvcmd[5]=current_drive->sbp_read_frames&0x0ff;
-		drvcmd[6]=0x02;
-	}
-	else if (famT_drive)
-	{
-		drvcmd[0]=CMDT_READ;
-		drvcmd[2]=(block>>24)&0x0ff;
-		drvcmd[3]=(block>>16)&0x0ff;
-		drvcmd[4]=(block>>8)&0x0ff;
-		drvcmd[5]=block&0x0ff;
-		drvcmd[7]=(current_drive->sbp_read_frames>>8)&0x0ff;
-		drvcmd[8]=current_drive->sbp_read_frames&0x0ff;
-	}
-	flags_cmd_out=f_putcmd;
-	response_count=0;
-	i=cmd_out();
-	if (i<0) msg(DBG_INF,"error giving READ command: %0d\n", i);
-	return;
-}
-/*==========================================================================*/
-/*
- *  Check the completion of the read-data command.  On success, read
- *  the current_drive->sbp_bufsiz * 2048 bytes of data from the disk into buffer.
- */
-static int sbp_data(struct request *req)
-{
-	int i=0, j=0, l, frame;
-	u_int try=0;
-	u_long timeout;
-	u_char *p;
-	u_int data_tries = 0;
-	u_int data_waits = 0;
-	u_int data_retrying = 0;
-	int error_flag;
-	int xa_count;
-	int max_latency;
-	int success;
-	int wait;
-	int duration;
-	
-	error_flag=0;
-	success=0;
-#if LONG_TIMING
-	max_latency=9*HZ;
-#else
-	if (current_drive->f_multisession) max_latency=15*HZ;
-	else max_latency=5*HZ;
-#endif
-	duration=jiffies;
-	for (frame=0;frame<current_drive->sbp_read_frames&&!error_flag; frame++)
-	{
-		SBPCD_CLI;
-		
-		del_timer(&data_timer);
-		data_timer.expires=jiffies+max_latency;
-		timed_out_data=0;
-		add_timer(&data_timer);
-		while (!timed_out_data) 
-		{
-			if (current_drive->f_multisession) try=maxtim_data*4;
-			else try=maxtim_data;
-			msg(DBG_000,"sbp_data: CDi_status loop: try=%d.\n",try);
-			for ( ; try!=0;try--)
+			msg(DBG_AUD,"read_audio: sbp_status: ok.\n");
+
+			flags_cmd_out = f_putcmd | f_respo2 | f_ResponseStatus | f_obey_p_check;
+			if (fam0L_drive)
 			{
-				j=inb(CDi_status);
-				if (!(j&s_not_data_ready)) break;
-				if (!(j&s_not_result_ready)) break;
-				if (fam0LV_drive) if (j&s_attention) break;
+				flags_cmd_out |= f_lopsta | f_getsta | f_bit1;
+				cmd_type=READ_M2;
+				drvcmd[0]=CMD0_READ_XA; /* "read XA frames", old drives */
+				drvcmd[1]=(block>>16)&0x000000ff;
+				drvcmd[2]=(block>>8)&0x000000ff;
+				drvcmd[3]=block&0x000000ff;
+				drvcmd[4]=0;
+				drvcmd[5]=read_audio.nframes; /* # of frames */
+				drvcmd[6]=0;
 			}
-			if (!(j&s_not_data_ready)) goto data_ready;
-			if (try==0)
+			else if (fam1_drive)
 			{
-				if (data_retrying == 0) data_waits++;
-				data_retrying = 1;
-				msg(DBG_000,"sbp_data: CDi_status loop: sleeping.\n");
-				sbp_sleep(1);
-				try = 1;
+				drvcmd[0]=CMD1_READ; /* "read frames", new drives */
+				lba2msf(block,&drvcmd[1]); /* msf-bin format required */
+				drvcmd[4]=0;
+				drvcmd[5]=0;
+				drvcmd[6]=read_audio.nframes; /* # of frames */
 			}
-		}
-		msg(DBG_INF,"sbp_data: CDi_status loop expired.\n");
-	data_ready:
-		del_timer(&data_timer);
-
-		if (timed_out_data)
-		{
-			msg(DBG_INF,"sbp_data: CDi_status timeout (timed_out_data) (%02X).\n", j);
-			error_flag++;
-		}
-		if (try==0)
-		{
-			msg(DBG_INF,"sbp_data: CDi_status timeout (try=0) (%02X).\n", j);
-			error_flag++;
-		}
-		if (!(j&s_not_result_ready))
-		{
-			msg(DBG_INF, "sbp_data: RESULT_READY where DATA_READY awaited (%02X).\n", j);
-			response_count=20;
-			j=ResponseInfo();
-			j=inb(CDi_status);
-		}
-		if (j&s_not_data_ready)
-		{
-			if ((current_drive->ored_ctl_adr&0x40)==0)
-				msg(DBG_INF, "CD contains no data tracks.\n");
-			else msg(DBG_INF, "sbp_data: DATA_READY timeout (%02X).\n", j);
-			error_flag++;
-		}
-		SBPCD_STI;
-		if (error_flag) break;
-
-		msg(DBG_000, "sbp_data: beginning to read.\n");
-		p = current_drive->sbp_buf + frame *  CD_FRAMESIZE;
-		if (sbpro_type==1) OUT(CDo_sel_i_d,1);
-		if (cmd_type==READ_M2) {
-                        if (do_16bit) insw(CDi_data, xa_head_buf, CD_XA_HEAD>>1);
-                        else insb(CDi_data, xa_head_buf, CD_XA_HEAD);
-		}
-		if (do_16bit) insw(CDi_data, p, CD_FRAMESIZE>>1);
-		else insb(CDi_data, p, CD_FRAMESIZE);
-		if (cmd_type==READ_M2) {
-                        if (do_16bit) insw(CDi_data, xa_tail_buf, CD_XA_TAIL>>1);
-                        else insb(CDi_data, xa_tail_buf, CD_XA_TAIL);
-		}
-		current_drive->sbp_current++;
-		if (sbpro_type==1) OUT(CDo_sel_i_d,0);
-		if (cmd_type==READ_M2)
-		{
-			for (xa_count=0;xa_count<CD_XA_HEAD;xa_count++)
-				sprintf(&msgbuf[xa_count*3], " %02X", xa_head_buf[xa_count]);
-			msgbuf[xa_count*3]=0;
-			msg(DBG_XA1,"xa head:%s\n", msgbuf);
-		}
-		data_retrying = 0;
-		data_tries++;
-		if (data_tries >= 1000)
-		{
-			msg(DBG_INF,"sbp_data() statistics: %d waits in %d frames.\n", data_waits, data_tries);
-			data_waits = data_tries = 0;
-		}
-	}
-	duration=jiffies-duration;
-	msg(DBG_TEA,"time to read %d frames: %d jiffies .\n",frame,duration);
-	if (famT_drive)
-	{
-		wait=8;
-		do
-		{
-			if (teac==2)
-                          {
-                            if ((i=CDi_stat_loop_T()) == -1) break;
-                          }
-                        else
-                          {
-                            sbp_sleep(1);
-                            OUT(CDo_sel_i_d,0); 
-                            i=inb(CDi_status);
-                          } 
-			if (!(i&s_not_data_ready))
+			else if (fam2_drive)
 			{
-				OUT(CDo_sel_i_d,1);
-				j=0;
-				do
+				drvcmd[0]=CMD2_READ_XA2;
+				lba2msf(block,&drvcmd[1]); /* msf-bin format required */
+				drvcmd[4]=0;
+				drvcmd[5]=read_audio.nframes; /* # of frames */
+				drvcmd[6]=0x11; /* raw mode */
+			}
+			else if (famT_drive) /* CD-55A: not tested yet */
+			{
+			}
+			msg(DBG_AUD,"read_audio: before giving \"read\" command.\n");
+			flags_cmd_out=f_putcmd;
+			response_count=0;
+			i=cmd_out();
+			if (i<0) msg(DBG_INF,"error giving READ AUDIO command: %0d\n", i);
+			sbp_sleep(0);
+			msg(DBG_AUD,"read_audio: after giving \"read\" command.\n");
+			for (frame=1;frame<2 && !error_flag; frame++)
+			{
+				try=maxtim_data;
+				for (timeout=jiffies+9*HZ; ; )
 				{
-					if (do_16bit) i=inw(CDi_data);
-					else i=inb(CDi_data);
-					j++;
-					i=inb(CDi_status);
+					for ( ; try!=0;try--)
+					{
+						j=inb(CDi_status);
+						if (!(j&s_not_data_ready)) break;
+						if (!(j&s_not_result_ready)) break;
+						if (fam0L_drive) if (j&s_attention) break;
+					}
+					if (try != 0 || time_after_eq(jiffies, timeout)) break;
+					if (data_retrying == 0) data_waits++;
+					data_retrying = 1;
+					sbp_sleep(1);
+					try = 1;
 				}
-				while (!(i&s_not_data_ready));
-				msg(DBG_TEA, "==========too much data (%d bytes/words)==============.\n", j);
-			}
-			if (!(i&s_not_result_ready))
-			{
-				OUT(CDo_sel_i_d,0);
-				l=0;
-				do
+				if (try==0)
 				{
-					infobuf[l++]=inb(CDi_info);
-					i=inb(CDi_status);
+					msg(DBG_INF,"read_audio: sbp_data: CDi_status timeout.\n");
+					error_flag++;
+					break;
 				}
-				while (!(i&s_not_result_ready));
-				if (infobuf[0]==0x00) success=1;
-#if 1
-				for (j=0;j<l;j++) sprintf(&msgbuf[j*3], " %02X", infobuf[j]);
-				msgbuf[j*3]=0;
-				msg(DBG_TEA,"sbp_data info response:%s\n", msgbuf);
-#endif
-				if (infobuf[0]==0x02)
+				msg(DBG_AUD,"read_audio: sbp_data: CDi_status ok.\n");
+				if (j&s_not_data_ready)
 				{
+					msg(DBG_INF, "read_audio: sbp_data: DATA_READY timeout.\n");
 					error_flag++;
-					do
-					{
-						++recursion;
-						if (recursion>1) msg(DBG_TEA,"cmd_out_T READ_ERR recursion (sbp_data): %d !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",recursion);
-						else msg(DBG_TEA,"sbp_data: CMDT_READ_ERR necessary.\n");
-						clr_cmdbuf();
-						drvcmd[0]=CMDT_READ_ERR;
-						j=cmd_out_T(); /* !!! recursive here !!! */
-						--recursion;
-						sbp_sleep(1);
+					break;
+				}
+				msg(DBG_AUD,"read_audio: before reading data.\n");
+				error_flag=0;
+				p = current_drive->aud_buf;
+				if (sbpro_type==1) OUT(CDo_sel_i_d,1);
+				if (do_16bit)
+				{
+					u_short *p2 = (u_short *) p;
+
+					for (; (u_char *) p2 < current_drive->aud_buf + read_audio.nframes*CD_FRAMESIZE_RAW;)
+				  	{
+						if ((inb_p(CDi_status)&s_not_data_ready)) continue;
+
+						/* get one sample */
+						*p2++ = inw_p(CDi_data);
+						*p2++ = inw_p(CDi_data);
+					}
+				} else {
+					for (; p < current_drive->aud_buf + read_audio.nframes*CD_FRAMESIZE_RAW;)
+				  	{
+						if ((inb_p(CDi_status)&s_not_data_ready)) continue;
+
+						/* get one sample */
+						*p++ = inb_p(CDi_data);
+						*p++ = inb_p(CDi_data);
+						*p++ = inb_p(CDi_data);
+						*p++ = inb_p(CDi_data);
 					}
-					while (j<0);
-					current_drive->error_state=infobuf[2];
-					current_drive->b3=infobuf[3];
-					current_drive->b4=infobuf[4];
 				}
-				break;
+				if (sbpro_type==1) OUT(CDo_sel_i_d,0);
+				data_retrying = 0;
 			}
-			else
+			msg(DBG_AUD,"read_audio: after reading data.\n");
+			if (error_flag)    /* must have been spurious D_RDY or (ATTN&&!D_RDY) */
 			{
-#if 0
-				msg(DBG_TEA, "============= waiting for result=================.\n");
-				sbp_sleep(1);
-#endif
-			}
-		}
-		while (wait--);
-	}
-
-	if (error_flag) /* must have been spurious D_RDY or (ATTN&&!D_RDY) */
-	{
-		msg(DBG_TEA, "================error flag: %d=================.\n", error_flag);
-		msg(DBG_INF,"sbp_data: read aborted by drive.\n");
-#if 1
-		i=cc_DriveReset(); /* ugly fix to prevent a hang */
+				msg(DBG_AUD,"read_audio: read aborted by drive\n");
+#if 0000
+				i=cc_DriveReset();                /* ugly fix to prevent a hang */
 #else
-		i=cc_ReadError();
+				i=cc_ReadError();
 #endif
-		return (0);
-	}
-	
-	if (fam0LV_drive)
-	{
-		SBPCD_CLI;
-		i=maxtim_data;
-		for (timeout=jiffies+HZ; time_before(jiffies, timeout); timeout--)
-		{
-			for ( ;i!=0;i--)
+				continue;
+			}
+			if (fam0L_drive)
 			{
-				j=inb(CDi_status);
-				if (!(j&s_not_data_ready)) break;
-				if (!(j&s_not_result_ready)) break;
-				if (j&s_attention) break;
+				i=maxtim_data;
+				for (timeout=jiffies+9*HZ; time_before(jiffies, timeout); timeout--)
+				{
+					for ( ;i!=0;i--)
+					{
+						j=inb(CDi_status);
+						if (!(j&s_not_data_ready)) break;
+						if (!(j&s_not_result_ready)) break;
+						if (j&s_attention) break;
+					}
+					if (i != 0 || time_after_eq(jiffies, timeout)) break;
+					sbp_sleep(0);
+					i = 1;
+				}
+				if (i==0) msg(DBG_AUD,"read_audio: STATUS TIMEOUT AFTER READ");
+				if (!(j&s_attention))
+				{
+					msg(DBG_AUD,"read_audio: sbp_data: timeout waiting DRV_ATTN - retrying\n");
+					i=cc_DriveReset();  /* ugly fix to prevent a hang */
+					continue;
+				}
 			}
-			if (i != 0 || time_after_eq(jiffies, timeout)) break;
-			sbp_sleep(0);
-			i = 1;
-		}
-		if (i==0) msg(DBG_INF,"status timeout after READ.\n");
-		if (!(j&s_attention))
-		{
-			msg(DBG_INF,"sbp_data: timeout waiting DRV_ATTN - retrying.\n");
-			i=cc_DriveReset();  /* ugly fix to prevent a hang */
-			SBPCD_STI;
-			return (0);
-		}
-		SBPCD_STI;
-	}
-	
-#if 0
-	if (!success)
-#endif
-		do
-		{
-			if (fam0LV_drive) cc_ReadStatus();
-#if 1
-			if (famT_drive) msg(DBG_TEA, "================before ResponseStatus=================.\n", i);
-#endif
-			i=ResponseStatus();  /* builds status_bits, returns orig. status (old) or faked p_success (new) */
-#if 1
-			if (famT_drive)	msg(DBG_TEA, "================ResponseStatus: %d=================.\n", i);
-#endif
-			if (i<0)
+			do
 			{
-				msg(DBG_INF,"bad cc_ReadStatus after read: %02X\n", current_drive->status_bits);
-				return (0);
+				if (fam0L_drive) cc_ReadStatus();
+				i=ResponseStatus();  /* builds status_bits, returns orig. status (old) or faked p_success (new) */
+				if (i<0) { msg(DBG_AUD,
+					       "read_audio: cc_ReadStatus error after read: %02X\n",
+					       current_drive->status_bits);
+					   continue; /* FIXME */
+				   }
+			}
+			while ((fam0L_drive)&&(!st_check)&&(!(i&p_success)));
+			if (st_check)
+			{
+				i=cc_ReadError();
+				msg(DBG_AUD,"read_audio: cc_ReadError was necessary after read: %02X\n",i);
+				continue;
 			}
+			if (copy_to_user(read_audio.buf,
+					 current_drive->aud_buf,
+					 read_audio.nframes * CD_FRAMESIZE_RAW))
+				RETURN_UP(-EFAULT);
+			msg(DBG_AUD,"read_audio: copy_to_user done.\n");
+			break;
 		}
-		while ((fam0LV_drive)&&(!st_check)&&(!(i&p_success)));
-	if (st_check)
-	{
-		i=cc_ReadError();
-		msg(DBG_INF,"cc_ReadError was necessary after read: %d\n",i);
-		return (0);
-	}
-	if (fatal_err)
-	{
-		fatal_err=0;
-		current_drive->sbp_first_frame=current_drive->sbp_last_frame=-1;      /* purge buffer */
-		current_drive->sbp_current = 0;
-		msg(DBG_INF,"sbp_data: fatal_err - retrying.\n");
-		return (0);
-	}
-	
-	current_drive->sbp_first_frame = req -> sector / 4;
-	current_drive->sbp_last_frame = current_drive->sbp_first_frame + current_drive->sbp_read_frames - 1;
-	sbp_transfer(req);
-	return (1);
-}
-/*==========================================================================*/
-
-static int sbpcd_block_open(struct inode *inode, struct file *file)
-{
-	struct sbpcd_drive *p = inode->i_bdev->bd_disk->private_data;
-	return cdrom_open(p->sbpcd_infop, inode, file);
-}
-
-static int sbpcd_block_release(struct inode *inode, struct file *file)
-{
-	struct sbpcd_drive *p = inode->i_bdev->bd_disk->private_data;
-	return cdrom_release(p->sbpcd_infop, file);
-}
+		cc_ModeSelect(CD_FRAMESIZE);
+		cc_ModeSense();
+		current_drive->mode=READ_M1;
+#if OLD_BUSY
+		busy_audio=0;
+#endif /* OLD_BUSY */
+		if (data_tries == 0)
+		{
+			msg(DBG_AUD,"read_audio: failed after 5 tries in line %d.\n", __LINE__);
+			RETURN_UP(-EIO);
+		}
+		msg(DBG_AUD,"read_audio: successful return.\n");
+		RETURN_UP(0);
+	} /* end of CDROMREADAUDIO */
 
-static int sbpcd_block_ioctl(struct inode *inode, struct file *file,
-				unsigned cmd, unsigned long arg)
-{
-	struct sbpcd_drive *p = inode->i_bdev->bd_disk->private_data;
-	return cdrom_ioctl(file, p->sbpcd_infop, inode, cmd, arg);
+	default:
+		msg(DBG_IOC,"ioctl: unknown function request %04X\n", cmd);
+		RETURN_UP(-EINVAL);
+	} /* end switch(cmd) */
 }
 
 static int sbpcd_block_media_changed(struct gendisk *disk)
@@ -5478,10 +5471,9 @@ static struct cdrom_device_ops sbpcd_dops = {
 	.get_mcn		= sbpcd_get_mcn,
 	.reset			= sbpcd_reset,
 	.audio_ioctl		= sbpcd_audio_ioctl,
-	.dev_ioctl		= sbpcd_dev_ioctl,
 	.capability		= CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK |
 				CDC_MULTI_SESSION | CDC_MEDIA_CHANGED |
-				CDC_MCN | CDC_PLAY_AUDIO | CDC_IOCTLS,
+				CDC_MCN | CDC_PLAY_AUDIO,
 	.n_minors		= 1,
 };
 
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index e27617259552..c0f817ba7adb 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -627,7 +627,7 @@ static struct cdrom_device_ops viocd_dops = {
 	.media_changed = viocd_media_changed,
 	.lock_door = viocd_lock_door,
 	.generic_packet = viocd_packet,
-	.capability = CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | CDC_SELECT_SPEED | CDC_SELECT_DISC | CDC_MULTI_SESSION | CDC_MCN | CDC_MEDIA_CHANGED | CDC_PLAY_AUDIO | CDC_RESET | CDC_IOCTLS | CDC_DRIVE_STATUS | CDC_GENERIC_PACKET | CDC_CD_R | CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM | CDC_RAM
+	.capability = CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | CDC_SELECT_SPEED | CDC_SELECT_DISC | CDC_MULTI_SESSION | CDC_MCN | CDC_MEDIA_CHANGED | CDC_PLAY_AUDIO | CDC_RESET | CDC_DRIVE_STATUS | CDC_GENERIC_PACKET | CDC_CD_R | CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM | CDC_RAM
 };
 
 static int __init find_capability(const char *type)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3325660f7248..430d8af35cb1 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -2470,52 +2470,6 @@ static int ide_cdrom_packet(struct cdrom_device_info *cdi,
 	return cgc->stat;
 }
 
-static
-int ide_cdrom_dev_ioctl (struct cdrom_device_info *cdi,
-			 unsigned int cmd, unsigned long arg)
-{
-	struct packet_command cgc;
-	char buffer[16];
-	int stat;
-
-	init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_UNKNOWN);
-
-	/* These will be moved into the Uniform layer shortly... */
-	switch (cmd) {
- 	case CDROMSETSPINDOWN: {
- 		char spindown;
- 
- 		if (copy_from_user(&spindown, (void __user *) arg, sizeof(char)))
-			return -EFAULT;
- 
-                if ((stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CDROM_PAGE, 0)))
-			return stat;
-
- 		buffer[11] = (buffer[11] & 0xf0) | (spindown & 0x0f);
-
- 		return cdrom_mode_select(cdi, &cgc);
- 	} 
- 
- 	case CDROMGETSPINDOWN: {
- 		char spindown;
- 
-                if ((stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CDROM_PAGE, 0)))
-			return stat;
- 
- 		spindown = buffer[11] & 0x0f;
- 
-		if (copy_to_user((void __user *) arg, &spindown, sizeof (char)))
-			return -EFAULT;
- 
- 		return 0;
- 	}
-  
-	default:
-		return -EINVAL;
-	}
-
-}
-
 static
 int ide_cdrom_audio_ioctl (struct cdrom_device_info *cdi,
 			   unsigned int cmd, void *arg)
@@ -2852,12 +2806,11 @@ static struct cdrom_device_ops ide_cdrom_dops = {
 	.get_mcn		= ide_cdrom_get_mcn,
 	.reset			= ide_cdrom_reset,
 	.audio_ioctl		= ide_cdrom_audio_ioctl,
-	.dev_ioctl		= ide_cdrom_dev_ioctl,
 	.capability		= CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK |
 				CDC_SELECT_SPEED | CDC_SELECT_DISC |
 				CDC_MULTI_SESSION | CDC_MCN |
 				CDC_MEDIA_CHANGED | CDC_PLAY_AUDIO | CDC_RESET |
-				CDC_IOCTLS | CDC_DRIVE_STATUS | CDC_CD_R |
+				CDC_DRIVE_STATUS | CDC_CD_R |
 				CDC_CD_RW | CDC_DVD | CDC_DVD_R| CDC_DVD_RAM |
 				CDC_GENERIC_PACKET | CDC_MO_DRIVE | CDC_MRW |
 				CDC_MRW_W | CDC_RAM,
@@ -3367,6 +3320,45 @@ static int idecd_release(struct inode * inode, struct file * file)
 	return 0;
 }
 
+static int idecd_set_spindown(struct cdrom_device_info *cdi, unsigned long arg)
+{
+	struct packet_command cgc;
+	char buffer[16];
+	int stat;
+	char spindown;
+
+	if (copy_from_user(&spindown, (void __user *)arg, sizeof(char)))
+		return -EFAULT;
+
+	init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_UNKNOWN);
+
+	stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CDROM_PAGE, 0);
+	if (stat)
+		return stat;
+
+	buffer[11] = (buffer[11] & 0xf0) | (spindown & 0x0f);
+	return cdrom_mode_select(cdi, &cgc);
+}
+
+static int idecd_get_spindown(struct cdrom_device_info *cdi, unsigned long arg)
+{
+	struct packet_command cgc;
+	char buffer[16];
+	int stat;
+ 	char spindown;
+
+	init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_UNKNOWN);
+
+	stat = cdrom_mode_sense(cdi, &cgc, GPMODE_CDROM_PAGE, 0);
+	if (stat)
+		return stat;
+
+	spindown = buffer[11] & 0x0f;
+	if (copy_to_user((void __user *)arg, &spindown, sizeof (char)))
+		return -EFAULT;
+	return 0;
+}
+
 static int idecd_ioctl (struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
@@ -3374,7 +3366,16 @@ static int idecd_ioctl (struct inode *inode, struct file *file,
 	struct cdrom_info *info = ide_cd_g(bdev->bd_disk);
 	int err;
 
-	err  = generic_ide_ioctl(info->drive, file, bdev, cmd, arg);
+	switch (cmd) {
+ 	case CDROMSETSPINDOWN:
+		return idecd_set_spindown(&info->devinfo, arg);
+ 	case CDROMGETSPINDOWN:
+		return idecd_get_spindown(&info->devinfo, arg);
+	default:
+		break;
+ 	}
+
+	err = generic_ide_ioctl(info->drive, file, bdev, cmd, arg);
 	if (err == -EINVAL)
 		err = cdrom_ioctl(file, &info->devinfo, inode, cmd, arg);
 
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index f9c1192dc15e..7c80711e18ed 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -71,7 +71,7 @@ MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_CDROM_MAJOR);
 #define SR_CAPABILITIES \
 	(CDC_CLOSE_TRAY|CDC_OPEN_TRAY|CDC_LOCK|CDC_SELECT_SPEED| \
 	 CDC_SELECT_DISC|CDC_MULTI_SESSION|CDC_MCN|CDC_MEDIA_CHANGED| \
-	 CDC_PLAY_AUDIO|CDC_RESET|CDC_IOCTLS|CDC_DRIVE_STATUS| \
+	 CDC_PLAY_AUDIO|CDC_RESET|CDC_DRIVE_STATUS| \
 	 CDC_CD_R|CDC_CD_RW|CDC_DVD|CDC_DVD_R|CDC_DVD_RAM|CDC_GENERIC_PACKET| \
 	 CDC_MRW|CDC_MRW_W|CDC_RAM)
 
@@ -118,7 +118,6 @@ static struct cdrom_device_ops sr_dops = {
 	.get_mcn		= sr_get_mcn,
 	.reset			= sr_reset,
 	.audio_ioctl		= sr_audio_ioctl,
-	.dev_ioctl		= sr_dev_ioctl,
 	.capability		= SR_CAPABILITIES,
 	.generic_packet		= sr_packet,
 };
@@ -456,17 +455,33 @@ static int sr_block_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 {
 	struct scsi_cd *cd = scsi_cd(inode->i_bdev->bd_disk);
 	struct scsi_device *sdev = cd->device;
+	void __user *argp = (void __user *)arg;
+	int ret;
 
-        /*
-         * Send SCSI addressing ioctls directly to mid level, send other
-         * ioctls to cdrom/block level.
-         */
-        switch (cmd) {
-                case SCSI_IOCTL_GET_IDLUN:
-                case SCSI_IOCTL_GET_BUS_NUMBER:
-                        return scsi_ioctl(sdev, cmd, (void __user *)arg);
+	/*
+	 * Send SCSI addressing ioctls directly to mid level, send other
+	 * ioctls to cdrom/block level.
+	 */
+	switch (cmd) {
+	case SCSI_IOCTL_GET_IDLUN:
+	case SCSI_IOCTL_GET_BUS_NUMBER:
+		return scsi_ioctl(sdev, cmd, argp);
 	}
-	return cdrom_ioctl(file, &cd->cdi, inode, cmd, arg);
+
+	ret = cdrom_ioctl(file, &cd->cdi, inode, cmd, arg);
+	if (ret != ENOSYS)
+		return ret;
+
+	/*
+	 * ENODEV means that we didn't recognise the ioctl, or that we
+	 * cannot execute it in the current device state.  In either
+	 * case fall through to scsi_ioctl, which will return ENDOEV again
+	 * if it doesn't recognise the ioctl
+	 */
+	ret = scsi_nonblockable_ioctl(sdev, cmd, argp, NULL);
+	if (ret != -ENODEV)
+		return ret;
+	return scsi_ioctl(sdev, cmd, argp);
 }
 
 static int sr_block_media_changed(struct gendisk *disk)
diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h
index d2bcd99c272f..d65de9621b27 100644
--- a/drivers/scsi/sr.h
+++ b/drivers/scsi/sr.h
@@ -55,7 +55,6 @@ int sr_get_mcn(struct cdrom_device_info *, struct cdrom_mcn *);
 int sr_reset(struct cdrom_device_info *);
 int sr_select_speed(struct cdrom_device_info *cdi, int speed);
 int sr_audio_ioctl(struct cdrom_device_info *, unsigned int, void *);
-int sr_dev_ioctl(struct cdrom_device_info *, unsigned int, unsigned long);
 
 int sr_is_xa(Scsi_CD *);
 
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index b65462f76484..d1268cb46837 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -562,22 +562,3 @@ int sr_is_xa(Scsi_CD *cd)
 #endif
 	return is_xa;
 }
-
-int sr_dev_ioctl(struct cdrom_device_info *cdi,
-		 unsigned int cmd, unsigned long arg)
-{
-	Scsi_CD *cd = cdi->handle;
-	int ret;
-	
-	ret = scsi_nonblockable_ioctl(cd->device, cmd,
-				      (void __user *)arg, NULL);
-	/*
-	 * ENODEV means that we didn't recognise the ioctl, or that we
-	 * cannot execute it in the current device state.  In either
-	 * case fall through to scsi_ioctl, which will return ENDOEV again
-	 * if it doesn't recognise the ioctl
-	 */
-	if (ret != -ENODEV)
-		return ret;
-	return scsi_ioctl(cd->device, cmd, (void __user *)arg);
-}
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index b68fdf1f3156..3c9b0bc05123 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -378,7 +378,6 @@ struct cdrom_generic_command
 #define CDC_MEDIA_CHANGED 	0x80    /* media changed */
 #define CDC_PLAY_AUDIO		0x100   /* audio functions */
 #define CDC_RESET               0x200   /* hard reset device */
-#define CDC_IOCTLS              0x400   /* driver has non-standard ioctls */
 #define CDC_DRIVE_STATUS        0x800   /* driver implements drive status */
 #define CDC_GENERIC_PACKET	0x1000	/* driver implements generic packets */
 #define CDC_CD_R		0x2000	/* drive is a CD-R */
@@ -974,9 +973,7 @@ struct cdrom_device_ops {
 	int (*reset) (struct cdrom_device_info *);
 	/* play stuff */
 	int (*audio_ioctl) (struct cdrom_device_info *,unsigned int, void *);
-	/* dev-specific */
- 	int (*dev_ioctl) (struct cdrom_device_info *,
-			  unsigned int, unsigned long);
+
 /* driver specifications */
 	const int capability;   /* capability flags */
 	int n_minors;           /* number of active minor devices */
-- 
cgit v1.2.3


From 804f1594cc3deb161e531a43d90c501f0db2635a Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kyle@parisc-linux.org>
Date: Thu, 23 Mar 2006 03:00:16 -0800
Subject: [PATCH] Move read_mostly definition to asm/cache.h

Seems like needless clutter having a bunch of #if defined(CONFIG_$ARCH) in
include/linux/cache.h.  Move the per architecture section definition to
asm/cache.h, and keep the if-not-defined dummy case in linux/cache.h to
catch architectures which don't implement the section.

Verified that symbols still go in .data.read_mostly on parisc,
and the compile doesn't break.

Signed-off-by: Kyle McMartin <kyle@parisc-linux.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/cache.h    | 2 ++
 include/asm-ia64/cache.h    | 2 ++
 include/asm-parisc/cache.h  | 2 ++
 include/asm-sparc64/cache.h | 2 ++
 include/asm-x86_64/cache.h  | 2 ++
 include/linux/cache.h       | 4 +---
 6 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/cache.h b/include/asm-i386/cache.h
index 615911e5bd24..ca15c9c665cf 100644
--- a/include/asm-i386/cache.h
+++ b/include/asm-i386/cache.h
@@ -10,4 +10,6 @@
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
 
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+
 #endif
diff --git a/include/asm-ia64/cache.h b/include/asm-ia64/cache.h
index 40dd25195d65..f0a104db8f20 100644
--- a/include/asm-ia64/cache.h
+++ b/include/asm-ia64/cache.h
@@ -25,4 +25,6 @@
 # define SMP_CACHE_BYTES	(1 << 3)
 #endif
 
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+
 #endif /* _ASM_IA64_CACHE_H */
diff --git a/include/asm-parisc/cache.h b/include/asm-parisc/cache.h
index 93f179f13ce8..ae50f8e12eed 100644
--- a/include/asm-parisc/cache.h
+++ b/include/asm-parisc/cache.h
@@ -29,6 +29,8 @@
 
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+
 extern void flush_data_cache_local(void *);  /* flushes local data-cache only */
 extern void flush_instruction_cache_local(void *); /* flushes local code-cache only */
 #ifdef CONFIG_SMP
diff --git a/include/asm-sparc64/cache.h b/include/asm-sparc64/cache.h
index f7d35a2ae9b8..e9df17acedde 100644
--- a/include/asm-sparc64/cache.h
+++ b/include/asm-sparc64/cache.h
@@ -13,4 +13,6 @@
 #define        SMP_CACHE_BYTES_SHIFT	6
 #define        SMP_CACHE_BYTES		(1 << SMP_CACHE_BYTES_SHIFT) /* L2 cache line size. */
 
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+
 #endif
diff --git a/include/asm-x86_64/cache.h b/include/asm-x86_64/cache.h
index 263f0a211ed7..c8043a16152e 100644
--- a/include/asm-x86_64/cache.h
+++ b/include/asm-x86_64/cache.h
@@ -20,6 +20,8 @@
        __attribute__((__section__(".data.page_aligned")))
 #endif
 
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+
 #endif
 
 #endif
diff --git a/include/linux/cache.h b/include/linux/cache.h
index d22e632f41fb..cc4b3aafad9a 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -13,9 +13,7 @@
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 #endif
 
-#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_IA64) || defined(CONFIG_PARISC)
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
-#else
+#ifndef __read_mostly
 #define __read_mostly
 #endif
 
-- 
cgit v1.2.3


From c039e3134ae62863bbc8e8429b29e3c43cf21b2a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 23 Mar 2006 03:00:28 -0800
Subject: [PATCH] sem2mutex: blockdev #2

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ioctl.c                   | 22 +++++++++++-----------
 drivers/block/rd.c              |  4 ++--
 drivers/s390/block/dasd_ioctl.c |  8 ++++----
 fs/block_dev.c                  | 28 ++++++++++++++--------------
 fs/buffer.c                     |  6 +++---
 fs/super.c                      |  4 ++--
 include/linux/fs.h              |  4 ++--
 7 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index e1109491c234..35fdb7dc6512 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -42,9 +42,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 					return -EINVAL;
 			}
 			/* partition number in use? */
-			down(&bdev->bd_sem);
+			mutex_lock(&bdev->bd_mutex);
 			if (disk->part[part - 1]) {
-				up(&bdev->bd_sem);
+				mutex_unlock(&bdev->bd_mutex);
 				return -EBUSY;
 			}
 			/* overlap? */
@@ -55,13 +55,13 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 					continue;
 				if (!(start+length <= s->start_sect ||
 				      start >= s->start_sect + s->nr_sects)) {
-					up(&bdev->bd_sem);
+					mutex_unlock(&bdev->bd_mutex);
 					return -EBUSY;
 				}
 			}
 			/* all seems OK */
 			add_partition(disk, part, start, length);
-			up(&bdev->bd_sem);
+			mutex_unlock(&bdev->bd_mutex);
 			return 0;
 		case BLKPG_DEL_PARTITION:
 			if (!disk->part[part-1])
@@ -71,9 +71,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			bdevp = bdget_disk(disk, part);
 			if (!bdevp)
 				return -ENOMEM;
-			down(&bdevp->bd_sem);
+			mutex_lock(&bdevp->bd_mutex);
 			if (bdevp->bd_openers) {
-				up(&bdevp->bd_sem);
+				mutex_unlock(&bdevp->bd_mutex);
 				bdput(bdevp);
 				return -EBUSY;
 			}
@@ -81,10 +81,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			fsync_bdev(bdevp);
 			invalidate_bdev(bdevp, 0);
 
-			down(&bdev->bd_sem);
+			mutex_lock(&bdev->bd_mutex);
 			delete_partition(disk, part);
-			up(&bdev->bd_sem);
-			up(&bdevp->bd_sem);
+			mutex_unlock(&bdev->bd_mutex);
+			mutex_unlock(&bdevp->bd_mutex);
 			bdput(bdevp);
 
 			return 0;
@@ -102,10 +102,10 @@ static int blkdev_reread_part(struct block_device *bdev)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
-	if (down_trylock(&bdev->bd_sem))
+	if (!mutex_trylock(&bdev->bd_mutex))
 		return -EBUSY;
 	res = rescan_partitions(disk, bdev);
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	return res;
 }
 
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index ffd6abd6d5a0..1c54f46d3f70 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -310,12 +310,12 @@ static int rd_ioctl(struct inode *inode, struct file *file,
 	 * cache
 	 */
 	error = -EBUSY;
-	down(&bdev->bd_sem);
+	mutex_lock(&bdev->bd_mutex);
 	if (bdev->bd_openers <= 2) {
 		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 		error = 0;
 	}
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	return error;
 }
 
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index fafeeae52675..f9930552ab54 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -151,9 +151,9 @@ dasd_ioctl_enable(struct block_device *bdev, int no, long args)
 		return -ENODEV;
 	dasd_enable_device(device);
 	/* Formatting the dasd device can change the capacity. */
-	down(&bdev->bd_sem);
+	mutex_lock(&bdev->bd_mutex);
 	i_size_write(bdev->bd_inode, (loff_t)get_capacity(device->gdp) << 9);
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	return 0;
 }
 
@@ -184,9 +184,9 @@ dasd_ioctl_disable(struct block_device *bdev, int no, long args)
 	 * Set i_size to zero, since read, write, etc. check against this
 	 * value.
 	 */
-	down(&bdev->bd_sem);
+	mutex_lock(&bdev->bd_mutex);
 	i_size_write(bdev->bd_inode, 0);
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	return 0;
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6e50346fb1ee..44d05e6e34db 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -265,8 +265,8 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 	    SLAB_CTOR_CONSTRUCTOR)
 	{
 		memset(bdev, 0, sizeof(*bdev));
-		sema_init(&bdev->bd_sem, 1);
-		sema_init(&bdev->bd_mount_sem, 1);
+		mutex_init(&bdev->bd_mutex);
+		mutex_init(&bdev->bd_mount_mutex);
 		INIT_LIST_HEAD(&bdev->bd_inodes);
 		INIT_LIST_HEAD(&bdev->bd_list);
 		inode_init_once(&ei->vfs_inode);
@@ -574,7 +574,7 @@ static int do_open(struct block_device *bdev, struct file *file)
 	}
 	owner = disk->fops->owner;
 
-	down(&bdev->bd_sem);
+	mutex_lock(&bdev->bd_mutex);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
 		bdev->bd_contains = bdev;
@@ -605,21 +605,21 @@ static int do_open(struct block_device *bdev, struct file *file)
 			if (ret)
 				goto out_first;
 			bdev->bd_contains = whole;
-			down(&whole->bd_sem);
+			mutex_lock(&whole->bd_mutex);
 			whole->bd_part_count++;
 			p = disk->part[part - 1];
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
 			if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
 				whole->bd_part_count--;
-				up(&whole->bd_sem);
+				mutex_unlock(&whole->bd_mutex);
 				ret = -ENXIO;
 				goto out_first;
 			}
 			kobject_get(&p->kobj);
 			bdev->bd_part = p;
 			bd_set_size(bdev, (loff_t) p->nr_sects << 9);
-			up(&whole->bd_sem);
+			mutex_unlock(&whole->bd_mutex);
 		}
 	} else {
 		put_disk(disk);
@@ -633,13 +633,13 @@ static int do_open(struct block_device *bdev, struct file *file)
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
 		} else {
-			down(&bdev->bd_contains->bd_sem);
+			mutex_lock(&bdev->bd_contains->bd_mutex);
 			bdev->bd_contains->bd_part_count++;
-			up(&bdev->bd_contains->bd_sem);
+			mutex_unlock(&bdev->bd_contains->bd_mutex);
 		}
 	}
 	bdev->bd_openers++;
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	unlock_kernel();
 	return 0;
 
@@ -652,7 +652,7 @@ out_first:
 	put_disk(disk);
 	module_put(owner);
 out:
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	unlock_kernel();
 	if (ret)
 		bdput(bdev);
@@ -714,7 +714,7 @@ int blkdev_put(struct block_device *bdev)
 	struct inode *bd_inode = bdev->bd_inode;
 	struct gendisk *disk = bdev->bd_disk;
 
-	down(&bdev->bd_sem);
+	mutex_lock(&bdev->bd_mutex);
 	lock_kernel();
 	if (!--bdev->bd_openers) {
 		sync_blockdev(bdev);
@@ -724,9 +724,9 @@ int blkdev_put(struct block_device *bdev)
 		if (disk->fops->release)
 			ret = disk->fops->release(bd_inode, NULL);
 	} else {
-		down(&bdev->bd_contains->bd_sem);
+		mutex_lock(&bdev->bd_contains->bd_mutex);
 		bdev->bd_contains->bd_part_count--;
-		up(&bdev->bd_contains->bd_sem);
+		mutex_unlock(&bdev->bd_contains->bd_mutex);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
@@ -746,7 +746,7 @@ int blkdev_put(struct block_device *bdev)
 		bdev->bd_contains = NULL;
 	}
 	unlock_kernel();
-	up(&bdev->bd_sem);
+	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	return ret;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index 1d3683d496f8..0d6ca7bac6c8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -201,7 +201,7 @@ int fsync_bdev(struct block_device *bdev)
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
  *
- * This takes the block device bd_mount_sem to make sure no new mounts
+ * This takes the block device bd_mount_mutex to make sure no new mounts
  * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
@@ -210,7 +210,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
 
-	down(&bdev->bd_mount_sem);
+	mutex_lock(&bdev->bd_mount_mutex);
 	sb = get_super(bdev);
 	if (sb && !(sb->s_flags & MS_RDONLY)) {
 		sb->s_frozen = SB_FREEZE_WRITE;
@@ -264,7 +264,7 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 		drop_super(sb);
 	}
 
-	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_mount_mutex);
 }
 EXPORT_SYMBOL(thaw_bdev);
 
diff --git a/fs/super.c b/fs/super.c
index e20b5580afd5..8f9c9b3af70c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -693,9 +693,9 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	 * will protect the lockfs code from trying to start a snapshot
 	 * while we are mounting
 	 */
-	down(&bdev->bd_mount_sem);
+	mutex_lock(&bdev->bd_mount_mutex);
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
-	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_mount_mutex);
 	if (IS_ERR(s))
 		goto out;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 128d0082522c..009ac96053fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -397,8 +397,8 @@ struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
 	struct inode *		bd_inode;	/* will die */
 	int			bd_openers;
-	struct semaphore	bd_sem;	/* open/close mutex */
-	struct semaphore	bd_mount_sem;	/* mount mutex */
+	struct mutex		bd_mutex;	/* open/close mutex */
+	struct mutex		bd_mount_mutex;	/* mount mutex */
 	struct list_head	bd_inodes;
 	void *			bd_holder;
 	int			bd_holders;
-- 
cgit v1.2.3


From d3be915fc5e7d19a2283ad9b0fe0782a74675d0a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:29 -0800
Subject: [PATCH] sem2mutex: quota

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dquot.c            | 117 +++++++++++++++++++++++++-------------------------
 fs/ext3/super.c       |   4 +-
 fs/quota.c            |   6 +--
 fs/quota_v2.c         |   2 +-
 fs/super.c            |   4 +-
 include/linux/quota.h |   7 +--
 6 files changed, 71 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dquot.c b/fs/dquot.c
index 9376a4378988..acf07e581f8c 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -103,12 +103,12 @@
  * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
  * for altering the flag i_mutex is also needed).  If operation is holding
  * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
- * dqonoff_sem.
+ * dqonoff_mutex.
  * This locking assures that:
  *   a) update/access to dquot pointers in inode is serialized
  *   b) everyone is guarded against invalidate_dquots()
  *
- * Each dquot has its dq_lock semaphore. Locked dquots might not be referenced
+ * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
  * from inodes (dquot_alloc_space() and such don't check the dq_lock).
  * Currently dquot is locked only when it is being read to memory (or space for
  * it is being allocated) on the first dqget() and when it is being released on
@@ -118,8 +118,9 @@
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is the following:
- *  i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > dqio_sem
- * i_mutex on quota files is special (it's below dqio_sem)
+ *   i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
+ *   dqio_mutex
+ * i_mutex on quota files is special (it's below dqio_mutex)
  */
 
 static DEFINE_SPINLOCK(dq_list_lock);
@@ -280,8 +281,8 @@ static inline void remove_inuse(struct dquot *dquot)
 
 static void wait_on_dquot(struct dquot *dquot)
 {
-	down(&dquot->dq_lock);
-	up(&dquot->dq_lock);
+	mutex_lock(&dquot->dq_lock);
+	mutex_unlock(&dquot->dq_lock);
 }
 
 #define mark_dquot_dirty(dquot) ((dquot)->dq_sb->dq_op->mark_dirty(dquot))
@@ -320,8 +321,8 @@ int dquot_acquire(struct dquot *dquot)
 	int ret = 0, ret2 = 0;
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 
-	down(&dquot->dq_lock);
-	down(&dqopt->dqio_sem);
+	mutex_lock(&dquot->dq_lock);
+	mutex_lock(&dqopt->dqio_mutex);
 	if (!test_bit(DQ_READ_B, &dquot->dq_flags))
 		ret = dqopt->ops[dquot->dq_type]->read_dqblk(dquot);
 	if (ret < 0)
@@ -342,8 +343,8 @@ int dquot_acquire(struct dquot *dquot)
 	}
 	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out_iolock:
-	up(&dqopt->dqio_sem);
-	up(&dquot->dq_lock);
+	mutex_unlock(&dqopt->dqio_mutex);
+	mutex_unlock(&dquot->dq_lock);
 	return ret;
 }
 
@@ -355,7 +356,7 @@ int dquot_commit(struct dquot *dquot)
 	int ret = 0, ret2 = 0;
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 
-	down(&dqopt->dqio_sem);
+	mutex_lock(&dqopt->dqio_mutex);
 	spin_lock(&dq_list_lock);
 	if (!clear_dquot_dirty(dquot)) {
 		spin_unlock(&dq_list_lock);
@@ -372,7 +373,7 @@ int dquot_commit(struct dquot *dquot)
 			ret = ret2;
 	}
 out_sem:
-	up(&dqopt->dqio_sem);
+	mutex_unlock(&dqopt->dqio_mutex);
 	return ret;
 }
 
@@ -384,11 +385,11 @@ int dquot_release(struct dquot *dquot)
 	int ret = 0, ret2 = 0;
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 
-	down(&dquot->dq_lock);
+	mutex_lock(&dquot->dq_lock);
 	/* Check whether we are not racing with some other dqget() */
 	if (atomic_read(&dquot->dq_count) > 1)
 		goto out_dqlock;
-	down(&dqopt->dqio_sem);
+	mutex_lock(&dqopt->dqio_mutex);
 	if (dqopt->ops[dquot->dq_type]->release_dqblk) {
 		ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot);
 		/* Write the info */
@@ -398,9 +399,9 @@ int dquot_release(struct dquot *dquot)
 			ret = ret2;
 	}
 	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
-	up(&dqopt->dqio_sem);
+	mutex_unlock(&dqopt->dqio_mutex);
 out_dqlock:
-	up(&dquot->dq_lock);
+	mutex_unlock(&dquot->dq_lock);
 	return ret;
 }
 
@@ -464,7 +465,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int cnt;
 
-	down(&dqopt->dqonoff_sem);
+	mutex_lock(&dqopt->dqonoff_mutex);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
@@ -499,7 +500,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	spin_lock(&dq_list_lock);
 	dqstats.syncs++;
 	spin_unlock(&dq_list_lock);
-	up(&dqopt->dqonoff_sem);
+	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	return 0;
 }
@@ -540,7 +541,7 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
 /*
  * Put reference to dquot
  * NOTE: If you change this function please check whether dqput_blocks() works right...
- * MUST be called with either dqptr_sem or dqonoff_sem held
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
 static void dqput(struct dquot *dquot)
 {
@@ -605,7 +606,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 		return NODQUOT;
 
 	memset((caddr_t)dquot, 0, sizeof(struct dquot));
-	sema_init(&dquot->dq_lock, 1);
+	mutex_init(&dquot->dq_lock);
 	INIT_LIST_HEAD(&dquot->dq_free);
 	INIT_LIST_HEAD(&dquot->dq_inuse);
 	INIT_HLIST_NODE(&dquot->dq_hash);
@@ -620,7 +621,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 
 /*
  * Get reference to dquot
- * MUST be called with either dqptr_sem or dqonoff_sem held
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
 static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
@@ -686,7 +687,7 @@ static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
-/* This routine is guarded by dqonoff_sem semaphore */
+/* This routine is guarded by dqonoff_mutex mutex */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
 	struct list_head *p;
@@ -964,8 +965,8 @@ int dquot_initialize(struct inode *inode, int type)
 	unsigned int id = 0;
 	int cnt, ret = 0;
 
-	/* First test before acquiring semaphore - solves deadlocks when we
-         * re-enter the quota code and are already holding the semaphore */
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return 0;
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1028,8 +1029,8 @@ int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
 
-	/* First test before acquiring semaphore - solves deadlocks when we
-         * re-enter the quota code and are already holding the semaphore */
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode)) {
 out_add:
 		inode_add_bytes(inode, number);
@@ -1077,8 +1078,8 @@ int dquot_alloc_inode(const struct inode *inode, unsigned long number)
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
 
-	/* First test before acquiring semaphore - solves deadlocks when we
-         * re-enter the quota code and are already holding the semaphore */
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
@@ -1121,8 +1122,8 @@ int dquot_free_space(struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
 
-	/* First test before acquiring semaphore - solves deadlocks when we
-         * re-enter the quota code and are already holding the semaphore */
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode)) {
 out_sub:
 		inode_sub_bytes(inode, number);
@@ -1157,8 +1158,8 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
 {
 	unsigned int cnt;
 
-	/* First test before acquiring semaphore - solves deadlocks when we
-         * re-enter the quota code and are already holding the semaphore */
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1197,8 +1198,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	    chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
 	char warntype[MAXQUOTAS];
 
-	/* First test before acquiring semaphore - solves deadlocks when we
-         * re-enter the quota code and are already holding the semaphore */
+	/* First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
 	/* Clear the arrays */
@@ -1292,9 +1293,9 @@ int dquot_commit_info(struct super_block *sb, int type)
 	int ret;
 	struct quota_info *dqopt = sb_dqopt(sb);
 
-	down(&dqopt->dqio_sem);
+	mutex_lock(&dqopt->dqio_mutex);
 	ret = dqopt->ops[type]->write_file_info(sb, type);
-	up(&dqopt->dqio_sem);
+	mutex_unlock(&dqopt->dqio_mutex);
 	return ret;
 }
 
@@ -1350,7 +1351,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 	struct inode *toputinode[MAXQUOTAS];
 
 	/* We need to serialize quota_off() for device */
-	down(&dqopt->dqonoff_sem);
+	mutex_lock(&dqopt->dqonoff_mutex);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		toputinode[cnt] = NULL;
 		if (type != -1 && cnt != type)
@@ -1379,7 +1380,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 		dqopt->info[cnt].dqi_bgrace = 0;
 		dqopt->ops[cnt] = NULL;
 	}
-	up(&dqopt->dqonoff_sem);
+	mutex_unlock(&dqopt->dqonoff_mutex);
 	/* Sync the superblock so that buffers with quota data are written to
 	 * disk (and so userspace sees correct data afterwards). */
 	if (sb->s_op->sync_fs)
@@ -1392,7 +1393,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 	 * changes done by userspace on the next quotaon() */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		if (toputinode[cnt]) {
-			down(&dqopt->dqonoff_sem);
+			mutex_lock(&dqopt->dqonoff_mutex);
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_enabled(sb, cnt)) {
@@ -1404,7 +1405,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 				mark_inode_dirty(toputinode[cnt]);
 				iput(toputinode[cnt]);
 			}
-			up(&dqopt->dqonoff_sem);
+			mutex_unlock(&dqopt->dqonoff_mutex);
 		}
 	if (sb->s_bdev)
 		invalidate_bdev(sb->s_bdev, 0);
@@ -1445,7 +1446,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	/* And now flush the block cache so that kernel sees the changes */
 	invalidate_bdev(sb->s_bdev, 0);
 	mutex_lock(&inode->i_mutex);
-	down(&dqopt->dqonoff_sem);
+	mutex_lock(&dqopt->dqonoff_mutex);
 	if (sb_has_quota_enabled(sb, type)) {
 		error = -EBUSY;
 		goto out_lock;
@@ -1470,17 +1471,17 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	dqopt->ops[type] = fmt->qf_ops;
 	dqopt->info[type].dqi_format = fmt;
 	INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
-	down(&dqopt->dqio_sem);
+	mutex_lock(&dqopt->dqio_mutex);
 	if ((error = dqopt->ops[type]->read_file_info(sb, type)) < 0) {
-		up(&dqopt->dqio_sem);
+		mutex_unlock(&dqopt->dqio_mutex);
 		goto out_file_init;
 	}
-	up(&dqopt->dqio_sem);
+	mutex_unlock(&dqopt->dqio_mutex);
 	mutex_unlock(&inode->i_mutex);
 	set_enable_flags(dqopt, type);
 
 	add_dquot_ref(sb, type);
-	up(&dqopt->dqonoff_sem);
+	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	return 0;
 
@@ -1488,7 +1489,7 @@ out_file_init:
 	dqopt->files[type] = NULL;
 	iput(inode);
 out_lock:
-	up(&dqopt->dqonoff_sem);
+	mutex_unlock(&dqopt->dqonoff_mutex);
 	if (oldflags != -1) {
 		down_write(&dqopt->dqptr_sem);
 		/* Set the flags back (in the case of accidental quotaon()
@@ -1576,14 +1577,14 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	down(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	if (!(dquot = dqget(sb, id, type))) {
-		up(&sb_dqopt(sb)->dqonoff_sem);
+		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 	}
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
-	up(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
 
@@ -1645,14 +1646,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	down(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	if (!(dquot = dqget(sb, id, type))) {
-		up(&sb_dqopt(sb)->dqonoff_sem);
+		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 	}
 	do_set_dqblk(dquot, di);
 	dqput(dquot);
-	up(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
 
@@ -1661,9 +1662,9 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
   
-	down(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	if (!sb_has_quota_enabled(sb, type)) {
-		up(&sb_dqopt(sb)->dqonoff_sem);
+		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 	}
 	mi = sb_dqopt(sb)->info + type;
@@ -1673,7 +1674,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	ii->dqi_flags = mi->dqi_flags & DQF_MASK;
 	ii->dqi_valid = IIF_ALL;
 	spin_unlock(&dq_data_lock);
-	up(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
 
@@ -1682,9 +1683,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
 
-	down(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	if (!sb_has_quota_enabled(sb, type)) {
-		up(&sb_dqopt(sb)->dqonoff_sem);
+		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 	}
 	mi = sb_dqopt(sb)->info + type;
@@ -1699,7 +1700,7 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
 	sb->dq_op->write_info(sb, type);
-	up(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 56bf76586019..efa832059143 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2382,8 +2382,8 @@ static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
  * Process 1                         Process 2
  * ext3_create()                     quota_sync()
  *   journal_start()                   write_dquot()
- *   DQUOT_INIT()                        down(dqio_sem)
- *     down(dqio_sem)                    journal_start()
+ *   DQUOT_INIT()                        down(dqio_mutex)
+ *     down(dqio_mutex)                    journal_start()
  *
  */
 
diff --git a/fs/quota.c b/fs/quota.c
index ba9e0bf32f67..d6a2be826e29 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -170,10 +170,10 @@ static void quota_sync_sb(struct super_block *sb, int type)
 
 	/* Now when everything is written we can discard the pagecache so
 	 * that userspace sees the changes. We need i_mutex and so we could
-	 * not do it inside dqonoff_sem. Moreover we need to be carefull
+	 * not do it inside dqonoff_mutex. Moreover we need to be carefull
 	 * about races with quotaoff() (that is the reason why we have own
 	 * reference to inode). */
-	down(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		discard[cnt] = NULL;
 		if (type != -1 && cnt != type)
@@ -182,7 +182,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 			continue;
 		discard[cnt] = igrab(sb_dqopt(sb)->files[cnt]);
 	}
-	up(&sb_dqopt(sb)->dqonoff_sem);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (discard[cnt]) {
 			mutex_lock(&discard[cnt]->i_mutex);
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b4199ec3ece4..c519a583e681 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -394,7 +394,7 @@ static int v2_write_dquot(struct dquot *dquot)
 	ssize_t ret;
 	struct v2_disk_dqblk ddquot, empty;
 
-	/* dq_off is guarded by dqio_sem */
+	/* dq_off is guarded by dqio_mutex */
 	if (!dquot->dq_off)
 		if ((ret = dq_insert_tree(dquot)) < 0) {
 			printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
diff --git a/fs/super.c b/fs/super.c
index 8f9c9b3af70c..9cc6545dfa4c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -77,8 +77,8 @@ static struct super_block *alloc_super(void)
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
 		sema_init(&s->s_vfs_rename_sem,1);
-		sema_init(&s->s_dquot.dqio_sem, 1);
-		sema_init(&s->s_dquot.dqonoff_sem, 1);
+		mutex_init(&s->s_dquot.dqio_mutex);
+		mutex_init(&s->s_dquot.dqonoff_mutex);
 		init_rwsem(&s->s_dquot.dqptr_sem);
 		init_waitqueue_head(&s->s_wait_unfrozen);
 		s->s_maxbytes = MAX_NON_LFS;
diff --git a/include/linux/quota.h b/include/linux/quota.h
index f33aeb22c26a..8dc2d04a103f 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -38,6 +38,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 
 #define __DQUOT_VERSION__	"dquot_6.5.1"
 #define __DQUOT_NUM_VERSION__	6*10000+5*100+1
@@ -215,7 +216,7 @@ struct dquot {
 	struct list_head dq_inuse;	/* List of all quotas */
 	struct list_head dq_free;	/* Free list element */
 	struct list_head dq_dirty;	/* List of dirty dquots */
-	struct semaphore dq_lock;	/* dquot IO lock */
+	struct mutex dq_lock;		/* dquot IO lock */
 	atomic_t dq_count;		/* Use count */
 	wait_queue_head_t dq_wait_unused;	/* Wait queue for dquot to become unused */
 	struct super_block *dq_sb;	/* superblock this applies to */
@@ -285,8 +286,8 @@ struct quota_format_type {
 
 struct quota_info {
 	unsigned int flags;			/* Flags for diskquotas on this device */
-	struct semaphore dqio_sem;		/* lock device while I/O in progress */
-	struct semaphore dqonoff_sem;		/* Serialize quotaon & quotaoff */
+	struct mutex dqio_mutex;		/* lock device while I/O in progress */
+	struct mutex dqonoff_mutex;		/* Serialize quotaon & quotaoff */
 	struct rw_semaphore dqptr_sem;		/* serialize ops using quota_info struct, pointers from inode to dquots */
 	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
-- 
cgit v1.2.3


From d4f9af9dac4ecb75818f909168f87b441cc95653 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:30 -0800
Subject: [PATCH] sem2mutex: inotify

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Acked-by: Robert Love <rml@novell.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         |   2 +-
 fs/inotify.c       | 110 ++++++++++++++++++++++++++---------------------------
 include/linux/fs.h |   2 +-
 3 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index d0be6159eb7f..603e93ef0c6f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -206,7 +206,7 @@ void inode_init_once(struct inode *inode)
 	i_size_ordered_init(inode);
 #ifdef CONFIG_INOTIFY
 	INIT_LIST_HEAD(&inode->inotify_watches);
-	sema_init(&inode->inotify_sem, 1);
+	mutex_init(&inode->inotify_mutex);
 #endif
 }
 
diff --git a/fs/inotify.c b/fs/inotify.c
index 3041503bde02..60d9653d55b7 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -56,8 +56,8 @@ int inotify_max_queued_events;
  * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
  * iprune_sem (synchronize shrink_icache_memory())
  * 	inode_lock (protects the super_block->s_inodes list)
- * 	inode->inotify_sem (protects inode->inotify_watches and watches->i_list)
- * 		inotify_dev->sem (protects inotify_device and watches->d_list)
+ * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
+ * 		inotify_dev->mutex (protects inotify_device and watches->d_list)
  */
 
 /*
@@ -79,12 +79,12 @@ int inotify_max_queued_events;
 /*
  * struct inotify_device - represents an inotify instance
  *
- * This structure is protected by the semaphore 'sem'.
+ * This structure is protected by the mutex 'mutex'.
  */
 struct inotify_device {
 	wait_queue_head_t 	wq;		/* wait queue for i/o */
 	struct idr		idr;		/* idr mapping wd -> watch */
-	struct semaphore	sem;		/* protects this bad boy */
+	struct mutex		mutex;		/* protects this bad boy */
 	struct list_head 	events;		/* list of queued events */
 	struct list_head	watches;	/* list of watches */
 	atomic_t		count;		/* reference count */
@@ -101,7 +101,7 @@ struct inotify_device {
  * device.  In read(), this list is walked and all events that can fit in the
  * buffer are returned.
  *
- * Protected by dev->sem of the device in which we are queued.
+ * Protected by dev->mutex of the device in which we are queued.
  */
 struct inotify_kernel_event {
 	struct inotify_event	event;	/* the user-space event */
@@ -112,8 +112,8 @@ struct inotify_kernel_event {
 /*
  * struct inotify_watch - represents a watch request on a specific inode
  *
- * d_list is protected by dev->sem of the associated watch->dev.
- * i_list and mask are protected by inode->inotify_sem of the associated inode.
+ * d_list is protected by dev->mutex of the associated watch->dev.
+ * i_list and mask are protected by inode->inotify_mutex of the associated inode.
  * dev, inode, and wd are never written to once the watch is created.
  */
 struct inotify_watch {
@@ -261,7 +261,7 @@ static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
 /*
  * inotify_dev_get_event - return the next event in the given dev's queue
  *
- * Caller must hold dev->sem.
+ * Caller must hold dev->mutex.
  */
 static inline struct inotify_kernel_event *
 inotify_dev_get_event(struct inotify_device *dev)
@@ -272,7 +272,7 @@ inotify_dev_get_event(struct inotify_device *dev)
 /*
  * inotify_dev_queue_event - add a new event to the given device
  *
- * Caller must hold dev->sem.  Can sleep (calls kernel_event()).
+ * Caller must hold dev->mutex.  Can sleep (calls kernel_event()).
  */
 static void inotify_dev_queue_event(struct inotify_device *dev,
 				    struct inotify_watch *watch, u32 mask,
@@ -315,7 +315,7 @@ static void inotify_dev_queue_event(struct inotify_device *dev,
 /*
  * remove_kevent - cleans up and ultimately frees the given kevent
  *
- * Caller must hold dev->sem.
+ * Caller must hold dev->mutex.
  */
 static void remove_kevent(struct inotify_device *dev,
 			  struct inotify_kernel_event *kevent)
@@ -332,7 +332,7 @@ static void remove_kevent(struct inotify_device *dev,
 /*
  * inotify_dev_event_dequeue - destroy an event on the given device
  *
- * Caller must hold dev->sem.
+ * Caller must hold dev->mutex.
  */
 static void inotify_dev_event_dequeue(struct inotify_device *dev)
 {
@@ -346,7 +346,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
 /*
  * inotify_dev_get_wd - returns the next WD for use by the given dev
  *
- * Callers must hold dev->sem.  This function can sleep.
+ * Callers must hold dev->mutex.  This function can sleep.
  */
 static int inotify_dev_get_wd(struct inotify_device *dev,
 			      struct inotify_watch *watch)
@@ -383,7 +383,7 @@ static int find_inode(const char __user *dirname, struct nameidata *nd,
 /*
  * create_watch - creates a watch on the given device.
  *
- * Callers must hold dev->sem.  Calls inotify_dev_get_wd() so may sleep.
+ * Callers must hold dev->mutex.  Calls inotify_dev_get_wd() so may sleep.
  * Both 'dev' and 'inode' (by way of nameidata) need to be pinned.
  */
 static struct inotify_watch *create_watch(struct inotify_device *dev,
@@ -434,7 +434,7 @@ static struct inotify_watch *create_watch(struct inotify_device *dev,
 /*
  * inotify_find_dev - find the watch associated with the given inode and dev
  *
- * Callers must hold inode->inotify_sem.
+ * Callers must hold inode->inotify_mutex.
  */
 static struct inotify_watch *inode_find_dev(struct inode *inode,
 					    struct inotify_device *dev)
@@ -469,7 +469,7 @@ static void remove_watch_no_event(struct inotify_watch *watch,
  * the IN_IGNORED event to the given device signifying that the inode is no
  * longer watched.
  *
- * Callers must hold both inode->inotify_sem and dev->sem.  We drop a
+ * Callers must hold both inode->inotify_mutex and dev->mutex.  We drop a
  * reference to the inode before returning.
  *
  * The inode is not iput() so as to remain atomic.  If the inode needs to be
@@ -507,21 +507,21 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
 	if (!inotify_inode_watched(inode))
 		return;
 
-	down(&inode->inotify_sem);
+	mutex_lock(&inode->inotify_mutex);
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
 		u32 watch_mask = watch->mask;
 		if (watch_mask & mask) {
 			struct inotify_device *dev = watch->dev;
 			get_inotify_watch(watch);
-			down(&dev->sem);
+			mutex_lock(&dev->mutex);
 			inotify_dev_queue_event(dev, watch, mask, cookie, name);
 			if (watch_mask & IN_ONESHOT)
 				remove_watch_no_event(watch, dev);
-			up(&dev->sem);
+			mutex_unlock(&dev->mutex);
 			put_inotify_watch(watch);
 		}
 	}
-	up(&inode->inotify_sem);
+	mutex_unlock(&inode->inotify_mutex);
 }
 EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
 
@@ -626,16 +626,16 @@ void inotify_unmount_inodes(struct list_head *list)
 			iput(need_iput_tmp);
 
 		/* for each watch, send IN_UNMOUNT and then remove it */
-		down(&inode->inotify_sem);
+		mutex_lock(&inode->inotify_mutex);
 		watches = &inode->inotify_watches;
 		list_for_each_entry_safe(watch, next_w, watches, i_list) {
 			struct inotify_device *dev = watch->dev;
-			down(&dev->sem);
+			mutex_lock(&dev->mutex);
 			inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL);
 			remove_watch(watch, dev);
-			up(&dev->sem);
+			mutex_unlock(&dev->mutex);
 		}
-		up(&inode->inotify_sem);
+		mutex_unlock(&inode->inotify_mutex);
 		iput(inode);		
 
 		spin_lock(&inode_lock);
@@ -651,14 +651,14 @@ void inotify_inode_is_dead(struct inode *inode)
 {
 	struct inotify_watch *watch, *next;
 
-	down(&inode->inotify_sem);
+	mutex_lock(&inode->inotify_mutex);
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
 		struct inotify_device *dev = watch->dev;
-		down(&dev->sem);
+		mutex_lock(&dev->mutex);
 		remove_watch(watch, dev);
-		up(&dev->sem);
+		mutex_unlock(&dev->mutex);
 	}
-	up(&inode->inotify_sem);
+	mutex_unlock(&inode->inotify_mutex);
 }
 EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
 
@@ -670,10 +670,10 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 	int ret = 0;
 
 	poll_wait(file, &dev->wq, wait);
-	down(&dev->sem);
+	mutex_lock(&dev->mutex);
 	if (!list_empty(&dev->events))
 		ret = POLLIN | POLLRDNORM;
-	up(&dev->sem);
+	mutex_unlock(&dev->mutex);
 
 	return ret;
 }
@@ -695,9 +695,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 
 		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
 
-		down(&dev->sem);
+		mutex_lock(&dev->mutex);
 		events = !list_empty(&dev->events);
-		up(&dev->sem);
+		mutex_unlock(&dev->mutex);
 		if (events) {
 			ret = 0;
 			break;
@@ -720,7 +720,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	down(&dev->sem);
+	mutex_lock(&dev->mutex);
 	while (1) {
 		struct inotify_kernel_event *kevent;
 
@@ -750,7 +750,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 
 		remove_kevent(dev, kevent);
 	}
-	up(&dev->sem);
+	mutex_unlock(&dev->mutex);
 
 	return ret;
 }
@@ -763,37 +763,37 @@ static int inotify_release(struct inode *ignored, struct file *file)
 	 * Destroy all of the watches on this device.  Unfortunately, not very
 	 * pretty.  We cannot do a simple iteration over the list, because we
 	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_sem before dev->sem.  The following works.
+	 * hold inode->inotify_mutex before dev->mutex.  The following works.
 	 */
 	while (1) {
 		struct inotify_watch *watch;
 		struct list_head *watches;
 		struct inode *inode;
 
-		down(&dev->sem);
+		mutex_lock(&dev->mutex);
 		watches = &dev->watches;
 		if (list_empty(watches)) {
-			up(&dev->sem);
+			mutex_unlock(&dev->mutex);
 			break;
 		}
 		watch = list_entry(watches->next, struct inotify_watch, d_list);
 		get_inotify_watch(watch);
-		up(&dev->sem);
+		mutex_unlock(&dev->mutex);
 
 		inode = watch->inode;
-		down(&inode->inotify_sem);
-		down(&dev->sem);
+		mutex_lock(&inode->inotify_mutex);
+		mutex_lock(&dev->mutex);
 		remove_watch_no_event(watch, dev);
-		up(&dev->sem);
-		up(&inode->inotify_sem);
+		mutex_unlock(&dev->mutex);
+		mutex_unlock(&inode->inotify_mutex);
 		put_inotify_watch(watch);
 	}
 
 	/* destroy all of the events on this device */
-	down(&dev->sem);
+	mutex_lock(&dev->mutex);
 	while (!list_empty(&dev->events))
 		inotify_dev_event_dequeue(dev);
-	up(&dev->sem);
+	mutex_unlock(&dev->mutex);
 
 	/* free this device: the put matching the get in inotify_init() */
 	put_inotify_dev(dev);
@@ -811,26 +811,26 @@ static int inotify_ignore(struct inotify_device *dev, s32 wd)
 	struct inotify_watch *watch;
 	struct inode *inode;
 
-	down(&dev->sem);
+	mutex_lock(&dev->mutex);
 	watch = idr_find(&dev->idr, wd);
 	if (unlikely(!watch)) {
-		up(&dev->sem);
+		mutex_unlock(&dev->mutex);
 		return -EINVAL;
 	}
 	get_inotify_watch(watch);
 	inode = watch->inode;
-	up(&dev->sem);
+	mutex_unlock(&dev->mutex);
 
-	down(&inode->inotify_sem);
-	down(&dev->sem);
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&dev->mutex);
 
 	/* make sure that we did not race */
 	watch = idr_find(&dev->idr, wd);
 	if (likely(watch))
 		remove_watch(watch, dev);
 
-	up(&dev->sem);
-	up(&inode->inotify_sem);
+	mutex_unlock(&dev->mutex);
+	mutex_unlock(&inode->inotify_mutex);
 	put_inotify_watch(watch);
 
 	return 0;
@@ -905,7 +905,7 @@ asmlinkage long sys_inotify_init(void)
 	INIT_LIST_HEAD(&dev->events);
 	INIT_LIST_HEAD(&dev->watches);
 	init_waitqueue_head(&dev->wq);
-	sema_init(&dev->sem, 1);
+	mutex_init(&dev->mutex);
 	dev->event_count = 0;
 	dev->queue_size = 0;
 	dev->max_events = inotify_max_queued_events;
@@ -960,8 +960,8 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	inode = nd.dentry->d_inode;
 	dev = filp->private_data;
 
-	down(&inode->inotify_sem);
-	down(&dev->sem);
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&dev->mutex);
 
 	if (mask & IN_MASK_ADD)
 		mask_add = 1;
@@ -998,8 +998,8 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	list_add(&watch->i_list, &inode->inotify_watches);
 	ret = watch->wd;
 out:
-	up(&dev->sem);
-	up(&inode->inotify_sem);
+	mutex_unlock(&dev->mutex);
+	mutex_unlock(&inode->inotify_mutex);
 	path_release(&nd);
 fput_and_out:
 	fput_light(filp, fput_needed);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 009ac96053fe..9ed1f36b6d54 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -509,7 +509,7 @@ struct inode {
 
 #ifdef CONFIG_INOTIFY
 	struct list_head	inotify_watches; /* watches on this inode */
-	struct semaphore	inotify_sem;	/* protects the watches list */
+	struct mutex		inotify_mutex;	/* protects the watches list */
 #endif
 
 	unsigned long		i_state;
-- 
cgit v1.2.3


From 70522e121a521aa09bd0f4e62e1aa68708b798e1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:31 -0800
Subject: [PATCH] sem2mutex: tty

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/n_tty.c  | 10 +++++-----
 drivers/char/tty_io.c | 50 +++++++++++++++++++++++++-------------------------
 drivers/char/vt.c     | 14 +++++++-------
 include/linux/tty.h   |  8 ++++----
 kernel/exit.c         |  4 ++--
 kernel/sys.c          |  4 ++--
 6 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index ccad7ae94541..ede365d05387 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -132,7 +132,7 @@ static void put_tty_queue(unsigned char c, struct tty_struct *tty)
  *	We test the TTY_THROTTLED bit first so that it always
  *	indicates the current state. The decision about whether
  *	it is worth allowing more input has been taken by the caller.
- *	Can sleep, may be called under the atomic_read semaphore but
+ *	Can sleep, may be called under the atomic_read_lock mutex but
  *	this is not guaranteed.
  */
  
@@ -1132,7 +1132,7 @@ static inline int input_available_p(struct tty_struct *tty, int amt)
  *	buffer, and once to drain the space from the (physical) beginning of
  *	the buffer to head pointer.
  *
- *	Called under the tty->atomic_read sem and with TTY_DONT_FLIP set
+ *	Called under the tty->atomic_read_lock sem and with TTY_DONT_FLIP set
  *
  */
  
@@ -1262,11 +1262,11 @@ do_it_again:
 	 *	Internal serialization of reads.
 	 */
 	if (file->f_flags & O_NONBLOCK) {
-		if (down_trylock(&tty->atomic_read))
+		if (!mutex_trylock(&tty->atomic_read_lock))
 			return -EAGAIN;
 	}
 	else {
-		if (down_interruptible(&tty->atomic_read))
+		if (mutex_lock_interruptible(&tty->atomic_read_lock))
 			return -ERESTARTSYS;
 	}
 
@@ -1393,7 +1393,7 @@ do_it_again:
 			timeout = time;
 	}
 	clear_bit(TTY_DONT_FLIP, &tty->flags);
-	up(&tty->atomic_read);
+	mutex_unlock(&tty->atomic_read_lock);
 	remove_wait_queue(&tty->read_wait, &wait);
 
 	if (!waitqueue_active(&tty->read_wait))
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 53d3d066554e..76592ee1fb38 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -130,7 +130,7 @@ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
 
 /* Semaphore to protect creating and releasing a tty. This is shared with
    vt.c for deeply disgusting hack reasons */
-DECLARE_MUTEX(tty_sem);
+DEFINE_MUTEX(tty_mutex);
 
 #ifdef CONFIG_UNIX98_PTYS
 extern struct tty_driver *ptm_driver;	/* Unix98 pty masters; for /dev/ptmx */
@@ -1188,11 +1188,11 @@ void disassociate_ctty(int on_exit)
 
 	lock_kernel();
 
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	tty = current->signal->tty;
 	if (tty) {
 		tty_pgrp = tty->pgrp;
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY)
 			tty_vhangup(tty);
 	} else {
@@ -1200,7 +1200,7 @@ void disassociate_ctty(int on_exit)
 			kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit);
 			kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit);
 		}
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		unlock_kernel();	
 		return;
 	}
@@ -1211,7 +1211,7 @@ void disassociate_ctty(int on_exit)
 	}
 
 	/* Must lock changes to tty_old_pgrp */
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	current->signal->tty_old_pgrp = 0;
 	tty->session = 0;
 	tty->pgrp = -1;
@@ -1222,7 +1222,7 @@ void disassociate_ctty(int on_exit)
 		p->signal->tty = NULL;
 	} while_each_task_pid(current->signal->session, PIDTYPE_SID, p);
 	read_unlock(&tasklist_lock);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	unlock_kernel();
 }
 
@@ -1306,7 +1306,7 @@ static inline ssize_t do_tty_write(
 	ssize_t ret = 0, written = 0;
 	unsigned int chunk;
 	
-	if (down_interruptible(&tty->atomic_write)) {
+	if (mutex_lock_interruptible(&tty->atomic_write_lock)) {
 		return -ERESTARTSYS;
 	}
 
@@ -1329,7 +1329,7 @@ static inline ssize_t do_tty_write(
 	if (count < chunk)
 		chunk = count;
 
-	/* write_buf/write_cnt is protected by the atomic_write semaphore */
+	/* write_buf/write_cnt is protected by the atomic_write_lock mutex */
 	if (tty->write_cnt < chunk) {
 		unsigned char *buf;
 
@@ -1338,7 +1338,7 @@ static inline ssize_t do_tty_write(
 
 		buf = kmalloc(chunk, GFP_KERNEL);
 		if (!buf) {
-			up(&tty->atomic_write);
+			mutex_unlock(&tty->atomic_write_lock);
 			return -ENOMEM;
 		}
 		kfree(tty->write_buf);
@@ -1374,7 +1374,7 @@ static inline ssize_t do_tty_write(
 		inode->i_mtime = current_fs_time(inode->i_sb);
 		ret = written;
 	}
-	up(&tty->atomic_write);
+	mutex_unlock(&tty->atomic_write_lock);
 	return ret;
 }
 
@@ -1442,8 +1442,8 @@ static inline void tty_line_name(struct tty_driver *driver, int index, char *p)
 
 /*
  * WSH 06/09/97: Rewritten to remove races and properly clean up after a
- * failed open.  The new code protects the open with a semaphore, so it's
- * really quite straightforward.  The semaphore locking can probably be
+ * failed open.  The new code protects the open with a mutex, so it's
+ * really quite straightforward.  The mutex locking can probably be
  * relaxed for the (most common) case of reopening a tty.
  */
 static int init_dev(struct tty_driver *driver, int idx,
@@ -1640,7 +1640,7 @@ fast_track:
 success:
 	*ret_tty = tty;
 	
-	/* All paths come through here to release the semaphore */
+	/* All paths come through here to release the mutex */
 end_init:
 	return retval;
 
@@ -1837,7 +1837,7 @@ static void release_dev(struct file * filp)
 		/* Guard against races with tty->count changes elsewhere and
 		   opens on /dev/tty */
 		   
-		down(&tty_sem);
+		mutex_lock(&tty_mutex);
 		tty_closing = tty->count <= 1;
 		o_tty_closing = o_tty &&
 			(o_tty->count <= (pty_master ? 1 : 0));
@@ -1868,7 +1868,7 @@ static void release_dev(struct file * filp)
 
 		printk(KERN_WARNING "release_dev: %s: read/write wait queue "
 				    "active!\n", tty_name(tty, buf));
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		schedule();
 	}	
 
@@ -1934,7 +1934,7 @@ static void release_dev(struct file * filp)
 		read_unlock(&tasklist_lock);
 	}
 
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 
 	/* check whether both sides are closing ... */
 	if (!tty_closing || (o_tty && !o_tty_closing))
@@ -2040,11 +2040,11 @@ retry_open:
 	index  = -1;
 	retval = 0;
 	
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 
 	if (device == MKDEV(TTYAUX_MAJOR,0)) {
 		if (!current->signal->tty) {
-			up(&tty_sem);
+			mutex_unlock(&tty_mutex);
 			return -ENXIO;
 		}
 		driver = current->signal->tty->driver;
@@ -2070,18 +2070,18 @@ retry_open:
 			noctty = 1;
 			goto got_driver;
 		}
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		return -ENODEV;
 	}
 
 	driver = get_tty_driver(device, &index);
 	if (!driver) {
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		return -ENODEV;
 	}
 got_driver:
 	retval = init_dev(driver, index, &tty);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	if (retval)
 		return retval;
 
@@ -2167,9 +2167,9 @@ static int ptmx_open(struct inode * inode, struct file * filp)
 	}
 	up(&allocated_ptys_lock);
 
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	retval = init_dev(ptm_driver, index, &tty);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	
 	if (retval)
 		goto out;
@@ -2915,8 +2915,8 @@ static void initialize_tty_struct(struct tty_struct *tty)
 	init_waitqueue_head(&tty->write_wait);
 	init_waitqueue_head(&tty->read_wait);
 	INIT_WORK(&tty->hangup_work, do_tty_hangup, tty);
-	sema_init(&tty->atomic_read, 1);
-	sema_init(&tty->atomic_write, 1);
+	mutex_init(&tty->atomic_read_lock);
+	mutex_init(&tty->atomic_write_lock);
 	spin_lock_init(&tty->read_lock);
 	INIT_LIST_HEAD(&tty->tty_files);
 	INIT_WORK(&tty->SAK_work, NULL, NULL);
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 86b31b87eb85..ca4844c527da 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2489,7 +2489,7 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 }
 
 /*
- * We take tty_sem in here to prevent another thread from coming in via init_dev
+ * We take tty_mutex in here to prevent another thread from coming in via init_dev
  * and taking a ref against the tty while we're in the process of forgetting
  * about it and cleaning things up.
  *
@@ -2497,7 +2497,7 @@ static int con_open(struct tty_struct *tty, struct file *filp)
  */
 static void con_close(struct tty_struct *tty, struct file *filp)
 {
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	acquire_console_sem();
 	if (tty && tty->count == 1) {
 		struct vc_data *vc = tty->driver_data;
@@ -2507,15 +2507,15 @@ static void con_close(struct tty_struct *tty, struct file *filp)
 		tty->driver_data = NULL;
 		release_console_sem();
 		vcs_remove_devfs(tty);
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		/*
-		 * tty_sem is released, but we still hold BKL, so there is
+		 * tty_mutex is released, but we still hold BKL, so there is
 		 * still exclusion against init_dev()
 		 */
 		return;
 	}
 	release_console_sem();
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 }
 
 static void vc_init(struct vc_data *vc, unsigned int rows,
@@ -2869,9 +2869,9 @@ void unblank_screen(void)
 }
 
 /*
- * We defer the timer blanking to work queue so it can take the console semaphore
+ * We defer the timer blanking to work queue so it can take the console mutex
  * (console operations can still happen at irq time, but only from printk which
- * has the console semaphore. Not perfect yet, but better than no locking
+ * has the console mutex. Not perfect yet, but better than no locking
  */
 static void blank_screen_t(unsigned long dummy)
 {
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f45cd74e6f24..f13f49afe198 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -24,6 +24,7 @@
 #include <linux/tty_driver.h>
 #include <linux/tty_ldisc.h>
 #include <linux/screen_info.h>
+#include <linux/mutex.h>
 
 #include <asm/system.h>
 
@@ -231,8 +232,8 @@ struct tty_struct {
 	int canon_data;
 	unsigned long canon_head;
 	unsigned int canon_column;
-	struct semaphore atomic_read;
-	struct semaphore atomic_write;
+	struct mutex atomic_read_lock;
+	struct mutex atomic_write_lock;
 	unsigned char *write_buf;
 	int write_cnt;
 	spinlock_t read_lock;
@@ -319,8 +320,7 @@ extern void tty_ldisc_put(int);
 extern void tty_wakeup(struct tty_struct *tty);
 extern void tty_ldisc_flush(struct tty_struct *tty);
 
-struct semaphore;
-extern struct semaphore tty_sem;
+extern struct mutex tty_mutex;
 
 /* n_tty.c */
 extern struct tty_ldisc tty_ldisc_N_TTY;
diff --git a/kernel/exit.c b/kernel/exit.c
index d1e8d500a7e1..8037405e136e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -345,9 +345,9 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	current->signal->tty = NULL;
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
diff --git a/kernel/sys.c b/kernel/sys.c
index 4941b9b14b97..c0fcad9f826c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1227,7 +1227,7 @@ asmlinkage long sys_setsid(void)
 	struct pid *pid;
 	int err = -EPERM;
 
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	write_lock_irq(&tasklist_lock);
 
 	pid = find_pid(PIDTYPE_PGID, group_leader->pid);
@@ -1241,7 +1241,7 @@ asmlinkage long sys_setsid(void)
 	err = process_group(group_leader);
 out:
 	write_unlock_irq(&tasklist_lock);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	return err;
 }
 
-- 
cgit v1.2.3


From a11f3a0574a5734db3e5de38922430d005d35118 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 23 Mar 2006 03:00:33 -0800
Subject: [PATCH] sem2mutex: vfs_rename_mutex

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/dir.c      |  8 ++++----
 fs/cifs/fcntl.c    |  4 ++--
 fs/cifs/file.c     |  4 ++--
 fs/cifs/inode.c    | 16 ++++++++--------
 fs/cifs/link.c     | 16 ++++++++--------
 fs/cifs/readdir.c  |  4 ++--
 fs/cifs/xattr.c    | 16 ++++++++--------
 fs/namei.c         | 12 ++++++------
 fs/super.c         |  2 +-
 include/linux/fs.h |  2 +-
 10 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index fed55e3c53df..632561dd9c50 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -138,9 +138,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -317,9 +317,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if(full_path == NULL)
 		rc = -ENOMEM;
 	else if (pTcon->ses->capabilities & CAP_UNIX) {
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
index a7a47bb36bf3..ec4dfe9bf5ef 100644
--- a/fs/cifs/fcntl.c
+++ b/fs/cifs/fcntl.c
@@ -86,9 +86,9 @@ int cifs_dir_notify(struct file * file, unsigned long arg)
 	cifs_sb = CIFS_SB(file->f_dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&file->f_dentry->d_sb->s_vfs_rename_sem);
+	mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	up(&file->f_dentry->d_sb->s_vfs_rename_sem);
+	mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		rc = -ENOMEM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 675bd2568297..165d67426381 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -203,9 +203,9 @@ int cifs_open(struct inode *inode, struct file *file)
 		}
 	}
 
-	down(&inode->i_sb->s_vfs_rename_sem);
+	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	up(&inode->i_sb->s_vfs_rename_sem);
+	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 59359911f481..ff93a9f81d1c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -574,9 +574,9 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 
 	/* Unlink can be called from rename so we can not grab the sem here
 	   since we deadlock otherwise */
-/*	down(&direntry->d_sb->s_vfs_rename_sem);*/
+/*	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
 	full_path = build_path_from_dentry(direntry);
-/*	up(&direntry->d_sb->s_vfs_rename_sem);*/
+/*	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -718,9 +718,9 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&inode->i_sb->s_vfs_rename_sem);
+	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&inode->i_sb->s_vfs_rename_sem);
+	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -803,9 +803,9 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&inode->i_sb->s_vfs_rename_sem);
+	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&inode->i_sb->s_vfs_rename_sem);
+	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -1137,9 +1137,9 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			rc = 0;
 	}
 		
-	down(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 0f99aae33162..8d0da7c87c7b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -48,10 +48,10 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 /* No need to check for cross device links since server will do that
    BB note DFS case in future though (when we may have to check) */
 
-	down(&inode->i_sb->s_vfs_rename_sem);
+	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	fromName = build_path_from_dentry(old_file);
 	toName = build_path_from_dentry(direntry);
-	up(&inode->i_sb->s_vfs_rename_sem);
+	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if((fromName == NULL) || (toName == NULL)) {
 		rc = -ENOMEM;
 		goto cifs_hl_exit;
@@ -103,9 +103,9 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	xid = GetXid();
 
-	down(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&direntry->d_sb->s_vfs_rename_sem);
+	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 
 	if (!full_path)
 		goto out_no_free;
@@ -164,9 +164,9 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&inode->i_sb->s_vfs_rename_sem);
+	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&inode->i_sb->s_vfs_rename_sem);
+	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		FreeXid(xid);
@@ -232,9 +232,9 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 
 /* BB would it be safe against deadlock to grab this sem 
       even though rename itself grabs the sem and calls lookup? */
-/*       down(&inode->i_sb->s_vfs_rename_sem);*/
+/*       mutex_lock(&inode->i_sb->s_vfs_rename_mutex);*/
 	full_path = build_path_from_dentry(direntry);
-/*       up(&inode->i_sb->s_vfs_rename_sem);*/
+/*       mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);*/
 
 	if(full_path == NULL) {
 		FreeXid(xid);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 288cc048d37f..edb3b6eb34bc 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -404,9 +404,9 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	if(pTcon == NULL)
 		return -EINVAL;
 
-	down(&file->f_dentry->d_sb->s_vfs_rename_sem);
+	mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	up(&file->f_dentry->d_sb->s_vfs_rename_sem);
+	mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		return -ENOMEM;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 777e3363c2a4..3938444d87b2 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -62,9 +62,9 @@ int cifs_removexattr(struct dentry * direntry, const char * ea_name)
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
                                                                                      
-	down(&sb->s_vfs_rename_sem);
+	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&sb->s_vfs_rename_sem);
+	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -116,9 +116,9 @@ int cifs_setxattr(struct dentry * direntry, const char * ea_name,
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&sb->s_vfs_rename_sem);
+	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&sb->s_vfs_rename_sem);
+	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -223,9 +223,9 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name,
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&sb->s_vfs_rename_sem);
+	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&sb->s_vfs_rename_sem);
+	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -341,9 +341,9 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size)
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	down(&sb->s_vfs_rename_sem);
+	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	up(&sb->s_vfs_rename_sem);
+	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
diff --git a/fs/namei.c b/fs/namei.c
index 8dc2b038d5d9..c72b940797fc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -104,7 +104,7 @@
  */
 /*
  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
- * implemented.  Let's see if raised priority of ->s_vfs_rename_sem gives
+ * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
  * any extra contention...
  */
 
@@ -1422,7 +1422,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 		return NULL;
 	}
 
-	down(&p1->d_inode->i_sb->s_vfs_rename_sem);
+	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 
 	for (p = p1; p->d_parent != p; p = p->d_parent) {
 		if (p->d_parent == p2) {
@@ -1450,7 +1450,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 	mutex_unlock(&p1->d_inode->i_mutex);
 	if (p1 != p2) {
 		mutex_unlock(&p2->d_inode->i_mutex);
-		up(&p1->d_inode->i_sb->s_vfs_rename_sem);
+		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 	}
 }
 
@@ -2277,17 +2277,17 @@ asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
  *	a) we can get into loop creation. Check is done in is_subdir().
  *	b) race potential - two innocent renames can create a loop together.
  *	   That's where 4.4 screws up. Current fix: serialization on
- *	   sb->s_vfs_rename_sem. We might be more accurate, but that's another
+ *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
  *	   story.
  *	c) we have to lock _three_ objects - parents and victim (if it exists).
  *	   And that - after we got ->i_mutex on parents (until then we don't know
  *	   whether the target exists).  Solution: try to be smart with locking
  *	   order for inodes.  We rely on the fact that tree topology may change
- *	   only under ->s_vfs_rename_sem _and_ that parent of the object we
+ *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
  *	   move will be locked.  Thus we can rank directories by the tree
  *	   (ancestors first) and rank all non-directories after them.
  *	   That works since everybody except rename does "lock parent, lookup,
- *	   lock child" and rename is under ->s_vfs_rename_sem.
+ *	   lock child" and rename is under ->s_vfs_rename_mutex.
  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
  *	   we'd better make sure that there's no link(2) for them.
diff --git a/fs/super.c b/fs/super.c
index 9cc6545dfa4c..425861cb1caa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -76,7 +76,7 @@ static struct super_block *alloc_super(void)
 		down_write(&s->s_umount);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
-		sema_init(&s->s_vfs_rename_sem,1);
+		mutex_init(&s->s_vfs_rename_mutex);
 		mutex_init(&s->s_dquot.dqio_mutex);
 		mutex_init(&s->s_dquot.dqonoff_mutex);
 		init_rwsem(&s->s_dquot.dqptr_sem);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ed1f36b6d54..0f71ee730127 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -847,7 +847,7 @@ struct super_block {
 	 * The next field is for VFS *only*. No filesystems have any business
 	 * even looking at it. You had been warned.
 	 */
-	struct semaphore s_vfs_rename_sem;	/* Kludge */
+	struct mutex s_vfs_rename_mutex;	/* Kludge */
 
 	/* Granuality of c/m/atime in ns.
 	   Cannot be worse than a second */
-- 
cgit v1.2.3


From f24075bd0c1cd1cc2cf86d394f960aa0401de573 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:34 -0800
Subject: [PATCH] sem2mutex: iprune

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         | 16 ++++++++--------
 fs/inotify.c       |  6 +++---
 include/linux/fs.h |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 603e93ef0c6f..25967b67903d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -84,14 +84,14 @@ static struct hlist_head *inode_hashtable;
 DEFINE_SPINLOCK(inode_lock);
 
 /*
- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * iprune_mutex provides exclusion between the kswapd or try_to_free_pages
  * icache shrinking path, and the umount path.  Without this exclusion,
  * by the time prune_icache calls iput for the inode whose pages it has
  * been invalidating, or by the time it calls clear_inode & destroy_inode
  * from its final dispose_list, the struct super_block they refer to
  * (for inode->i_sb->s_op) may already have been freed and reused.
  */
-DECLARE_MUTEX(iprune_sem);
+DEFINE_MUTEX(iprune_mutex);
 
 /*
  * Statistics gathering..
@@ -319,7 +319,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 		/*
 		 * We can reschedule here without worrying about the list's
 		 * consistency because the per-sb list of inodes must not
-		 * change during umount anymore, and because iprune_sem keeps
+		 * change during umount anymore, and because iprune_mutex keeps
 		 * shrink_icache_memory() away.
 		 */
 		cond_resched_lock(&inode_lock);
@@ -355,14 +355,14 @@ int invalidate_inodes(struct super_block * sb)
 	int busy;
 	LIST_HEAD(throw_away);
 
-	down(&iprune_sem);
+	mutex_lock(&iprune_mutex);
 	spin_lock(&inode_lock);
 	inotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&throw_away);
-	up(&iprune_sem);
+	mutex_unlock(&iprune_mutex);
 
 	return busy;
 }
@@ -377,7 +377,7 @@ int __invalidate_device(struct block_device *bdev)
 	if (sb) {
 		/*
 		 * no need to lock the super, get_super holds the
-		 * read semaphore so the filesystem cannot go away
+		 * read mutex so the filesystem cannot go away
 		 * under us (->put_super runs with the write lock
 		 * hold).
 		 */
@@ -423,7 +423,7 @@ static void prune_icache(int nr_to_scan)
 	int nr_scanned;
 	unsigned long reap = 0;
 
-	down(&iprune_sem);
+	mutex_lock(&iprune_mutex);
 	spin_lock(&inode_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
@@ -459,7 +459,7 @@ static void prune_icache(int nr_to_scan)
 	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
-	up(&iprune_sem);
+	mutex_unlock(&iprune_mutex);
 
 	if (current_is_kswapd())
 		mod_page_state(kswapd_inodesteal, reap);
diff --git a/fs/inotify.c b/fs/inotify.c
index 60d9653d55b7..0ee39ef591c6 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -54,7 +54,7 @@ int inotify_max_queued_events;
  * Lock ordering:
  *
  * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
- * iprune_sem (synchronize shrink_icache_memory())
+ * iprune_mutex (synchronize shrink_icache_memory())
  * 	inode_lock (protects the super_block->s_inodes list)
  * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
  * 		inotify_dev->mutex (protects inotify_device and watches->d_list)
@@ -569,7 +569,7 @@ EXPORT_SYMBOL_GPL(inotify_get_cookie);
  * @list: list of inodes being unmounted (sb->s_inodes)
  *
  * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_sem held, keeping shrink_icache_memory() at bay.
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
  * We temporarily drop inode_lock, however, and CAN block.
  */
 void inotify_unmount_inodes(struct list_head *list)
@@ -618,7 +618,7 @@ void inotify_unmount_inodes(struct list_head *list)
 		 * We can safely drop inode_lock here because we hold
 		 * references on both inode and next_i.  Also no new inodes
 		 * will be added since the umount has begun.  Finally,
-		 * iprune_sem keeps shrink_icache_memory() away.
+		 * iprune_mutex keeps shrink_icache_memory() away.
 		 */
 		spin_unlock(&inode_lock);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0f71ee730127..8fd8d9b90b00 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1534,7 +1534,7 @@ extern void destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
 extern int remove_suid(struct dentry *);
 extern void remove_dquot_ref(struct super_block *, int, struct list_head *);
-extern struct semaphore iprune_sem;
+extern struct mutex iprune_mutex;
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 extern void remove_inode_hash(struct inode *);
-- 
cgit v1.2.3


From 2c68ee754c40099c59828e59618a54726f76126a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 23 Mar 2006 03:00:35 -0800
Subject: [PATCH] sem2mutex: jbd, j_checkpoint_mutex

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c  | 4 ++--
 fs/jbd/journal.c     | 4 ++--
 fs/jbd/transaction.c | 4 ++--
 include/linux/jbd.h  | 7 ++++---
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 543ed543d1e5..3f5102b069db 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -85,7 +85,7 @@ void __log_wait_for_space(journal_t *journal)
 		if (journal->j_flags & JFS_ABORT)
 			return;
 		spin_unlock(&journal->j_state_lock);
-		down(&journal->j_checkpoint_sem);
+		mutex_lock(&journal->j_checkpoint_mutex);
 
 		/*
 		 * Test again, another process may have checkpointed while we
@@ -98,7 +98,7 @@ void __log_wait_for_space(journal_t *journal)
 			log_do_checkpoint(journal);
 			spin_lock(&journal->j_state_lock);
 		}
-		up(&journal->j_checkpoint_sem);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 }
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index e4b516ac4989..95a628d8cac8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -659,8 +659,8 @@ static journal_t * journal_init_common (void)
 	init_waitqueue_head(&journal->j_wait_checkpoint);
 	init_waitqueue_head(&journal->j_wait_commit);
 	init_waitqueue_head(&journal->j_wait_updates);
-	init_MUTEX(&journal->j_barrier);
-	init_MUTEX(&journal->j_checkpoint_sem);
+	mutex_init(&journal->j_barrier);
+	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
 	spin_lock_init(&journal->j_list_lock);
 	spin_lock_init(&journal->j_state_lock);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ca917973c2c0..5fc40888f4cf 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -455,7 +455,7 @@ void journal_lock_updates(journal_t *journal)
 	 * to make sure that we serialise special journal-locked operations
 	 * too.
 	 */
-	down(&journal->j_barrier);
+	mutex_lock(&journal->j_barrier);
 }
 
 /**
@@ -470,7 +470,7 @@ void journal_unlock_updates (journal_t *journal)
 {
 	J_ASSERT(journal->j_barrier_count != 0);
 
-	up(&journal->j_barrier);
+	mutex_unlock(&journal->j_barrier);
 	spin_lock(&journal->j_state_lock);
 	--journal->j_barrier_count;
 	spin_unlock(&journal->j_state_lock);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 41ee79962bb2..2ccbfb6340ba 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -28,6 +28,7 @@
 #include <linux/journal-head.h>
 #include <linux/stddef.h>
 #include <linux/bit_spinlock.h>
+#include <linux/mutex.h>
 #include <asm/semaphore.h>
 #endif
 
@@ -575,7 +576,7 @@ struct transaction_s
  * @j_wait_checkpoint:  Wait queue to trigger checkpointing
  * @j_wait_commit: Wait queue to trigger commit
  * @j_wait_updates: Wait queue to wait for updates to complete
- * @j_checkpoint_sem: Semaphore for locking against concurrent checkpoints
+ * @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
  * @j_head: Journal head - identifies the first unused block in the journal
  * @j_tail: Journal tail - identifies the oldest still-used block in the
  *  journal.
@@ -645,7 +646,7 @@ struct journal_s
 	int			j_barrier_count;
 
 	/* The barrier lock itself */
-	struct semaphore	j_barrier;
+	struct mutex		j_barrier;
 
 	/*
 	 * Transactions: The current running transaction...
@@ -687,7 +688,7 @@ struct journal_s
 	wait_queue_head_t	j_wait_updates;
 
 	/* Semaphore for locking against concurrent checkpoints */
-	struct semaphore 	j_checkpoint_sem;
+	struct mutex	 	j_checkpoint_mutex;
 
 	/*
 	 * Journal head: identifies the first unused block in the journal.
-- 
cgit v1.2.3


From 7a7d1cf95408863a657035701606b13644c9f55e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:35 -0800
Subject: [PATCH] sem2mutex: kprobes

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    |  4 ++--
 arch/powerpc/kernel/kprobes.c |  4 ++--
 arch/x86_64/kernel/kprobes.c  |  4 ++--
 include/linux/kprobes.h       |  3 ++-
 kernel/kprobes.c              | 14 +++++++-------
 5 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 694a13997637..7a59050242a7 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -84,9 +84,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 258039fb3016..cb1fe5878e8b 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -81,9 +81,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 8b866a8572cf..14f0ced613b6 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -222,9 +222,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 669756bc20a2..778adc0fa640 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -36,6 +36,7 @@
 #include <linux/percpu.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/mutex.h>
 
 #ifdef CONFIG_KPROBES
 #include <asm/kprobes.h>
@@ -152,7 +153,7 @@ struct kretprobe_instance {
 };
 
 extern spinlock_t kretprobe_lock;
-extern struct semaphore kprobe_mutex;
+extern struct mutex kprobe_mutex;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73ce..1fb9f753ef60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-DECLARE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
+DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	}
 
 	p->nmissed = 0;
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
   	arch_arm_kprobe(p);
 
 out:
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 
 	if (ret && probed_mod)
 		module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 	struct kprobe *old_p, *list_p;
 	int cleanup_p;
 
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (unlikely(!old_p)) {
-		up(&kprobe_mutex);
+		mutex_unlock(&kprobe_mutex);
 		return;
 	}
 	if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
 				goto valid_p;
-		up(&kprobe_mutex);
+		mutex_unlock(&kprobe_mutex);
 		return;
 	}
 valid_p:
@@ -523,7 +523,7 @@ valid_p:
 		cleanup_p = 0;
 	}
 
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 
 	synchronize_sched();
 	if (p->mod_refcounted &&
-- 
cgit v1.2.3


From 0ac1759abc69fb62438c30a7e422f628a1120d67 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:37 -0800
Subject: [PATCH] sem2mutex: fs/seq_file.c

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/seq_file.c            | 10 +++++-----
 include/linux/seq_file.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7c40570b71dc..555b9ac04c25 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -37,7 +37,7 @@ int seq_open(struct file *file, struct seq_operations *op)
 		file->private_data = p;
 	}
 	memset(p, 0, sizeof(*p));
-	sema_init(&p->sem, 1);
+	mutex_init(&p->lock);
 	p->op = op;
 
 	/*
@@ -71,7 +71,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	void *p;
 	int err = 0;
 
-	down(&m->sem);
+	mutex_lock(&m->lock);
 	/*
 	 * seq_file->op->..m_start/m_stop/m_next may do special actions
 	 * or optimisations based on the file->f_version, so we want to
@@ -164,7 +164,7 @@ Done:
 	else
 		*ppos += copied;
 	file->f_version = m->version;
-	up(&m->sem);
+	mutex_unlock(&m->lock);
 	return copied;
 Enomem:
 	err = -ENOMEM;
@@ -237,7 +237,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	long long retval = -EINVAL;
 
-	down(&m->sem);
+	mutex_lock(&m->lock);
 	m->version = file->f_version;
 	switch (origin) {
 		case 1:
@@ -260,7 +260,7 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 				}
 			}
 	}
-	up(&m->sem);
+	mutex_unlock(&m->lock);
 	file->f_version = m->version;
 	return retval;
 }
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 850a974ee505..b95f6eb7254c 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -4,7 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 struct seq_operations;
 struct file;
@@ -19,7 +19,7 @@ struct seq_file {
 	size_t count;
 	loff_t index;
 	loff_t version;
-	struct semaphore sem;
+	struct mutex lock;
 	struct seq_operations *op;
 	void *private;
 };
-- 
cgit v1.2.3


From f85221dd74f2708b78a2aa54de59944e44206d0e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:38 -0800
Subject: [PATCH] sem2mutex: drivers/block/loop.c

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/loop.c | 18 +++++++++---------
 include/linux/loop.h |  3 ++-
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0010704739e3..74bf0255e98f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1144,7 +1144,7 @@ static int lo_ioctl(struct inode * inode, struct file * file,
 	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
 	int err;
 
-	down(&lo->lo_ctl_mutex);
+	mutex_lock(&lo->lo_ctl_mutex);
 	switch (cmd) {
 	case LOOP_SET_FD:
 		err = loop_set_fd(lo, file, inode->i_bdev, arg);
@@ -1170,7 +1170,7 @@ static int lo_ioctl(struct inode * inode, struct file * file,
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
-	up(&lo->lo_ctl_mutex);
+	mutex_unlock(&lo->lo_ctl_mutex);
 	return err;
 }
 
@@ -1178,9 +1178,9 @@ static int lo_open(struct inode *inode, struct file *file)
 {
 	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
 
-	down(&lo->lo_ctl_mutex);
+	mutex_lock(&lo->lo_ctl_mutex);
 	lo->lo_refcnt++;
-	up(&lo->lo_ctl_mutex);
+	mutex_unlock(&lo->lo_ctl_mutex);
 
 	return 0;
 }
@@ -1189,9 +1189,9 @@ static int lo_release(struct inode *inode, struct file *file)
 {
 	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
 
-	down(&lo->lo_ctl_mutex);
+	mutex_lock(&lo->lo_ctl_mutex);
 	--lo->lo_refcnt;
-	up(&lo->lo_ctl_mutex);
+	mutex_unlock(&lo->lo_ctl_mutex);
 
 	return 0;
 }
@@ -1233,12 +1233,12 @@ int loop_unregister_transfer(int number)
 	xfer_funcs[n] = NULL;
 
 	for (lo = &loop_dev[0]; lo < &loop_dev[max_loop]; lo++) {
-		down(&lo->lo_ctl_mutex);
+		mutex_lock(&lo->lo_ctl_mutex);
 
 		if (lo->lo_encryption == xfer)
 			loop_release_xfer(lo);
 
-		up(&lo->lo_ctl_mutex);
+		mutex_unlock(&lo->lo_ctl_mutex);
 	}
 
 	return 0;
@@ -1285,7 +1285,7 @@ static int __init loop_init(void)
 		lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
 		if (!lo->lo_queue)
 			goto out_mem4;
-		init_MUTEX(&lo->lo_ctl_mutex);
+		mutex_init(&lo->lo_ctl_mutex);
 		init_completion(&lo->lo_done);
 		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
diff --git a/include/linux/loop.h b/include/linux/loop.h
index f96506782ebe..e76c7611d6cc 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 
 /* Possible states of device */
 enum {
@@ -60,7 +61,7 @@ struct loop_device {
 	int			lo_state;
 	struct completion	lo_done;
 	struct completion	lo_bh_done;
-	struct semaphore	lo_ctl_mutex;
+	struct mutex		lo_ctl_mutex;
 	int			lo_pending;
 
 	request_queue_t		*lo_queue;
-- 
cgit v1.2.3


From 82d4dc5adb0055393248ad4ab8de392fac708a12 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:38 -0800
Subject: [PATCH] sem2mutex: drivers/block/nbd.c

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Paul Clements <Paul.Clements@steeleye.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/nbd.c | 16 ++++++++--------
 include/linux/nbd.h |  3 ++-
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6997d8e6bfb5..a9bde30dadad 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -459,9 +459,9 @@ static void do_nbd_request(request_queue_t * q)
 		req->errors = 0;
 		spin_unlock_irq(q->queue_lock);
 
-		down(&lo->tx_lock);
+		mutex_lock(&lo->tx_lock);
 		if (unlikely(!lo->sock)) {
-			up(&lo->tx_lock);
+			mutex_unlock(&lo->tx_lock);
 			printk(KERN_ERR "%s: Attempted send on closed socket\n",
 			       lo->disk->disk_name);
 			req->errors++;
@@ -484,7 +484,7 @@ static void do_nbd_request(request_queue_t * q)
 		}
 
 		lo->active_req = NULL;
-		up(&lo->tx_lock);
+		mutex_unlock(&lo->tx_lock);
 		wake_up_all(&lo->active_wq);
 
 		spin_lock_irq(q->queue_lock);
@@ -534,9 +534,9 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
  
 	case NBD_CLEAR_SOCK:
 		error = 0;
-		down(&lo->tx_lock);
+		mutex_lock(&lo->tx_lock);
 		lo->sock = NULL;
-		up(&lo->tx_lock);
+		mutex_unlock(&lo->tx_lock);
 		file = lo->file;
 		lo->file = NULL;
 		nbd_clear_que(lo);
@@ -590,7 +590,7 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 		 * FIXME: This code is duplicated from sys_shutdown, but
 		 * there should be a more generic interface rather than
 		 * calling socket ops directly here */
-		down(&lo->tx_lock);
+		mutex_lock(&lo->tx_lock);
 		if (lo->sock) {
 			printk(KERN_WARNING "%s: shutting down socket\n",
 				lo->disk->disk_name);
@@ -598,7 +598,7 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 				SEND_SHUTDOWN|RCV_SHUTDOWN);
 			lo->sock = NULL;
 		}
-		up(&lo->tx_lock);
+		mutex_unlock(&lo->tx_lock);
 		file = lo->file;
 		lo->file = NULL;
 		nbd_clear_que(lo);
@@ -683,7 +683,7 @@ static int __init nbd_init(void)
 		nbd_dev[i].flags = 0;
 		spin_lock_init(&nbd_dev[i].queue_lock);
 		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
-		init_MUTEX(&nbd_dev[i].tx_lock);
+		mutex_init(&nbd_dev[i].tx_lock);
 		init_waitqueue_head(&nbd_dev[i].active_wq);
 		nbd_dev[i].blksize = 1024;
 		nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index f95d51fae733..a6ce409ec6fc 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -38,6 +38,7 @@ enum {
 #ifdef __KERNEL__
 
 #include <linux/wait.h>
+#include <linux/mutex.h>
 
 /* values for flags field */
 #define NBD_READ_ONLY 0x0001
@@ -57,7 +58,7 @@ struct nbd_device {
 	struct request *active_req;
 	wait_queue_head_t active_wq;
 
-	struct semaphore tx_lock;
+	struct mutex tx_lock;
 	struct gendisk *disk;
 	int blksize;
 	u64 bytesize;
-- 
cgit v1.2.3


From 97461518610fb1679f67333bb699bb81136e49fe Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 23 Mar 2006 03:00:42 -0800
Subject: [PATCH] convert ext3's truncate_sem to a mutex

ext3's truncate_sem is always released in the same function it's taken
and it otherwise is a mutex as well..

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/file.c            |  4 ++--
 fs/ext3/inode.c           | 14 +++++++-------
 fs/ext3/ioctl.c           |  4 ++--
 fs/ext3/super.c           |  2 +-
 include/linux/ext3_fs_i.h |  7 ++++---
 5 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 98e78345ead9..59098ea56711 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -37,9 +37,9 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
 	{
-		down(&EXT3_I(inode)->truncate_sem);
+		mutex_lock(&EXT3_I(inode)->truncate_mutex);
 		ext3_discard_reservation(inode);
-		up(&EXT3_I(inode)->truncate_sem);
+		mutex_unlock(&EXT3_I(inode)->truncate_mutex);
 	}
 	if (is_dx(inode) && filp->private_data)
 		ext3_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index d59d5a667b0b..2c361377e0a5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -702,7 +702,7 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 	if (!create || err == -EIO)
 		goto cleanup;
 
-	down(&ei->truncate_sem);
+	mutex_lock(&ei->truncate_mutex);
 
 	/*
 	 * If the indirect block is missing while we are reading
@@ -723,7 +723,7 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 		}
 		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
 		if (!partial) {
-			up(&ei->truncate_sem);
+			mutex_unlock(&ei->truncate_mutex);
 			if (err)
 				goto cleanup;
 			clear_buffer_new(bh_result);
@@ -759,13 +759,13 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 		err = ext3_splice_branch(handle, inode, iblock, chain,
 					 partial, left);
 	/*
-	 * i_disksize growing is protected by truncate_sem.  Don't forget to
+	 * i_disksize growing is protected by truncate_mutex.  Don't forget to
 	 * protect it if you're about to implement concurrent
 	 * ext3_get_block() -bzzz
 	*/
 	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
 		ei->i_disksize = inode->i_size;
-	up(&ei->truncate_sem);
+	mutex_unlock(&ei->truncate_mutex);
 	if (err)
 		goto cleanup;
 
@@ -1227,7 +1227,7 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  *	ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
  *
  * Same applies to ext3_get_block().  We will deadlock on various things like
- * lock_journal and i_truncate_sem.
+ * lock_journal and i_truncate_mutex.
  *
  * Setting PF_MEMALLOC here doesn't work - too many internal memory
  * allocations fail.
@@ -2161,7 +2161,7 @@ void ext3_truncate(struct inode * inode)
 	 * From here we block out all ext3_get_block() callers who want to
 	 * modify the block allocation tree.
 	 */
-	down(&ei->truncate_sem);
+	mutex_lock(&ei->truncate_mutex);
 
 	if (n == 1) {		/* direct blocks */
 		ext3_free_data(handle, inode, NULL, i_data+offsets[0],
@@ -2228,7 +2228,7 @@ do_indirects:
 
 	ext3_discard_reservation(inode);
 
-	up(&ei->truncate_sem);
+	mutex_unlock(&ei->truncate_mutex);
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	ext3_mark_inode_dirty(handle, inode);
 
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 556cd5510078..aaf1da17b6d4 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -182,7 +182,7 @@ flags_err:
 		 * need to allocate reservation structure for this inode
 		 * before set the window size
 		 */
-		down(&ei->truncate_sem);
+		mutex_lock(&ei->truncate_mutex);
 		if (!ei->i_block_alloc_info)
 			ext3_init_block_alloc_info(inode);
 
@@ -190,7 +190,7 @@ flags_err:
 			struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
 			rsv->rsv_goal_size = rsv_window_size;
 		}
-		up(&ei->truncate_sem);
+		mutex_unlock(&ei->truncate_mutex);
 		return 0;
 	}
 	case EXT3_IOC_GROUP_EXTEND: {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index efa832059143..efe5b20d7a5a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -472,7 +472,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 #ifdef CONFIG_EXT3_FS_XATTR
 		init_rwsem(&ei->xattr_sem);
 #endif
-		init_MUTEX(&ei->truncate_sem);
+		mutex_init(&ei->truncate_mutex);
 		inode_init_once(&ei->vfs_inode);
 	}
 }
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index e71dd98dbcae..7abf90147180 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -19,6 +19,7 @@
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/seqlock.h>
+#include <linux/mutex.h>
 
 struct ext3_reserve_window {
 	__u32			_rsv_start;	/* First byte reserved */
@@ -122,16 +123,16 @@ struct ext3_inode_info {
 	__u16 i_extra_isize;
 
 	/*
-	 * truncate_sem is for serialising ext3_truncate() against
+	 * truncate_mutex is for serialising ext3_truncate() against
 	 * ext3_getblock().  In the 2.4 ext2 design, great chunks of inode's
 	 * data tree are chopped off during truncate. We can't do that in
 	 * ext3 because whenever we perform intermediate commits during
 	 * truncate, the inode and all the metadata blocks *must* be in a
 	 * consistent state which allows truncation of the orphans to restart
 	 * during recovery.  Hence we must fix the get_block-vs-truncate race
-	 * by other means, so we have truncate_sem.
+	 * by other means, so we have truncate_mutex.
 	 */
-	struct semaphore truncate_sem;
+	struct mutex truncate_mutex;
 	struct inode vfs_inode;
 };
 
-- 
cgit v1.2.3


From 8e3f90459b7052c31a9669417b837fb14aa6d313 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:43 -0800
Subject: [PATCH] sem2mutex: NCPFS

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ncpfs/file.c           |  4 ++--
 fs/ncpfs/inode.c          |  6 +++---
 fs/ncpfs/ncplib_kernel.c  |  4 ++--
 fs/ncpfs/sock.c           | 34 +++++++++++++++++-----------------
 include/linux/ncp_fs_i.h  |  2 +-
 include/linux/ncp_fs_sb.h |  5 +++--
 6 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 973b444d6914..ebdad8f6398f 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -46,7 +46,7 @@ int ncp_make_open(struct inode *inode, int right)
 		NCP_FINFO(inode)->volNumber, 
 		NCP_FINFO(inode)->dirEntNum);
 	error = -EACCES;
-	down(&NCP_FINFO(inode)->open_sem);
+	mutex_lock(&NCP_FINFO(inode)->open_mutex);
 	if (!atomic_read(&NCP_FINFO(inode)->opened)) {
 		struct ncp_entry_info finfo;
 		int result;
@@ -93,7 +93,7 @@ int ncp_make_open(struct inode *inode, int right)
 	}
 
 out_unlock:
-	up(&NCP_FINFO(inode)->open_sem);
+	mutex_unlock(&NCP_FINFO(inode)->open_mutex);
 out:
 	return error;
 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d277a58bd128..0b521d3d97ce 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -63,7 +63,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
-		init_MUTEX(&ei->open_sem);
+		mutex_init(&ei->open_mutex);
 		inode_init_once(&ei->vfs_inode);
 	}
 }
@@ -520,7 +520,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	}
 
 /*	server->lock = 0;	*/
-	init_MUTEX(&server->sem);
+	mutex_init(&server->mutex);
 	server->packet = NULL;
 /*	server->buffer_size = 0;	*/
 /*	server->conn_status = 0;	*/
@@ -557,7 +557,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	server->dentry_ttl = 0;	/* no caching */
 
 	INIT_LIST_HEAD(&server->tx.requests);
-	init_MUTEX(&server->rcv.creq_sem);
+	mutex_init(&server->rcv.creq_mutex);
 	server->tx.creq		= NULL;
 	server->rcv.creq	= NULL;
 	server->data_ready	= sock->sk->sk_data_ready;
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index c755e1848a42..d9ebf6439f59 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -291,7 +291,7 @@ ncp_make_closed(struct inode *inode)
 	int err;
 
 	err = 0;
-	down(&NCP_FINFO(inode)->open_sem);	
+	mutex_lock(&NCP_FINFO(inode)->open_mutex);
 	if (atomic_read(&NCP_FINFO(inode)->opened) == 1) {
 		atomic_set(&NCP_FINFO(inode)->opened, 0);
 		err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle);
@@ -301,7 +301,7 @@ ncp_make_closed(struct inode *inode)
 				NCP_FINFO(inode)->volNumber,
 				NCP_FINFO(inode)->dirEntNum, err);
 	}
-	up(&NCP_FINFO(inode)->open_sem);
+	mutex_unlock(&NCP_FINFO(inode)->open_mutex);
 	return err;
 }
 
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 6593a5ca88ba..8783eb7ec641 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -171,9 +171,9 @@ static inline void __ncp_abort_request(struct ncp_server *server, struct ncp_req
 
 static inline void ncp_abort_request(struct ncp_server *server, struct ncp_request_reply *req, int err)
 {
-	down(&server->rcv.creq_sem);
+	mutex_lock(&server->rcv.creq_mutex);
 	__ncp_abort_request(server, req, err);
-	up(&server->rcv.creq_sem);
+	mutex_unlock(&server->rcv.creq_mutex);
 }
 
 static inline void __ncptcp_abort(struct ncp_server *server)
@@ -303,20 +303,20 @@ static inline void __ncp_start_request(struct ncp_server *server, struct ncp_req
 
 static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *req)
 {
-	down(&server->rcv.creq_sem);
+	mutex_lock(&server->rcv.creq_mutex);
 	if (!ncp_conn_valid(server)) {
-		up(&server->rcv.creq_sem);
+		mutex_unlock(&server->rcv.creq_mutex);
 		printk(KERN_ERR "ncpfs: tcp: Server died\n");
 		return -EIO;
 	}
 	if (server->tx.creq || server->rcv.creq) {
 		req->status = RQ_QUEUED;
 		list_add_tail(&req->req, &server->tx.requests);
-		up(&server->rcv.creq_sem);
+		mutex_unlock(&server->rcv.creq_mutex);
 		return 0;
 	}
 	__ncp_start_request(server, req);
-	up(&server->rcv.creq_sem);
+	mutex_unlock(&server->rcv.creq_mutex);
 	return 0;
 }
 
@@ -400,7 +400,7 @@ void ncpdgram_rcv_proc(void *s)
 				info_server(server, 0, server->unexpected_packet.data, result);
 				continue;
 			}
-			down(&server->rcv.creq_sem);		
+			mutex_lock(&server->rcv.creq_mutex);
 			req = server->rcv.creq;
 			if (req && (req->tx_type == NCP_ALLOC_SLOT_REQUEST || (server->sequence == reply.sequence && 
 					server->connection == get_conn_number(&reply)))) {
@@ -430,11 +430,11 @@ void ncpdgram_rcv_proc(void *s)
 				     	server->rcv.creq = NULL;
 					ncp_finish_request(req, result);
 					__ncp_next_request(server);
-					up(&server->rcv.creq_sem);
+					mutex_unlock(&server->rcv.creq_mutex);
 					continue;
 				}
 			}
-			up(&server->rcv.creq_sem);
+			mutex_unlock(&server->rcv.creq_mutex);
 		}
 drop:;		
 		_recv(sock, &reply, sizeof(reply), MSG_DONTWAIT);
@@ -472,9 +472,9 @@ static void __ncpdgram_timeout_proc(struct ncp_server *server)
 void ncpdgram_timeout_proc(void *s)
 {
 	struct ncp_server *server = s;
-	down(&server->rcv.creq_sem);
+	mutex_lock(&server->rcv.creq_mutex);
 	__ncpdgram_timeout_proc(server);
-	up(&server->rcv.creq_sem);
+	mutex_unlock(&server->rcv.creq_mutex);
 }
 
 static inline void ncp_init_req(struct ncp_request_reply* req)
@@ -657,18 +657,18 @@ void ncp_tcp_rcv_proc(void *s)
 {
 	struct ncp_server *server = s;
 
-	down(&server->rcv.creq_sem);
+	mutex_lock(&server->rcv.creq_mutex);
 	__ncptcp_rcv_proc(server);
-	up(&server->rcv.creq_sem);
+	mutex_unlock(&server->rcv.creq_mutex);
 }
 
 void ncp_tcp_tx_proc(void *s)
 {
 	struct ncp_server *server = s;
 	
-	down(&server->rcv.creq_sem);
+	mutex_lock(&server->rcv.creq_mutex);
 	__ncptcp_try_send(server);
-	up(&server->rcv.creq_sem);
+	mutex_unlock(&server->rcv.creq_mutex);
 }
 
 static int do_ncp_rpc_call(struct ncp_server *server, int size,
@@ -833,7 +833,7 @@ int ncp_disconnect(struct ncp_server *server)
 
 void ncp_lock_server(struct ncp_server *server)
 {
-	down(&server->sem);
+	mutex_lock(&server->mutex);
 	if (server->lock)
 		printk(KERN_WARNING "ncp_lock_server: was locked!\n");
 	server->lock = 1;
@@ -846,5 +846,5 @@ void ncp_unlock_server(struct ncp_server *server)
 		return;
 	}
 	server->lock = 0;
-	up(&server->sem);
+	mutex_unlock(&server->mutex);
 }
diff --git a/include/linux/ncp_fs_i.h b/include/linux/ncp_fs_i.h
index 415be1ec6f98..bdb4c8ae6924 100644
--- a/include/linux/ncp_fs_i.h
+++ b/include/linux/ncp_fs_i.h
@@ -19,7 +19,7 @@ struct ncp_inode_info {
 	__le32	DosDirNum;
 	__u8	volNumber;
 	__le32	nwattr;
-	struct semaphore open_sem;
+	struct mutex open_mutex;
 	atomic_t	opened;
 	int	access;
 	int	flags;
diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h
index cf858eb80f0b..b089d9506283 100644
--- a/include/linux/ncp_fs_sb.h
+++ b/include/linux/ncp_fs_sb.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/ncp_mount.h>
 #include <linux/net.h>
+#include <linux/mutex.h>
 
 #ifdef __KERNEL__
 
@@ -51,7 +52,7 @@ struct ncp_server {
 				   receive replies */
 
 	int lock;		/* To prevent mismatch in protocols. */
-	struct semaphore sem;
+	struct mutex mutex;
 
 	int current_size;	/* for packet preparation */
 	int has_subfunction;
@@ -96,7 +97,7 @@ struct ncp_server {
 	struct {
 		struct work_struct tq;		/* STREAM/DGRAM: data/error ready */
 		struct ncp_request_reply* creq;	/* STREAM/DGRAM: awaiting reply from this request */
-		struct semaphore creq_sem;	/* DGRAM only: lock accesses to rcv.creq */
+		struct mutex creq_mutex;	/* DGRAM only: lock accesses to rcv.creq */
 
 		unsigned int state;		/* STREAM only: receiver state */
 		struct {
-- 
cgit v1.2.3


From 1e7933defd0fce79b2d8ecdbc7ca37fed0c188ed Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:44 -0800
Subject: [PATCH] sem2mutex: UDF

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/udf/balloc.c           | 36 ++++++++++++++++++------------------
 fs/udf/ialloc.c           |  8 ++++----
 fs/udf/super.c            |  2 +-
 include/linux/udf_fs_sb.h |  4 ++--
 4 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 201049ac8a96..ea521f846d97 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -152,7 +152,7 @@ static void udf_bitmap_free_blocks(struct super_block * sb,
 	int bitmap_nr;
 	unsigned long overflow;
 
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 	if (bloc.logicalBlockNum < 0 ||
 		(bloc.logicalBlockNum + count) > UDF_SB_PARTLEN(sb, bloc.partitionReferenceNum))
 	{
@@ -211,7 +211,7 @@ error_return:
 	sb->s_dirt = 1;
 	if (UDF_SB_LVIDBH(sb))
 		mark_buffer_dirty(UDF_SB_LVIDBH(sb));
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 	return;
 }
 
@@ -226,7 +226,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block * sb,
 	int nr_groups, bitmap_nr;
 	struct buffer_head *bh;
 
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 	if (first_block < 0 || first_block >= UDF_SB_PARTLEN(sb, partition))
 		goto out;
 
@@ -275,7 +275,7 @@ out:
 		mark_buffer_dirty(UDF_SB_LVIDBH(sb));
 	}
 	sb->s_dirt = 1;
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 	return alloc_count;
 }
 
@@ -291,7 +291,7 @@ static int udf_bitmap_new_block(struct super_block * sb,
 	int newblock = 0;
 
 	*err = -ENOSPC;
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 
 repeat:
 	if (goal < 0 || goal >= UDF_SB_PARTLEN(sb, partition))
@@ -364,7 +364,7 @@ repeat:
 	}
 	if (i >= (nr_groups*2))
 	{
-		up(&sbi->s_alloc_sem);
+		mutex_unlock(&sbi->s_alloc_mutex);
 		return newblock;
 	}
 	if (bit < sb->s_blocksize << 3)
@@ -373,7 +373,7 @@ repeat:
 		bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3, group_start << 3);
 	if (bit >= sb->s_blocksize << 3)
 	{
-		up(&sbi->s_alloc_sem);
+		mutex_unlock(&sbi->s_alloc_mutex);
 		return 0;
 	}
 
@@ -387,7 +387,7 @@ got_block:
 	 */
 	if (inode && DQUOT_ALLOC_BLOCK(inode, 1))
 	{
-		up(&sbi->s_alloc_sem);
+		mutex_unlock(&sbi->s_alloc_mutex);
 		*err = -EDQUOT;
 		return 0;
 	}
@@ -410,13 +410,13 @@ got_block:
 		mark_buffer_dirty(UDF_SB_LVIDBH(sb));
 	}
 	sb->s_dirt = 1;
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 	*err = 0;
 	return newblock;
 
 error_return:
 	*err = -EIO;
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 	return 0;
 }
 
@@ -433,7 +433,7 @@ static void udf_table_free_blocks(struct super_block * sb,
 	int8_t etype;
 	int i;
 
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 	if (bloc.logicalBlockNum < 0 ||
 		(bloc.logicalBlockNum + count) > UDF_SB_PARTLEN(sb, bloc.partitionReferenceNum))
 	{
@@ -666,7 +666,7 @@ static void udf_table_free_blocks(struct super_block * sb,
 
 error_return:
 	sb->s_dirt = 1;
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 	return;
 }
 
@@ -692,7 +692,7 @@ static int udf_table_prealloc_blocks(struct super_block * sb,
 	else
 		return 0;
 
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 	extoffset = sizeof(struct unallocSpaceEntry);
 	bloc = UDF_I_LOCATION(table);
 
@@ -736,7 +736,7 @@ static int udf_table_prealloc_blocks(struct super_block * sb,
 		mark_buffer_dirty(UDF_SB_LVIDBH(sb));
 		sb->s_dirt = 1;
 	}
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 	return alloc_count;
 }
 
@@ -761,7 +761,7 @@ static int udf_table_new_block(struct super_block * sb,
 	else
 		return newblock;
 
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 	if (goal < 0 || goal >= UDF_SB_PARTLEN(sb, partition))
 		goal = 0;
 
@@ -811,7 +811,7 @@ static int udf_table_new_block(struct super_block * sb,
 	if (spread == 0xFFFFFFFF)
 	{
 		udf_release_data(goal_bh);
-		up(&sbi->s_alloc_sem);
+		mutex_unlock(&sbi->s_alloc_mutex);
 		return 0;
 	}
 
@@ -827,7 +827,7 @@ static int udf_table_new_block(struct super_block * sb,
 	if (inode && DQUOT_ALLOC_BLOCK(inode, 1))
 	{
 		udf_release_data(goal_bh);
-		up(&sbi->s_alloc_sem);
+		mutex_unlock(&sbi->s_alloc_mutex);
 		*err = -EDQUOT;
 		return 0;
 	}
@@ -846,7 +846,7 @@ static int udf_table_new_block(struct super_block * sb,
 	}
 
 	sb->s_dirt = 1;
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 	*err = 0;
 	return newblock;
 }
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index c9b707b470ca..3873c672cb4c 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -42,7 +42,7 @@ void udf_free_inode(struct inode * inode)
 
 	clear_inode(inode);
 
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 	if (sbi->s_lvidbh) {
 		if (S_ISDIR(inode->i_mode))
 			UDF_SB_LVIDIU(sb)->numDirs =
@@ -53,7 +53,7 @@ void udf_free_inode(struct inode * inode)
 		
 		mark_buffer_dirty(sbi->s_lvidbh);
 	}
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 
 	udf_free_blocks(sb, NULL, UDF_I_LOCATION(inode), 0, 1);
 }
@@ -83,7 +83,7 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
 		return NULL;
 	}
 
-	down(&sbi->s_alloc_sem);
+	mutex_lock(&sbi->s_alloc_mutex);
 	UDF_I_UNIQUE(inode) = 0;
 	UDF_I_LENEXTENTS(inode) = 0;
 	UDF_I_NEXT_ALLOC_BLOCK(inode) = 0;
@@ -148,7 +148,7 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
 		UDF_I_CRTIME(inode) = current_fs_time(inode->i_sb);
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
-	up(&sbi->s_alloc_sem);
+	mutex_unlock(&sbi->s_alloc_mutex);
 
 	if (DQUOT_ALLOC_INODE(inode))
 	{
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 368d8f81fe54..9303c50c5d55 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1515,7 +1515,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sb->s_fs_info = sbi;
 	memset(UDF_SB(sb), 0x00, sizeof(struct udf_sb_info));
 
-	init_MUTEX(&sbi->s_alloc_sem);
+	mutex_init(&sbi->s_alloc_mutex);
 
 	if (!udf_parse_options((char *)options, &uopt))
 		goto error_out;
diff --git a/include/linux/udf_fs_sb.h b/include/linux/udf_fs_sb.h
index b15ff2e99c91..80ae9ef940dc 100644
--- a/include/linux/udf_fs_sb.h
+++ b/include/linux/udf_fs_sb.h
@@ -13,7 +13,7 @@
 #ifndef _UDF_FS_SB_H
 #define _UDF_FS_SB_H 1
 
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #pragma pack(1)
 
@@ -111,7 +111,7 @@ struct udf_sb_info
 	/* VAT inode */
 	struct inode		*s_vat;
 
-	struct semaphore	s_alloc_sem;
+	struct mutex		s_alloc_mutex;
 };
 
 #endif /* _UDF_FS_SB_H */
-- 
cgit v1.2.3


From 81861d78c9edf9a9b03a9ba1f5b242d658f16832 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:44 -0800
Subject: [PATCH] sem2mutex: serial ->port_write_mutex

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/generic_serial.c  | 14 +++++++-------
 drivers/char/ser_a2232.c       |  4 ++--
 drivers/char/sx.c              |  2 +-
 drivers/char/vme_scc.c         |  2 +-
 include/linux/generic_serial.h |  4 +++-
 5 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/generic_serial.c b/drivers/char/generic_serial.c
index e38a5f0e07bb..5e59c0b42731 100644
--- a/drivers/char/generic_serial.c
+++ b/drivers/char/generic_serial.c
@@ -48,8 +48,8 @@ static int gs_debug;
 #define NEW_WRITE_LOCKING 1
 #if NEW_WRITE_LOCKING
 #define DECL      /* Nothing */
-#define LOCKIT    down (& port->port_write_sem);
-#define RELEASEIT up (&port->port_write_sem);
+#define LOCKIT    mutex_lock(& port->port_write_mutex);
+#define RELEASEIT mutex_unlock(&port->port_write_mutex);
 #else
 #define DECL      unsigned long flags;
 #define LOCKIT    save_flags (flags);cli ()
@@ -124,14 +124,14 @@ int gs_write(struct tty_struct * tty,
 	/* get exclusive "write" access to this port (problem 3) */
 	/* This is not a spinlock because we can have a disk access (page 
 		 fault) in copy_from_user */
-	down (& port->port_write_sem);
+	mutex_lock(& port->port_write_mutex);
 
 	while (1) {
 
 		c = count;
  
 		/* This is safe because we "OWN" the "head". Noone else can 
-		   change the "head": we own the port_write_sem. */
+		   change the "head": we own the port_write_mutex. */
 		/* Don't overrun the end of the buffer */
 		t = SERIAL_XMIT_SIZE - port->xmit_head;
 		if (t < c) c = t;
@@ -153,7 +153,7 @@ int gs_write(struct tty_struct * tty,
 		count -= c;
 		total += c;
 	}
-	up (& port->port_write_sem);
+	mutex_unlock(& port->port_write_mutex);
 
 	gs_dprintk (GS_DEBUG_WRITE, "write: interrupts are %s\n", 
 	            (port->flags & GS_TX_INTEN)?"enabled": "disabled"); 
@@ -214,7 +214,7 @@ int gs_write(struct tty_struct * tty,
 		c = count;
 
 		/* This is safe because we "OWN" the "head". Noone else can 
-		   change the "head": we own the port_write_sem. */
+		   change the "head": we own the port_write_mutex. */
 		/* Don't overrun the end of the buffer */
 		t = SERIAL_XMIT_SIZE - port->xmit_head;
 		if (t < c) c = t;
@@ -888,7 +888,7 @@ int gs_init_port(struct gs_port *port)
 	spin_lock_irqsave (&port->driver_lock, flags);
 	if (port->tty) 
 		clear_bit(TTY_IO_ERROR, &port->tty->flags);
-	init_MUTEX(&port->port_write_sem);
+	mutex_init(&port->port_write_mutex);
 	port->xmit_cnt = port->xmit_head = port->xmit_tail = 0;
 	spin_unlock_irqrestore(&port->driver_lock, flags);
 	gs_set_termios(port->tty, NULL);
diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c
index fee68cc895f8..510bd3e0e88b 100644
--- a/drivers/char/ser_a2232.c
+++ b/drivers/char/ser_a2232.c
@@ -97,7 +97,7 @@
 #include <asm/amigahw.h>
 #include <linux/zorro.h>
 #include <asm/irq.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include <linux/delay.h>
 
@@ -654,7 +654,7 @@ static void a2232_init_portstructs(void)
 		port->gs.closing_wait = 30 * HZ;
 		port->gs.rd = &a2232_real_driver;
 #ifdef NEW_WRITE_LOCKING
-		init_MUTEX(&(port->gs.port_write_sem));
+		init_MUTEX(&(port->gs.port_write_mutex));
 #endif
 		init_waitqueue_head(&port->gs.open_wait);
 		init_waitqueue_head(&port->gs.close_wait);
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index a6b4f02bdceb..3b4747230270 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -2318,7 +2318,7 @@ static int sx_init_portstructs (int nboards, int nports)
 			port->board = board;
 			port->gs.rd = &sx_real_driver;
 #ifdef NEW_WRITE_LOCKING
-			port->gs.port_write_sem = MUTEX;
+			port->gs.port_write_mutex = MUTEX;
 #endif
 			port->gs.driver_lock = SPIN_LOCK_UNLOCKED;
 			/*
diff --git a/drivers/char/vme_scc.c b/drivers/char/vme_scc.c
index d9325281e482..fd00822ac145 100644
--- a/drivers/char/vme_scc.c
+++ b/drivers/char/vme_scc.c
@@ -184,7 +184,7 @@ static void scc_init_portstructs(void)
 		port->gs.closing_wait = 30 * HZ;
 		port->gs.rd = &scc_real_driver;
 #ifdef NEW_WRITE_LOCKING
-		port->gs.port_write_sem = MUTEX;
+		port->gs.port_write_mutex = MUTEX;
 #endif
 		init_waitqueue_head(&port->gs.open_wait);
 		init_waitqueue_head(&port->gs.close_wait);
diff --git a/include/linux/generic_serial.h b/include/linux/generic_serial.h
index 0abe9d9a0069..652611a4bdcd 100644
--- a/include/linux/generic_serial.h
+++ b/include/linux/generic_serial.h
@@ -12,6 +12,8 @@
 #ifndef GENERIC_SERIAL_H
 #define GENERIC_SERIAL_H
 
+#include <linux/mutex.h>
+
 struct real_driver {
   void                    (*disable_tx_interrupts) (void *);
   void                    (*enable_tx_interrupts) (void *);
@@ -34,7 +36,7 @@ struct gs_port {
   int                     xmit_head;
   int                     xmit_tail;
   int                     xmit_cnt;
-  struct semaphore        port_write_sem;
+  struct mutex            port_write_mutex;
   int                     flags;
   wait_queue_head_t       open_wait;
   wait_queue_head_t       close_wait;
-- 
cgit v1.2.3


From 6b9438e1323a2be10dcc039f6321e7ca18b9459e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 23 Mar 2006 03:00:48 -0800
Subject: [PATCH] fat_lock is used as a mutex, convert it to using the new
 mutex primitive

The fat code uses the fat_lock always in a mutex way (taking and releasing
the lock in the same function), the patch below converts it into the new
mutex primitive.  Please consider this patch for the code.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/fatent.c          | 6 +++---
 include/linux/msdos_fs.h | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index a1a9e0451217..ab171ea8e869 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -267,19 +267,19 @@ static struct fatent_operations fat32_ops = {
 
 static inline void lock_fat(struct msdos_sb_info *sbi)
 {
-	down(&sbi->fat_lock);
+	mutex_lock(&sbi->fat_lock);
 }
 
 static inline void unlock_fat(struct msdos_sb_info *sbi)
 {
-	up(&sbi->fat_lock);
+	mutex_unlock(&sbi->fat_lock);
 }
 
 void fat_ent_access_init(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
-	init_MUTEX(&sbi->fat_lock);
+	mutex_init(&sbi->fat_lock);
 
 	switch (sbi->fat_bits) {
 	case 32:
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 8bcd9450d926..779e6a5744c7 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -184,6 +184,7 @@ struct fat_slot_info {
 #include <linux/string.h>
 #include <linux/nls.h>
 #include <linux/fs.h>
+#include <linux/mutex.h>
 
 struct fat_mount_options {
 	uid_t fs_uid;
@@ -226,7 +227,7 @@ struct msdos_sb_info {
 	unsigned long max_cluster;   /* maximum cluster number */
 	unsigned long root_cluster;  /* first cluster of the root directory */
 	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
-	struct semaphore fat_lock;
+	struct mutex fat_lock;
 	unsigned int prev_free;      /* previously allocated cluster number */
 	unsigned int free_clusters;  /* -1 if undefined */
 	struct fat_mount_options options;
-- 
cgit v1.2.3


From 66d2173053f64121f8dc69a1cf85972e37e1ee75 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 23 Mar 2006 03:00:51 -0800
Subject: [PATCH] Extract inode_inc_link_count(), inode_dec_link_count()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8fd8d9b90b00..f9c9dea636d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1115,6 +1115,18 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline void inode_inc_link_count(struct inode *inode)
+{
+	inode->i_nlink++;
+	mark_inode_dirty(inode);
+}
+
+static inline void inode_dec_link_count(struct inode *inode)
+{
+	inode->i_nlink--;
+	mark_inode_dirty(inode);
+}
+
 extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
 static inline void file_accessed(struct file *file)
 {
-- 
cgit v1.2.3


From 772a0dc5d2103baff2f15c2668930bcd37add777 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:00:55 -0800
Subject: [PATCH] notifier: profile.h forward decl fix

Declarations use struct notifier_block on both legs of the ifdef, so move the
notifier_block forward declaration outside the ifdef.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/profile.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/profile.h b/include/linux/profile.h
index 026969a5595c..1f2fea6640a4 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -14,6 +14,7 @@
 
 struct proc_dir_entry;
 struct pt_regs;
+struct notifier_block;
 
 /* init basic kernel profiler */
 void __init profile_init(void);
@@ -32,7 +33,6 @@ enum profile_type {
 
 #ifdef CONFIG_PROFILING
 
-struct notifier_block;
 struct task_struct;
 struct mm_struct;
 
-- 
cgit v1.2.3


From 41c28ff1635e71af072c4711ff5fadd5855d48e7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Mar 2006 03:00:56 -0800
Subject: [PATCH] kill _INLINE_

This patch removes all occurances of _INLINE_ in the kernel.

With the exception of tty_flip.h, I've simply removed the inline's since
gcc should know best which functions to be inlined.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/hp/sim/simserial.c       |  7 +---
 arch/xtensa/platform-iss/console.c |  4 ---
 drivers/char/amiserial.c           | 18 ++++------
 drivers/isdn/hisax/config.c        |  1 -
 drivers/isdn/hisax/elsa.c          |  1 -
 drivers/serial/68328serial.c       |  9 +++--
 drivers/serial/au1x00_uart.c       | 11 +++---
 drivers/serial/crisv10.c           | 68 ++++++++++++--------------------------
 drivers/serial/m32r_sio.c          | 15 ++++-----
 drivers/serial/sunsu.c             | 13 +++-----
 drivers/tc/zs.c                    |  9 ++---
 include/linux/tty_flip.h           | 12 ++-----
 12 files changed, 56 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index 626cdc83668b..0e5c6ae50228 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -46,11 +46,6 @@
 #define KEYBOARD_INTR	3	/* must match with simulator! */
 
 #define NR_PORTS	1	/* only one port for now */
-#define SERIAL_INLINE	1
-
-#ifdef SERIAL_INLINE
-#define _INLINE_ inline
-#endif
 
 #define IRQ_T(info) ((info->flags & ASYNC_SHARE_IRQ) ? SA_SHIRQ : SA_INTERRUPT)
 
@@ -237,7 +232,7 @@ static void rs_put_char(struct tty_struct *tty, unsigned char ch)
 	local_irq_restore(flags);
 }
 
-static _INLINE_ void transmit_chars(struct async_struct *info, int *intr_done)
+static void transmit_chars(struct async_struct *info, int *intr_done)
 {
 	int count;
 	unsigned long flags;
diff --git a/arch/xtensa/platform-iss/console.c b/arch/xtensa/platform-iss/console.c
index 94fdfe474ac1..2a580efb58ec 100644
--- a/arch/xtensa/platform-iss/console.c
+++ b/arch/xtensa/platform-iss/console.c
@@ -31,10 +31,6 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 
-#ifdef SERIAL_INLINE
-#define _INLINE_ inline
-#endif
-
 #define SERIAL_MAX_NUM_LINES 1
 #define SERIAL_TIMER_VALUE (20 * HZ)
 
diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c
index 7ac365b5d9ec..6602b3156df5 100644
--- a/drivers/char/amiserial.c
+++ b/drivers/char/amiserial.c
@@ -46,8 +46,6 @@
 
 /* Sanity checks */
 
-#define SERIAL_INLINE
-  
 #if defined(MODULE) && defined(SERIAL_DEBUG_MCOUNT)
 #define DBG_CNT(s) printk("(%s): [%x] refc=%d, serc=%d, ttyc=%d -> %s\n", \
  tty->name, (info->flags), serial_driver->refcount,info->count,tty->count,s)
@@ -95,10 +93,6 @@ static char *serial_version = "4.30";
 #include <asm/amigahw.h>
 #include <asm/amigaints.h>
 
-#ifdef SERIAL_INLINE
-#define _INLINE_ inline
-#endif
-
 #define custom amiga_custom
 static char *serial_name = "Amiga-builtin serial driver";
 
@@ -253,14 +247,14 @@ static void rs_start(struct tty_struct *tty)
  * This routine is used by the interrupt handler to schedule
  * processing in the software interrupt portion of the driver.
  */
-static _INLINE_ void rs_sched_event(struct async_struct *info,
-				  int event)
+static void rs_sched_event(struct async_struct *info,
+			   int event)
 {
 	info->event |= 1 << event;
 	tasklet_schedule(&info->tlet);
 }
 
-static _INLINE_ void receive_chars(struct async_struct *info)
+static void receive_chars(struct async_struct *info)
 {
         int status;
 	int serdatr;
@@ -349,7 +343,7 @@ out:
 	return;
 }
 
-static _INLINE_ void transmit_chars(struct async_struct *info)
+static void transmit_chars(struct async_struct *info)
 {
 	custom.intreq = IF_TBE;
 	mb();
@@ -389,7 +383,7 @@ static _INLINE_ void transmit_chars(struct async_struct *info)
 	}
 }
 
-static _INLINE_ void check_modem_status(struct async_struct *info)
+static void check_modem_status(struct async_struct *info)
 {
 	unsigned char status = ciab.pra & (SER_DCD | SER_CTS | SER_DSR);
 	unsigned char dstatus;
@@ -1959,7 +1953,7 @@ done:
  * number, and identifies which options were configured into this
  * driver.
  */
-static _INLINE_ void show_serial_version(void)
+static void show_serial_version(void)
 {
  	printk(KERN_INFO "%s version %s\n", serial_name, serial_version);
 }
diff --git a/drivers/isdn/hisax/config.c b/drivers/isdn/hisax/config.c
index df9d65201819..27332506f9f7 100644
--- a/drivers/isdn/hisax/config.c
+++ b/drivers/isdn/hisax/config.c
@@ -25,7 +25,6 @@
 #include <linux/workqueue.h>
 #include <linux/interrupt.h>
 #define HISAX_STATUS_BUFSIZE 4096
-#define INCLUDE_INLINE_FUNCS
 
 /*
  * This structure array contains one entry per card. An entry looks
diff --git a/drivers/isdn/hisax/elsa.c b/drivers/isdn/hisax/elsa.c
index 110e9fd669c5..f8ca4b323331 100644
--- a/drivers/isdn/hisax/elsa.c
+++ b/drivers/isdn/hisax/elsa.c
@@ -108,7 +108,6 @@ static const char *ITACVer[] =
 #define ELSA_ASSIGN      4
 
 #define RS_ISR_PASS_LIMIT 256
-#define _INLINE_ inline
 #define FLG_MODEM_ACTIVE 1
 /* IPAC AUX */
 #define ELSA_IPAC_LINE_LED	0x40	/* Bit 6 Gelbe LED */
diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c
index 7f0f35a05dca..b88a7c1158af 100644
--- a/drivers/serial/68328serial.c
+++ b/drivers/serial/68328serial.c
@@ -101,8 +101,6 @@ struct tty_driver *serial_driver;
 
 #define RS_ISR_PASS_LIMIT 256
 
-#define _INLINE_ inline
-
 static void change_speed(struct m68k_serial *info);
 
 /*
@@ -262,7 +260,7 @@ static void batten_down_hatches(void)
 	/* Drop into the debugger */
 }
 
-static _INLINE_ void status_handle(struct m68k_serial *info, unsigned short status)
+static void status_handle(struct m68k_serial *info, unsigned short status)
 {
 #if 0
 	if(status & DCD) {
@@ -289,7 +287,8 @@ static _INLINE_ void status_handle(struct m68k_serial *info, unsigned short stat
 	return;
 }
 
-static _INLINE_ void receive_chars(struct m68k_serial *info, struct pt_regs *regs, unsigned short rx)
+static void receive_chars(struct m68k_serial *info, struct pt_regs *regs,
+			  unsigned short rx)
 {
 	struct tty_struct *tty = info->tty;
 	m68328_uart *uart = &uart_addr[info->line];
@@ -359,7 +358,7 @@ clear_and_exit:
 	return;
 }
 
-static _INLINE_ void transmit_chars(struct m68k_serial *info)
+static void transmit_chars(struct m68k_serial *info)
 {
 	m68328_uart *uart = &uart_addr[info->line];
 
diff --git a/drivers/serial/au1x00_uart.c b/drivers/serial/au1x00_uart.c
index 29f94bbb79be..948880ac5878 100644
--- a/drivers/serial/au1x00_uart.c
+++ b/drivers/serial/au1x00_uart.c
@@ -133,13 +133,12 @@ static const struct serial_uart_config uart_config[PORT_MAX_8250+1] = {
 	{ "AU1X00_UART",16,	UART_CLEAR_FIFO | UART_USE_FIFO },
 };
 
-static _INLINE_ unsigned int serial_in(struct uart_8250_port *up, int offset)
+static unsigned int serial_in(struct uart_8250_port *up, int offset)
 {
 	return au_readl((unsigned long)up->port.membase + offset);
 }
 
-static _INLINE_ void
-serial_out(struct uart_8250_port *up, int offset, int value)
+static void serial_out(struct uart_8250_port *up, int offset, int value)
 {
 	au_writel(value, (unsigned long)up->port.membase + offset);
 }
@@ -237,7 +236,7 @@ static void serial8250_enable_ms(struct uart_port *port)
 	serial_out(up, UART_IER, up->ier);
 }
 
-static _INLINE_ void
+static void
 receive_chars(struct uart_8250_port *up, int *status, struct pt_regs *regs)
 {
 	struct tty_struct *tty = up->port.info->tty;
@@ -312,7 +311,7 @@ receive_chars(struct uart_8250_port *up, int *status, struct pt_regs *regs)
 	spin_lock(&up->port.lock);
 }
 
-static _INLINE_ void transmit_chars(struct uart_8250_port *up)
+static void transmit_chars(struct uart_8250_port *up)
 {
 	struct circ_buf *xmit = &up->port.info->xmit;
 	int count;
@@ -346,7 +345,7 @@ static _INLINE_ void transmit_chars(struct uart_8250_port *up)
 		serial8250_stop_tx(&up->port);
 }
 
-static _INLINE_ void check_modem_status(struct uart_8250_port *up)
+static void check_modem_status(struct uart_8250_port *up)
 {
 	int status;
 
diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c
index be12623d8544..89700141f87e 100644
--- a/drivers/serial/crisv10.c
+++ b/drivers/serial/crisv10.c
@@ -481,8 +481,6 @@ static char *serial_version = "$Revision: 1.25 $";
 #include "serial_compat.h"
 #endif
 
-#define _INLINE_ inline
-
 struct tty_driver *serial_driver;
 
 /* serial subtype definitions */
@@ -591,8 +589,6 @@ static void rs_throttle(struct tty_struct * tty);
 static void rs_wait_until_sent(struct tty_struct *tty, int timeout);
 static int rs_write(struct tty_struct * tty, int from_user,
                     const unsigned char *buf, int count);
-extern _INLINE_ int rs_raw_write(struct tty_struct * tty, int from_user,
-                            const unsigned char *buf, int count);
 #ifdef CONFIG_ETRAX_RS485
 static int e100_write_rs485(struct tty_struct * tty, int from_user,
                             const unsigned char *buf, int count);
@@ -1538,8 +1534,7 @@ e100_enable_rxdma_irq(struct e100_serial *info)
 
 /* the tx DMA uses only dma_descr interrupt */
 
-static _INLINE_ void
-e100_disable_txdma_irq(struct e100_serial *info)
+static void e100_disable_txdma_irq(struct e100_serial *info)
 {
 #ifdef SERIAL_DEBUG_INTR
 	printk("txdma_irq(%d): 0\n",info->line);
@@ -1548,8 +1543,7 @@ e100_disable_txdma_irq(struct e100_serial *info)
 	*R_IRQ_MASK2_CLR = info->irq;
 }
 
-static _INLINE_ void
-e100_enable_txdma_irq(struct e100_serial *info)
+static void e100_enable_txdma_irq(struct e100_serial *info)
 {
 #ifdef SERIAL_DEBUG_INTR
 	printk("txdma_irq(%d): 1\n",info->line);
@@ -1558,8 +1552,7 @@ e100_enable_txdma_irq(struct e100_serial *info)
 	*R_IRQ_MASK2_SET = info->irq;
 }
 
-static _INLINE_ void
-e100_disable_txdma_channel(struct e100_serial *info)
+static void e100_disable_txdma_channel(struct e100_serial *info)
 {
 	unsigned long flags;
 
@@ -1599,8 +1592,7 @@ e100_disable_txdma_channel(struct e100_serial *info)
 }
 
 
-static _INLINE_ void
-e100_enable_txdma_channel(struct e100_serial *info)
+static void e100_enable_txdma_channel(struct e100_serial *info)
 {
 	unsigned long flags;
 
@@ -1625,8 +1617,7 @@ e100_enable_txdma_channel(struct e100_serial *info)
 	restore_flags(flags);
 }
 
-static _INLINE_ void
-e100_disable_rxdma_channel(struct e100_serial *info)
+static void e100_disable_rxdma_channel(struct e100_serial *info)
 {
 	unsigned long flags;
 
@@ -1665,8 +1656,7 @@ e100_disable_rxdma_channel(struct e100_serial *info)
 }
 
 
-static _INLINE_ void
-e100_enable_rxdma_channel(struct e100_serial *info)
+static void e100_enable_rxdma_channel(struct e100_serial *info)
 {
 	unsigned long flags;
 
@@ -1913,9 +1903,7 @@ rs_start(struct tty_struct *tty)
  * This routine is used by the interrupt handler to schedule
  * processing in the software interrupt portion of the driver.
  */
-static _INLINE_ void
-rs_sched_event(struct e100_serial *info,
-				    int event)
+static void rs_sched_event(struct e100_serial *info, int event)
 {
 	if (info->event & (1 << event))
 		return;
@@ -2155,8 +2143,9 @@ add_char_and_flag(struct e100_serial *info, unsigned char data, unsigned char fl
 	return 1;
 }
 
-extern _INLINE_ unsigned int
-handle_descr_data(struct e100_serial *info, struct etrax_dma_descr *descr, unsigned int recvl)
+static unsigned int handle_descr_data(struct e100_serial *info,
+				      struct etrax_dma_descr *descr,
+				      unsigned int recvl)
 {
 	struct etrax_recv_buffer *buffer = phys_to_virt(descr->buf) - sizeof *buffer;
 
@@ -2182,8 +2171,7 @@ handle_descr_data(struct e100_serial *info, struct etrax_dma_descr *descr, unsig
 	return recvl;
 }
 
-static _INLINE_ unsigned int
-handle_all_descr_data(struct e100_serial *info)
+static unsigned int handle_all_descr_data(struct e100_serial *info)
 {
 	struct etrax_dma_descr *descr;
 	unsigned int recvl;
@@ -2230,8 +2218,7 @@ handle_all_descr_data(struct e100_serial *info)
 	return ret;
 }
 
-static _INLINE_ void
-receive_chars_dma(struct e100_serial *info)
+static void receive_chars_dma(struct e100_serial *info)
 {
 	struct tty_struct *tty;
 	unsigned char rstat;
@@ -2292,8 +2279,7 @@ receive_chars_dma(struct e100_serial *info)
 	*info->icmdadr = IO_STATE(R_DMA_CH6_CMD, cmd, restart);
 }
 
-static _INLINE_ int
-start_recv_dma(struct e100_serial *info)
+static int start_recv_dma(struct e100_serial *info)
 {
 	struct etrax_dma_descr *descr = info->rec_descr;
 	struct etrax_recv_buffer *buffer;
@@ -2348,11 +2334,6 @@ start_receive(struct e100_serial *info)
 }
 
 
-static _INLINE_ void
-status_handle(struct e100_serial *info, unsigned short status)
-{
-}
-
 /* the bits in the MASK2 register are laid out like this:
    DMAI_EOP DMAI_DESCR DMAO_EOP DMAO_DESCR
    where I is the input channel and O is the output channel for the port.
@@ -2454,8 +2435,7 @@ rec_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 	return IRQ_RETVAL(handled);
 } /* rec_interrupt */
 
-static _INLINE_ int
-force_eop_if_needed(struct e100_serial *info)
+static int force_eop_if_needed(struct e100_serial *info)
 {
 	/* We check data_avail bit to determine if data has
 	 * arrived since last time
@@ -2499,8 +2479,7 @@ force_eop_if_needed(struct e100_serial *info)
 	return 1;
 }
 
-extern _INLINE_ void
-flush_to_flip_buffer(struct e100_serial *info)
+static void flush_to_flip_buffer(struct e100_serial *info)
 {
 	struct tty_struct *tty;
 	struct etrax_recv_buffer *buffer;
@@ -2611,8 +2590,7 @@ flush_to_flip_buffer(struct e100_serial *info)
 	tty_flip_buffer_push(tty);
 }
 
-static _INLINE_ void
-check_flush_timeout(struct e100_serial *info)
+static void check_flush_timeout(struct e100_serial *info)
 {
 	/* Flip what we've got (if we can) */
 	flush_to_flip_buffer(info);
@@ -2741,7 +2719,7 @@ TODO: The break will be delayed until an F or V character is received.
 
 */
 
-extern _INLINE_
+static
 struct e100_serial * handle_ser_rx_interrupt_no_dma(struct e100_serial *info)
 {
 	unsigned long data_read;
@@ -2875,8 +2853,7 @@ more_data:
 	return info;
 }
 
-extern _INLINE_
-struct e100_serial* handle_ser_rx_interrupt(struct e100_serial *info)
+static struct e100_serial* handle_ser_rx_interrupt(struct e100_serial *info)
 {
 	unsigned char rstat;
 
@@ -2995,7 +2972,7 @@ struct e100_serial* handle_ser_rx_interrupt(struct e100_serial *info)
 	return info;
 } /* handle_ser_rx_interrupt */
 
-extern _INLINE_ void handle_ser_tx_interrupt(struct e100_serial *info)
+static void handle_ser_tx_interrupt(struct e100_serial *info)
 {
 	unsigned long flags;
 
@@ -3621,9 +3598,8 @@ rs_flush_chars(struct tty_struct *tty)
 	restore_flags(flags);
 }
 
-extern _INLINE_ int
-rs_raw_write(struct tty_struct * tty, int from_user,
-	  const unsigned char *buf, int count)
+static int rs_raw_write(struct tty_struct * tty, int from_user,
+			const unsigned char *buf, int count)
 {
 	int	c, ret = 0;
 	struct e100_serial *info = (struct e100_serial *)tty->driver_data;
@@ -4710,7 +4686,7 @@ rs_open(struct tty_struct *tty, struct file * filp)
  * /proc fs routines....
  */
 
-extern _INLINE_ int line_info(char *buf, struct e100_serial *info)
+static int line_info(char *buf, struct e100_serial *info)
 {
 	char	stat_buf[30];
 	int	ret;
diff --git a/drivers/serial/m32r_sio.c b/drivers/serial/m32r_sio.c
index 876bc5e027bb..e9c10c0a30fc 100644
--- a/drivers/serial/m32r_sio.c
+++ b/drivers/serial/m32r_sio.c
@@ -248,17 +248,17 @@ static void sio_error(int *status)
 
 #endif /* CONFIG_SERIAL_M32R_PLDSIO */
 
-static _INLINE_ unsigned int sio_in(struct uart_sio_port *up, int offset)
+static unsigned int sio_in(struct uart_sio_port *up, int offset)
 {
 	return __sio_in(up->port.iobase + offset);
 }
 
-static _INLINE_ void sio_out(struct uart_sio_port *up, int offset, int value)
+static void sio_out(struct uart_sio_port *up, int offset, int value)
 {
 	__sio_out(value, up->port.iobase + offset);
 }
 
-static _INLINE_ unsigned int serial_in(struct uart_sio_port *up, int offset)
+static unsigned int serial_in(struct uart_sio_port *up, int offset)
 {
 	if (!offset)
 		return 0;
@@ -266,8 +266,7 @@ static _INLINE_ unsigned int serial_in(struct uart_sio_port *up, int offset)
 	return __sio_in(offset);
 }
 
-static _INLINE_ void
-serial_out(struct uart_sio_port *up, int offset, int value)
+static void serial_out(struct uart_sio_port *up, int offset, int value)
 {
 	if (!offset)
 		return;
@@ -326,8 +325,8 @@ static void m32r_sio_enable_ms(struct uart_port *port)
 	serial_out(up, UART_IER, up->ier);
 }
 
-static _INLINE_ void receive_chars(struct uart_sio_port *up, int *status,
-	struct pt_regs *regs)
+static void receive_chars(struct uart_sio_port *up, int *status,
+			  struct pt_regs *regs)
 {
 	struct tty_struct *tty = up->port.info->tty;
 	unsigned char ch;
@@ -400,7 +399,7 @@ static _INLINE_ void receive_chars(struct uart_sio_port *up, int *status,
 	tty_flip_buffer_push(tty);
 }
 
-static _INLINE_ void transmit_chars(struct uart_sio_port *up)
+static void transmit_chars(struct uart_sio_port *up)
 {
 	struct circ_buf *xmit = &up->port.info->xmit;
 	int count;
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 7fc3d3b41d18..9fe2283d91e5 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -102,9 +102,7 @@ struct uart_sunsu_port {
 #endif
 };
 
-#define _INLINE_
-
-static _INLINE_ unsigned int serial_in(struct uart_sunsu_port *up, int offset)
+static unsigned int serial_in(struct uart_sunsu_port *up, int offset)
 {
 	offset <<= up->port.regshift;
 
@@ -121,8 +119,7 @@ static _INLINE_ unsigned int serial_in(struct uart_sunsu_port *up, int offset)
 	}
 }
 
-static _INLINE_ void
-serial_out(struct uart_sunsu_port *up, int offset, int value)
+static void serial_out(struct uart_sunsu_port *up, int offset, int value)
 {
 #ifndef CONFIG_SPARC64
 	/*
@@ -316,7 +313,7 @@ static void sunsu_enable_ms(struct uart_port *port)
 	spin_unlock_irqrestore(&up->port.lock, flags);
 }
 
-static _INLINE_ struct tty_struct *
+static struct tty_struct *
 receive_chars(struct uart_sunsu_port *up, unsigned char *status, struct pt_regs *regs)
 {
 	struct tty_struct *tty = up->port.info->tty;
@@ -395,7 +392,7 @@ receive_chars(struct uart_sunsu_port *up, unsigned char *status, struct pt_regs
 	return tty;
 }
 
-static _INLINE_ void transmit_chars(struct uart_sunsu_port *up)
+static void transmit_chars(struct uart_sunsu_port *up)
 {
 	struct circ_buf *xmit = &up->port.info->xmit;
 	int count;
@@ -431,7 +428,7 @@ static _INLINE_ void transmit_chars(struct uart_sunsu_port *up)
 		__stop_tx(up);
 }
 
-static _INLINE_ void check_modem_status(struct uart_sunsu_port *up)
+static void check_modem_status(struct uart_sunsu_port *up)
 {
 	int status;
 
diff --git a/drivers/tc/zs.c b/drivers/tc/zs.c
index 6756d0fab6fe..2dffa8e303b2 100644
--- a/drivers/tc/zs.c
+++ b/drivers/tc/zs.c
@@ -186,8 +186,6 @@ static struct tty_driver *serial_driver;
 #define RS_STROBE_TIME 10
 #define RS_ISR_PASS_LIMIT 256
 
-#define _INLINE_ inline
-
 static void probe_sccs(void);
 static void change_speed(struct dec_serial *info);
 static void rs_wait_until_sent(struct tty_struct *tty, int timeout);
@@ -344,14 +342,13 @@ static inline void rs_recv_clear(struct dec_zschannel *zsc)
  * This routine is used by the interrupt handler to schedule
  * processing in the software interrupt portion of the driver.
  */
-static _INLINE_ void rs_sched_event(struct dec_serial *info, int event)
+static void rs_sched_event(struct dec_serial *info, int event)
 {
 	info->event |= 1 << event;
 	tasklet_schedule(&info->tlet);
 }
 
-static _INLINE_ void receive_chars(struct dec_serial *info,
-				   struct pt_regs *regs)
+static void receive_chars(struct dec_serial *info, struct pt_regs *regs)
 {
 	struct tty_struct *tty = info->tty;
 	unsigned char ch, stat, flag;
@@ -441,7 +438,7 @@ static void transmit_chars(struct dec_serial *info)
 		rs_sched_event(info, RS_EVENT_WRITE_WAKEUP);
 }
 
-static _INLINE_ void status_handle(struct dec_serial *info)
+static void status_handle(struct dec_serial *info)
 {
 	unsigned char stat;
 
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 222faf97d5f9..0c6169fff366 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -7,14 +7,8 @@ extern int tty_insert_flip_string_flags(struct tty_struct *tty, unsigned char *c
 extern int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_t size);
 extern int tty_prepare_flip_string_flags(struct tty_struct *tty, unsigned char **chars, char **flags, size_t size);
 
-#ifdef INCLUDE_INLINE_FUNCS
-#define _INLINE_ extern
-#else
-#define _INLINE_ static __inline__
-#endif
-
-_INLINE_ int tty_insert_flip_char(struct tty_struct *tty,
-				   unsigned char ch, char flag)
+static inline int tty_insert_flip_char(struct tty_struct *tty,
+				       unsigned char ch, char flag)
 {
 	struct tty_buffer *tb = tty->buf.tail;
 	if (tb && tb->active && tb->used < tb->size) {
@@ -25,7 +19,7 @@ _INLINE_ int tty_insert_flip_char(struct tty_struct *tty,
 	return tty_insert_flip_string_flags(tty, &ch, &flag, 1);
 }
 
-_INLINE_ void tty_schedule_flip(struct tty_struct *tty)
+static inline void tty_schedule_flip(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty->buf.lock, flags);
-- 
cgit v1.2.3


From dd287796d608fcdc3fe5e8fdb5bf762a8f1bc32a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:00:57 -0800
Subject: [PATCH] pause_on_oops command line option

Attempt to fix the problem wherein people's oops reports scroll off the screen
due to repeated oopsing or to oopses on other CPUs.

If this happens the user can reboot with the `pause_on_oops=<seconds>' option.
It will allow the first oopsing CPU to print an oops record just a single
time.  Second oopsing attempts, or oopses on other CPUs will cause those CPUs
to enter a tight loop until the specified number of seconds have elapsed.

The patch implements the infrastructure generically in the expectation that
architectures other than x86 will find it useful.

Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  5 ++
 arch/i386/kernel/traps.c            |  3 ++
 arch/i386/mm/fault.c                | 39 +++++++++------
 include/linux/kernel.h              |  3 ++
 kernel/panic.c                      | 97 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 130 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 880be3a30d8d..7b7382d0f758 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1126,6 +1126,11 @@ running once the system is up.
 	pas16=		[HW,SCSI]
 			See header of drivers/scsi/pas16.c.
 
+	pause_on_oops=
+			Halt all CPUs after the first oops has been printed for
+			the specified number of seconds.  This is to be used if
+			your oopses keep scrolling off the screen.
+
 	pcbit=		[HW,ISDN]
 
 	pcd.		[PARIDE]
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 1b7ad4115d81..de5386b01d38 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -352,6 +352,8 @@ void die(const char * str, struct pt_regs * regs, long err)
 	static int die_counter;
 	unsigned long flags;
 
+	oops_enter();
+
 	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
 		spin_lock_irqsave(&die.lock, flags);
@@ -404,6 +406,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 		ssleep(5);
 		panic("Fatal exception");
 	}
+	oops_exit();
 	do_exit(SIGSEGV);
 }
 
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 47a3b72ec7b6..7f0fcf219a26 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -509,24 +509,31 @@ no_context:
 
 	bust_spinlocks(1);
 
-#ifdef CONFIG_X86_PAE
-	if (error_code & 16) {
-		pte_t *pte = lookup_address(address);
-
-		if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
-			printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
+	if (oops_may_print()) {
+	#ifdef CONFIG_X86_PAE
+		if (error_code & 16) {
+			pte_t *pte = lookup_address(address);
+
+			if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+				printk(KERN_CRIT "kernel tried to execute "
+					"NX-protected page - exploit attempt? "
+					"(uid: %d)\n", current->uid);
+		}
+	#endif
+		if (address < PAGE_SIZE)
+			printk(KERN_ALERT "BUG: unable to handle kernel NULL "
+					"pointer dereference");
+		else
+			printk(KERN_ALERT "BUG: unable to handle kernel paging"
+					" request");
+		printk(" at virtual address %08lx\n",address);
+		printk(KERN_ALERT " printing eip:\n");
+		printk("%08lx\n", regs->eip);
 	}
-#endif
-	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "BUG: unable to handle kernel NULL pointer dereference");
-	else
-		printk(KERN_ALERT "BUG: unable to handle kernel paging request");
-	printk(" at virtual address %08lx\n",address);
-	printk(KERN_ALERT " printing eip:\n");
-	printk("%08lx\n", regs->eip);
 	page = read_cr3();
 	page = ((unsigned long *) __va(page))[address >> 22];
-	printk(KERN_ALERT "*pde = %08lx\n", page);
+	if (oops_may_print())
+		printk(KERN_ALERT "*pde = %08lx\n", page);
 	/*
 	 * We must not directly access the pte in the highpte
 	 * case, the page table might be allocated in highmem.
@@ -534,7 +541,7 @@ no_context:
 	 * it's allocated already.
 	 */
 #ifndef CONFIG_HIGHPTE
-	if (page & 1) {
+	if ((page & 1) && oops_may_print()) {
 		page &= PAGE_MASK;
 		address &= 0x003ff000;
 		page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3b507bf05d09..bb6e7ddee2fd 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -91,6 +91,9 @@ extern struct notifier_block *panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));
+extern void oops_enter(void);
+extern void oops_exit(void);
+extern int oops_may_print(void);
 fastcall NORET_TYPE void do_exit(long error_code)
 	ATTRIB_NORET;
 NORET_TYPE void complete_and_exit(struct completion *, long)
diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c74..acd95adddb93 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,10 +20,13 @@
 #include <linux/nmi.h>
 #include <linux/kexec.h>
 
-int panic_timeout;
 int panic_on_oops;
 int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
 
+int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 
 struct notifier_block *panic_notifier_list;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
 	tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
+
+static int __init pause_on_oops_setup(char *str)
+{
+	pause_on_oops = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+
+static void spin_msec(int msecs)
+{
+	int i;
+
+	for (i = 0; i < msecs; i++) {
+		touch_nmi_watchdog();
+		mdelay(1);
+	}
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+	unsigned long flags;
+	static int spin_counter;
+
+	if (!pause_on_oops)
+		return;
+
+	spin_lock_irqsave(&pause_on_oops_lock, flags);
+	if (pause_on_oops_flag == 0) {
+		/* This CPU may now print the oops message */
+		pause_on_oops_flag = 1;
+	} else {
+		/* We need to stall this CPU */
+		if (!spin_counter) {
+			/* This CPU gets to do the counting */
+			spin_counter = pause_on_oops;
+			do {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(MSEC_PER_SEC);
+				spin_lock(&pause_on_oops_lock);
+			} while (--spin_counter);
+			pause_on_oops_flag = 0;
+		} else {
+			/* This CPU waits for a different one */
+			while (spin_counter) {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(1);
+				spin_lock(&pause_on_oops_lock);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.  This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+	return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything.  If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all this
+ * to ensure that oopses don't scroll off the screen.  It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+	do_oops_enter_exit();
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+	do_oops_enter_exit();
+}
-- 
cgit v1.2.3


From 2178426d26661ed6e18a8d6ea0bc05c98d73600d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Mar 2006 03:01:00 -0800
Subject: [PATCH] kernel/rcupdate.c: make two structs static

This patch makes two needlessly global structs static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 2 --
 kernel/rcupdate.c        | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c2ec6c77874e..5673008b61e1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -113,8 +113,6 @@ struct rcu_data {
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-extern struct rcu_ctrlblk rcu_ctrlblk;
-extern struct rcu_ctrlblk rcu_bh_ctrlblk;
 
 /*
  * Increment the quiescent state counter.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index af8a2a57e17d..6df1559b1c02 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -50,13 +50,13 @@
 #include <linux/mutex.h>
 
 /* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = {
+static struct rcu_ctrlblk rcu_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.lock = SPIN_LOCK_UNLOCKED,
 	.cpumask = CPU_MASK_NONE,
 };
-struct rcu_ctrlblk rcu_bh_ctrlblk = {
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.lock = SPIN_LOCK_UNLOCKED,
-- 
cgit v1.2.3


From 5a6b7951bfcca7f45f44269ea87417c74558daf8 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@linux.intel.com>
Date: Thu, 23 Mar 2006 03:01:03 -0800
Subject: [PATCH] get_empty_filp tweaks, inline epoll_init_file()

Eliminate a handful of cache references by keeping current in a register
instead of reloading (helps x86) and avoiding the overhead of a function
call.  Inlining eventpoll_init_file() saves 24 bytes.  Also reorder file
initialization to make writes occur more sequentially.

Signed-off-by: Benjamin LaHaise <bcrl@linux.intel.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/eventpoll.c            |  9 ---------
 fs/file_table.c           | 10 ++++++----
 include/linux/eventpoll.h |  8 ++++++--
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f5d69f46ba9b..1c2b16fda13a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -452,15 +452,6 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
 }
 
 
-/* Used to initialize the epoll bits inside the "struct file" */
-void eventpoll_init_file(struct file *file)
-{
-
-	INIT_LIST_HEAD(&file->f_ep_links);
-	spin_lock_init(&file->f_ep_lock);
-}
-
-
 /*
  * This is called from eventpoll_release() to unlink files from the eventpoll
  * interface. We need to have this facility to cleanup correctly files that are
diff --git a/fs/file_table.c b/fs/file_table.c
index 44fabeaa9415..bcea1998b4de 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -88,6 +88,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *filp,
  */
 struct file *get_empty_filp(void)
 {
+	struct task_struct *tsk;
 	static int old_max;
 	struct file * f;
 
@@ -112,13 +113,14 @@ struct file *get_empty_filp(void)
 	if (security_file_alloc(f))
 		goto fail_sec;
 
-	eventpoll_init_file(f);
+	tsk = current;
+	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_set(&f->f_count, 1);
-	f->f_uid = current->fsuid;
-	f->f_gid = current->fsgid;
 	rwlock_init(&f->f_owner.lock);
+	f->f_uid = tsk->fsuid;
+	f->f_gid = tsk->fsgid;
+	eventpoll_init_file(f);
 	/* f->f_version: 0 */
-	INIT_LIST_HEAD(&f->f_u.fu_list);
 	return f;
 
 over:
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 1289f0ec4c00..1e4bdfcf83a2 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -52,7 +52,12 @@ struct file;
 #ifdef CONFIG_EPOLL
 
 /* Used to initialize the epoll bits inside the "struct file" */
-void eventpoll_init_file(struct file *file);
+static inline void eventpoll_init_file(struct file *file)
+{
+	INIT_LIST_HEAD(&file->f_ep_links);
+	spin_lock_init(&file->f_ep_lock);
+}
+
 
 /* Used to release the epoll bits inside the "struct file" */
 void eventpoll_release_file(struct file *file);
@@ -85,7 +90,6 @@ static inline void eventpoll_release(struct file *file)
 	eventpoll_release_file(file);
 }
 
-
 #else
 
 static inline void eventpoll_init_file(struct file *file) {}
-- 
cgit v1.2.3


From 394e3902c55e667945f6f1c2bdbc59842cce70f7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:01:05 -0800
Subject: [PATCH] more for_each_cpu() conversions

When we stop allocating percpu memory for not-possible CPUs we must not touch
the percpu data for not-possible CPUs at all.  The correct way of doing this
is to test cpu_possible() or to use for_each_cpu().

This patch is a kernel-wide sweep of all instances of NR_CPUS.  I found very
few instances of this bug, if any.  But the patch converts lots of open-coded
test to use the preferred helper macros.

Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Acked-by: Kyle McMartin <kyle@parisc-linux.org>
Cc: Anton Blanchard <anton@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Christian Zankel <chris@zankel.net>
Cc: Philippe Elie <phil.el@wanadoo.fr>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Jens Axboe <axboe@suse.de>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/kernel/irq.c                     | 10 ++++------
 arch/frv/kernel/irq.c                      | 10 ++++------
 arch/i386/kernel/cpu/cpufreq/powernow-k8.c |  4 +---
 arch/i386/kernel/io_apic.c                 | 22 +++++++++-------------
 arch/i386/kernel/nmi.c                     |  4 ++--
 arch/i386/oprofile/nmi_int.c               |  7 ++-----
 arch/m32r/kernel/irq.c                     | 10 ++++------
 arch/mips/kernel/irq.c                     | 10 ++++------
 arch/mips/kernel/smp.c                     |  4 ++--
 arch/mips/sgi-ip27/ip27-irq.c              |  5 +----
 arch/parisc/kernel/smp.c                   | 25 ++++++++++---------------
 arch/powerpc/kernel/irq.c                  |  5 ++---
 arch/powerpc/kernel/setup-common.c         |  5 ++---
 arch/powerpc/kernel/setup_32.c             |  5 ++---
 arch/powerpc/platforms/powermac/smp.c      |  4 +---
 arch/ppc/kernel/setup.c                    | 10 ++++------
 arch/s390/kernel/smp.c                     |  4 +---
 arch/sh/kernel/irq.c                       |  5 ++---
 arch/sh/kernel/setup.c                     |  5 ++---
 arch/sh64/kernel/irq.c                     |  5 ++---
 arch/sparc/kernel/irq.c                    |  5 ++---
 arch/sparc/kernel/smp.c                    | 24 ++++++++++--------------
 arch/sparc/kernel/sun4d_irq.c              |  8 +++-----
 arch/sparc/kernel/sun4d_smp.c              |  8 +++-----
 arch/sparc/kernel/sun4m_smp.c              |  6 ++----
 arch/sparc64/kernel/irq.c                  |  4 +---
 arch/sparc64/kernel/smp.c                  | 30 ++++++++++++------------------
 arch/x86_64/kernel/irq.c                   | 21 ++++++++-------------
 arch/x86_64/kernel/nmi.c                   |  4 +---
 arch/xtensa/kernel/irq.c                   | 15 ++++++---------
 drivers/net/loopback.c                     |  4 +---
 drivers/oprofile/cpu_buffer.c              |  3 +--
 fs/xfs/linux-2.6/xfs_stats.c               |  7 ++-----
 fs/xfs/linux-2.6/xfs_sysctl.c              |  3 +--
 include/asm-alpha/mmu_context.h            |  5 ++---
 include/asm-alpha/topology.h               |  4 ++--
 include/asm-generic/percpu.h               |  7 +++----
 include/asm-powerpc/percpu.h               |  7 +++----
 include/asm-s390/percpu.h                  |  7 +++----
 include/asm-sparc64/percpu.h               |  7 +++----
 include/asm-x86_64/percpu.h                |  7 +++----
 include/linux/genhd.h                      | 14 ++++----------
 42 files changed, 137 insertions(+), 222 deletions(-)

(limited to 'include/linux')

diff --git a/arch/cris/kernel/irq.c b/arch/cris/kernel/irq.c
index 30deaf1b728a..b504def3e346 100644
--- a/arch/cris/kernel/irq.c
+++ b/arch/cris/kernel/irq.c
@@ -52,9 +52,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_printf(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
@@ -67,9 +66,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
 		seq_printf(p, " %14s", irq_desc[i].handler->typename);
 		seq_printf(p, "  %s", action->name);
diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c
index 27ab4c30aac6..11fa326a8f62 100644
--- a/arch/frv/kernel/irq.c
+++ b/arch/frv/kernel/irq.c
@@ -75,9 +75,8 @@ int show_interrupts(struct seq_file *p, void *v)
 	switch (i) {
 	case 0:
 		seq_printf(p, "           ");
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 
 		seq_putc(p, '\n');
 		break;
@@ -100,9 +99,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i - 1]);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i - 1]);
 #endif
 
 		level = group->sources[ix]->level - frv_irq_levels;
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index e11a09207ec8..3d5110b65cc3 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -1145,9 +1145,7 @@ static int __cpuinit powernowk8_init(void)
 {
 	unsigned int i, supported_cpus = 0;
 
-	for (i=0; i<NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	for_each_cpu(i) {
 		if (check_supported_cpu(i))
 			supported_cpus++;
 	}
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index fd1c60cfd294..311b4e7266f1 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -351,8 +351,8 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
 {
 	int i, j;
 	Dprintk("Rotating IRQs among CPUs.\n");
-	for (i = 0; i < NR_CPUS; i++) {
-		for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
+	for_each_online_cpu(i) {
+		for (j = 0; j < NR_IRQS; j++) {
 			if (!irq_desc[j].action)
 				continue;
 			/* Is it a significant load ?  */
@@ -381,7 +381,7 @@ static void do_irq_balance(void)
 	unsigned long imbalance = 0;
 	cpumask_t allowed_mask, target_cpu_mask, tmp;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		int package_index;
 		CPU_IRQ(i) = 0;
 		if (!cpu_online(i))
@@ -422,9 +422,7 @@ static void do_irq_balance(void)
 		}
 	}
 	/* Find the least loaded processor package */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	for_each_online_cpu(i) {
 		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (min_cpu_irq > CPU_IRQ(i)) {
@@ -441,9 +439,7 @@ tryanothercpu:
 	 */
 	tmp_cpu_irq = 0;
 	tmp_loaded = -1;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	for_each_online_cpu(i) {
 		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (max_cpu_irq <= CPU_IRQ(i)) 
@@ -619,9 +615,7 @@ static int __init balanced_irq_init(void)
 	if (smp_num_siblings > 1 && !cpus_empty(tmp))
 		physical_balance = 1;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	for_each_online_cpu(i) {
 		irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
 		irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
 		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
@@ -638,9 +632,11 @@ static int __init balanced_irq_init(void)
 	else 
 		printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
 failed:
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		kfree(irq_cpu_data[i].irq_delta);
+		irq_cpu_data[i].irq_delta = NULL;
 		kfree(irq_cpu_data[i].last_irq);
+		irq_cpu_data[i].last_irq = NULL;
 	}
 	return 0;
 }
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 1db34effdd8d..9074818b9473 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -143,7 +143,7 @@ static int __init check_nmi_watchdog(void)
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_cpu(cpu) {
 #ifdef CONFIG_SMP
 		/* Check cpu_callin_map here because that is set
 		   after the timer is started. */
@@ -510,7 +510,7 @@ void touch_nmi_watchdog (void)
 	 * Just reset the alert counters, (other CPUs might be
 	 * spinning on locks we hold):
 	 */
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		alert_counter[i] = 0;
 
 	/*
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 0493e8b8ec49..1accce50c2c7 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -122,7 +122,7 @@ static void nmi_save_registers(void * dummy)
 static void free_msrs(void)
 {
 	int i;
-	for (i = 0; i < NR_CPUS; ++i) {
+	for_each_cpu(i) {
 		kfree(cpu_msrs[i].counters);
 		cpu_msrs[i].counters = NULL;
 		kfree(cpu_msrs[i].controls);
@@ -138,10 +138,7 @@ static int allocate_msrs(void)
 	size_t counters_size = sizeof(struct op_msr) * model->num_counters;
 
 	int i;
-	for (i = 0; i < NR_CPUS; ++i) {
-		if (!cpu_online(i))
-			continue;
-
+	for_each_online_cpu(i) {
 		cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL);
 		if (!cpu_msrs[i].counters) {
 			success = 0;
diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c
index 1ce63926a3c0..a4634b06f675 100644
--- a/arch/m32r/kernel/irq.c
+++ b/arch/m32r/kernel/irq.c
@@ -37,9 +37,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_printf(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
@@ -52,9 +51,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
 		seq_printf(p, " %14s", irq_desc[i].handler->typename);
 		seq_printf(p, "  %s", action->name);
diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c
index 7d93992e462c..3dd76b3d2967 100644
--- a/arch/mips/kernel/irq.c
+++ b/arch/mips/kernel/irq.c
@@ -68,9 +68,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_printf(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
@@ -83,9 +82,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
 		seq_printf(p, " %14s", irq_desc[i].handler->typename);
 		seq_printf(p, "  %s", action->name);
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 06ed90752424..78d171bfa331 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -167,8 +167,8 @@ int smp_call_function (void (*func) (void *info), void *info, int retry,
 	mb();
 
 	/* Send a message to all other CPUs and wait for them to respond */
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_online(i) && i != cpu)
+	for_each_online_cpu(i)
+		if (i != cpu)
 			core_send_ipi(i, SMP_CALL_FUNCTION);
 
 	/* Wait for response */
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c
index 73e5e52781d8..2854ac4c9be1 100644
--- a/arch/mips/sgi-ip27/ip27-irq.c
+++ b/arch/mips/sgi-ip27/ip27-irq.c
@@ -88,12 +88,9 @@ static inline int find_level(cpuid_t *cpunum, int irq)
 {
 	int cpu, i;
 
-	for (cpu = 0; cpu <= NR_CPUS; cpu++) {
+	for_each_online_cpu(cpu) {
 		struct slice_data *si = cpu_data[cpu].data;
 
-		if (!cpu_online(cpu))
-			continue;
-
 		for (i = BASE_PCI_IRQ; i < LEVELS_PER_SLICE; i++)
 			if (si->level_to_irq[i] == irq) {
 				*cpunum = cpu;
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 25564b7ca6bb..d6ac1c60a471 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -298,8 +298,8 @@ send_IPI_allbutself(enum ipi_message_type op)
 {
 	int i;
 	
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i) && i != smp_processor_id())
+	for_each_online_cpu(i) {
+		if (i != smp_processor_id())
 			send_IPI_single(i, op);
 	}
 }
@@ -643,14 +643,13 @@ int sys_cpus(int argc, char **argv)
 	if ( argc == 1 ){
 	
 #ifdef DUMP_MORE_STATE
-		for(i=0; i<NR_CPUS; i++) {
+		for_each_online_cpu(i) {
 			int cpus_per_line = 4;
-			if(cpu_online(i)) {
-				if (j++ % cpus_per_line)
-					printk(" %3d",i);
-				else
-					printk("\n %3d",i);
-			}
+
+			if (j++ % cpus_per_line)
+				printk(" %3d",i);
+			else
+				printk("\n %3d",i);
 		}
 		printk("\n"); 
 #else
@@ -659,9 +658,7 @@ int sys_cpus(int argc, char **argv)
 	} else if((argc==2) && !(strcmp(argv[1],"-l"))) {
 		printk("\nCPUSTATE  TASK CPUNUM CPUID HARDCPU(HPA)\n");
 #ifdef DUMP_MORE_STATE
-		for(i=0;i<NR_CPUS;i++) {
-			if (!cpu_online(i))
-				continue;
+		for_each_online_cpu(i) {
 			if (cpu_data[i].cpuid != NO_PROC_ID) {
 				switch(cpu_data[i].state) {
 					case STATE_RENDEZVOUS:
@@ -695,9 +692,7 @@ int sys_cpus(int argc, char **argv)
 	} else if ((argc==2) && !(strcmp(argv[1],"-s"))) { 
 #ifdef DUMP_MORE_STATE
      		printk("\nCPUSTATE   CPUID\n");
-		for (i=0;i<NR_CPUS;i++) {
-			if (!cpu_online(i))
-				continue;
+		for_each_online_cpu(i) {
 			if (cpu_data[i].cpuid != NO_PROC_ID) {
 				switch(cpu_data[i].state) {
 					case STATE_RENDEZVOUS:
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 24dc8117b822..771a59cbd213 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,9 +135,8 @@ skip:
 #ifdef CONFIG_TAU_INT
 		if (tau_initialized){
 			seq_puts(p, "TAU: ");
-			for (j = 0; j < NR_CPUS; j++)
-				if (cpu_online(j))
-					seq_printf(p, "%10u ", tau_interrupts(j));
+			for_each_online_cpu(j)
+				seq_printf(p, "%10u ", tau_interrupts(j));
 			seq_puts(p, "  PowerPC             Thermal Assist (cpu temp)\n");
 		}
 #endif
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index be12041c0fc5..c1d62bf11f29 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -162,9 +162,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 #if defined(CONFIG_SMP) && defined(CONFIG_PPC32)
 		unsigned long bogosum = 0;
 		int i;
-		for (i = 0; i < NR_CPUS; ++i)
-			if (cpu_online(i))
-				bogosum += loops_per_jiffy;
+		for_each_online_cpu(i)
+			bogosum += loops_per_jiffy;
 		seq_printf(m, "total bogomips\t: %lu.%02lu\n",
 			   bogosum/(500000/HZ), bogosum/(5000/HZ) % 100);
 #endif /* CONFIG_SMP && CONFIG_PPC32 */
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index db72a92943bf..dc2770df25b3 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -272,9 +272,8 @@ int __init ppc_init(void)
 	if ( ppc_md.progress ) ppc_md.progress("             ", 0xffff);
 
 	/* register CPU devices */
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_possible(i))
-			register_cpu(&cpu_devices[i], i, NULL);
+	for_each_cpu(i)
+		register_cpu(&cpu_devices[i], i, NULL);
 
 	/* call platform init */
 	if (ppc_md.init != NULL) {
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 6d64a9bf3474..1065d87fc279 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -191,9 +191,7 @@ static void smp_psurge_message_pass(int target, int msg)
 	if (num_online_cpus() < 2)
 		return;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	for_each_online_cpu(i) {
 		if (target == MSG_ALL
 		    || (target == MSG_ALL_BUT_SELF && i != smp_processor_id())
 		    || target == i) {
diff --git a/arch/ppc/kernel/setup.c b/arch/ppc/kernel/setup.c
index c08ab432e958..53e9deacee82 100644
--- a/arch/ppc/kernel/setup.c
+++ b/arch/ppc/kernel/setup.c
@@ -168,9 +168,8 @@ int show_cpuinfo(struct seq_file *m, void *v)
 		/* Show summary information */
 #ifdef CONFIG_SMP
 		unsigned long bogosum = 0;
-		for (i = 0; i < NR_CPUS; ++i)
-			if (cpu_online(i))
-				bogosum += cpu_data[i].loops_per_jiffy;
+		for_each_online_cpu(i)
+			bogosum += cpu_data[i].loops_per_jiffy;
 		seq_printf(m, "total bogomips\t: %lu.%02lu\n",
 			   bogosum/(500000/HZ), bogosum/(5000/HZ) % 100);
 #endif /* CONFIG_SMP */
@@ -712,9 +711,8 @@ int __init ppc_init(void)
 	if ( ppc_md.progress ) ppc_md.progress("             ", 0xffff);
 
 	/* register CPU devices */
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_possible(i))
-			register_cpu(&cpu_devices[i], i, NULL);
+	for_each_cpu(i)
+		register_cpu(&cpu_devices[i], i, NULL);
 
 	/* call platform init */
 	if (ppc_md.init != NULL) {
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 7dbe00c76c6b..d52d6d211d9f 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -799,9 +799,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
          */
 	print_cpu_info(&S390_lowcore.cpu_data);
 
-        for(i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
+        for_each_cpu(i) {
 		lowcore_ptr[i] = (struct _lowcore *)
 			__get_free_pages(GFP_KERNEL|GFP_DMA, 
 					sizeof(void*) == 8 ? 1 : 0);
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 6883c00728cb..b56e79632f24 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -35,9 +35,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_puts(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index a067a34e0b64..c0e79843f580 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -404,9 +404,8 @@ static int __init topology_init(void)
 {
 	int cpu_id;
 
-	for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++)
-		if (cpu_possible(cpu_id))
-			register_cpu(&cpu[cpu_id], cpu_id, NULL);
+	for_each_cpu(cpu_id)
+		register_cpu(&cpu[cpu_id], cpu_id, NULL);
 
 	return 0;
 }
diff --git a/arch/sh64/kernel/irq.c b/arch/sh64/kernel/irq.c
index 9fc2b71dbd84..d69879c0e063 100644
--- a/arch/sh64/kernel/irq.c
+++ b/arch/sh64/kernel/irq.c
@@ -53,9 +53,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_puts(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
diff --git a/arch/sparc/kernel/irq.c b/arch/sparc/kernel/irq.c
index 410b9a72aba9..4c60a6ef54a9 100644
--- a/arch/sparc/kernel/irq.c
+++ b/arch/sparc/kernel/irq.c
@@ -184,9 +184,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++) {
-			if (cpu_online(j))
-				seq_printf(p, "%10u ",
+		for_each_online_cpu(j) {
+			seq_printf(p, "%10u ",
 				    kstat_cpu(cpu_logical_map(j)).irqs[i]);
 		}
 #endif
diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c
index c6e721d8f477..ea5682ce7031 100644
--- a/arch/sparc/kernel/smp.c
+++ b/arch/sparc/kernel/smp.c
@@ -243,9 +243,8 @@ int setup_profiling_timer(unsigned int multiplier)
 		return -EINVAL;
 
 	spin_lock_irqsave(&prof_setup_lock, flags);
-	for(i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i))
-			load_profile_irq(i, lvl14_resolution / multiplier);
+	for_each_cpu(i) {
+		load_profile_irq(i, lvl14_resolution / multiplier);
 		prof_multiplier(i) = multiplier;
 	}
 	spin_unlock_irqrestore(&prof_setup_lock, flags);
@@ -273,13 +272,12 @@ void smp_bogo(struct seq_file *m)
 {
 	int i;
 	
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i))
-			seq_printf(m,
-				   "Cpu%dBogo\t: %lu.%02lu\n", 
-				   i,
-				   cpu_data(i).udelay_val/(500000/HZ),
-				   (cpu_data(i).udelay_val/(5000/HZ))%100);
+	for_each_online_cpu(i) {
+		seq_printf(m,
+			   "Cpu%dBogo\t: %lu.%02lu\n",
+			   i,
+			   cpu_data(i).udelay_val/(500000/HZ),
+			   (cpu_data(i).udelay_val/(5000/HZ))%100);
 	}
 }
 
@@ -288,8 +286,6 @@ void smp_info(struct seq_file *m)
 	int i;
 
 	seq_printf(m, "State:\n");
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i))
-			seq_printf(m, "CPU%d\t\t: online\n", i);
-	}
+	for_each_online_cpu(i)
+		seq_printf(m, "CPU%d\t\t: online\n", i);
 }
diff --git a/arch/sparc/kernel/sun4d_irq.c b/arch/sparc/kernel/sun4d_irq.c
index 52621348a56c..cea7fc6fc6e5 100644
--- a/arch/sparc/kernel/sun4d_irq.c
+++ b/arch/sparc/kernel/sun4d_irq.c
@@ -103,11 +103,9 @@ found_it:	seq_printf(p, "%3d: ", i);
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (x = 0; x < NR_CPUS; x++) {
-			if (cpu_online(x))
-				seq_printf(p, "%10u ",
-				       kstat_cpu(cpu_logical_map(x)).irqs[i]);
-		}
+		for_each_online_cpu(x)
+			seq_printf(p, "%10u ",
+			       kstat_cpu(cpu_logical_map(x)).irqs[i]);
 #endif
 		seq_printf(p, "%c %s",
 			(action->flags & SA_INTERRUPT) ? '+' : ' ',
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 4219dd2ce3a2..41bb9596be48 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -249,11 +249,9 @@ void __init smp4d_boot_cpus(void)
 	} else {
 		unsigned long bogosum = 0;
 		
-		for(i = 0; i < NR_CPUS; i++) {
-			if (cpu_isset(i, cpu_present_map)) {
-				bogosum += cpu_data(i).udelay_val;
-				smp_highest_cpu = i;
-			}
+		for_each_present_cpu(i) {
+			bogosum += cpu_data(i).udelay_val;
+			smp_highest_cpu = i;
 		}
 		SMP_PRINTK(("Total of %d Processors activated (%lu.%02lu BogoMIPS).\n", cpucount + 1, bogosum/(500000/HZ), (bogosum/(5000/HZ))%100));
 		printk("Total of %d Processors activated (%lu.%02lu BogoMIPS).\n",
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index fbbd8a474c4c..1dde312eebda 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -218,10 +218,8 @@ void __init smp4m_boot_cpus(void)
 		cpu_present_map = cpumask_of_cpu(smp_processor_id());
 	} else {
 		unsigned long bogosum = 0;
-		for(i = 0; i < NR_CPUS; i++) {
-			if (cpu_isset(i, cpu_present_map))
-				bogosum += cpu_data(i).udelay_val;
-		}
+		for_each_present_cpu(i)
+			bogosum += cpu_data(i).udelay_val;
 		printk("Total of %d Processors activated (%lu.%02lu BogoMIPS).\n",
 		       cpucount + 1,
 		       bogosum/(500000/HZ),
diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index 8c93ba655b33..e505a4125e35 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -117,9 +117,7 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++) {
-			if (!cpu_online(j))
-				continue;
+		for_each_online_cpu(j) {
 			seq_printf(p, "%10u ",
 				   kstat_cpu(j).irqs[i]);
 		}
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 373a701c90a5..1b6e2ade1008 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -57,25 +57,21 @@ void smp_info(struct seq_file *m)
 	int i;
 	
 	seq_printf(m, "State:\n");
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i))
-			seq_printf(m,
-				   "CPU%d:\t\tonline\n", i);
-	}
+	for_each_online_cpu(i)
+		seq_printf(m, "CPU%d:\t\tonline\n", i);
 }
 
 void smp_bogo(struct seq_file *m)
 {
 	int i;
 	
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_online(i))
-			seq_printf(m,
-				   "Cpu%dBogo\t: %lu.%02lu\n"
-				   "Cpu%dClkTck\t: %016lx\n",
-				   i, cpu_data(i).udelay_val / (500000/HZ),
-				   (cpu_data(i).udelay_val / (5000/HZ)) % 100,
-				   i, cpu_data(i).clock_tick);
+	for_each_online_cpu(i)
+		seq_printf(m,
+			   "Cpu%dBogo\t: %lu.%02lu\n"
+			   "Cpu%dClkTck\t: %016lx\n",
+			   i, cpu_data(i).udelay_val / (500000/HZ),
+			   (cpu_data(i).udelay_val / (5000/HZ)) % 100,
+			   i, cpu_data(i).clock_tick);
 }
 
 void __init smp_store_cpu_info(int id)
@@ -1282,7 +1278,7 @@ int setup_profiling_timer(unsigned int multiplier)
 		return -EINVAL;
 
 	spin_lock_irqsave(&prof_setup_lock, flags);
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		prof_multiplier(i) = multiplier;
 	current_tick_offset = (timer_tick_offset / multiplier);
 	spin_unlock_irqrestore(&prof_setup_lock, flags);
@@ -1384,10 +1380,8 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	unsigned long bogosum = 0;
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i))
-			bogosum += cpu_data(i).udelay_val;
-	}
+	for_each_online_cpu(i)
+		bogosum += cpu_data(i).udelay_val;
 	printk("Total of %ld processors activated "
 	       "(%lu.%02lu BogoMIPS).\n",
 	       (long) num_online_cpus(),
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 30d2a1e545fe..d8bd0b345b1e 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -38,9 +38,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_printf(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
@@ -53,10 +52,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-			seq_printf(p, "%10u ",
-				kstat_cpu(j).irqs[i]);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
 		seq_printf(p, " %14s", irq_desc[i].handler->typename);
 
@@ -68,15 +65,13 @@ skip:
 		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
 	} else if (i == NR_IRQS) {
 		seq_printf(p, "NMI: ");
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
 		seq_putc(p, '\n');
 #ifdef CONFIG_X86_LOCAL_APIC
 		seq_printf(p, "LOC: ");
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
 		seq_putc(p, '\n');
 #endif
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 5bf17e41cd2d..66c009e10bac 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -162,9 +162,7 @@ int __init check_nmi_watchdog (void)
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		if (!cpu_online(cpu))
-			continue;
+	for_each_online_cpu(cpu) {
 		if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
 			endflag = 1;
 			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index 4cbf6d91571f..51f9bed455fa 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -83,9 +83,8 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (i == 0) {
 		seq_printf(p, "           ");
-		for (j=0; j<NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%d       ",j);
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%d       ",j);
 		seq_putc(p, '\n');
 	}
 
@@ -98,9 +97,8 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
 		seq_printf(p, " %14s", irq_desc[i].handler->typename);
 		seq_printf(p, "  %s", action->name);
@@ -113,9 +111,8 @@ skip:
 		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
 	} else if (i == NR_IRQS) {
 		seq_printf(p, "NMI: ");
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "%10u ", nmi_count(j));
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", nmi_count(j));
 		seq_putc(p, '\n');
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
 	}
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 690a1aae0b34..0c13795dca38 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -172,11 +172,9 @@ static struct net_device_stats *get_stats(struct net_device *dev)
 
 	memset(stats, 0, sizeof(struct net_device_stats));
 
-	for (i=0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct net_device_stats *lb_stats;
 
-		if (!cpu_possible(i)) 
-			continue;
 		lb_stats = &per_cpu(loopback_stats, i);
 		stats->rx_bytes   += lb_stats->rx_bytes;
 		stats->tx_bytes   += lb_stats->tx_bytes;
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 78193e4bbdb5..330d3869b41e 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -38,9 +38,8 @@ void free_cpu_buffers(void)
 {
 	int i;
  
-	for_each_online_cpu(i) {
+	for_each_online_cpu(i)
 		vfree(cpu_buffer[i].buffer);
-	}
 }
 
 int alloc_cpu_buffers(void)
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 8955720a2c6b..713e6a7505d0 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -62,18 +62,15 @@ xfs_read_xfsstats(
 		while (j < xstats[i].endpoint) {
 			val = 0;
 			/* sum over all cpus */
-			for (c = 0; c < NR_CPUS; c++) {
-				if (!cpu_possible(c)) continue;
+			for_each_cpu(c)
 				val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-			}
 			len += sprintf(buffer + len, " %u", val);
 			j++;
 		}
 		buffer[len++] = '\n';
 	}
 	/* extra precision counters */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i)) continue;
+	for_each_cpu(i) {
 		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
 		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
 		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index a02564972420..7079cc837210 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -38,8 +38,7 @@ xfs_stats_clear_proc_handler(
 
 	if (!ret && write && *valp) {
 		printk("XFS Clearing xfsstats\n");
-		for (c = 0; c < NR_CPUS; c++) {
-			if (!cpu_possible(c)) continue;
+		for_each_cpu(c) {
 			preempt_disable();
 			/* save vn_active, it's a universal truth! */
 			vn_active = per_cpu(xfsstats, c).vn_active;
diff --git a/include/asm-alpha/mmu_context.h b/include/asm-alpha/mmu_context.h
index 6f92482cc96c..0c017fc181c1 100644
--- a/include/asm-alpha/mmu_context.h
+++ b/include/asm-alpha/mmu_context.h
@@ -231,9 +231,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_online(i))
-			mm->context[i] = 0;
+	for_each_online_cpu(i)
+		mm->context[i] = 0;
 	if (tsk != current)
 		task_thread_info(tsk)->pcb.ptbr
 		  = ((unsigned long)mm->pgd - IDENT_ADDR) >> PAGE_SHIFT;
diff --git a/include/asm-alpha/topology.h b/include/asm-alpha/topology.h
index eb740e280d9c..420ccde6b916 100644
--- a/include/asm-alpha/topology.h
+++ b/include/asm-alpha/topology.h
@@ -27,8 +27,8 @@ static inline cpumask_t node_to_cpumask(int node)
 	cpumask_t node_cpu_mask = CPU_MASK_NONE;
 	int cpu;
 
-	for(cpu = 0; cpu < NR_CPUS; cpu++) {
-		if (cpu_online(cpu) && (cpu_to_node(cpu) == node))
+	for_each_online_cpu(cpu) {
+		if (cpu_to_node(cpu) == node)
 			cpu_set(cpu, node_cpu_mask);
 	}
 
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 9044aeb37828..78cf45547e31 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -19,10 +19,9 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for (__i = 0; __i < NR_CPUS; __i++)			\
-		if (cpu_possible(__i))				\
-			memcpy((pcpudst)+__per_cpu_offset[__i],	\
-			       (src), (size));			\
+	for_each_cpu(__i)					\
+		memcpy((pcpudst)+__per_cpu_offset[__i],		\
+		       (src), (size));				\
 } while (0)
 #else /* ! SMP */
 
diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h
index e31922c50e53..464301cd0d03 100644
--- a/include/asm-powerpc/percpu.h
+++ b/include/asm-powerpc/percpu.h
@@ -27,10 +27,9 @@
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for (__i = 0; __i < NR_CPUS; __i++)			\
-		if (cpu_possible(__i))				\
-			memcpy((pcpudst)+__per_cpu_offset(__i),	\
-			       (src), (size));			\
+	for_each_cpu(__i)					\
+		memcpy((pcpudst)+__per_cpu_offset(__i),		\
+		       (src), (size));				\
 } while (0)
 
 extern void setup_per_cpu_areas(void);
diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h
index 123fcaca295e..e10ed87094f0 100644
--- a/include/asm-s390/percpu.h
+++ b/include/asm-s390/percpu.h
@@ -46,10 +46,9 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for (__i = 0; __i < NR_CPUS; __i++)			\
-		if (cpu_possible(__i))				\
-			memcpy((pcpudst)+__per_cpu_offset[__i],	\
-			       (src), (size));			\
+	for_each_cpu(__i)					\
+		memcpy((pcpudst)+__per_cpu_offset[__i],		\
+		       (src), (size));				\
 } while (0)
 
 #else /* ! SMP */
diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h
index aea4e51e7cd1..82032e159a76 100644
--- a/include/asm-sparc64/percpu.h
+++ b/include/asm-sparc64/percpu.h
@@ -26,10 +26,9 @@ register unsigned long __local_per_cpu_offset asm("g5");
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for (__i = 0; __i < NR_CPUS; __i++)			\
-		if (cpu_possible(__i))				\
-			memcpy((pcpudst)+__per_cpu_offset(__i),	\
-			       (src), (size));			\
+	for_each_cpu(__i)					\
+		memcpy((pcpudst)+__per_cpu_offset(__i),		\
+		       (src), (size));				\
 } while (0)
 #else /* ! SMP */
 
diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h
index 29a6b0408f75..4405b4adeaba 100644
--- a/include/asm-x86_64/percpu.h
+++ b/include/asm-x86_64/percpu.h
@@ -26,10 +26,9 @@
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for (__i = 0; __i < NR_CPUS; __i++)			\
-		if (cpu_possible(__i))				\
-			memcpy((pcpudst)+__per_cpu_offset(__i),	\
-			       (src), (size));			\
+	for_each_cpu(__i)					\
+		memcpy((pcpudst)+__per_cpu_offset(__i),		\
+		       (src), (size));				\
 } while (0)
 
 extern void setup_per_cpu_areas(void);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index eef5ccdcd731..fd647fde5ec1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -149,22 +149,16 @@ struct disk_attribute {
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
-	for (i=0; i < NR_CPUS; i++) {					\
-		if (!cpu_possible(i))					\
-			continue;					\
+	for_each_cpu(i)							\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
-	}								\
 	res;								\
 })
 
 static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 	int i;
-	for (i=0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(gendiskp->dkstats, i), value,	
-					sizeof (struct disk_stats));
-		}
-	}
+	for_each_cpu(i)
+		memset(per_cpu_ptr(gendiskp->dkstats, i), value,
+				sizeof (struct disk_stats));
 }		
 				
 #else
-- 
cgit v1.2.3


From b86ff981a8252d83d6a7719ae09f3a05307e3592 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 19:56:55 +0100
Subject: [PATCH] relay: migrate from relayfs to a generic relay API

Original patch from Paul Mundt, sysfs parts removed by me since they
were broken.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/Kconfig            |  12 -
 fs/Makefile           |   1 -
 fs/relayfs/Makefile   |   4 -
 fs/relayfs/buffers.c  | 190 -----------
 fs/relayfs/buffers.h  |  12 -
 fs/relayfs/inode.c    | 581 -------------------------------
 fs/relayfs/relay.c    | 482 --------------------------
 fs/relayfs/relay.h    |   8 -
 include/linux/relay.h | 281 +++++++++++++++
 init/Kconfig          |  11 +
 kernel/Makefile       |   1 +
 kernel/relay.c        | 919 ++++++++++++++++++++++++++++++++++++++++++++++++++
 12 files changed, 1212 insertions(+), 1290 deletions(-)
 delete mode 100644 fs/relayfs/Makefile
 delete mode 100644 fs/relayfs/buffers.c
 delete mode 100644 fs/relayfs/buffers.h
 delete mode 100644 fs/relayfs/inode.c
 delete mode 100644 fs/relayfs/relay.c
 delete mode 100644 fs/relayfs/relay.h
 create mode 100644 include/linux/relay.h
 create mode 100644 kernel/relay.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index e9749b0eecd8..c8d0a209120c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -859,18 +859,6 @@ config RAMFS
 	  To compile this as a module, choose M here: the module will be called
 	  ramfs.
 
-config RELAYFS_FS
-	tristate "Relayfs file system support"
-	---help---
-	  Relayfs is a high-speed data relay filesystem designed to provide
-	  an efficient mechanism for tools and facilities to relay large
-	  amounts of data from kernel space to user space.
-
-	  To compile this code as a module, choose M here: the module will be
-	  called relayfs.
-
-	  If unsure, say N.
-
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 1db711319c80..080b3867be4d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -91,7 +91,6 @@ obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_UDF_FS)		+= udf/
-obj-$(CONFIG_RELAYFS_FS)	+= relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
deleted file mode 100644
index e76e182cdb38..000000000000
--- a/fs/relayfs/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-obj-$(CONFIG_RELAYFS_FS) += relayfs.o
-
-relayfs-y := relay.o inode.o buffers.o
-
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
deleted file mode 100644
index 10187812771e..000000000000
--- a/fs/relayfs/buffers.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * RelayFS buffer management code.
- *
- * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
- * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
- *
- * This file is released under the GPL.
- */
-
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-#include <linux/relayfs_fs.h>
-#include "relay.h"
-#include "buffers.h"
-
-/*
- * close() vm_op implementation for relayfs file mapping.
- */
-static void relay_file_mmap_close(struct vm_area_struct *vma)
-{
-	struct rchan_buf *buf = vma->vm_private_data;
-	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
-}
-
-/*
- * nopage() vm_op implementation for relayfs file mapping.
- */
-static struct page *relay_buf_nopage(struct vm_area_struct *vma,
-				     unsigned long address,
-				     int *type)
-{
-	struct page *page;
-	struct rchan_buf *buf = vma->vm_private_data;
-	unsigned long offset = address - vma->vm_start;
-
-	if (address > vma->vm_end)
-		return NOPAGE_SIGBUS; /* Disallow mremap */
-	if (!buf)
-		return NOPAGE_OOM;
-
-	page = vmalloc_to_page(buf->start + offset);
-	if (!page)
-		return NOPAGE_OOM;
-	get_page(page);
-
-	if (type)
-		*type = VM_FAULT_MINOR;
-
-	return page;
-}
-
-/*
- * vm_ops for relay file mappings.
- */
-static struct vm_operations_struct relay_file_mmap_ops = {
-	.nopage = relay_buf_nopage,
-	.close = relay_file_mmap_close,
-};
-
-/**
- *	relay_mmap_buf: - mmap channel buffer to process address space
- *	@buf: relay channel buffer
- *	@vma: vm_area_struct describing memory to be mapped
- *
- *	Returns 0 if ok, negative on error
- *
- *	Caller should already have grabbed mmap_sem.
- */
-int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
-{
-	unsigned long length = vma->vm_end - vma->vm_start;
-	struct file *filp = vma->vm_file;
-
-	if (!buf)
-		return -EBADF;
-
-	if (length != (unsigned long)buf->chan->alloc_size)
-		return -EINVAL;
-
-	vma->vm_ops = &relay_file_mmap_ops;
-	vma->vm_private_data = buf;
-	buf->chan->cb->buf_mapped(buf, filp);
-
-	return 0;
-}
-
-/**
- *	relay_alloc_buf - allocate a channel buffer
- *	@buf: the buffer struct
- *	@size: total size of the buffer
- *
- *	Returns a pointer to the resulting buffer, NULL if unsuccessful
- */
-static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
-{
-	void *mem;
-	unsigned int i, j, n_pages;
-
-	size = PAGE_ALIGN(size);
-	n_pages = size >> PAGE_SHIFT;
-
-	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
-	if (!buf->page_array)
-		return NULL;
-
-	for (i = 0; i < n_pages; i++) {
-		buf->page_array[i] = alloc_page(GFP_KERNEL);
-		if (unlikely(!buf->page_array[i]))
-			goto depopulate;
-	}
-	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
-	if (!mem)
-		goto depopulate;
-
-	memset(mem, 0, size);
-	buf->page_count = n_pages;
-	return mem;
-
-depopulate:
-	for (j = 0; j < i; j++)
-		__free_page(buf->page_array[j]);
-	kfree(buf->page_array);
-	return NULL;
-}
-
-/**
- *	relay_create_buf - allocate and initialize a channel buffer
- *	@alloc_size: size of the buffer to allocate
- *	@n_subbufs: number of sub-buffers in the channel
- *
- *	Returns channel buffer if successful, NULL otherwise
- */
-struct rchan_buf *relay_create_buf(struct rchan *chan)
-{
-	struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
-	if (!buf)
-		return NULL;
-
-	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
-	if (!buf->padding)
-		goto free_buf;
-
-	buf->start = relay_alloc_buf(buf, chan->alloc_size);
-	if (!buf->start)
-		goto free_buf;
-
-	buf->chan = chan;
-	kref_get(&buf->chan->kref);
-	return buf;
-
-free_buf:
-	kfree(buf->padding);
-	kfree(buf);
-	return NULL;
-}
-
-/**
- *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
- *	@buf: the buffer struct
- */
-void relay_destroy_buf(struct rchan_buf *buf)
-{
-	struct rchan *chan = buf->chan;
-	unsigned int i;
-
-	if (likely(buf->start)) {
-		vunmap(buf->start);
-		for (i = 0; i < buf->page_count; i++)
-			__free_page(buf->page_array[i]);
-		kfree(buf->page_array);
-	}
-	kfree(buf->padding);
-	kfree(buf);
-	kref_put(&chan->kref, relay_destroy_channel);
-}
-
-/**
- *	relay_remove_buf - remove a channel buffer
- *
- *	Removes the file from the relayfs fileystem, which also frees the
- *	rchan_buf_struct and the channel buffer.  Should only be called from
- *	kref_put().
- */
-void relay_remove_buf(struct kref *kref)
-{
-	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-	buf->chan->cb->remove_buf_file(buf->dentry);
-	relay_destroy_buf(buf);
-}
diff --git a/fs/relayfs/buffers.h b/fs/relayfs/buffers.h
deleted file mode 100644
index 37a12493f641..000000000000
--- a/fs/relayfs/buffers.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _BUFFERS_H
-#define _BUFFERS_H
-
-/* This inspired by rtai/shmem */
-#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
-
-extern int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma);
-extern struct rchan_buf *relay_create_buf(struct rchan *chan);
-extern void relay_destroy_buf(struct rchan_buf *buf);
-extern void relay_remove_buf(struct kref *kref);
-
-#endif/* _BUFFERS_H */
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
deleted file mode 100644
index 383523011aad..000000000000
--- a/fs/relayfs/inode.c
+++ /dev/null
@@ -1,581 +0,0 @@
-/*
- * VFS-related code for RelayFS, a high-speed data relay filesystem.
- *
- * Copyright (C) 2003-2005 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
- * Copyright (C) 2003-2005 - Karim Yaghmour <karim@opersys.com>
- *
- * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
- *
- * This file is released under the GPL.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/backing-dev.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
-#include <linux/relayfs_fs.h>
-#include "relay.h"
-#include "buffers.h"
-
-#define RELAYFS_MAGIC			0xF0B4A981
-
-static struct vfsmount *		relayfs_mount;
-static int				relayfs_mount_count;
-
-static struct backing_dev_info		relayfs_backing_dev_info = {
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
-};
-
-static struct inode *relayfs_get_inode(struct super_block *sb,
-				       int mode,
- 				       struct file_operations *fops,
-				       void *data)
-{
-	struct inode *inode;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return NULL;
-
-	inode->i_mode = mode;
-	inode->i_uid = 0;
-	inode->i_gid = 0;
-	inode->i_blksize = PAGE_CACHE_SIZE;
-	inode->i_blocks = 0;
-	inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	switch (mode & S_IFMT) {
-	case S_IFREG:
-		inode->i_fop = fops;
-		if (data)
-			inode->u.generic_ip = data;
-		break;
-	case S_IFDIR:
-		inode->i_op = &simple_dir_inode_operations;
-		inode->i_fop = &simple_dir_operations;
-
-		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
-		break;
-	default:
-		break;
-	}
-
-	return inode;
-}
-
-/**
- *	relayfs_create_entry - create a relayfs directory or file
- *	@name: the name of the file to create
- *	@parent: parent directory
- *	@mode: mode
- *	@fops: file operations to use for the file
- *	@data: user-associated data for this file
- *
- *	Returns the new dentry, NULL on failure
- *
- *	Creates a file or directory with the specifed permissions.
- */
-static struct dentry *relayfs_create_entry(const char *name,
-					   struct dentry *parent,
-					   int mode,
-					   struct file_operations *fops,
-					   void *data)
-{
-	struct dentry *d;
-	struct inode *inode;
-	int error = 0;
-
-	BUG_ON(!name || !(S_ISREG(mode) || S_ISDIR(mode)));
-
-	error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
-	if (error) {
-		printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
-		return NULL;
-	}
-
-	if (!parent && relayfs_mount && relayfs_mount->mnt_sb)
-		parent = relayfs_mount->mnt_sb->s_root;
-
-	if (!parent) {
-		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
-		return NULL;
-	}
-
-	parent = dget(parent);
-	mutex_lock(&parent->d_inode->i_mutex);
-	d = lookup_one_len(name, parent, strlen(name));
-	if (IS_ERR(d)) {
-		d = NULL;
-		goto release_mount;
-	}
-
-	if (d->d_inode) {
-		d = NULL;
-		goto release_mount;
-	}
-
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
-	if (!inode) {
-		d = NULL;
-		goto release_mount;
-	}
-
-	d_instantiate(d, inode);
-	dget(d);	/* Extra count - pin the dentry in core */
-
-	if (S_ISDIR(mode))
-		parent->d_inode->i_nlink++;
-
-	goto exit;
-
-release_mount:
-	simple_release_fs(&relayfs_mount, &relayfs_mount_count);
-
-exit:
-	mutex_unlock(&parent->d_inode->i_mutex);
-	dput(parent);
-	return d;
-}
-
-/**
- *	relayfs_create_file - create a file in the relay filesystem
- *	@name: the name of the file to create
- *	@parent: parent directory
- *	@mode: mode, if not specied the default perms are used
- *	@fops: file operations to use for the file
- *	@data: user-associated data for this file
- *
- *	Returns file dentry if successful, NULL otherwise.
- *
- *	The file will be created user r on behalf of current user.
- */
-struct dentry *relayfs_create_file(const char *name,
-				   struct dentry *parent,
-				   int mode,
-				   struct file_operations *fops,
-				   void *data)
-{
-	BUG_ON(!fops);
-
-	if (!mode)
-		mode = S_IRUSR;
-	mode = (mode & S_IALLUGO) | S_IFREG;
-
-	return relayfs_create_entry(name, parent, mode, fops, data);
-}
-
-/**
- *	relayfs_create_dir - create a directory in the relay filesystem
- *	@name: the name of the directory to create
- *	@parent: parent directory, NULL if parent should be fs root
- *
- *	Returns directory dentry if successful, NULL otherwise.
- *
- *	The directory will be created world rwx on behalf of current user.
- */
-struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
-{
-	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	return relayfs_create_entry(name, parent, mode, NULL, NULL);
-}
-
-/**
- *	relayfs_remove - remove a file or directory in the relay filesystem
- *	@dentry: file or directory dentry
- *
- *	Returns 0 if successful, negative otherwise.
- */
-int relayfs_remove(struct dentry *dentry)
-{
-	struct dentry *parent;
-	int error = 0;
-
-	if (!dentry)
-		return -EINVAL;
-	parent = dentry->d_parent;
-	if (!parent)
-		return -EINVAL;
-
-	parent = dget(parent);
-	mutex_lock(&parent->d_inode->i_mutex);
-	if (dentry->d_inode) {
-		if (S_ISDIR(dentry->d_inode->i_mode))
-			error = simple_rmdir(parent->d_inode, dentry);
-		else
-			error = simple_unlink(parent->d_inode, dentry);
-		if (!error)
-			d_delete(dentry);
-	}
-	if (!error)
-		dput(dentry);
-	mutex_unlock(&parent->d_inode->i_mutex);
-	dput(parent);
-
-	if (!error)
-		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
-
-	return error;
-}
-
-/**
- *	relayfs_remove_file - remove a file from relay filesystem
- *	@dentry: directory dentry
- *
- *	Returns 0 if successful, negative otherwise.
- */
-int relayfs_remove_file(struct dentry *dentry)
-{
-	return relayfs_remove(dentry);
-}
-
-/**
- *	relayfs_remove_dir - remove a directory in the relay filesystem
- *	@dentry: directory dentry
- *
- *	Returns 0 if successful, negative otherwise.
- */
-int relayfs_remove_dir(struct dentry *dentry)
-{
-	return relayfs_remove(dentry);
-}
-
-/**
- *	relay_file_open - open file op for relay files
- *	@inode: the inode
- *	@filp: the file
- *
- *	Increments the channel buffer refcount.
- */
-static int relay_file_open(struct inode *inode, struct file *filp)
-{
-	struct rchan_buf *buf = inode->u.generic_ip;
-	kref_get(&buf->kref);
-	filp->private_data = buf;
-
-	return 0;
-}
-
-/**
- *	relay_file_mmap - mmap file op for relay files
- *	@filp: the file
- *	@vma: the vma describing what to map
- *
- *	Calls upon relay_mmap_buf to map the file into user space.
- */
-static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	struct rchan_buf *buf = filp->private_data;
-	return relay_mmap_buf(buf, vma);
-}
-
-/**
- *	relay_file_poll - poll file op for relay files
- *	@filp: the file
- *	@wait: poll table
- *
- *	Poll implemention.
- */
-static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
-{
-	unsigned int mask = 0;
-	struct rchan_buf *buf = filp->private_data;
-
-	if (buf->finalized)
-		return POLLERR;
-
-	if (filp->f_mode & FMODE_READ) {
-		poll_wait(filp, &buf->read_wait, wait);
-		if (!relay_buf_empty(buf))
-			mask |= POLLIN | POLLRDNORM;
-	}
-
-	return mask;
-}
-
-/**
- *	relay_file_release - release file op for relay files
- *	@inode: the inode
- *	@filp: the file
- *
- *	Decrements the channel refcount, as the filesystem is
- *	no longer using it.
- */
-static int relay_file_release(struct inode *inode, struct file *filp)
-{
-	struct rchan_buf *buf = filp->private_data;
-	kref_put(&buf->kref, relay_remove_buf);
-
-	return 0;
-}
-
-/**
- *	relay_file_read_consume - update the consumed count for the buffer
- */
-static void relay_file_read_consume(struct rchan_buf *buf,
-				    size_t read_pos,
-				    size_t bytes_consumed)
-{
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-	size_t read_subbuf;
-
-	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
-		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
-		buf->bytes_consumed = 0;
-	}
-
-	buf->bytes_consumed += bytes_consumed;
-	read_subbuf = read_pos / buf->chan->subbuf_size;
-	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
-		if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
-		    (buf->offset == subbuf_size))
-			return;
-		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
-		buf->bytes_consumed = 0;
-	}
-}
-
-/**
- *	relay_file_read_avail - boolean, are there unconsumed bytes available?
- */
-static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
-{
-	size_t bytes_produced, bytes_consumed, write_offset;
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-	size_t produced = buf->subbufs_produced % n_subbufs;
-	size_t consumed = buf->subbufs_consumed % n_subbufs;
-
-	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
-
-	if (consumed > produced) {
-		if ((produced > n_subbufs) &&
-		    (produced + n_subbufs - consumed <= n_subbufs))
-			produced += n_subbufs;
-	} else if (consumed == produced) {
-		if (buf->offset > subbuf_size) {
-			produced += n_subbufs;
-			if (buf->subbufs_produced == buf->subbufs_consumed)
-				consumed += n_subbufs;
-		}
-	}
-
-	if (buf->offset > subbuf_size)
-		bytes_produced = (produced - 1) * subbuf_size + write_offset;
-	else
-		bytes_produced = produced * subbuf_size + write_offset;
-	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
-
-	if (bytes_produced == bytes_consumed)
-		return 0;
-
-	relay_file_read_consume(buf, read_pos, 0);
-
-	return 1;
-}
-
-/**
- *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
- */
-static size_t relay_file_read_subbuf_avail(size_t read_pos,
-					   struct rchan_buf *buf)
-{
-	size_t padding, avail = 0;
-	size_t read_subbuf, read_offset, write_subbuf, write_offset;
-	size_t subbuf_size = buf->chan->subbuf_size;
-
-	write_subbuf = (buf->data - buf->start) / subbuf_size;
-	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
-	read_subbuf = read_pos / subbuf_size;
-	read_offset = read_pos % subbuf_size;
-	padding = buf->padding[read_subbuf];
-
-	if (read_subbuf == write_subbuf) {
-		if (read_offset + padding < write_offset)
-			avail = write_offset - (read_offset + padding);
-	} else
-		avail = (subbuf_size - padding) - read_offset;
-
-	return avail;
-}
-
-/**
- *	relay_file_read_start_pos - find the first available byte to read
- *
- *	If the read_pos is in the middle of padding, return the
- *	position of the first actually available byte, otherwise
- *	return the original value.
- */
-static size_t relay_file_read_start_pos(size_t read_pos,
-					struct rchan_buf *buf)
-{
-	size_t read_subbuf, padding, padding_start, padding_end;
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-
-	read_subbuf = read_pos / subbuf_size;
-	padding = buf->padding[read_subbuf];
-	padding_start = (read_subbuf + 1) * subbuf_size - padding;
-	padding_end = (read_subbuf + 1) * subbuf_size;
-	if (read_pos >= padding_start && read_pos < padding_end) {
-		read_subbuf = (read_subbuf + 1) % n_subbufs;
-		read_pos = read_subbuf * subbuf_size;
-	}
-
-	return read_pos;
-}
-
-/**
- *	relay_file_read_end_pos - return the new read position
- */
-static size_t relay_file_read_end_pos(struct rchan_buf *buf,
-				      size_t read_pos,
-				      size_t count)
-{
-	size_t read_subbuf, padding, end_pos;
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-
-	read_subbuf = read_pos / subbuf_size;
-	padding = buf->padding[read_subbuf];
-	if (read_pos % subbuf_size + count + padding == subbuf_size)
-		end_pos = (read_subbuf + 1) * subbuf_size;
-	else
-		end_pos = read_pos + count;
-	if (end_pos >= subbuf_size * n_subbufs)
-		end_pos = 0;
-
-	return end_pos;
-}
-
-/**
- *	relay_file_read - read file op for relay files
- *	@filp: the file
- *	@buffer: the userspace buffer
- *	@count: number of bytes to read
- *	@ppos: position to read from
- *
- *	Reads count bytes or the number of bytes available in the
- *	current sub-buffer being read, whichever is smaller.
- */
-static ssize_t relay_file_read(struct file *filp,
-			       char __user *buffer,
-			       size_t count,
-			       loff_t *ppos)
-{
-	struct rchan_buf *buf = filp->private_data;
-	struct inode *inode = filp->f_dentry->d_inode;
-	size_t read_start, avail;
-	ssize_t ret = 0;
-	void *from;
-
-	mutex_lock(&inode->i_mutex);
-	if(!relay_file_read_avail(buf, *ppos))
-		goto out;
-
-	read_start = relay_file_read_start_pos(*ppos, buf);
-	avail = relay_file_read_subbuf_avail(read_start, buf);
-	if (!avail)
-		goto out;
-
-	from = buf->start + read_start;
-	ret = count = min(count, avail);
-	if (copy_to_user(buffer, from, count)) {
-		ret = -EFAULT;
-		goto out;
-	}
-	relay_file_read_consume(buf, read_start, count);
-	*ppos = relay_file_read_end_pos(buf, read_start, count);
-out:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
-struct file_operations relay_file_operations = {
-	.open		= relay_file_open,
-	.poll		= relay_file_poll,
-	.mmap		= relay_file_mmap,
-	.read		= relay_file_read,
-	.llseek		= no_llseek,
-	.release	= relay_file_release,
-};
-
-static struct super_operations relayfs_ops = {
-	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
-};
-
-static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
-{
-	struct inode *inode;
-	struct dentry *root;
-	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = RELAYFS_MAGIC;
-	sb->s_op = &relayfs_ops;
-	inode = relayfs_get_inode(sb, mode, NULL, NULL);
-
-	if (!inode)
-		return -ENOMEM;
-
-	root = d_alloc_root(inode);
-	if (!root) {
-		iput(inode);
-		return -ENOMEM;
-	}
-	sb->s_root = root;
-
-	return 0;
-}
-
-static struct super_block * relayfs_get_sb(struct file_system_type *fs_type,
-					   int flags, const char *dev_name,
-					   void *data)
-{
-	return get_sb_single(fs_type, flags, data, relayfs_fill_super);
-}
-
-static struct file_system_type relayfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "relayfs",
-	.get_sb		= relayfs_get_sb,
-	.kill_sb	= kill_litter_super,
-};
-
-static int __init init_relayfs_fs(void)
-{
-	return register_filesystem(&relayfs_fs_type);
-}
-
-static void __exit exit_relayfs_fs(void)
-{
-
-
-
-
-
-	unregister_filesystem(&relayfs_fs_type);
-}
-
-module_init(init_relayfs_fs)
-module_exit(exit_relayfs_fs)
-
-EXPORT_SYMBOL_GPL(relay_file_operations);
-EXPORT_SYMBOL_GPL(relayfs_create_dir);
-EXPORT_SYMBOL_GPL(relayfs_remove_dir);
-EXPORT_SYMBOL_GPL(relayfs_create_file);
-EXPORT_SYMBOL_GPL(relayfs_remove_file);
-
-MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
-MODULE_DESCRIPTION("Relay Filesystem");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
deleted file mode 100644
index abf3ceaace49..000000000000
--- a/fs/relayfs/relay.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * Public API and common code for RelayFS.
- *
- * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
- *
- * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
- * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
- *
- * This file is released under the GPL.
- */
-
-#include <linux/errno.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/relayfs_fs.h>
-#include "relay.h"
-#include "buffers.h"
-
-/**
- *	relay_buf_empty - boolean, is the channel buffer empty?
- *	@buf: channel buffer
- *
- *	Returns 1 if the buffer is empty, 0 otherwise.
- */
-int relay_buf_empty(struct rchan_buf *buf)
-{
-	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
-}
-
-/**
- *	relay_buf_full - boolean, is the channel buffer full?
- *	@buf: channel buffer
- *
- *	Returns 1 if the buffer is full, 0 otherwise.
- */
-int relay_buf_full(struct rchan_buf *buf)
-{
-	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
-	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
-}
-
-/*
- * High-level relayfs kernel API and associated functions.
- */
-
-/*
- * rchan_callback implementations defining default channel behavior.  Used
- * in place of corresponding NULL values in client callback struct.
- */
-
-/*
- * subbuf_start() default callback.  Does nothing.
- */
-static int subbuf_start_default_callback (struct rchan_buf *buf,
-					  void *subbuf,
-					  void *prev_subbuf,
-					  size_t prev_padding)
-{
-	if (relay_buf_full(buf))
-		return 0;
-
-	return 1;
-}
-
-/*
- * buf_mapped() default callback.  Does nothing.
- */
-static void buf_mapped_default_callback(struct rchan_buf *buf,
-					struct file *filp)
-{
-}
-
-/*
- * buf_unmapped() default callback.  Does nothing.
- */
-static void buf_unmapped_default_callback(struct rchan_buf *buf,
-					  struct file *filp)
-{
-}
-
-/*
- * create_buf_file_create() default callback.  Creates file to represent buf.
- */
-static struct dentry *create_buf_file_default_callback(const char *filename,
-						       struct dentry *parent,
-						       int mode,
-						       struct rchan_buf *buf,
-						       int *is_global)
-{
-	return relayfs_create_file(filename, parent, mode,
-				   &relay_file_operations, buf);
-}
-
-/*
- * remove_buf_file() default callback.  Removes file representing relay buffer.
- */
-static int remove_buf_file_default_callback(struct dentry *dentry)
-{
-	return relayfs_remove(dentry);
-}
-
-/* relay channel default callbacks */
-static struct rchan_callbacks default_channel_callbacks = {
-	.subbuf_start = subbuf_start_default_callback,
-	.buf_mapped = buf_mapped_default_callback,
-	.buf_unmapped = buf_unmapped_default_callback,
-	.create_buf_file = create_buf_file_default_callback,
-	.remove_buf_file = remove_buf_file_default_callback,
-};
-
-/**
- *	wakeup_readers - wake up readers waiting on a channel
- *	@private: the channel buffer
- *
- *	This is the work function used to defer reader waking.  The
- *	reason waking is deferred is that calling directly from write
- *	causes problems if you're writing from say the scheduler.
- */
-static void wakeup_readers(void *private)
-{
-	struct rchan_buf *buf = private;
-	wake_up_interruptible(&buf->read_wait);
-}
-
-/**
- *	__relay_reset - reset a channel buffer
- *	@buf: the channel buffer
- *	@init: 1 if this is a first-time initialization
- *
- *	See relay_reset for description of effect.
- */
-static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
-{
-	size_t i;
-
-	if (init) {
-		init_waitqueue_head(&buf->read_wait);
-		kref_init(&buf->kref);
-		INIT_WORK(&buf->wake_readers, NULL, NULL);
-	} else {
-		cancel_delayed_work(&buf->wake_readers);
-		flush_scheduled_work();
-	}
-
-	buf->subbufs_produced = 0;
-	buf->subbufs_consumed = 0;
-	buf->bytes_consumed = 0;
-	buf->finalized = 0;
-	buf->data = buf->start;
-	buf->offset = 0;
-
-	for (i = 0; i < buf->chan->n_subbufs; i++)
-		buf->padding[i] = 0;
-
-	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
-}
-
-/**
- *	relay_reset - reset the channel
- *	@chan: the channel
- *
- *	This has the effect of erasing all data from all channel buffers
- *	and restarting the channel in its initial state.  The buffers
- *	are not freed, so any mappings are still in effect.
- *
- *	NOTE: Care should be taken that the channel isn't actually
- *	being used by anything when this call is made.
- */
-void relay_reset(struct rchan *chan)
-{
-	unsigned int i;
-	struct rchan_buf *prev = NULL;
-
-	if (!chan)
-		return;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i] || chan->buf[i] == prev)
-			break;
-		__relay_reset(chan->buf[i], 0);
-		prev = chan->buf[i];
-	}
-}
-
-/**
- *	relay_open_buf - create a new channel buffer in relayfs
- *
- *	Internal - used by relay_open().
- */
-static struct rchan_buf *relay_open_buf(struct rchan *chan,
-					const char *filename,
-					struct dentry *parent,
-					int *is_global)
-{
-	struct rchan_buf *buf;
-	struct dentry *dentry;
-
-	if (*is_global)
-		return chan->buf[0];
-
- 	buf = relay_create_buf(chan);
- 	if (!buf)
- 		return NULL;
-
-	/* Create file in fs */
- 	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
- 					   buf, is_global);
- 	if (!dentry) {
- 		relay_destroy_buf(buf);
-		return NULL;
- 	}
-
-	buf->dentry = dentry;
-	__relay_reset(buf, 1);
-
-	return buf;
-}
-
-/**
- *	relay_close_buf - close a channel buffer
- *	@buf: channel buffer
- *
- *	Marks the buffer finalized and restores the default callbacks.
- *	The channel buffer and channel buffer data structure are then freed
- *	automatically when the last reference is given up.
- */
-static inline void relay_close_buf(struct rchan_buf *buf)
-{
-	buf->finalized = 1;
-	buf->chan->cb = &default_channel_callbacks;
-	cancel_delayed_work(&buf->wake_readers);
-	flush_scheduled_work();
-	kref_put(&buf->kref, relay_remove_buf);
-}
-
-static inline void setup_callbacks(struct rchan *chan,
-				   struct rchan_callbacks *cb)
-{
-	if (!cb) {
-		chan->cb = &default_channel_callbacks;
-		return;
-	}
-
-	if (!cb->subbuf_start)
-		cb->subbuf_start = subbuf_start_default_callback;
-	if (!cb->buf_mapped)
-		cb->buf_mapped = buf_mapped_default_callback;
-	if (!cb->buf_unmapped)
-		cb->buf_unmapped = buf_unmapped_default_callback;
-	if (!cb->create_buf_file)
-		cb->create_buf_file = create_buf_file_default_callback;
-	if (!cb->remove_buf_file)
-		cb->remove_buf_file = remove_buf_file_default_callback;
-	chan->cb = cb;
-}
-
-/**
- *	relay_open - create a new relayfs channel
- *	@base_filename: base name of files to create
- *	@parent: dentry of parent directory, NULL for root directory
- *	@subbuf_size: size of sub-buffers
- *	@n_subbufs: number of sub-buffers
- *	@cb: client callback functions
- *
- *	Returns channel pointer if successful, NULL otherwise.
- *
- *	Creates a channel buffer for each cpu using the sizes and
- *	attributes specified.  The created channel buffer files
- *	will be named base_filename0...base_filenameN-1.  File
- *	permissions will be S_IRUSR.
- */
-struct rchan *relay_open(const char *base_filename,
-			 struct dentry *parent,
-			 size_t subbuf_size,
-			 size_t n_subbufs,
-			 struct rchan_callbacks *cb)
-{
-	unsigned int i;
-	struct rchan *chan;
-	char *tmpname;
-	int is_global = 0;
-
-	if (!base_filename)
-		return NULL;
-
-	if (!(subbuf_size && n_subbufs))
-		return NULL;
-
-	chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
-	if (!chan)
-		return NULL;
-
-	chan->version = RELAYFS_CHANNEL_VERSION;
-	chan->n_subbufs = n_subbufs;
-	chan->subbuf_size = subbuf_size;
-	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
-	setup_callbacks(chan, cb);
-	kref_init(&chan->kref);
-
-	tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!tmpname)
-		goto free_chan;
-
-	for_each_online_cpu(i) {
-		sprintf(tmpname, "%s%d", base_filename, i);
-		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
-					      &is_global);
-		chan->buf[i]->cpu = i;
-		if (!chan->buf[i])
-			goto free_bufs;
-	}
-
-	kfree(tmpname);
-	return chan;
-
-free_bufs:
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			break;
-		relay_close_buf(chan->buf[i]);
-		if (is_global)
-			break;
-	}
-	kfree(tmpname);
-
-free_chan:
-	kref_put(&chan->kref, relay_destroy_channel);
-	return NULL;
-}
-
-/**
- *	relay_switch_subbuf - switch to a new sub-buffer
- *	@buf: channel buffer
- *	@length: size of current event
- *
- *	Returns either the length passed in or 0 if full.
-
- *	Performs sub-buffer-switch tasks such as invoking callbacks,
- *	updating padding counts, waking up readers, etc.
- */
-size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
-{
-	void *old, *new;
-	size_t old_subbuf, new_subbuf;
-
-	if (unlikely(length > buf->chan->subbuf_size))
-		goto toobig;
-
-	if (buf->offset != buf->chan->subbuf_size + 1) {
-		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
-		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
-		buf->padding[old_subbuf] = buf->prev_padding;
-		buf->subbufs_produced++;
-		if (waitqueue_active(&buf->read_wait)) {
-			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
-			schedule_delayed_work(&buf->wake_readers, 1);
-		}
-	}
-
-	old = buf->data;
-	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
-	new = buf->start + new_subbuf * buf->chan->subbuf_size;
-	buf->offset = 0;
-	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
-		buf->offset = buf->chan->subbuf_size + 1;
-		return 0;
-	}
-	buf->data = new;
-	buf->padding[new_subbuf] = 0;
-
-	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
-		goto toobig;
-
-	return length;
-
-toobig:
-	buf->chan->last_toobig = length;
-	return 0;
-}
-
-/**
- *	relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
- *	@chan: the channel
- *	@cpu: the cpu associated with the channel buffer to update
- *	@subbufs_consumed: number of sub-buffers to add to current buf's count
- *
- *	Adds to the channel buffer's consumed sub-buffer count.
- *	subbufs_consumed should be the number of sub-buffers newly consumed,
- *	not the total consumed.
- *
- *	NOTE: kernel clients don't need to call this function if the channel
- *	mode is 'overwrite'.
- */
-void relay_subbufs_consumed(struct rchan *chan,
-			    unsigned int cpu,
-			    size_t subbufs_consumed)
-{
-	struct rchan_buf *buf;
-
-	if (!chan)
-		return;
-
-	if (cpu >= NR_CPUS || !chan->buf[cpu])
-		return;
-
-	buf = chan->buf[cpu];
-	buf->subbufs_consumed += subbufs_consumed;
-	if (buf->subbufs_consumed > buf->subbufs_produced)
-		buf->subbufs_consumed = buf->subbufs_produced;
-}
-
-/**
- *	relay_destroy_channel - free the channel struct
- *
- *	Should only be called from kref_put().
- */
-void relay_destroy_channel(struct kref *kref)
-{
-	struct rchan *chan = container_of(kref, struct rchan, kref);
-	kfree(chan);
-}
-
-/**
- *	relay_close - close the channel
- *	@chan: the channel
- *
- *	Closes all channel buffers and frees the channel.
- */
-void relay_close(struct rchan *chan)
-{
-	unsigned int i;
-	struct rchan_buf *prev = NULL;
-
-	if (!chan)
-		return;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i] || chan->buf[i] == prev)
-			break;
-		relay_close_buf(chan->buf[i]);
-		prev = chan->buf[i];
-	}
-
-	if (chan->last_toobig)
-		printk(KERN_WARNING "relayfs: one or more items not logged "
-		       "[item size (%Zd) > sub-buffer size (%Zd)]\n",
-		       chan->last_toobig, chan->subbuf_size);
-
-	kref_put(&chan->kref, relay_destroy_channel);
-}
-
-/**
- *	relay_flush - close the channel
- *	@chan: the channel
- *
- *	Flushes all channel buffers i.e. forces buffer switch.
- */
-void relay_flush(struct rchan *chan)
-{
-	unsigned int i;
-	struct rchan_buf *prev = NULL;
-
-	if (!chan)
-		return;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i] || chan->buf[i] == prev)
-			break;
-		relay_switch_subbuf(chan->buf[i], 0);
-		prev = chan->buf[i];
-	}
-}
-
-EXPORT_SYMBOL_GPL(relay_open);
-EXPORT_SYMBOL_GPL(relay_close);
-EXPORT_SYMBOL_GPL(relay_flush);
-EXPORT_SYMBOL_GPL(relay_reset);
-EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
-EXPORT_SYMBOL_GPL(relay_switch_subbuf);
-EXPORT_SYMBOL_GPL(relay_buf_full);
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
deleted file mode 100644
index 0993d3e5753b..000000000000
--- a/fs/relayfs/relay.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _RELAY_H
-#define _RELAY_H
-
-extern int relayfs_remove(struct dentry *dentry);
-extern int relay_buf_empty(struct rchan_buf *buf);
-extern void relay_destroy_channel(struct kref *kref);
-
-#endif /* _RELAY_H */
diff --git a/include/linux/relay.h b/include/linux/relay.h
new file mode 100644
index 000000000000..4bcc1531d6a9
--- /dev/null
+++ b/include/linux/relay.h
@@ -0,0 +1,281 @@
+/*
+ * linux/include/linux/relay.h
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * CONFIG_RELAY definitions and declarations
+ */
+
+#ifndef _LINUX_RELAY_H
+#define _LINUX_RELAY_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/kref.h>
+
+/* Needs a _much_ better name... */
+#define FIX_SIZE(x) ((((x) - 1) & PAGE_MASK) + PAGE_SIZE)
+
+/*
+ * Tracks changes to rchan/rchan_buf structs
+ */
+#define RELAYFS_CHANNEL_VERSION		6
+
+/*
+ * Per-cpu relay channel buffer
+ */
+struct rchan_buf
+{
+	void *start;			/* start of channel buffer */
+	void *data;			/* start of current sub-buffer */
+	size_t offset;			/* current offset into sub-buffer */
+	size_t subbufs_produced;	/* count of sub-buffers produced */
+	size_t subbufs_consumed;	/* count of sub-buffers consumed */
+	struct rchan *chan;		/* associated channel */
+	wait_queue_head_t read_wait;	/* reader wait queue */
+	struct work_struct wake_readers; /* reader wake-up work struct */
+	struct dentry *dentry;		/* channel file dentry */
+	struct kref kref;		/* channel buffer refcount */
+	struct page **page_array;	/* array of current buffer pages */
+	unsigned int page_count;	/* number of current buffer pages */
+	unsigned int finalized;		/* buffer has been finalized */
+	size_t *padding;		/* padding counts per sub-buffer */
+	size_t prev_padding;		/* temporary variable */
+	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
+	unsigned int cpu;		/* this buf's cpu */
+} ____cacheline_aligned;
+
+/*
+ * Relay channel data structure
+ */
+struct rchan
+{
+	u32 version;			/* the version of this struct */
+	size_t subbuf_size;		/* sub-buffer size */
+	size_t n_subbufs;		/* number of sub-buffers per buffer */
+	size_t alloc_size;		/* total buffer size allocated */
+	struct rchan_callbacks *cb;	/* client callbacks */
+	struct kref kref;		/* channel refcount */
+	void *private_data;		/* for user-defined data */
+	size_t last_toobig;		/* tried to log event > subbuf size */
+	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
+};
+
+/*
+ * Relay channel client callbacks
+ */
+struct rchan_callbacks
+{
+	/*
+	 * subbuf_start - called on buffer-switch to a new sub-buffer
+	 * @buf: the channel buffer containing the new sub-buffer
+	 * @subbuf: the start of the new sub-buffer
+	 * @prev_subbuf: the start of the previous sub-buffer
+	 * @prev_padding: unused space at the end of previous sub-buffer
+	 *
+	 * The client should return 1 to continue logging, 0 to stop
+	 * logging.
+	 *
+	 * NOTE: subbuf_start will also be invoked when the buffer is
+	 *       created, so that the first sub-buffer can be initialized
+	 *       if necessary.  In this case, prev_subbuf will be NULL.
+	 *
+	 * NOTE: the client can reserve bytes at the beginning of the new
+	 *       sub-buffer by calling subbuf_start_reserve() in this callback.
+	 */
+	int (*subbuf_start) (struct rchan_buf *buf,
+			     void *subbuf,
+			     void *prev_subbuf,
+			     size_t prev_padding);
+
+	/*
+	 * buf_mapped - relay buffer mmap notification
+	 * @buf: the channel buffer
+	 * @filp: relay file pointer
+	 *
+	 * Called when a relay file is successfully mmapped
+	 */
+        void (*buf_mapped)(struct rchan_buf *buf,
+			   struct file *filp);
+
+	/*
+	 * buf_unmapped - relay buffer unmap notification
+	 * @buf: the channel buffer
+	 * @filp: relay file pointer
+	 *
+	 * Called when a relay file is successfully unmapped
+	 */
+        void (*buf_unmapped)(struct rchan_buf *buf,
+			     struct file *filp);
+	/*
+	 * create_buf_file - create file to represent a relay channel buffer
+	 * @filename: the name of the file to create
+	 * @parent: the parent of the file to create
+	 * @mode: the mode of the file to create
+	 * @buf: the channel buffer
+	 * @is_global: outparam - set non-zero if the buffer should be global
+	 *
+	 * Called during relay_open(), once for each per-cpu buffer,
+	 * to allow the client to create a file to be used to
+	 * represent the corresponding channel buffer.  If the file is
+	 * created outside of relay, the parent must also exist in
+	 * that filesystem.
+	 *
+	 * The callback should return the dentry of the file created
+	 * to represent the relay buffer.
+	 *
+	 * Setting the is_global outparam to a non-zero value will
+	 * cause relay_open() to create a single global buffer rather
+	 * than the default set of per-cpu buffers.
+	 *
+	 * See Documentation/filesystems/relayfs.txt for more info.
+	 */
+	struct dentry *(*create_buf_file)(const char *filename,
+					  struct dentry *parent,
+					  int mode,
+					  struct rchan_buf *buf,
+					  int *is_global);
+
+	/*
+	 * remove_buf_file - remove file representing a relay channel buffer
+	 * @dentry: the dentry of the file to remove
+	 *
+	 * Called during relay_close(), once for each per-cpu buffer,
+	 * to allow the client to remove a file used to represent a
+	 * channel buffer.
+	 *
+	 * The callback should return 0 if successful, negative if not.
+	 */
+	int (*remove_buf_file)(struct dentry *dentry);
+};
+
+/*
+ * CONFIG_RELAY kernel API, kernel/relay.c
+ */
+
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb);
+extern void relay_close(struct rchan *chan);
+extern void relay_flush(struct rchan *chan);
+extern void relay_subbufs_consumed(struct rchan *chan,
+				   unsigned int cpu,
+				   size_t consumed);
+extern void relay_reset(struct rchan *chan);
+extern int relay_buf_full(struct rchan_buf *buf);
+
+extern size_t relay_switch_subbuf(struct rchan_buf *buf,
+				  size_t length);
+
+/**
+ *	relay_write - write data into the channel
+ *	@chan: relay channel
+ *	@data: data to be written
+ *	@length: number of bytes to write
+ *
+ *	Writes data into the current cpu's channel buffer.
+ *
+ *	Protects the buffer by disabling interrupts.  Use this
+ *	if you might be logging from interrupt context.  Try
+ *	__relay_write() if you know you	won't be logging from
+ *	interrupt context.
+ */
+static inline void relay_write(struct rchan *chan,
+			       const void *data,
+			       size_t length)
+{
+	unsigned long flags;
+	struct rchan_buf *buf;
+
+	local_irq_save(flags);
+	buf = chan->buf[smp_processor_id()];
+	if (unlikely(buf->offset + length > chan->subbuf_size))
+		length = relay_switch_subbuf(buf, length);
+	memcpy(buf->data + buf->offset, data, length);
+	buf->offset += length;
+	local_irq_restore(flags);
+}
+
+/**
+ *	__relay_write - write data into the channel
+ *	@chan: relay channel
+ *	@data: data to be written
+ *	@length: number of bytes to write
+ *
+ *	Writes data into the current cpu's channel buffer.
+ *
+ *	Protects the buffer by disabling preemption.  Use
+ *	relay_write() if you might be logging from interrupt
+ *	context.
+ */
+static inline void __relay_write(struct rchan *chan,
+				 const void *data,
+				 size_t length)
+{
+	struct rchan_buf *buf;
+
+	buf = chan->buf[get_cpu()];
+	if (unlikely(buf->offset + length > buf->chan->subbuf_size))
+		length = relay_switch_subbuf(buf, length);
+	memcpy(buf->data + buf->offset, data, length);
+	buf->offset += length;
+	put_cpu();
+}
+
+/**
+ *	relay_reserve - reserve slot in channel buffer
+ *	@chan: relay channel
+ *	@length: number of bytes to reserve
+ *
+ *	Returns pointer to reserved slot, NULL if full.
+ *
+ *	Reserves a slot in the current cpu's channel buffer.
+ *	Does not protect the buffer at all - caller must provide
+ *	appropriate synchronization.
+ */
+static inline void *relay_reserve(struct rchan *chan, size_t length)
+{
+	void *reserved;
+	struct rchan_buf *buf = chan->buf[smp_processor_id()];
+
+	if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
+		length = relay_switch_subbuf(buf, length);
+		if (!length)
+			return NULL;
+	}
+	reserved = buf->data + buf->offset;
+	buf->offset += length;
+
+	return reserved;
+}
+
+/**
+ *	subbuf_start_reserve - reserve bytes at the start of a sub-buffer
+ *	@buf: relay channel buffer
+ *	@length: number of bytes to reserve
+ *
+ *	Helper function used to reserve bytes at the beginning of
+ *	a sub-buffer in the subbuf_start() callback.
+ */
+static inline void subbuf_start_reserve(struct rchan_buf *buf,
+					size_t length)
+{
+	BUG_ON(length >= buf->chan->subbuf_size - 1);
+	buf->offset = length;
+}
+
+/*
+ * exported relay file operations, kernel/relay.c
+ */
+extern struct file_operations relay_file_operations;
+
+#endif /* _LINUX_RELAY_H */
+
diff --git a/init/Kconfig b/init/Kconfig
index 38416a199def..1d19fd25204b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -214,6 +214,17 @@ config CPUSETS
 
 	  Say N if unsure.
 
+config RELAY
+	bool "Kernel->user space relay support (formerly relayfs)"
+	help
+	  This option enables support for relay interface support in
+	  certain file systems (such as debugfs).
+	  It is designed to provide an efficient mechanism for tools and
+	  facilities to relay large amounts of data from kernel space to
+	  user space.
+
+	  If unsure, say N.
+
 source "usr/Kconfig"
 
 config UID16
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde815d..aebd7a78984e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RELAY) += relay.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 000000000000..9358e8eb8476
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,919 @@
+/*
+ * Public API and common code for kernel->userspace relay file support.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Moved to kernel/relay.c by Paul Mundt, 2006.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relay.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+/*
+ * close() vm_op implementation for relay file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = vma->vm_private_data;
+	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * nopage() vm_op implementation for relay file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+				     unsigned long address,
+				     int *type)
+{
+	struct page *page;
+	struct rchan_buf *buf = vma->vm_private_data;
+	unsigned long offset = address - vma->vm_start;
+
+	if (address > vma->vm_end)
+		return NOPAGE_SIGBUS; /* Disallow mremap */
+	if (!buf)
+		return NOPAGE_OOM;
+
+	page = vmalloc_to_page(buf->start + offset);
+	if (!page)
+		return NOPAGE_OOM;
+	get_page(page);
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	return page;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+	.nopage = relay_buf_nopage,
+	.close = relay_file_mmap_close,
+};
+
+/**
+ *	relay_mmap_buf: - mmap channel buffer to process address space
+ *	@buf: relay channel buffer
+ *	@vma: vm_area_struct describing memory to be mapped
+ *
+ *	Returns 0 if ok, negative on error
+ *
+ *	Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+	unsigned long length = vma->vm_end - vma->vm_start;
+	struct file *filp = vma->vm_file;
+
+	if (!buf)
+		return -EBADF;
+
+	if (length != (unsigned long)buf->chan->alloc_size)
+		return -EINVAL;
+
+	vma->vm_ops = &relay_file_mmap_ops;
+	vma->vm_private_data = buf;
+	buf->chan->cb->buf_mapped(buf, filp);
+
+	return 0;
+}
+
+/**
+ *	relay_alloc_buf - allocate a channel buffer
+ *	@buf: the buffer struct
+ *	@size: total size of the buffer
+ *
+ *	Returns a pointer to the resulting buffer, NULL if unsuccessful
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+{
+	void *mem;
+	unsigned int i, j, n_pages;
+
+	size = PAGE_ALIGN(size);
+	n_pages = size >> PAGE_SHIFT;
+
+	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!buf->page_array)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		buf->page_array[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!buf->page_array[i]))
+			goto depopulate;
+	}
+	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
+	if (!mem)
+		goto depopulate;
+
+	memset(mem, 0, size);
+	buf->page_count = n_pages;
+	return mem;
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(buf->page_array[j]);
+	kfree(buf->page_array);
+	return NULL;
+}
+
+/**
+ *	relay_create_buf - allocate and initialize a channel buffer
+ *	@alloc_size: size of the buffer to allocate
+ *	@n_subbufs: number of sub-buffers in the channel
+ *
+ *	Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+	struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+	if (!buf->padding)
+		goto free_buf;
+
+	buf->start = relay_alloc_buf(buf, chan->alloc_size);
+	if (!buf->start)
+		goto free_buf;
+
+	buf->chan = chan;
+	kref_get(&buf->chan->kref);
+	return buf;
+
+free_buf:
+	kfree(buf->padding);
+	kfree(buf);
+	return NULL;
+}
+
+/**
+ *	relay_destroy_channel - free the channel struct
+ *
+ *	Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+	struct rchan *chan = container_of(kref, struct rchan, kref);
+	kfree(chan);
+}
+
+/**
+ *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ *	@buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+	struct rchan *chan = buf->chan;
+	unsigned int i;
+
+	if (likely(buf->start)) {
+		vunmap(buf->start);
+		for (i = 0; i < buf->page_count; i++)
+			__free_page(buf->page_array[i]);
+		kfree(buf->page_array);
+	}
+	kfree(buf->padding);
+	kfree(buf);
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ *	relay_remove_buf - remove a channel buffer
+ *
+ *	Removes the file from the fileystem, which also frees the
+ *	rchan_buf_struct and the channel buffer.  Should only be called from
+ *	kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+	buf->chan->cb->remove_buf_file(buf->dentry);
+	relay_destroy_buf(buf);
+}
+
+/**
+ *	relay_buf_empty - boolean, is the channel buffer empty?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+EXPORT_SYMBOL_GPL(relay_buf_empty);
+
+/**
+ *	relay_buf_full - boolean, is the channel buffer full?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(relay_buf_full);
+
+/*
+ * High-level relay kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback.  Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+					  void *subbuf,
+					  void *prev_subbuf,
+					  size_t prev_padding)
+{
+	if (relay_buf_full(buf))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * buf_mapped() default callback.  Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+					struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback.  Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+					  struct file *filp)
+{
+}
+
+/*
+ * create_buf_file_create() default callback.  Does nothing.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+						       struct dentry *parent,
+						       int mode,
+						       struct rchan_buf *buf,
+						       int *is_global)
+{
+	return NULL;
+}
+
+/*
+ * remove_buf_file() default callback.  Does nothing.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+	return -EINVAL;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+	.subbuf_start = subbuf_start_default_callback,
+	.buf_mapped = buf_mapped_default_callback,
+	.buf_unmapped = buf_unmapped_default_callback,
+	.create_buf_file = create_buf_file_default_callback,
+	.remove_buf_file = remove_buf_file_default_callback,
+};
+
+/**
+ *	wakeup_readers - wake up readers waiting on a channel
+ *	@private: the channel buffer
+ *
+ *	This is the work function used to defer reader waking.  The
+ *	reason waking is deferred is that calling directly from write
+ *	causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+	struct rchan_buf *buf = private;
+	wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ *	__relay_reset - reset a channel buffer
+ *	@buf: the channel buffer
+ *	@init: 1 if this is a first-time initialization
+ *
+ *	See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+	size_t i;
+
+	if (init) {
+		init_waitqueue_head(&buf->read_wait);
+		kref_init(&buf->kref);
+		INIT_WORK(&buf->wake_readers, NULL, NULL);
+	} else {
+		cancel_delayed_work(&buf->wake_readers);
+		flush_scheduled_work();
+	}
+
+	buf->subbufs_produced = 0;
+	buf->subbufs_consumed = 0;
+	buf->bytes_consumed = 0;
+	buf->finalized = 0;
+	buf->data = buf->start;
+	buf->offset = 0;
+
+	for (i = 0; i < buf->chan->n_subbufs; i++)
+		buf->padding[i] = 0;
+
+	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ *	relay_reset - reset the channel
+ *	@chan: the channel
+ *
+ *	This has the effect of erasing all data from all channel buffers
+ *	and restarting the channel in its initial state.  The buffers
+ *	are not freed, so any mappings are still in effect.
+ *
+ *	NOTE: Care should be taken that the channel isn't actually
+ *	being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		__relay_reset(chan->buf[i], 0);
+		prev = chan->buf[i];
+	}
+}
+EXPORT_SYMBOL_GPL(relay_reset);
+
+/**
+ *	relay_open_buf - create a new relay channel buffer
+ *
+ *	Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+					const char *filename,
+					struct dentry *parent,
+					int *is_global)
+{
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+
+	if (*is_global)
+		return chan->buf[0];
+
+	buf = relay_create_buf(chan);
+	if (!buf)
+		return NULL;
+
+	/* Create file in fs */
+	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+					   buf, is_global);
+	if (!dentry) {
+		relay_destroy_buf(buf);
+		return NULL;
+	}
+
+	buf->dentry = dentry;
+	__relay_reset(buf, 1);
+
+	return buf;
+}
+
+/**
+ *	relay_close_buf - close a channel buffer
+ *	@buf: channel buffer
+ *
+ *	Marks the buffer finalized and restores the default callbacks.
+ *	The channel buffer and channel buffer data structure are then freed
+ *	automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+	buf->finalized = 1;
+	cancel_delayed_work(&buf->wake_readers);
+	flush_scheduled_work();
+	kref_put(&buf->kref, relay_remove_buf);
+}
+
+static inline void setup_callbacks(struct rchan *chan,
+				   struct rchan_callbacks *cb)
+{
+	if (!cb) {
+		chan->cb = &default_channel_callbacks;
+		return;
+	}
+
+	if (!cb->subbuf_start)
+		cb->subbuf_start = subbuf_start_default_callback;
+	if (!cb->buf_mapped)
+		cb->buf_mapped = buf_mapped_default_callback;
+	if (!cb->buf_unmapped)
+		cb->buf_unmapped = buf_unmapped_default_callback;
+	if (!cb->create_buf_file)
+		cb->create_buf_file = create_buf_file_default_callback;
+	if (!cb->remove_buf_file)
+		cb->remove_buf_file = remove_buf_file_default_callback;
+	chan->cb = cb;
+}
+
+/**
+ *	relay_open - create a new relay channel
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, NULL for root directory
+ *	@subbuf_size: size of sub-buffers
+ *	@n_subbufs: number of sub-buffers
+ *	@cb: client callback functions
+ *
+ *	Returns channel pointer if successful, NULL otherwise.
+ *
+ *	Creates a channel buffer for each cpu using the sizes and
+ *	attributes specified.  The created channel buffer files
+ *	will be named base_filename0...base_filenameN-1.  File
+ *	permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb)
+{
+	unsigned int i;
+	struct rchan *chan;
+	char *tmpname;
+	int is_global = 0;
+
+	if (!base_filename)
+		return NULL;
+
+	if (!(subbuf_size && n_subbufs))
+		return NULL;
+
+	chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+	if (!chan)
+		return NULL;
+
+	chan->version = RELAYFS_CHANNEL_VERSION;
+	chan->n_subbufs = n_subbufs;
+	chan->subbuf_size = subbuf_size;
+	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+	setup_callbacks(chan, cb);
+	kref_init(&chan->kref);
+
+	tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		goto free_chan;
+
+	for_each_online_cpu(i) {
+		sprintf(tmpname, "%s%d", base_filename, i);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+					      &is_global);
+		if (!chan->buf[i])
+			goto free_bufs;
+
+		chan->buf[i]->cpu = i;
+	}
+
+	kfree(tmpname);
+	return chan;
+
+free_bufs:
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			break;
+		relay_close_buf(chan->buf[i]);
+		if (is_global)
+			break;
+	}
+	kfree(tmpname);
+
+free_chan:
+	kref_put(&chan->kref, relay_destroy_channel);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(relay_open);
+
+/**
+ *	relay_switch_subbuf - switch to a new sub-buffer
+ *	@buf: channel buffer
+ *	@length: size of current event
+ *
+ *	Returns either the length passed in or 0 if full.
+ *
+ *	Performs sub-buffer-switch tasks such as invoking callbacks,
+ *	updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+	void *old, *new;
+	size_t old_subbuf, new_subbuf;
+
+	if (unlikely(length > buf->chan->subbuf_size))
+		goto toobig;
+
+	if (buf->offset != buf->chan->subbuf_size + 1) {
+		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+		buf->padding[old_subbuf] = buf->prev_padding;
+		buf->subbufs_produced++;
+		if (waitqueue_active(&buf->read_wait)) {
+			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+			schedule_delayed_work(&buf->wake_readers, 1);
+		}
+	}
+
+	old = buf->data;
+	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+	new = buf->start + new_subbuf * buf->chan->subbuf_size;
+	buf->offset = 0;
+	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+		buf->offset = buf->chan->subbuf_size + 1;
+		return 0;
+	}
+	buf->data = new;
+	buf->padding[new_subbuf] = 0;
+
+	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+		goto toobig;
+
+	return length;
+
+toobig:
+	buf->chan->last_toobig = length;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+
+/**
+ *	relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ *	@chan: the channel
+ *	@cpu: the cpu associated with the channel buffer to update
+ *	@subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ *	Adds to the channel buffer's consumed sub-buffer count.
+ *	subbufs_consumed should be the number of sub-buffers newly consumed,
+ *	not the total consumed.
+ *
+ *	NOTE: kernel clients don't need to call this function if the channel
+ *	mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+			    unsigned int cpu,
+			    size_t subbufs_consumed)
+{
+	struct rchan_buf *buf;
+
+	if (!chan)
+		return;
+
+	if (cpu >= NR_CPUS || !chan->buf[cpu])
+		return;
+
+	buf = chan->buf[cpu];
+	buf->subbufs_consumed += subbufs_consumed;
+	if (buf->subbufs_consumed > buf->subbufs_produced)
+		buf->subbufs_consumed = buf->subbufs_produced;
+}
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+
+/**
+ *	relay_close - close the channel
+ *	@chan: the channel
+ *
+ *	Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		relay_close_buf(chan->buf[i]);
+		prev = chan->buf[i];
+	}
+
+	if (chan->last_toobig)
+		printk(KERN_WARNING "relay: one or more items not logged "
+		       "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+		       chan->last_toobig, chan->subbuf_size);
+
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+EXPORT_SYMBOL_GPL(relay_close);
+
+/**
+ *	relay_flush - close the channel
+ *	@chan: the channel
+ *
+ *	Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		relay_switch_subbuf(chan->buf[i], 0);
+		prev = chan->buf[i];
+	}
+}
+EXPORT_SYMBOL_GPL(relay_flush);
+
+/**
+ *	relay_file_open - open file op for relay files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Increments the channel buffer refcount.
+ */
+static int relay_file_open(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = inode->u.generic_ip;
+	kref_get(&buf->kref);
+	filp->private_data = buf;
+
+	return 0;
+}
+
+/**
+ *	relay_file_mmap - mmap file op for relay files
+ *	@filp: the file
+ *	@vma: the vma describing what to map
+ *
+ *	Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = filp->private_data;
+	return relay_mmap_buf(buf, vma);
+}
+
+/**
+ *	relay_file_poll - poll file op for relay files
+ *	@filp: the file
+ *	@wait: poll table
+ *
+ *	Poll implemention.
+ */
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+{
+	unsigned int mask = 0;
+	struct rchan_buf *buf = filp->private_data;
+
+	if (buf->finalized)
+		return POLLERR;
+
+	if (filp->f_mode & FMODE_READ) {
+		poll_wait(filp, &buf->read_wait, wait);
+		if (!relay_buf_empty(buf))
+			mask |= POLLIN | POLLRDNORM;
+	}
+
+	return mask;
+}
+
+/**
+ *	relay_file_release - release file op for relay files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Decrements the channel refcount, as the filesystem is
+ *	no longer using it.
+ */
+static int relay_file_release(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = filp->private_data;
+	kref_put(&buf->kref, relay_remove_buf);
+
+	return 0;
+}
+
+/**
+ *	relay_file_read_consume - update the consumed count for the buffer
+ */
+static void relay_file_read_consume(struct rchan_buf *buf,
+				    size_t read_pos,
+				    size_t bytes_consumed)
+{
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t read_subbuf;
+
+	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+
+	buf->bytes_consumed += bytes_consumed;
+	read_subbuf = read_pos / buf->chan->subbuf_size;
+	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+		if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+		    (buf->offset == subbuf_size))
+			return;
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+}
+
+/**
+ *	relay_file_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+	size_t bytes_produced, bytes_consumed, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t produced = buf->subbufs_produced % n_subbufs;
+	size_t consumed = buf->subbufs_consumed % n_subbufs;
+
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+
+	if (consumed > produced) {
+		if ((produced > n_subbufs) &&
+		    (produced + n_subbufs - consumed <= n_subbufs))
+			produced += n_subbufs;
+	} else if (consumed == produced) {
+		if (buf->offset > subbuf_size) {
+			produced += n_subbufs;
+			if (buf->subbufs_produced == buf->subbufs_consumed)
+				consumed += n_subbufs;
+		}
+	}
+
+	if (buf->offset > subbuf_size)
+		bytes_produced = (produced - 1) * subbuf_size + write_offset;
+	else
+		bytes_produced = produced * subbuf_size + write_offset;
+	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
+
+	if (bytes_produced == bytes_consumed)
+		return 0;
+
+	relay_file_read_consume(buf, read_pos, 0);
+
+	return 1;
+}
+
+/**
+ *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+					   struct rchan_buf *buf)
+{
+	size_t padding, avail = 0;
+	size_t read_subbuf, read_offset, write_subbuf, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+
+	write_subbuf = (buf->data - buf->start) / subbuf_size;
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+	read_subbuf = read_pos / subbuf_size;
+	read_offset = read_pos % subbuf_size;
+	padding = buf->padding[read_subbuf];
+
+	if (read_subbuf == write_subbuf) {
+		if (read_offset + padding < write_offset)
+			avail = write_offset - (read_offset + padding);
+	} else
+		avail = (subbuf_size - padding) - read_offset;
+
+	return avail;
+}
+
+/**
+ *	relay_file_read_start_pos - find the first available byte to read
+ *
+ *	If the read_pos is in the middle of padding, return the
+ *	position of the first actually available byte, otherwise
+ *	return the original value.
+ */
+static size_t relay_file_read_start_pos(size_t read_pos,
+					struct rchan_buf *buf)
+{
+	size_t read_subbuf, padding, padding_start, padding_end;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	padding_start = (read_subbuf + 1) * subbuf_size - padding;
+	padding_end = (read_subbuf + 1) * subbuf_size;
+	if (read_pos >= padding_start && read_pos < padding_end) {
+		read_subbuf = (read_subbuf + 1) % n_subbufs;
+		read_pos = read_subbuf * subbuf_size;
+	}
+
+	return read_pos;
+}
+
+/**
+ *	relay_file_read_end_pos - return the new read position
+ */
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+				      size_t read_pos,
+				      size_t count)
+{
+	size_t read_subbuf, padding, end_pos;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	if (read_pos % subbuf_size + count + padding == subbuf_size)
+		end_pos = (read_subbuf + 1) * subbuf_size;
+	else
+		end_pos = read_pos + count;
+	if (end_pos >= subbuf_size * n_subbufs)
+		end_pos = 0;
+
+	return end_pos;
+}
+
+/**
+ *	relay_file_read - read file op for relay files
+ *	@filp: the file
+ *	@buffer: the userspace buffer
+ *	@count: number of bytes to read
+ *	@ppos: position to read from
+ *
+ *	Reads count bytes or the number of bytes available in the
+ *	current sub-buffer being read, whichever is smaller.
+ */
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
+{
+	struct rchan_buf *buf = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	size_t read_start, avail;
+	ssize_t ret = 0;
+	void *from;
+
+	mutex_lock(&inode->i_mutex);
+	if(!relay_file_read_avail(buf, *ppos))
+		goto out;
+
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
+	if (!avail)
+		goto out;
+
+	from = buf->start + read_start;
+	ret = count = min(count, avail);
+	if (copy_to_user(buffer, from, count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	relay_file_read_consume(buf, read_start, count);
+	*ppos = relay_file_read_end_pos(buf, read_start, count);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+struct file_operations relay_file_operations = {
+	.open		= relay_file_open,
+	.poll		= relay_file_poll,
+	.mmap		= relay_file_mmap,
+	.read		= relay_file_read,
+	.llseek		= no_llseek,
+	.release	= relay_file_release,
+};
+EXPORT_SYMBOL_GPL(relay_file_operations);
-- 
cgit v1.2.3


From 2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 20:00:26 +0100
Subject: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/Kconfig                |  12 +
 block/Makefile               |   2 +
 block/blktrace.c             | 538 +++++++++++++++++++++++++++++++++++++++++++
 block/elevator.c             |   4 +
 block/ioctl.c                |   6 +
 block/ll_rw_blk.c            |  44 +++-
 drivers/block/cciss.c        |   2 +
 drivers/md/dm.c              |  13 +-
 fs/bio.c                     |   4 +
 fs/compat_ioctl.c            |   1 +
 include/linux/blkdev.h       |   3 +
 include/linux/blktrace_api.h | 277 ++++++++++++++++++++++
 include/linux/compat_ioctl.h |   4 +
 include/linux/fs.h           |   4 +
 include/linux/sched.h        |   1 +
 kernel/fork.c                |   1 +
 mm/highmem.c                 |   3 +
 17 files changed, 916 insertions(+), 3 deletions(-)
 create mode 100644 block/blktrace.c
 create mode 100644 include/linux/blktrace_api.h

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e17..96783645092d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,16 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	select RELAY
+	select DEBUG_FS
+	help
+	  Say Y here, if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the user space
+	  support tools needed), fetch the blktrace app from:
+
+	  git://brick.kernel.dk/data/git/blktrace.git
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b44e..c05de0e0037f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 000000000000..36f3a172275f
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <asm/uaccess.h>
+
+static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
+static unsigned int blktrace_seq __read_mostly = 1;
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	struct blk_io_trace *t;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
+	if (t) {
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->device = bt->dev;
+		t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
+		t->pid = tsk->pid;
+		t->cpu = smp_processor_id();
+		t->pdu_len = sizeof(tsk->comm);
+		memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
+		tsk->btrace_seq = blktrace_seq;
+	}
+}
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
+
+/*
+ * Bio action bits of interest
+ */
+static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
+
+/*
+ * More could be added as needed, taking care to increment the decrementer
+ * to get correct indexing
+ */
+#define trace_barrier_bit(rw)	\
+	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
+#define trace_sync_bit(rw)	\
+	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct blk_io_trace *t;
+	unsigned long flags;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu;
+
+	if (unlikely(bt->trace_state != Blktrace_running))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= bio_act[trace_barrier_bit(rw)];
+	what |= bio_act[trace_sync_bit(rw)];
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes. Once reserved, it's
+	 * enough to get preemption disabled to prevent read of this data
+	 * before we are through filling it. get_cpu()/put_cpu() does this
+	 * for us
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		cpu = smp_processor_id();
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->pid = pid;
+		t->device = bt->dev;
+		t->cpu = cpu;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(__blk_add_trace);
+
+static struct dentry *blk_tree_root;
+static struct mutex blk_tree_mutex;
+static unsigned int root_users;
+
+static inline void blk_remove_root(void)
+{
+	if (blk_tree_root) {
+		debugfs_remove(blk_tree_root);
+		blk_tree_root = NULL;
+	}
+}
+
+static void blk_remove_tree(struct dentry *dir)
+{
+	mutex_lock(&blk_tree_mutex);
+	debugfs_remove(dir);
+	if (--root_users == 0)
+		blk_remove_root();
+	mutex_unlock(&blk_tree_mutex);
+}
+
+static struct dentry *blk_create_tree(const char *blk_name)
+{
+	struct dentry *dir = NULL;
+
+	mutex_lock(&blk_tree_mutex);
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			goto err;
+	}
+
+	dir = debugfs_create_dir(blk_name, blk_tree_root);
+	if (dir)
+		root_users++;
+	else
+		blk_remove_root();
+
+err:
+	mutex_unlock(&blk_tree_mutex);
+	return dir;
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	relay_close(bt->rchan);
+	debugfs_remove(bt->dropped_file);
+	blk_remove_tree(bt->dir);
+	free_percpu(bt->sequence);
+	kfree(bt);
+}
+
+static int blk_trace_remove(request_queue_t *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->u.generic_ip;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
+			   char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	char b[BDEVNAME_SIZE];
+	int ret, i;
+
+	if (copy_from_user(&buts, arg, sizeof(buts)))
+		return -EFAULT;
+
+	if (!buts.buf_size || !buts.buf_nr)
+		return -EINVAL;
+
+	strcpy(buts.name, bdevname(bdev, b));
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts.name); i++)
+		if (buts.name[i] == '/')
+			buts.name[i] = '_';
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	ret = -ENOENT;
+	dir = blk_create_tree(buts.name);
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = bdev->bd_dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
+	if (!bt->rchan)
+		goto err;
+	bt->rchan->private_data = bt;
+
+	bt->act_mask = buts.act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts.start_lba;
+	bt->end_lba = buts.end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts.pid;
+	bt->trace_state = Blktrace_setup;
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+err:
+	if (dir)
+		blk_remove_tree(dir);
+	if (bt) {
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		if (bt->sequence)
+			free_percpu(bt->sequence);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+static int blk_trace_startstop(request_queue_t *q, int start)
+{
+	struct blk_trace *bt;
+	int ret;
+
+	if ((bt = q->blk_trace) == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	request_queue_t *q;
+	int ret, start = 0;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		ret = blk_trace_setup(q, bdev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(request_queue_t *q)
+{
+	blk_trace_startstop(q, 0);
+	blk_trace_remove(q);
+}
+
+/*
+ * Average offset over two calls to sched_clock() with a gettimeofday()
+ * in the middle
+ */
+static void blk_check_time(unsigned long long *t)
+{
+	unsigned long long a, b;
+	struct timeval tv;
+
+	a = sched_clock();
+	do_gettimeofday(&tv);
+	b = sched_clock();
+
+	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+	*t -= (a + b) / 2;
+}
+
+static void blk_trace_check_cpu_time(void *data)
+{
+	unsigned long long *t;
+	int cpu = get_cpu();
+
+	t = &per_cpu(blk_trace_cpu_offset, cpu);
+
+	/*
+	 * Just call it twice, hopefully the second call will be cache hot
+	 * and a little more precise
+	 */
+	blk_check_time(t);
+	blk_check_time(t);
+
+	put_cpu();
+}
+
+/*
+ * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
+ * timings
+ */
+static void blk_trace_calibrate_offsets(void)
+{
+	unsigned long flags;
+
+	smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
+	local_irq_save(flags);
+	blk_trace_check_cpu_time(NULL);
+	local_irq_restore(flags);
+}
+
+static void blk_trace_set_ht_offsets(void)
+{
+#if defined(CONFIG_SCHED_SMT)
+	int cpu, i;
+
+	/*
+	 * now make sure HT siblings have the same time offset
+	 */
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		unsigned long long *cpu_off, *sibling_off;
+
+		for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
+			if (i == cpu)
+				continue;
+
+			cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
+			sibling_off = &per_cpu(blk_trace_cpu_offset, i);
+			*sibling_off = *cpu_off;
+		}
+	}
+	preempt_enable();
+#endif
+}
+
+static __init int blk_trace_init(void)
+{
+	mutex_init(&blk_tree_mutex);
+	blk_trace_calibrate_offsets();
+	blk_trace_set_ht_offsets();
+
+	return 0;
+}
+
+module_init(blk_trace_init);
+
diff --git a/block/elevator.c b/block/elevator.c
index db3d0d8296a0..5e558c4689a4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/blktrace_api.h>
 
 #include <asm/uaccess.h>
 
@@ -333,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
 	struct list_head *pos;
 	unsigned ordseq;
 
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+
 	rq->q = q;
 
 	switch (where) {
@@ -499,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->flags |= REQ_STARTED;
+			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/block/ioctl.c b/block/ioctl.c
index 35fdb7dc6512..9cfa2e1ecb24 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
 		return put_ulong(arg, bdev->bd_inode->i_size >> 9);
 	case BLKGETSIZE64:
 		return put_u64(arg, bdev->bd_inode->i_size);
+	case BLKTRACESTART:
+	case BLKTRACESTOP:
+	case BLKTRACESETUP:
+	case BLKTRACETEARDOWN:
+		return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
 	}
 	return -ENOIOCTLCMD;
 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 6c793b196aa9..062067fa7ead 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/blktrace_api.h>
 
 /*
  * for max sense size
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
 
-	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
-	if (q->unplug_fn)
+	if (q->unplug_fn) {
+		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+					q->rq.count[READ] + q->rq.count[WRITE]);
+
 		q->unplug_fn(q);
+	}
 }
 
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	q->unplug_fn(q);
 }
 
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
 {
 	request_queue_t *q = (request_queue_t *)data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	kblockd_schedule_work(&q->unplug_work);
 }
 
@@ -1753,6 +1766,9 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	if (q->blk_trace)
+		blk_trace_shutdown(q);
+
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2129,6 +2145,8 @@ rq_starved:
 	
 	rq_init(q, rq);
 	rq->rl = rl;
+
+	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
 	return rq;
 }
@@ -2157,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 		if (!rq) {
 			struct io_context *ioc;
 
+			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();
@@ -2210,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
@@ -2844,6 +2866,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->back_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2859,6 +2883,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->front_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
 			bio->bi_next = req->bio;
 			req->bio = bio;
 
@@ -2976,6 +3002,7 @@ void generic_make_request(struct bio *bio)
 	request_queue_t *q;
 	sector_t maxsector;
 	int ret, nr_sectors = bio_sectors(bio);
+	dev_t old_dev;
 
 	might_sleep();
 	/* Test device or partition size, when known. */
@@ -3002,6 +3029,8 @@ void generic_make_request(struct bio *bio)
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
+	maxsector = -1;
+	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 
@@ -3034,6 +3063,15 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (maxsector != -1)
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+					    maxsector);
+
+		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+		maxsector = bio->bi_sector;
+		old_dev = bio->bi_bdev->bd_dev;
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
@@ -3153,6 +3191,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
+	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e29b8926f80e..1f2890989b56 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -38,6 +38,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2331,6 +2332,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 
 	cmd->rq->completion_data = cmd;
 	cmd->rq->errors = status;
+	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
 	blk_complete_request(cmd->rq);
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 26b08ee425c7..8c82373f7ff3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
 
@@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io, int error)
 			/* nudge anyone waiting on suspend queue */
 			wake_up(&io->md->wait);
 
+		blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+
 		bio_endio(io->bio, io->bio->bi_size, io->error);
 		free_io(io->md, io);
 	}
@@ -392,6 +395,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		      struct target_io *tio)
 {
 	int r;
+	sector_t sector;
 
 	/*
 	 * Sanity checks.
@@ -407,10 +411,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
+	sector = clone->bi_sector;
 	r = ti->type->map(ti, clone, &tio->info);
-	if (r > 0)
+	if (r > 0) {
 		/* the bio has been remapped so dispatch it */
+
+		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 
+				    tio->io->bio->bi_bdev->bd_dev, sector, 
+				    clone->bi_sector);
+
 		generic_make_request(clone);
+	}
 
 	else if (r < 0) {
 		/* error the io and bail out */
diff --git a/fs/bio.c b/fs/bio.c
index 8f1d2e815c96..0a8c59cb68f5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 256
@@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	if (!bp)
 		return bp;
 
+	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+				bi->bi_sector + first_sectors);
+
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
 	atomic_set(&bp->cnt, 3);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c666769a875d..7c031f00fd79 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -72,6 +72,7 @@
 #include <linux/i2c-dev.h>
 #include <linux/wireless.h>
 #include <linux/atalk.h>
+#include <linux/blktrace_api.h>
 
 #include <net/sock.h>          /* siocdevprivate_ioctl */
 #include <net/bluetooth/bluetooth.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56bb6a4e15f3..c179966f1a2f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,6 +22,7 @@ typedef struct request_queue request_queue_t;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
+struct blk_trace;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -416,6 +417,8 @@ struct request_queue
 	unsigned int		sg_reserved_size;
 	int			node;
 
+	struct blk_trace	*blk_trace;
+
 	/*
 	 * reserved for flush operations
 	 */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
new file mode 100644
index 000000000000..b34d3e73d5ea
--- /dev/null
+++ b/include/linux/blktrace_api.h
@@ -0,0 +1,277 @@
+#ifndef BLKTRACE_H
+#define BLKTRACE_H
+
+#include <linux/config.h>
+#include <linux/blkdev.h>
+#include <linux/relay.h>
+
+/*
+ * Trace categories
+ */
+enum blktrace_cat {
+	BLK_TC_READ	= 1 << 0,	/* reads */
+	BLK_TC_WRITE	= 1 << 1,	/* writes */
+	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
+	BLK_TC_SYNC	= 1 << 3,	/* barrier */
+	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
+	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
+	BLK_TC_ISSUE	= 1 << 6,	/* issue */
+	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
+	BLK_TC_FS	= 1 << 8,	/* fs requests */
+	BLK_TC_PC	= 1 << 9,	/* pc requests */
+	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
+
+	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
+};
+
+#define BLK_TC_SHIFT		(16)
+#define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
+
+/*
+ * Basic trace actions
+ */
+enum blktrace_act {
+	__BLK_TA_QUEUE = 1,		/* queued */
+	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
+	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
+	__BLK_TA_GETRQ,			/* allocated new request */
+	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
+	__BLK_TA_REQUEUE,		/* request requeued */
+	__BLK_TA_ISSUE,			/* sent to driver */
+	__BLK_TA_COMPLETE,		/* completed by driver */
+	__BLK_TA_PLUG,			/* queue was plugged */
+	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
+	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
+	__BLK_TA_INSERT,		/* insert request */
+	__BLK_TA_SPLIT,			/* bio was split */
+	__BLK_TA_BOUNCE,		/* bio was bounced */
+	__BLK_TA_REMAP,			/* bio was remapped */
+};
+
+/*
+ * Trace actions in full. Additionally, read or write is masked
+ */
+#define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
+#define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
+#define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
+#define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
+#define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
+#define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
+
+#define BLK_IO_TRACE_MAGIC	0x65617400
+#define BLK_IO_TRACE_VERSION	0x07
+
+/*
+ * The trace itself
+ */
+struct blk_io_trace {
+	u32 magic;		/* MAGIC << 8 | version */
+	u32 sequence;		/* event number */
+	u64 time;		/* in microseconds */
+	u64 sector;		/* disk offset */
+	u32 bytes;		/* transfer length */
+	u32 action;		/* what happened */
+	u32 pid;		/* who did it */
+	u32 device;		/* device number */
+	u32 cpu;		/* on what cpu did it happen */
+	u16 error;		/* completion error */
+	u16 pdu_len;		/* length of data after this trace */
+};
+
+/*
+ * The remap event
+ */
+struct blk_io_trace_remap {
+	u32 device;
+	u32 __pad;
+	u64 sector;
+};
+
+enum {
+	Blktrace_setup = 1,
+	Blktrace_running,
+	Blktrace_stopped,
+};
+
+struct blk_trace {
+	int trace_state;
+	struct rchan *rchan;
+	unsigned long *sequence;
+	u16 act_mask;
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+	u32 dev;
+	struct dentry *dir;
+	struct dentry *dropped_file;
+	atomic_t dropped;
+};
+
+/*
+ * User setup structure passed with BLKTRACESTART
+ */
+struct blk_user_trace_setup {
+	char name[BDEVNAME_SIZE];	/* output */
+	u16 act_mask;			/* input */
+	u32 buf_size;			/* input */
+	u32 buf_nr;			/* input */
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+};
+
+#if defined(CONFIG_BLK_DEV_IO_TRACE)
+extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
+extern void blk_trace_shutdown(request_queue_t *);
+extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->flags & 0x07;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ * @what:	the action
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static inline void blk_add_trace_generic(struct request_queue *q,
+					 struct bio *bio, int rw, u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(q, bio, what);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * @q:		queue the io is for
+ * @what:	the action
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
+					 struct bio *bio, unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+	u64 rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#else /* !CONFIG_BLK_DEV_IO_TRACE */
+#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+#define blk_trace_shutdown(q)			do { } while (0)
+#define blk_add_trace_rq(q, rq, what)		do { } while (0)
+#define blk_add_trace_bio(q, rq, what)		do { } while (0)
+#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
+#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
+#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#endif
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index ae7dfb790df3..efb518f16bb3 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART)
 COMPATIBLE_IOCTL(BLKFLSBUF)
 COMPATIBLE_IOCTL(BLKSECTSET)
 COMPATIBLE_IOCTL(BLKSSZGET)
+COMPATIBLE_IOCTL(BLKTRACESTART)
+COMPATIBLE_IOCTL(BLKTRACESTOP)
+COMPATIBLE_IOCTL(BLKTRACESETUP)
+COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
 /* RAID */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9c9dea636d0..9b34a1b03455 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -197,6 +197,10 @@ extern int dir_notify_enable;
 #define BLKBSZGET  _IOR(0x12,112,size_t)
 #define BLKBSZSET  _IOW(0x12,113,size_t)
 #define BLKGETSIZE64 _IOR(0x12,114,size_t)	/* return device size in bytes (u64 *arg) */
+#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
+#define BLKTRACESTART _IO(0x12,116)
+#define BLKTRACESTOP _IO(0x12,117)
+#define BLKTRACETEARDOWN _IO(0x12,118)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e6314382f0..e60a91d5b369 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -706,6 +706,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned short ioprio;
+	unsigned int btrace_seq;
 
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
diff --git a/kernel/fork.c b/kernel/fork.c
index c79ae0b19a49..c21bae8c93b9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
 	return tsk;
 }
 
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa7..d0ea1eec6a9a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 static mempool_t *page_pool, *isa_page_pool;
@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 		pool = isa_page_pool;
 	}
 
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
 	/*
 	 * slow path
 	 */
-- 
cgit v1.2.3


From 42d3b83fecb7b576d477244347982baa02fa4f44 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Mon, 9 Jan 2006 23:22:24 +0100
Subject: [PATCH] hwmon: Allow sensor attributes arrays

This patch refactors SENSOR_DEVICE_ATTR macro.  First it creates a new
macro SENSOR_ATTR() which expands to an initialization expression, then
it uses that in SENSOR_DEVICE_ATTR, which declares and initializes a
struct sensor_device_attribute.

IOW, SENSOR_ATTR() imitates __ATTR() in include/linux/device.h.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/hwmon-sysfs.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hwmon-sysfs.h b/include/linux/hwmon-sysfs.h
index 7eb4004b3601..dedca6a516b4 100644
--- a/include/linux/hwmon-sysfs.h
+++ b/include/linux/hwmon-sysfs.h
@@ -27,11 +27,13 @@ struct sensor_device_attribute{
 #define to_sensor_dev_attr(_dev_attr) \
 	container_of(_dev_attr, struct sensor_device_attribute, dev_attr)
 
-#define SENSOR_DEVICE_ATTR(_name,_mode,_show,_store,_index)	\
-struct sensor_device_attribute sensor_dev_attr_##_name = {	\
-	.dev_attr =	__ATTR(_name,_mode,_show,_store),	\
-	.index =	_index,					\
-}
+#define SENSOR_ATTR(_name, _mode, _show, _store, _index)	\
+	{ .dev_attr = __ATTR(_name, _mode, _show, _store),	\
+	  .index = _index }
+
+#define SENSOR_DEVICE_ATTR(_name, _mode, _show, _store, _index)	\
+struct sensor_device_attribute sensor_dev_attr_##_name		\
+	= SENSOR_ATTR(_name, _mode, _show, _store, _index)
 
 struct sensor_device_attribute_2 {
 	struct device_attribute dev_attr;
-- 
cgit v1.2.3


From 70adca5a9f506942ec88fb2d9d6e71ef271888b7 Mon Sep 17 00:00:00 2001
From: Jim Cromie <jim.cromie@gmail.com>
Date: Wed, 18 Jan 2006 23:10:49 +0100
Subject: [PATCH] hwmon: Refactor SENSOR_DEVICE_ATTR_2

This patch refactors SENSOR_DEVICE_ATTR_2 macro, following pattern set by
SENSOR_ATTR.  First it creates a new macro SENSOR_ATTR_2() which expands
to an initialization expression, then it uses that in SENSOR_DEVICE_ATTR_2,
which declares and initializes a struct sensor_device_attribute_2.

Signed-off-by: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/hwmon-sysfs.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hwmon-sysfs.h b/include/linux/hwmon-sysfs.h
index dedca6a516b4..a90c09d331c1 100644
--- a/include/linux/hwmon-sysfs.h
+++ b/include/linux/hwmon-sysfs.h
@@ -43,11 +43,13 @@ struct sensor_device_attribute_2 {
 #define to_sensor_dev_attr_2(_dev_attr) \
 	container_of(_dev_attr, struct sensor_device_attribute_2, dev_attr)
 
+#define SENSOR_ATTR_2(_name, _mode, _show, _store, _nr, _index)	\
+	{ .dev_attr = __ATTR(_name, _mode, _show, _store),	\
+	  .index = _index,					\
+	  .nr = _nr }
+
 #define SENSOR_DEVICE_ATTR_2(_name,_mode,_show,_store,_nr,_index)	\
-struct sensor_device_attribute_2 sensor_dev_attr_##_name = {	\
-	.dev_attr =	__ATTR(_name,_mode,_show,_store),	\
-	.index =	_index,					\
-	.nr =		_nr,					\
-}
+struct sensor_device_attribute_2 sensor_dev_attr_##_name		\
+	= SENSOR_ATTR_2(_name, _mode, _show, _store, _nr, _index)
 
 #endif /* _LINUX_HWMON_SYSFS_H */
-- 
cgit v1.2.3


From 5c085d369c2c4f18942ec8951466e186366d5c78 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Jan 2006 23:16:04 +0100
Subject: [PATCH] i2c: Semaphore to mutex conversions, part 2

semaphore to mutex conversion.

the conversion was generated via scripts, and the result was validated
automatically via a script as well.

build tested.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/busses/i2c-amd756-s4882.c | 13 +++++-----
 drivers/i2c/busses/i2c-isa.c          |  2 +-
 drivers/i2c/chips/eeprom.c            |  9 +++----
 drivers/i2c/chips/max6875.c           | 10 ++++----
 drivers/i2c/chips/pcf8591.c           | 13 +++++-----
 drivers/i2c/chips/tps65010.c          | 45 ++++++++++++++++++-----------------
 drivers/i2c/i2c-core.c                | 34 +++++++++++++-------------
 include/linux/i2c.h                   |  6 ++---
 8 files changed, 68 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-amd756-s4882.c b/drivers/i2c/busses/i2c-amd756-s4882.c
index 56c7d987590f..f7b4cb4e9c75 100644
--- a/drivers/i2c/busses/i2c-amd756-s4882.c
+++ b/drivers/i2c/busses/i2c-amd756-s4882.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/i2c.h>
+#include <linux/mutex.h>
 
 extern struct i2c_adapter amd756_smbus;
 
@@ -45,7 +46,7 @@ static struct i2c_adapter *s4882_adapter;
 static struct i2c_algorithm *s4882_algo;
 
 /* Wrapper access functions for multiplexed SMBus */
-static struct semaphore amd756_lock;
+static struct mutex amd756_lock;
 
 static s32 amd756_access_virt0(struct i2c_adapter * adap, u16 addr,
 			       unsigned short flags, char read_write,
@@ -59,12 +60,12 @@ static s32 amd756_access_virt0(struct i2c_adapter * adap, u16 addr,
 	 || addr == 0x18)
 		return -1;
 
-	down(&amd756_lock);
+	mutex_lock(&amd756_lock);
 
 	error = amd756_smbus.algo->smbus_xfer(adap, addr, flags, read_write,
 					      command, size, data);
 
-	up(&amd756_lock);
+	mutex_unlock(&amd756_lock);
 
 	return error;
 }
@@ -87,7 +88,7 @@ static inline s32 amd756_access_channel(struct i2c_adapter * adap, u16 addr,
 	if (addr != 0x4c && (addr & 0xfc) != 0x50 && (addr & 0xfc) != 0x30)
 		return -1;
 
-	down(&amd756_lock);
+	mutex_lock(&amd756_lock);
 
 	if (last_channels != channels) {
 		union i2c_smbus_data mplxdata;
@@ -105,7 +106,7 @@ static inline s32 amd756_access_channel(struct i2c_adapter * adap, u16 addr,
 					      command, size, data);
 
 UNLOCK:
-	up(&amd756_lock);
+	mutex_unlock(&amd756_lock);
 	return error;
 }
 
@@ -166,7 +167,7 @@ static int __init amd756_s4882_init(void)
 	}
 
 	printk(KERN_INFO "Enabling SMBus multiplexing for Tyan S4882\n");
-	init_MUTEX(&amd756_lock);
+	mutex_init(&amd756_lock);
 
 	/* Define the 5 virtual adapters and algorithms structures */
 	if (!(s4882_adapter = kzalloc(5 * sizeof(struct i2c_adapter),
diff --git a/drivers/i2c/busses/i2c-isa.c b/drivers/i2c/busses/i2c-isa.c
index 4344ae6b1fcb..c3e1d3e888d7 100644
--- a/drivers/i2c/busses/i2c-isa.c
+++ b/drivers/i2c/busses/i2c-isa.c
@@ -125,7 +125,7 @@ int i2c_isa_del_driver(struct i2c_driver *driver)
 
 static int __init i2c_isa_init(void)
 {
-	init_MUTEX(&isa_adapter.clist_lock);
+	mutex_init(&isa_adapter.clist_lock);
 	INIT_LIST_HEAD(&isa_adapter.clients);
 
 	isa_adapter.nr = ANY_I2C_ISA_BUS;
diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c
index 41116b7947f6..13c108269a6d 100644
--- a/drivers/i2c/chips/eeprom.c
+++ b/drivers/i2c/chips/eeprom.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
+#include <linux/mutex.h>
 
 /* Addresses to scan */
 static unsigned short normal_i2c[] = { 0x50, 0x51, 0x52, 0x53, 0x54,
@@ -54,7 +55,7 @@ enum eeprom_nature {
 /* Each client has this additional data */
 struct eeprom_data {
 	struct i2c_client client;
-	struct semaphore update_lock;
+	struct mutex update_lock;
 	u8 valid;			/* bitfield, bit!=0 if slice is valid */
 	unsigned long last_updated[8];	/* In jiffies, 8 slices */
 	u8 data[EEPROM_SIZE];		/* Register values */
@@ -81,7 +82,7 @@ static void eeprom_update_client(struct i2c_client *client, u8 slice)
 	struct eeprom_data *data = i2c_get_clientdata(client);
 	int i, j;
 
-	down(&data->update_lock);
+	mutex_lock(&data->update_lock);
 
 	if (!(data->valid & (1 << slice)) ||
 	    time_after(jiffies, data->last_updated[slice] + 300 * HZ)) {
@@ -107,7 +108,7 @@ static void eeprom_update_client(struct i2c_client *client, u8 slice)
 		data->valid |= (1 << slice);
 	}
 exit:
-	up(&data->update_lock);
+	mutex_unlock(&data->update_lock);
 }
 
 static ssize_t eeprom_read(struct kobject *kobj, char *buf, loff_t off, size_t count)
@@ -187,7 +188,7 @@ static int eeprom_detect(struct i2c_adapter *adapter, int address, int kind)
 	/* Fill in the remaining client fields */
 	strlcpy(new_client->name, "eeprom", I2C_NAME_SIZE);
 	data->valid = 0;
-	init_MUTEX(&data->update_lock);
+	mutex_init(&data->update_lock);
 	data->nature = UNKNOWN;
 
 	/* Tell the I2C layer a new client has arrived */
diff --git a/drivers/i2c/chips/max6875.c b/drivers/i2c/chips/max6875.c
index 6d3ff584155e..88d2ddee4490 100644
--- a/drivers/i2c/chips/max6875.c
+++ b/drivers/i2c/chips/max6875.c
@@ -31,7 +31,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 /* Do not scan - the MAX6875 access method will write to some EEPROM chips */
 static unsigned short normal_i2c[] = {I2C_CLIENT_END};
@@ -54,7 +54,7 @@ I2C_CLIENT_INSMOD_1(max6875);
 /* Each client has this additional data */
 struct max6875_data {
 	struct i2c_client	client;
-	struct semaphore	update_lock;
+	struct mutex		update_lock;
 
 	u32			valid;
 	u8			data[USER_EEPROM_SIZE];
@@ -83,7 +83,7 @@ static void max6875_update_slice(struct i2c_client *client, int slice)
 	if (slice >= USER_EEPROM_SLICES)
 		return;
 
-	down(&data->update_lock);
+	mutex_lock(&data->update_lock);
 
 	buf = &data->data[slice << SLICE_BITS];
 
@@ -122,7 +122,7 @@ static void max6875_update_slice(struct i2c_client *client, int slice)
 		data->valid |= (1 << slice);
 	}
 exit_up:
-	up(&data->update_lock);
+	mutex_unlock(&data->update_lock);
 }
 
 static ssize_t max6875_read(struct kobject *kobj, char *buf, loff_t off,
@@ -196,7 +196,7 @@ static int max6875_detect(struct i2c_adapter *adapter, int address, int kind)
 	real_client->driver = &max6875_driver;
 	real_client->flags = 0;
 	strlcpy(real_client->name, "max6875", I2C_NAME_SIZE);
-	init_MUTEX(&data->update_lock);
+	mutex_init(&data->update_lock);
 
 	/* Init fake client data */
 	/* set the client data to the i2c_client so that it will get freed */
diff --git a/drivers/i2c/chips/pcf8591.c b/drivers/i2c/chips/pcf8591.c
index 36cff09c678d..925a6b371fd2 100644
--- a/drivers/i2c/chips/pcf8591.c
+++ b/drivers/i2c/chips/pcf8591.c
@@ -24,6 +24,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
+#include <linux/mutex.h>
 
 /* Addresses to scan */
 static unsigned short normal_i2c[] = { 0x48, 0x49, 0x4a, 0x4b, 0x4c,
@@ -74,7 +75,7 @@ MODULE_PARM_DESC(input_mode,
 
 struct pcf8591_data {
 	struct i2c_client client;
-	struct semaphore update_lock;
+	struct mutex update_lock;
 
 	u8 control;
 	u8 aout;
@@ -144,13 +145,13 @@ static ssize_t set_out0_enable(struct device *dev, struct device_attribute *attr
 	struct pcf8591_data *data = i2c_get_clientdata(client);
 	unsigned long val = simple_strtoul(buf, NULL, 10);
 
-	down(&data->update_lock);
+	mutex_lock(&data->update_lock);
 	if (val)
 		data->control |= PCF8591_CONTROL_AOEF;
 	else
 		data->control &= ~PCF8591_CONTROL_AOEF;
 	i2c_smbus_write_byte(client, data->control);
-	up(&data->update_lock);
+	mutex_unlock(&data->update_lock);
 	return count;
 }
 
@@ -200,7 +201,7 @@ static int pcf8591_detect(struct i2c_adapter *adapter, int address, int kind)
 	/* Fill in the remaining client fields and put it into the global 
 	   list */
 	strlcpy(new_client->name, "pcf8591", I2C_NAME_SIZE);
-	init_MUTEX(&data->update_lock);
+	mutex_init(&data->update_lock);
 
 	/* Tell the I2C layer a new client has arrived */
 	if ((err = i2c_attach_client(new_client)))
@@ -265,7 +266,7 @@ static int pcf8591_read_channel(struct device *dev, int channel)
 	struct i2c_client *client = to_i2c_client(dev);
 	struct pcf8591_data *data = i2c_get_clientdata(client);
 
-	down(&data->update_lock);
+	mutex_lock(&data->update_lock);
 
 	if ((data->control & PCF8591_CONTROL_AICH_MASK) != channel) {
 		data->control = (data->control & ~PCF8591_CONTROL_AICH_MASK)
@@ -278,7 +279,7 @@ static int pcf8591_read_channel(struct device *dev, int channel)
 	}
 	value = i2c_smbus_read_byte(client);
 
-	up(&data->update_lock);
+	mutex_unlock(&data->update_lock);
 
 	if ((channel == 2 && input_mode == 2) ||
 	    (channel != 3 && (input_mode == 1 || input_mode == 3)))
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index 1af3dfbb8086..179b1e022d80 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -32,6 +32,7 @@
 #include <linux/suspend.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/mutex.h>
 
 #include <asm/irq.h>
 #include <asm/mach-types.h>
@@ -81,7 +82,7 @@ enum tps_model {
 
 struct tps65010 {
 	struct i2c_client	client;
-	struct semaphore	lock;
+	struct mutex		lock;
 	int			irq;
 	struct work_struct	work;
 	struct dentry		*file;
@@ -218,7 +219,7 @@ static int dbg_show(struct seq_file *s, void *_)
 	seq_printf(s, "driver  %s\nversion %s\nchip    %s\n\n",
 			DRIVER_NAME, DRIVER_VERSION, chip);
 
-	down(&tps->lock);
+	mutex_lock(&tps->lock);
 
 	/* FIXME how can we tell whether a battery is present?
 	 * likely involves a charge gauging chip (like BQ26501).
@@ -300,7 +301,7 @@ static int dbg_show(struct seq_file *s, void *_)
 				(v2 & (1 << (4 + i))) ? "rising" : "falling");
 	}
 
-	up(&tps->lock);
+	mutex_unlock(&tps->lock);
 	return 0;
 }
 
@@ -416,7 +417,7 @@ static void tps65010_work(void *_tps)
 {
 	struct tps65010		*tps = _tps;
 
-	down(&tps->lock);
+	mutex_lock(&tps->lock);
 
 	tps65010_interrupt(tps);
 
@@ -444,7 +445,7 @@ static void tps65010_work(void *_tps)
 	if (test_and_clear_bit(FLAG_IRQ_ENABLE, &tps->flags))
 		enable_irq(tps->irq);
 
-	up(&tps->lock);
+	mutex_unlock(&tps->lock);
 }
 
 static irqreturn_t tps65010_irq(int irq, void *_tps, struct pt_regs *regs)
@@ -505,7 +506,7 @@ tps65010_probe(struct i2c_adapter *bus, int address, int kind)
 	if (!tps)
 		return 0;
 
-	init_MUTEX(&tps->lock);
+	mutex_init(&tps->lock);
 	INIT_WORK(&tps->work, tps65010_work, tps);
 	tps->irq = -1;
 	tps->client.addr = address;
@@ -695,7 +696,7 @@ int tps65010_set_gpio_out_value(unsigned gpio, unsigned value)
 	if ((gpio < GPIO1) || (gpio > GPIO4))
 		return -EINVAL;
 
-	down(&the_tps->lock);
+	mutex_lock(&the_tps->lock);
 
 	defgpio = i2c_smbus_read_byte_data(&the_tps->client, TPS_DEFGPIO);
 
@@ -720,7 +721,7 @@ int tps65010_set_gpio_out_value(unsigned gpio, unsigned value)
 		gpio, value ? "high" : "low",
 		i2c_smbus_read_byte_data(&the_tps->client, TPS_DEFGPIO));
 
-	up(&the_tps->lock);
+	mutex_unlock(&the_tps->lock);
 	return status;
 }
 EXPORT_SYMBOL(tps65010_set_gpio_out_value);
@@ -745,7 +746,7 @@ int tps65010_set_led(unsigned led, unsigned mode)
 		led = LED2;
 	}
 
-	down(&the_tps->lock);
+	mutex_lock(&the_tps->lock);
 
 	pr_debug("%s: led%i_on   0x%02x\n", DRIVER_NAME, led,
 		i2c_smbus_read_byte_data(&the_tps->client,
@@ -771,7 +772,7 @@ int tps65010_set_led(unsigned led, unsigned mode)
 	default:
 		printk(KERN_ERR "%s: Wrong mode parameter for set_led()\n",
 		       DRIVER_NAME);
-		up(&the_tps->lock);
+		mutex_unlock(&the_tps->lock);
 		return -EINVAL;
 	}
 
@@ -781,7 +782,7 @@ int tps65010_set_led(unsigned led, unsigned mode)
 	if (status != 0) {
 		printk(KERN_ERR "%s: Failed to write led%i_on register\n",
 		       DRIVER_NAME, led);
-		up(&the_tps->lock);
+		mutex_unlock(&the_tps->lock);
 		return status;
 	}
 
@@ -794,7 +795,7 @@ int tps65010_set_led(unsigned led, unsigned mode)
 	if (status != 0) {
 		printk(KERN_ERR "%s: Failed to write led%i_per register\n",
 		       DRIVER_NAME, led);
-		up(&the_tps->lock);
+		mutex_unlock(&the_tps->lock);
 		return status;
 	}
 
@@ -802,7 +803,7 @@ int tps65010_set_led(unsigned led, unsigned mode)
 		i2c_smbus_read_byte_data(&the_tps->client,
 				TPS_LED1_PER + offs));
 
-	up(&the_tps->lock);
+	mutex_unlock(&the_tps->lock);
 
 	return status;
 }
@@ -820,7 +821,7 @@ int tps65010_set_vib(unsigned value)
 	if (!the_tps)
 		return -ENODEV;
 
-	down(&the_tps->lock);
+	mutex_lock(&the_tps->lock);
 
 	vdcdc2 = i2c_smbus_read_byte_data(&the_tps->client, TPS_VDCDC2);
 	vdcdc2 &= ~(1 << 1);
@@ -831,7 +832,7 @@ int tps65010_set_vib(unsigned value)
 
 	pr_debug("%s: vibrator %s\n", DRIVER_NAME, value ? "on" : "off");
 
-	up(&the_tps->lock);
+	mutex_unlock(&the_tps->lock);
 	return status;
 }
 EXPORT_SYMBOL(tps65010_set_vib);
@@ -848,7 +849,7 @@ int tps65010_set_low_pwr(unsigned mode)
 	if (!the_tps)
 		return -ENODEV;
 
-	down(&the_tps->lock);
+	mutex_lock(&the_tps->lock);
 
 	pr_debug("%s: %s low_pwr, vdcdc1 0x%02x\n", DRIVER_NAME,
 		mode ? "enable" : "disable",
@@ -876,7 +877,7 @@ int tps65010_set_low_pwr(unsigned mode)
 		pr_debug("%s: vdcdc1 0x%02x\n", DRIVER_NAME,
 			i2c_smbus_read_byte_data(&the_tps->client, TPS_VDCDC1));
 
-	up(&the_tps->lock);
+	mutex_unlock(&the_tps->lock);
 
 	return status;
 }
@@ -894,7 +895,7 @@ int tps65010_config_vregs1(unsigned value)
 	if (!the_tps)
 		return -ENODEV;
 
-	down(&the_tps->lock);
+	mutex_lock(&the_tps->lock);
 
 	pr_debug("%s: vregs1 0x%02x\n", DRIVER_NAME,
 			i2c_smbus_read_byte_data(&the_tps->client, TPS_VREGS1));
@@ -909,7 +910,7 @@ int tps65010_config_vregs1(unsigned value)
 		pr_debug("%s: vregs1 0x%02x\n", DRIVER_NAME,
 			i2c_smbus_read_byte_data(&the_tps->client, TPS_VREGS1));
 
-	up(&the_tps->lock);
+	mutex_unlock(&the_tps->lock);
 
 	return status;
 }
@@ -931,7 +932,7 @@ int tps65013_set_low_pwr(unsigned mode)
 	if (!the_tps || the_tps->por)
 		return -ENODEV;
 
-	down(&the_tps->lock);
+	mutex_lock(&the_tps->lock);
 
 	pr_debug("%s: %s low_pwr, chgconfig 0x%02x vdcdc1 0x%02x\n",
 		DRIVER_NAME,
@@ -959,7 +960,7 @@ int tps65013_set_low_pwr(unsigned mode)
 	if (status != 0) {
 		printk(KERN_ERR "%s: Failed to write chconfig register\n",
 	 DRIVER_NAME);
-		up(&the_tps->lock);
+		mutex_unlock(&the_tps->lock);
 		return status;
 	}
 
@@ -977,7 +978,7 @@ int tps65013_set_low_pwr(unsigned mode)
 		pr_debug("%s: vdcdc1 0x%02x\n", DRIVER_NAME,
 			i2c_smbus_read_byte_data(&the_tps->client, TPS_VDCDC1));
 
-	up(&the_tps->lock);
+	mutex_unlock(&the_tps->lock);
 
 	return status;
 }
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 2b0c555aa011..975cb647da10 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -169,8 +169,8 @@ int i2c_add_adapter(struct i2c_adapter *adap)
 	}
 
 	adap->nr =  id & MAX_ID_MASK;
-	init_MUTEX(&adap->bus_lock);
-	init_MUTEX(&adap->clist_lock);
+	mutex_init(&adap->bus_lock);
+	mutex_init(&adap->clist_lock);
 	list_add_tail(&adap->list,&adapters);
 	INIT_LIST_HEAD(&adap->clients);
 
@@ -385,9 +385,9 @@ int i2c_check_addr(struct i2c_adapter *adapter, int addr)
 {
 	int rval;
 
-	down(&adapter->clist_lock);
+	mutex_lock(&adapter->clist_lock);
 	rval = __i2c_check_addr(adapter, addr);
-	up(&adapter->clist_lock);
+	mutex_unlock(&adapter->clist_lock);
 
 	return rval;
 }
@@ -396,13 +396,13 @@ int i2c_attach_client(struct i2c_client *client)
 {
 	struct i2c_adapter *adapter = client->adapter;
 
-	down(&adapter->clist_lock);
+	mutex_lock(&adapter->clist_lock);
 	if (__i2c_check_addr(client->adapter, client->addr)) {
-		up(&adapter->clist_lock);
+		mutex_unlock(&adapter->clist_lock);
 		return -EBUSY;
 	}
 	list_add_tail(&client->list,&adapter->clients);
-	up(&adapter->clist_lock);
+	mutex_unlock(&adapter->clist_lock);
 	
 	if (adapter->client_register)  {
 		if (adapter->client_register(client))  {
@@ -451,12 +451,12 @@ int i2c_detach_client(struct i2c_client *client)
 		}
 	}
 
-	down(&adapter->clist_lock);
+	mutex_lock(&adapter->clist_lock);
 	list_del(&client->list);
 	init_completion(&client->released);
 	device_remove_file(&client->dev, &dev_attr_client_name);
 	device_unregister(&client->dev);
-	up(&adapter->clist_lock);
+	mutex_unlock(&adapter->clist_lock);
 	wait_for_completion(&client->released);
 
  out:
@@ -514,19 +514,19 @@ void i2c_clients_command(struct i2c_adapter *adap, unsigned int cmd, void *arg)
 	struct list_head  *item;
 	struct i2c_client *client;
 
-	down(&adap->clist_lock);
+	mutex_lock(&adap->clist_lock);
 	list_for_each(item,&adap->clients) {
 		client = list_entry(item, struct i2c_client, list);
 		if (!try_module_get(client->driver->driver.owner))
 			continue;
 		if (NULL != client->driver->command) {
-			up(&adap->clist_lock);
+			mutex_unlock(&adap->clist_lock);
 			client->driver->command(client,cmd,arg);
-			down(&adap->clist_lock);
+			mutex_lock(&adap->clist_lock);
 		}
 		module_put(client->driver->driver.owner);
        }
-       up(&adap->clist_lock);
+       mutex_unlock(&adap->clist_lock);
 }
 
 static int __init i2c_init(void)
@@ -570,9 +570,9 @@ int i2c_transfer(struct i2c_adapter * adap, struct i2c_msg *msgs, int num)
 		}
 #endif
 
-		down(&adap->bus_lock);
+		mutex_lock(&adap->bus_lock);
 		ret = adap->algo->master_xfer(adap,msgs,num);
-		up(&adap->bus_lock);
+		mutex_unlock(&adap->bus_lock);
 
 		return ret;
 	} else {
@@ -1116,10 +1116,10 @@ s32 i2c_smbus_xfer(struct i2c_adapter * adapter, u16 addr, unsigned short flags,
 	flags &= I2C_M_TEN | I2C_CLIENT_PEC;
 
 	if (adapter->algo->smbus_xfer) {
-		down(&adapter->bus_lock);
+		mutex_lock(&adapter->bus_lock);
 		res = adapter->algo->smbus_xfer(adapter,addr,flags,read_write,
 		                                command,size,data);
-		up(&adapter->bus_lock);
+		mutex_unlock(&adapter->bus_lock);
 	} else
 		res = i2c_smbus_xfer_emulated(adapter,addr,flags,read_write,
 	                                      command,size,data);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 63f1d63cc1d8..1635ee25918f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -32,7 +32,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 /* --- For i2c-isa ---------------------------------------------------- */
 
@@ -225,8 +225,8 @@ struct i2c_adapter {
 	int (*client_unregister)(struct i2c_client *);
 
 	/* data fields that are valid for all devices	*/
-	struct semaphore bus_lock;
-	struct semaphore clist_lock;
+	struct mutex bus_lock;
+	struct mutex clist_lock;
 
 	int timeout;
 	int retries;
-- 
cgit v1.2.3


From 2f27f46c49c096d86e7e63acd43cad3bc42f2383 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sun, 5 Feb 2006 23:29:18 +0100
Subject: [PATCH] i2c: Drop the i2c-frodo bus driver

Drop the i2c-frodo bus driver. It isn't referenced by the build
system, and depends on code which was never included in 2.6 kernels.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/busses/i2c-frodo.c | 85 ------------------------------------------
 include/linux/i2c-id.h         |  1 -
 2 files changed, 86 deletions(-)
 delete mode 100644 drivers/i2c/busses/i2c-frodo.c

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-frodo.c b/drivers/i2c/busses/i2c-frodo.c
deleted file mode 100644
index b6f52f5a4138..000000000000
--- a/drivers/i2c/busses/i2c-frodo.c
+++ /dev/null
@@ -1,85 +0,0 @@
-
-/*
- * linux/drivers/i2c/i2c-frodo.c
- *
- * Author: Abraham van der Merwe <abraham@2d3d.co.za>
- *
- * An I2C adapter driver for the 2d3D, Inc. StrongARM SA-1110
- * Development board (Frodo).
- *
- * This source code is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/i2c.h>
-#include <linux/i2c-algo-bit.h>
-#include <asm/hardware.h>
-
-
-static void frodo_setsda (void *data,int state)
-{
-	if (state)
-		FRODO_CPLD_I2C |= FRODO_I2C_SDA_OUT;
-	else
-		FRODO_CPLD_I2C &= ~FRODO_I2C_SDA_OUT;
-}
-
-static void frodo_setscl (void *data,int state)
-{
-	if (state)
-		FRODO_CPLD_I2C |= FRODO_I2C_SCL_OUT;
-	else
-		FRODO_CPLD_I2C &= ~FRODO_I2C_SCL_OUT;
-}
-
-static int frodo_getsda (void *data)
-{
-	return ((FRODO_CPLD_I2C & FRODO_I2C_SDA_IN) != 0);
-}
-
-static int frodo_getscl (void *data)
-{
-	return ((FRODO_CPLD_I2C & FRODO_I2C_SCL_IN) != 0);
-}
-
-static struct i2c_algo_bit_data bit_frodo_data = {
-	.setsda		= frodo_setsda,
-	.setscl		= frodo_setscl,
-	.getsda		= frodo_getsda,
-	.getscl		= frodo_getscl,
-	.udelay		= 80,
-	.mdelay		= 80,
-	.timeout	= HZ
-};
-
-static struct i2c_adapter frodo_ops = {
-	.owner			= THIS_MODULE,
-	.id			= I2C_HW_B_FRODO,
-	.algo_data		= &bit_frodo_data,
-	.dev			= {
-		.name		= "Frodo adapter driver",
-	},
-};
-
-static int __init i2c_frodo_init (void)
-{
-	return i2c_bit_add_bus(&frodo_ops);
-}
-
-static void __exit i2c_frodo_exit (void)
-{
-	i2c_bit_del_bus(&frodo_ops);
-}
-
-MODULE_AUTHOR ("Abraham van der Merwe <abraham@2d3d.co.za>");
-MODULE_DESCRIPTION ("I2C-Bus adapter routines for Frodo");
-MODULE_LICENSE ("GPL");
-
-module_init (i2c_frodo_init);
-module_exit (i2c_frodo_exit);
-
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 474c8f4f5d4f..ec311bc89439 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -172,7 +172,6 @@
 #define I2C_HW_B_RIVA		0x010010 /* Riva based graphics cards */
 #define I2C_HW_B_IOC		0x010011 /* IOC bit-wiggling */
 #define I2C_HW_B_TSUNA		0x010012 /* DEC Tsunami chipset */
-#define I2C_HW_B_FRODO		0x010013 /* 2d3D SA-1110 Development Board */
 #define I2C_HW_B_OMAHA		0x010014 /* Omaha I2C interface (ARM) */
 #define I2C_HW_B_GUIDE		0x010015 /* Guide bit-basher */
 #define I2C_HW_B_IXP2000	0x010016 /* GPIO on IXP2000 systems */
-- 
cgit v1.2.3


From 5f7ea3c58c9aa571617a9d77dd2fbd4bd81cc50a Mon Sep 17 00:00:00 2001
From: Martin Devera <devik@cdi.cz>
Date: Mon, 27 Feb 2006 23:11:45 +0100
Subject: [PATCH] I2C: i2c-piix4: Add Broadcom HT-1000 support

Add Broadcom HT-1000 south bridge's PCI ID to i2c-piix driver. Note
that at least on Supermicro H8SSL it uses non-standard SMBHSTCFG = 3
and standard values like 0 or 9 causes hangup.

Signed-off-by: Martin Devera <devik@cdi.cz>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-piix4 | 2 +-
 drivers/i2c/busses/Kconfig         | 4 +++-
 drivers/i2c/busses/i2c-piix4.c     | 4 +++-
 include/linux/pci_ids.h            | 1 +
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index 856b4b8b962c..a1c8f581afed 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -4,7 +4,7 @@ Supported adapters:
   * Intel 82371AB PIIX4 and PIIX4E
   * Intel 82443MX (440MX)
     Datasheet: Publicly available at the Intel website
-  * ServerWorks OSB4, CSB5 and CSB6 southbridges
+  * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges
     Datasheet: Only available via NDA from ServerWorks
   * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge
     Datasheet: Publicly available at the SMSC website http://www.smsc.com
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 7d2705a68e09..089c6f5b24de 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -168,12 +168,14 @@ config I2C_PIIX4
 	help
 	  If you say yes to this option, support will be included for the Intel
 	  PIIX4 family of mainboard I2C interfaces.  Specifically, the following
-	  versions of the chipset are supported:
+	  versions of the chipset are supported (note that Serverworks is part
+	  of Broadcom):
 	    Intel PIIX4
 	    Intel 440MX
 	    Serverworks OSB4
 	    Serverworks CSB5
 	    Serverworks CSB6
+	    Serverworks HT-1000
 	    SMSC Victory66
 
 	  This driver can also be built as a module.  If so, the module
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 692f47345481..d9c7c00e71f9 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -22,7 +22,7 @@
 /*
    Supports:
 	Intel PIIX4, 440MX
-	Serverworks OSB4, CSB5, CSB6
+	Serverworks OSB4, CSB5, CSB6, HT-1000
 	SMSC Victory66
 
    Note: we assume there can only be one device, with one SMBus interface.
@@ -419,6 +419,8 @@ static struct pci_device_id piix4_ids[] = {
 	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6),
 	  .driver_data = 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT1000SB),
+	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443MX_3),
 	  .driver_data = 3 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_EFAR, PCI_DEVICE_ID_EFAR_SLC90E66_3),
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ec3c32932620..f5a724fbf094 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1371,6 +1371,7 @@
 #define PCI_DEVICE_ID_SERVERWORKS_OSB4	  0x0200
 #define PCI_DEVICE_ID_SERVERWORKS_CSB5	  0x0201
 #define PCI_DEVICE_ID_SERVERWORKS_CSB6    0x0203
+#define PCI_DEVICE_ID_SERVERWORKS_HT1000SB 0x0205
 #define PCI_DEVICE_ID_SERVERWORKS_OSB4IDE 0x0211
 #define PCI_DEVICE_ID_SERVERWORKS_CSB5IDE 0x0212
 #define PCI_DEVICE_ID_SERVERWORKS_CSB6IDE 0x0213
-- 
cgit v1.2.3


From b82db5cedf78bfeb4a1c8a28ae284dc671d26eb3 Mon Sep 17 00:00:00 2001
From: Kristen Accardi <kristen.c.accardi@intel.com>
Date: Tue, 17 Jan 2006 16:56:56 -0800
Subject: [PATCH] PCI: return max reserved busnr

Change the semantics of this call to return the max reserved
bus number instead of just the max assigned bus number.

Signed-off-by: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/pci.c   | 5 +++--
 include/linux/pci.h | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e17cd49d6244..0bf6d254426b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -19,7 +19,6 @@
 #include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
-#if 0
 
 /**
  * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children
@@ -34,7 +33,7 @@ pci_bus_max_busnr(struct pci_bus* bus)
 	struct list_head *tmp;
 	unsigned char max, n;
 
-	max = bus->number;
+	max = bus->subordinate;
 	list_for_each(tmp, &bus->children) {
 		n = pci_bus_max_busnr(pci_bus_b(tmp));
 		if(n > max)
@@ -42,7 +41,9 @@ pci_bus_max_busnr(struct pci_bus* bus)
 	}
 	return max;
 }
+EXPORT_SYMBOL_GPL(pci_bus_max_busnr);
 
+#if 0
 /**
  * pci_max_busnr - returns maximum PCI bus number
  *
diff --git a/include/linux/pci.h b/include/linux/pci.h
index fe1a2b02fc55..2039da1f3672 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -516,6 +516,7 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass
 void pci_walk_bus(struct pci_bus *top, void (*cb)(struct pci_dev *, void *),
 		  void *userdata);
 int pci_cfg_space_size(struct pci_dev *dev);
+unsigned char pci_bus_max_busnr(struct pci_bus* bus);
 
 /* kmem_cache style wrapper around pci_alloc_consistent() */
 
-- 
cgit v1.2.3


From 6e325a62a0a228cd0222783802b53cce04551776 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@mellanox.co.il>
Date: Tue, 14 Feb 2006 18:52:22 +0200
Subject: [PATCH] PCI: make MSI quirk inheritable from the pci bus

It turns out AMD 8131 quirk only affects MSI for devices behind the 8131 bridge.
Handle this by adding a flags field in pci_bus, inherited from parent to child.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/msi.c    | 3 +++
 drivers/pci/probe.c  | 1 +
 drivers/pci/quirks.c | 7 +++++--
 include/linux/pci.h  | 7 ++++++-
 4 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index d5a67c1bcb98..4de1c17ee573 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -703,6 +703,9 @@ int pci_enable_msi(struct pci_dev* dev)
 	if (dev->no_msi)
 		return status;
 
+	if (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
+		return -EINVAL;
+
 	temp = dev->irq;
 
 	status = msi_init();
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 3bc0fcd71d03..542e7dfb371b 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -347,6 +347,7 @@ pci_alloc_child_bus(struct pci_bus *parent, struct pci_dev *bridge, int busnr)
 	child->parent = parent;
 	child->ops = parent->ops;
 	child->sysdata = parent->sysdata;
+	child->bus_flags = parent->bus_flags;
 	child->bridge = get_device(&bridge->dev);
 
 	child->class_dev.class = &pcibus_class;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index d71c31df7fdf..4970f47be72c 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -575,8 +575,11 @@ static void __init quirk_amd_8131_ioapic(struct pci_dev *dev)
 { 
         unsigned char revid, tmp;
         
-	pci_msi_quirk = 1;
-	printk(KERN_WARNING "PCI: MSI quirk detected. pci_msi_quirk set.\n");
+	if (dev->subordinate) {
+		printk(KERN_WARNING "PCI: MSI quirk detected. "
+		       "PCI_BUS_FLAGS_NO_MSI set for subordinate bus.\n");
+		dev->subordinate->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+	}
 
         if (nr_ioapics == 0) 
                 return;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2039da1f3672..d4d533fa5d30 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -95,6 +95,11 @@ enum pci_channel_state {
 	pci_channel_io_perm_failure = (__force pci_channel_state_t) 3,
 };
 
+typedef unsigned short __bitwise pci_bus_flags_t;
+enum pci_bus_flags {
+	PCI_BUS_FLAGS_NO_MSI = (pci_bus_flags_t) 1,
+};
+
 /*
  * The pci_dev structure is used to describe PCI devices.
  */
@@ -203,7 +208,7 @@ struct pci_bus {
 	char		name[48];
 
 	unsigned short  bridge_ctl;	/* manage NO_ISA/FBB/et al behaviors */
-	unsigned short  pad2;
+	pci_bus_flags_t bus_flags;	/* Inherited by child busses */
 	struct device		*bridge;
 	struct class_device	class_dev;
 	struct bin_attribute	*legacy_io; /* legacy I/O for this bus */
-- 
cgit v1.2.3


From 3c990e9219ea0b0aee588473ce6c8a66cdee3ff5 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Sat, 4 Mar 2006 21:52:42 -0500
Subject: [PATCH] PCI: fix pci_request_region[s] arg

    Add missing 'const' to pci_request_region[s] 'res_name' arg,
    since we pass it directly to __request_region(), whose 'name' arg
    is also const.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/pci.c   | 4 ++--
 include/linux/pci.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 03af23238939..bea1ad1ad5ba 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -639,7 +639,7 @@ void pci_release_region(struct pci_dev *pdev, int bar)
  *	Returns 0 on success, or %EBUSY on error.  A warning
  *	message is also printed on failure.
  */
-int pci_request_region(struct pci_dev *pdev, int bar, char *res_name)
+int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name)
 {
 	if (pci_resource_len(pdev, bar) == 0)
 		return 0;
@@ -697,7 +697,7 @@ void pci_release_regions(struct pci_dev *pdev)
  *	Returns 0 on success, or %EBUSY on error.  A warning
  *	message is also printed on failure.
  */
-int pci_request_regions(struct pci_dev *pdev, char *res_name)
+int pci_request_regions(struct pci_dev *pdev, const char *res_name)
 {
 	int i;
 	
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d4d533fa5d30..0aad5a378e95 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -490,9 +490,9 @@ void pdev_sort_resources(struct pci_dev *, struct resource_list *);
 void pci_fixup_irqs(u8 (*)(struct pci_dev *, u8 *),
 		    int (*)(struct pci_dev *, u8, u8));
 #define HAVE_PCI_REQ_REGIONS	2
-int pci_request_regions(struct pci_dev *, char *);
+int pci_request_regions(struct pci_dev *, const char *);
 void pci_release_regions(struct pci_dev *);
-int pci_request_region(struct pci_dev *, int, char *);
+int pci_request_region(struct pci_dev *, int, const char *);
 void pci_release_region(struct pci_dev *, int);
 
 /* drivers/pci/bus.c */
-- 
cgit v1.2.3


From acf356b12d13c8b43c486e53e8ee12f1f435ecc8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 24 Mar 2006 14:07:50 +0900
Subject: [PATCH] libata: add per-dev pio/mwdma/udma_mask

Add per-dev pio/mwdma/udma_mask.  All transfer mode limits used to be
applied to ap->*_mask which unnecessarily restricted other devices
sharing the port.  This change will also benefit later EH speed down
and hotplug.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 43 +++++++++++++++++++++++++------------------
 include/linux/libata.h     |  5 +++++
 2 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 1514cb5e35f7..a87748ba579c 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -65,8 +65,7 @@ static unsigned int ata_dev_init_params(struct ata_port *ap,
 					struct ata_device *dev);
 static void ata_set_mode(struct ata_port *ap);
 static void ata_dev_set_xfermode(struct ata_port *ap, struct ata_device *dev);
-static unsigned int ata_dev_xfermask(struct ata_port *ap,
-				     struct ata_device *dev);
+static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev);
 
 static unsigned int ata_unique_id = 1;
 static struct workqueue_struct *ata_wq;
@@ -1801,16 +1800,19 @@ static void ata_set_mode(struct ata_port *ap)
 	/* step 1: calculate xfer_mask */
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *dev = &ap->device[i];
-		unsigned int xfer_mask;
+		unsigned int pio_mask, dma_mask;
 
 		if (!ata_dev_present(dev))
 			continue;
 
-		xfer_mask = ata_dev_xfermask(ap, dev);
+		ata_dev_xfermask(ap, dev);
 
-		dev->pio_mode = ata_xfer_mask2mode(xfer_mask & ATA_MASK_PIO);
-		dev->dma_mode = ata_xfer_mask2mode(xfer_mask & (ATA_MASK_MWDMA |
-								ATA_MASK_UDMA));
+		/* TODO: let LLDD filter dev->*_mask here */
+
+		pio_mask = ata_pack_xfermask(dev->pio_mask, 0, 0);
+		dma_mask = ata_pack_xfermask(0, dev->mwdma_mask, dev->udma_mask);
+		dev->pio_mode = ata_xfer_mask2mode(pio_mask);
+		dev->dma_mode = ata_xfer_mask2mode(dma_mask);
 	}
 
 	/* step 2: always set host PIO timings */
@@ -2653,18 +2655,15 @@ static int ata_dma_blacklisted(const struct ata_device *dev)
  *	@ap: Port on which the device to compute xfermask for resides
  *	@dev: Device to compute xfermask for
  *
- *	Compute supported xfermask of @dev.  This function is
- *	responsible for applying all known limits including host
- *	controller limits, device blacklist, etc...
+ *	Compute supported xfermask of @dev and store it in
+ *	dev->*_mask.  This function is responsible for applying all
+ *	known limits including host controller limits, device
+ *	blacklist, etc...
  *
  *	LOCKING:
  *	None.
- *
- *	RETURNS:
- *	Computed xfermask.
  */
-static unsigned int ata_dev_xfermask(struct ata_port *ap,
-				     struct ata_device *dev)
+static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev)
 {
 	unsigned long xfer_mask;
 	int i;
@@ -2677,6 +2676,8 @@ static unsigned int ata_dev_xfermask(struct ata_port *ap,
 		struct ata_device *d = &ap->device[i];
 		if (!ata_dev_present(d))
 			continue;
+		xfer_mask &= ata_pack_xfermask(d->pio_mask, d->mwdma_mask,
+					       d->udma_mask);
 		xfer_mask &= ata_id_xfermask(d->id);
 		if (ata_dma_blacklisted(d))
 			xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA);
@@ -2686,7 +2687,8 @@ static unsigned int ata_dev_xfermask(struct ata_port *ap,
 		printk(KERN_WARNING "ata%u: dev %u is on DMA blacklist, "
 		       "disabling DMA\n", ap->id, dev->devno);
 
-	return xfer_mask;
+	ata_unpack_xfermask(xfer_mask, &dev->pio_mask, &dev->mwdma_mask,
+			    &dev->udma_mask);
 }
 
 /**
@@ -4436,8 +4438,13 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	INIT_WORK(&ap->port_task, NULL, NULL);
 	INIT_LIST_HEAD(&ap->eh_done_q);
 
-	for (i = 0; i < ATA_MAX_DEVICES; i++)
-		ap->device[i].devno = i;
+	for (i = 0; i < ATA_MAX_DEVICES; i++) {
+		struct ata_device *dev = &ap->device[i];
+		dev->devno = i;
+		dev->pio_mask = UINT_MAX;
+		dev->mwdma_mask = UINT_MAX;
+		dev->udma_mask = UINT_MAX;
+	}
 
 #ifdef ATA_IRQ_TRAP
 	ap->stats.unhandled_irq = 1;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 7a54244d30aa..fbe8ba212598 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -358,6 +358,11 @@ struct ata_device {
 	unsigned int		max_sectors;	/* per-device max sectors */
 	unsigned int		cdb_len;
 
+	/* per-dev xfer mask */
+	unsigned int		pio_mask;
+	unsigned int		mwdma_mask;
+	unsigned int		udma_mask;
+
 	/* for CHS addressing */
 	u16			cylinders;	/* Number of cylinders */
 	u16			heads;		/* Number of heads */
-- 
cgit v1.2.3


From 082776e4be791736c32baf818e50f501a7f83819 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@cyclades.com>
Date: Thu, 23 Mar 2006 23:22:16 +1000
Subject: [PATCH] Make libata not powerdown drivers on PM_EVENT_FREEZE.

At the moment libata doesn't pass pm_message_t down ata_device_suspend.
This causes drives to be powered down when we just want a freeze,
causing unnecessary wear and tear. This patch gets pm_message_t passed
down so that it can be used to determine whether to power down the
drive.

Signed-off-by: Nigel Cunningham <nigel@suspend2.net>

 drivers/scsi/libata-core.c |    5 +++--
 drivers/scsi/libata-scsi.c |    4 ++--
 drivers/scsi/scsi_sysfs.c  |    2 +-
 include/linux/libata.h     |    4 ++--
 include/scsi/scsi_host.h   |    2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 5 +++--
 drivers/scsi/libata-scsi.c | 4 ++--
 drivers/scsi/scsi_sysfs.c  | 2 +-
 include/linux/libata.h     | 4 ++--
 include/scsi/scsi_host.h   | 2 +-
 5 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 6970f6867334..d8f77f2a5af1 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -4336,14 +4336,15 @@ int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
  *	Flush the cache on the drive, if appropriate, then issue a
  *	standbynow command.
  */
-int ata_device_suspend(struct ata_port *ap, struct ata_device *dev)
+int ata_device_suspend(struct ata_port *ap, struct ata_device *dev, pm_message_t state)
 {
 	if (!ata_dev_present(dev))
 		return 0;
 	if (dev->class == ATA_DEV_ATA)
 		ata_flush_cache(ap, dev);
 
-	ata_standby_drive(ap, dev);
+	if (state.event != PM_EVENT_FREEZE)
+		ata_standby_drive(ap, dev);
 	ap->flags |= ATA_FLAG_SUSPENDED;
 	return 0;
 }
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index a1259b242b8e..1fd3826da97e 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -414,12 +414,12 @@ int ata_scsi_device_resume(struct scsi_device *sdev)
 	return ata_device_resume(ap, dev);
 }
 
-int ata_scsi_device_suspend(struct scsi_device *sdev)
+int ata_scsi_device_suspend(struct scsi_device *sdev, pm_message_t state)
 {
 	struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
 	struct ata_device *dev = &ap->device[sdev->id];
 
-	return ata_device_suspend(ap, dev);
+	return ata_device_suspend(ap, dev, state);
 }
 
 /**
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 89055494dfee..a6fde52946d6 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -286,7 +286,7 @@ static int scsi_bus_suspend(struct device * dev, pm_message_t state)
 		return err;
 
 	if (sht->suspend)
-		err = sht->suspend(sdev);
+		err = sht->suspend(sdev, state);
 
 	return err;
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index fbe8ba212598..c52f13498556 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -520,9 +520,9 @@ extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 extern int ata_scsi_release(struct Scsi_Host *host);
 extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
 extern int ata_scsi_device_resume(struct scsi_device *);
-extern int ata_scsi_device_suspend(struct scsi_device *);
+extern int ata_scsi_device_suspend(struct scsi_device *, pm_message_t state);
 extern int ata_device_resume(struct ata_port *, struct ata_device *);
-extern int ata_device_suspend(struct ata_port *, struct ata_device *);
+extern int ata_device_suspend(struct ata_port *, struct ata_device *, pm_message_t state);
 extern int ata_ratelimit(void);
 extern unsigned int ata_busy_sleep(struct ata_port *ap,
 				   unsigned long timeout_pat,
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index a6cf3e535c0b..dc6862d09e53 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -286,7 +286,7 @@ struct scsi_host_template {
 	 * suspend support
 	 */
 	int (*resume)(struct scsi_device *);
-	int (*suspend)(struct scsi_device *);
+	int (*suspend)(struct scsi_device *, pm_message_t state);
 
 	/*
 	 * Name of proc directory
-- 
cgit v1.2.3


From ebdfca6eb1b755d3bfe9a81339ecdafd92038c1a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 23 Mar 2006 15:38:34 +0000
Subject: [PATCH] libata: add ata_dev_pair helper

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 17 +++++++++++++++++
 include/linux/libata.h     |  2 ++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index d8f77f2a5af1..909568f7a72a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1575,6 +1575,23 @@ void sata_phy_reset(struct ata_port *ap)
 	ata_bus_reset(ap);
 }
 
+/**
+ *	ata_dev_pair		-	return other device on cable
+ *	@ap: port
+ *	@adev: device
+ *
+ *	Obtain the other device on the same cable, or if none is
+ *	present NULL is returned
+ */
+ 
+struct ata_device *ata_dev_pair(struct ata_port *ap, struct ata_device *adev)
+{
+	struct ata_device *pair = &ap->device[1 - adev->devno];
+	if (!ata_dev_present(pair))
+		return NULL;
+	return pair;
+}
+
 /**
  *	ata_port_disable - Disable port.
  *	@ap: Port to be disabled.
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c52f13498556..80c2339da67d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -573,6 +573,8 @@ extern int ata_std_bios_param(struct scsi_device *sdev,
 			      struct block_device *bdev,
 			      sector_t capacity, int geom[]);
 extern int ata_scsi_slave_config(struct scsi_device *sdev);
+extern struct ata_device *ata_dev_pair(struct ata_port *ap, 
+				       struct ata_device *adev);
 
 /*
  * Timing helpers
-- 
cgit v1.2.3


From 2f1f610b62bce36d6d50857859091b8989c70267 Mon Sep 17 00:00:00 2001
From: Brian King <brking@us.ibm.com>
Date: Thu, 23 Mar 2006 17:30:15 -0600
Subject: [PATCH] libata: Remove dependence on host_set->dev for SAS

Remove some of the dependence on the host_set struct
in preparation for supporting SAS HBAs. Adds a struct device
pointer to the ata_port struct.

Signed-off-by: Brian King <brking@us.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 13 +++++++------
 include/linux/libata.h     |  1 +
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 94ad4acf8a59..d279666dcb38 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2856,7 +2856,7 @@ static void ata_sg_clean(struct ata_queued_cmd *qc)
 
 	if (qc->flags & ATA_QCFLAG_SG) {
 		if (qc->n_elem)
-			dma_unmap_sg(ap->host_set->dev, sg, qc->n_elem, dir);
+			dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
 		/* restore last sg */
 		sg[qc->orig_n_elem - 1].length += qc->pad_len;
 		if (pad_buf) {
@@ -2867,7 +2867,7 @@ static void ata_sg_clean(struct ata_queued_cmd *qc)
 		}
 	} else {
 		if (qc->n_elem)
-			dma_unmap_single(ap->host_set->dev,
+			dma_unmap_single(ap->dev,
 				sg_dma_address(&sg[0]), sg_dma_len(&sg[0]),
 				dir);
 		/* restore sg */
@@ -3078,7 +3078,7 @@ static int ata_sg_setup_one(struct ata_queued_cmd *qc)
 		goto skip_map;
 	}
 
-	dma_address = dma_map_single(ap->host_set->dev, qc->buf_virt,
+	dma_address = dma_map_single(ap->dev, qc->buf_virt,
 				     sg->length, dir);
 	if (dma_mapping_error(dma_address)) {
 		/* restore sg */
@@ -3166,7 +3166,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
 	}
 
 	dir = qc->dma_dir;
-	n_elem = dma_map_sg(ap->host_set->dev, sg, pre_n_elem, dir);
+	n_elem = dma_map_sg(ap->dev, sg, pre_n_elem, dir);
 	if (n_elem < 1) {
 		/* restore last sg */
 		lsg->length += qc->pad_len;
@@ -4381,7 +4381,7 @@ int ata_device_suspend(struct ata_port *ap, struct ata_device *dev, pm_message_t
 
 int ata_port_start (struct ata_port *ap)
 {
-	struct device *dev = ap->host_set->dev;
+	struct device *dev = ap->dev;
 	int rc;
 
 	ap->prd = dma_alloc_coherent(dev, ATA_PRD_TBL_SZ, &ap->prd_dma, GFP_KERNEL);
@@ -4414,7 +4414,7 @@ int ata_port_start (struct ata_port *ap)
 
 void ata_port_stop (struct ata_port *ap)
 {
-	struct device *dev = ap->host_set->dev;
+	struct device *dev = ap->dev;
 
 	dma_free_coherent(dev, ATA_PRD_TBL_SZ, ap->prd, ap->prd_dma);
 	ata_pad_free(ap, dev);
@@ -4480,6 +4480,7 @@ static void ata_host_init(struct ata_port *ap, struct Scsi_Host *host,
 	ap->host = host;
 	ap->ctl = ATA_DEVCTL_OBS;
 	ap->host_set = host_set;
+	ap->dev = ent->dev;
 	ap->port_no = port_no;
 	ap->hard_port_no =
 		ent->legacy_mode ? ent->hard_port_no : port_no;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 80c2339da67d..047192253c3a 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -400,6 +400,7 @@ struct ata_port {
 
 	struct ata_host_stats	stats;
 	struct ata_host_set	*host_set;
+	struct device 		*dev;
 
 	struct work_struct	port_task;
 
-- 
cgit v1.2.3


From 6961ec8267d08e21011457b05d2263ec06bdcfe1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <abergman@de.ibm.com>
Date: Fri, 24 Mar 2006 03:15:08 -0800
Subject: [PATCH] add sys_unshare to syscalls.h

All architecture independent system calls should be declared
in syscalls.h, add the one that is missing.

Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/syscalls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b9ea44ac0ddb..e487e3b60f60 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -568,5 +568,6 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
 				      int flag);
 asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
 				   int flags, int mode);
+asmlinkage long sys_unshare(unsigned long unshare_flags);
 
 #endif
-- 
cgit v1.2.3


From 9b04c997b1120feefa1e6ee8e2902270bc055cd2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 24 Mar 2006 03:15:10 -0800
Subject: [PATCH] vfs: MS_VERBOSE should be MS_SILENT

The meaning of MS_VERBOSE is backwards; if the bit is set, it really means,
"don't be verbose".  This is confusing and counter-intuitive.

In addition, there is also no way to set the MS_VERBOSE flag in the
mount(8) program in util-linux, but interesting, it does define options
which would do the right thing if MS_SILENT were defined, which
unfortunately we do not:

#ifdef MS_SILENT
  { "quiet",    0, 0, MS_SILENT    },   /* be quiet  */
  { "loud",     0, 1, MS_SILENT    },   /* print out messages. */
#endif

So the obvious fix is to deprecate the use of MS_VERBOSE and replace it
with MS_SILENT.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/super.c     | 2 +-
 fs/cifs/cifsfs.c   | 2 +-
 fs/jffs2/super.c   | 4 ++--
 fs/nfs/inode.c     | 4 ++--
 fs/super.c         | 6 +++---
 include/linux/fs.h | 4 +++-
 init/do_mounts.c   | 2 +-
 7 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/super.c b/fs/afs/super.c
index d6fa8e5999df..53c56e7231ab 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -341,7 +341,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 
 	sb->s_flags = flags;
 
-	ret = afs_fill_super(sb, &params, flags & MS_VERBOSE ? 1 : 0);
+	ret = afs_fill_super(sb, &params, flags & MS_SILENT ? 1 : 0);
 	if (ret < 0) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 79eeccd0437f..1cd044ce82a6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -479,7 +479,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 
 	sb->s_flags = flags;
 
-	rc = cifs_read_super(sb, data, dev_name, flags & MS_VERBOSE ? 1 : 0);
+	rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0);
 	if (rc) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 93883817cbd0..c8fac352a4cf 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -152,7 +152,7 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 	sb->s_op = &jffs2_super_operations;
 	sb->s_flags = flags | MS_NOATIME;
 
-	ret = jffs2_do_fill_super(sb, data, (flags&MS_VERBOSE)?1:0);
+	ret = jffs2_do_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 
 	if (ret) {
 		/* Failure case... */
@@ -257,7 +257,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 	}
 
 	if (imajor(nd.dentry->d_inode) != MTD_BLOCK_MAJOR) {
-		if (!(flags & MS_VERBOSE)) /* Yes I mean this. Strangely */
+		if (!(flags & MS_SILENT))
 			printk(KERN_NOTICE "Attempt to mount non-MTD device \"%s\" as JFFS2\n",
 			       dev_name);
 		goto out;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a77ee95b7efb..37e55c328ebc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1679,7 +1679,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 
 	s->s_flags = flags;
 
-	error = nfs_fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
+	error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
@@ -1996,7 +1996,7 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	s->s_flags = flags;
 
-	error = nfs4_fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
+	error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
diff --git a/fs/super.c b/fs/super.c
index 425861cb1caa..37554b876182 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -712,7 +712,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
@@ -756,7 +756,7 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 
 	s->s_flags = flags;
 
-	error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
+	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
@@ -785,7 +785,7 @@ struct super_block *get_sb_single(struct file_system_type *fs_type,
 		return s;
 	if (!s->s_root) {
 		s->s_flags = flags;
-		error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0);
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9b34a1b03455..65e6df247ea5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -103,7 +103,9 @@ extern int dir_notify_enable;
 #define MS_BIND		4096
 #define MS_MOVE		8192
 #define MS_REC		16384
-#define MS_VERBOSE	32768
+#define MS_VERBOSE	32768	/* War is peace. Verbosity is silence.
+				   MS_VERBOSE is deprecated. */
+#define MS_SILENT	32768
 #define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 #define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
 #define MS_PRIVATE	(1<<18)	/* change to private */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index b27c11064409..8b671fe68afa 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -19,7 +19,7 @@ extern int get_filesystem_list(char * buf);
 
 int __initdata rd_doload;	/* 1 = load RAM disk, 0 = don't load */
 
-int root_mountflags = MS_RDONLY | MS_VERBOSE;
+int root_mountflags = MS_RDONLY | MS_SILENT;
 char * __initdata root_device_name;
 static char __initdata saved_root_name[64];
 
-- 
cgit v1.2.3


From 87e24802586333fa861861f6493c76039872755b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:15:44 -0800
Subject: [PATCH] bitmap: region cleanup

Paul Mundt <lethal@linux-sh.org> says:

This patch set implements a number of patches to clean up and restructure the
bitmap region code, in addition to extending the interface to support
multiword spanning allocations.

The current implementation (before this patch set) is limited by only being
able to allocate pages <= BITS_PER_LONG, as noted by the strategically
positioned BUG_ON() at lib/bitmap.c:752:

        /* We don't do regions of pages > BITS_PER_LONG.  The
	 * algorithm would be a simple look for multiple zeros in the
	 * array, but there's no driver today that needs this.  If you
	 * trip this BUG(), you get to code it... */
        BUG_ON(pages > BITS_PER_LONG);

As I seem to have been the first person to trigger this, the result ends up
being the following patch set with the help of Paul Jackson.

The final patch in the series eliminates quite a bit of code duplication, so
the bitmap code size ends up being smaller than the current implementation as
an added bonus.

After these are applied, it should already be possible to do multiword
allocations with dma_alloc_coherent() out of ranges established by
dma_declare_coherent_memory() on x86 without having to change any of the code,
and the SH store queue API will follow up on this as the other user that needs
support for this.

This patch:

Some code cleanup on the lib/bitmap.c bitmap_*_region() routines:

 * spacing
 * variable names
 * comments

Has no change to code function.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bitmap.h |  3 +++
 lib/bitmap.c           | 64 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 41 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 7d8ff97b3e92..d9ed27969855 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -46,6 +46,9 @@
  * bitmap_parse(ubuf, ulen, dst, nbits)		Parse bitmap dst from user buf
  * bitmap_scnlistprintf(buf, len, src, nbits)	Print bitmap src as list to buf
  * bitmap_parselist(buf, dst, nbits)		Parse bitmap dst from list
+ * bitmap_find_free_region(bitmap, bits, order)	Find and allocate bit region
+ * bitmap_release_region(bitmap, pos, order)	Free specified bit region
+ * bitmap_allocate_region(bitmap, pos, order)	Allocate specified bit region
  */
 
 /*
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 48e708381d44..3fab1ce9ac65 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -677,39 +677,38 @@ int bitmap_bitremap(int oldbit, const unsigned long *old,
 EXPORT_SYMBOL(bitmap_bitremap);
 
 /**
- *	bitmap_find_free_region - find a contiguous aligned mem region
+ * bitmap_find_free_region - find a contiguous aligned mem region
  *	@bitmap: an array of unsigned longs corresponding to the bitmap
  *	@bits: number of bits in the bitmap
  *	@order: region size to find (size is actually 1<<order)
  *
- * This is used to allocate a memory region from a bitmap.  The idea is
- * that the region has to be 1<<order sized and 1<<order aligned (this
- * makes the search algorithm much faster).
+ * Find a sequence of free (zero) bits in a bitmap and allocate
+ * them (set them to one).  Only consider sequences of length a
+ * power ('order') of two, aligned to that power of two, which
+ * makes the search algorithm much faster.
  *
- * The region is marked as set bits in the bitmap if a free one is
- * found.
- *
- * Returns either beginning of region or negative error
+ * Return the bit offset in bitmap of the allocated sequence,
+ * or -errno on failure.
  */
 int bitmap_find_free_region(unsigned long *bitmap, int bits, int order)
 {
 	unsigned long mask;
-	int pages = 1 << order;
+	int nbits = 1 << order;
 	int i;
 
-	if(pages > BITS_PER_LONG)
+	if (nbits > BITS_PER_LONG)
 		return -EINVAL;
 
 	/* make a mask of the order */
-	mask = (1ul << (pages - 1));
+	mask = (1UL << (nbits - 1));
 	mask += mask - 1;
 
-	/* run up the bitmap pages bits at a time */
-	for (i = 0; i < bits; i += pages) {
-		int index = i/BITS_PER_LONG;
+	/* run up the bitmap nbits at a time */
+	for (i = 0; i < bits; i += nbits) {
+		int index = i / BITS_PER_LONG;
 		int offset = i - (index * BITS_PER_LONG);
-		if((bitmap[index] & (mask << offset)) == 0) {
-			/* set region in bimap */
+		if ((bitmap[index] & (mask << offset)) == 0) {
+			/* set region in bitmap */
 			bitmap[index] |= (mask << offset);
 			return i;
 		}
@@ -719,7 +718,7 @@ int bitmap_find_free_region(unsigned long *bitmap, int bits, int order)
 EXPORT_SYMBOL(bitmap_find_free_region);
 
 /**
- *	bitmap_release_region - release allocated bitmap region
+ * bitmap_release_region - release allocated bitmap region
  *	@bitmap: a pointer to the bitmap
  *	@pos: the beginning of the region
  *	@order: the order of the bits to release (number is 1<<order)
@@ -729,27 +728,40 @@ EXPORT_SYMBOL(bitmap_find_free_region);
  */
 void bitmap_release_region(unsigned long *bitmap, int pos, int order)
 {
-	int pages = 1 << order;
-	unsigned long mask = (1ul << (pages - 1));
-	int index = pos/BITS_PER_LONG;
+	int nbits = 1 << order;
+	unsigned long mask = (1UL << (nbits - 1));
+	int index = pos / BITS_PER_LONG;
 	int offset = pos - (index * BITS_PER_LONG);
+
 	mask += mask - 1;
 	bitmap[index] &= ~(mask << offset);
 }
 EXPORT_SYMBOL(bitmap_release_region);
 
+/**
+ * bitmap_allocate_region - allocate bitmap region
+ *	@bitmap: a pointer to the bitmap
+ *	@pos: the beginning of the region
+ *	@order: the order of the bits to allocate (number is 1<<order)
+ *
+ * Allocate (set bits in) a specified region of a bitmap.
+ * Return 0 on success, or -EBUSY if specified region wasn't
+ * free (not all bits were zero).
+ */
 int bitmap_allocate_region(unsigned long *bitmap, int pos, int order)
 {
-	int pages = 1 << order;
-	unsigned long mask = (1ul << (pages - 1));
-	int index = pos/BITS_PER_LONG;
+	int nbits = 1 << order;
+	unsigned long mask = (1UL << (nbits - 1));
+	int index = pos / BITS_PER_LONG;
 	int offset = pos - (index * BITS_PER_LONG);
 
-	/* We don't do regions of pages > BITS_PER_LONG.  The
+	/*
+	 * We don't do regions of nbits > BITS_PER_LONG.  The
 	 * algorithm would be a simple look for multiple zeros in the
 	 * array, but there's no driver today that needs this.  If you
-	 * trip this BUG(), you get to code it... */
-	BUG_ON(pages > BITS_PER_LONG);
+	 * trip this BUG(), you get to code it...
+	 */
+	BUG_ON(nbits > BITS_PER_LONG);
 	mask += mask - 1;
 	if (bitmap[index] & (mask << offset))
 		return -EBUSY;
-- 
cgit v1.2.3


From f6ef943813ac3085ece7252ea101d663581219f6 Mon Sep 17 00:00:00 2001
From: Bart Samwel <bart@samwel.tk>
Date: Fri, 24 Mar 2006 03:15:48 -0800
Subject: [PATCH] Represent dirty_*_centisecs as jiffies internally

Make that the internal values for:

/proc/sys/vm/dirty_writeback_centisecs
/proc/sys/vm/dirty_expire_centisecs

are stored as jiffies instead of centiseconds.  Let the sysctl interface do
the conversions with full precision using clock_t_to_jiffies, instead of
doing overflow-sensitive on-the-fly conversions every time the values are
used.

Cons: apparent precision loss if HZ is not a multiple of 100, because of
conversion back and forth.  This is a common problem for all sysctl values
that use proc_dointvec_userhz_jiffies.  (There is only one other in-tree
use, in net/core/neighbour.c.)

Signed-off-by: Bart Samwel <bart@samwel.tk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h |  4 ++--
 kernel/sysctl.c           | 10 +++++-----
 mm/page-writeback.c       | 24 ++++++++++++------------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index beaef5c7a0ea..609565961494 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -88,8 +88,8 @@ void throttle_vm_writeout(void);
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
-extern int dirty_writeback_centisecs;
-extern int dirty_expire_centisecs;
+extern int dirty_writeback_interval;
+extern int dirty_expire_interval;
 extern int block_dump;
 extern int laptop_mode;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 32b48e8ee36e..817ba25517eb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -742,18 +742,18 @@ static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_DIRTY_WB_CS,
 		.procname	= "dirty_writeback_centisecs",
-		.data		= &dirty_writeback_centisecs,
-		.maxlen		= sizeof(dirty_writeback_centisecs),
+		.data		= &dirty_writeback_interval,
+		.maxlen		= sizeof(dirty_writeback_interval),
 		.mode		= 0644,
 		.proc_handler	= &dirty_writeback_centisecs_handler,
 	},
 	{
 		.ctl_name	= VM_DIRTY_EXPIRE_CS,
 		.procname	= "dirty_expire_centisecs",
-		.data		= &dirty_expire_centisecs,
-		.maxlen		= sizeof(dirty_expire_centisecs),
+		.data		= &dirty_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_userhz_jiffies,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d2..e79107991d20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
  * The interval between `kupdate'-style writebacks, in centiseconds
  * (hundredths of a second)
  */
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
 
 /*
  * The longest number of centiseconds for which data is allowed to remain dirty
  */
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -380,8 +380,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * just walks the superblock inode list, writing back any inodes which are
  * older than a specific point in time.
  *
- * Try to run once per dirty_writeback_centisecs.  But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
  * one-second gap.
  *
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -406,9 +406,9 @@ static void wb_kupdate(unsigned long arg)
 	sync_supers();
 
 	get_writeback_state(&wbs);
-	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+	oldest_jif = jiffies - dirty_expire_interval;
 	start_jif = jiffies;
-	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+	next_jif = start_jif + dirty_writeback_interval;
 	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
@@ -425,7 +425,7 @@ static void wb_kupdate(unsigned long arg)
 	}
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
-	if (dirty_writeback_centisecs)
+	if (dirty_writeback_interval)
 		mod_timer(&wb_timer, next_jif);
 }
 
@@ -435,11 +435,11 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (dirty_writeback_centisecs) {
+	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	if (dirty_writeback_interval) {
 		mod_timer(&wb_timer,
-			jiffies + (dirty_writeback_centisecs * HZ) / 100);
-	} else {
+			jiffies + dirty_writeback_interval);
+		} else {
 		del_timer(&wb_timer);
 	}
 	return 0;
@@ -544,7 +544,7 @@ void __init page_writeback_init(void)
 		if (vm_dirty_ratio <= 0)
 			vm_dirty_ratio = 1;
 	}
-	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
-- 
cgit v1.2.3


From cdb0452789d365695b5b173542af9c7e3d24f185 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 24 Mar 2006 03:15:57 -0800
Subject: [PATCH] kill include/linux/platform.h, default_idle() cleanup

include/linux/platform.h contained nothing that was actually used except
the default_idle() prototype, and is therefore removed by this patch.

This patch does the following with the platform specific default_idle()
functions on different architectures:
- remove the unused function:
  - parisc
  - sparc64
- make the needlessly global function static:
  - arm
  - h8300
  - m68k
  - m68knommu
  - s390
  - v850
  - x86_64
- add a prototype in asm/system.h:
  - cris
  - i386
  - ia64

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Patrick Mochel <mochel@digitalimplant.org>
Acked-by: Kyle McMartin <kyle@parisc-linux.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/kernel/process.c      |  3 +--
 arch/h8300/kernel/process.c     |  4 ++--
 arch/i386/kernel/apm.c          |  2 --
 arch/i386/mach-visws/reboot.c   |  1 -
 arch/ia64/kernel/setup.c        |  1 -
 arch/m68k/kernel/process.c      |  2 +-
 arch/m68knommu/kernel/process.c |  2 +-
 arch/parisc/kernel/process.c    |  5 -----
 arch/s390/kernel/process.c      |  2 +-
 arch/sh/kernel/process.c        |  1 -
 arch/v850/kernel/process.c      |  2 +-
 arch/x86_64/kernel/process.c    |  2 +-
 include/asm-cris/system.h       |  2 ++
 include/asm-i386/system.h       |  2 ++
 include/asm-ia64/system.h       |  2 ++
 include/linux/platform.h        | 43 -----------------------------------------
 16 files changed, 14 insertions(+), 62 deletions(-)
 delete mode 100644 include/linux/platform.h

(limited to 'include/linux')

diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 4ab3e87115b6..123451c44154 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -116,6 +116,7 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/irq.h>
+#include <asm/system.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
@@ -194,8 +195,6 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void (*pm_idle)(void);
 
-extern void default_idle(void);
-
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index dd344f112cfe..16ccddc69c2b 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -54,7 +54,7 @@ asmlinkage void ret_from_fork(void);
  * The idle loop on an H8/300..
  */
 #if !defined(CONFIG_H8300H_SIM) && !defined(CONFIG_H8S_SIM)
-void default_idle(void)
+static void default_idle(void)
 {
 	local_irq_disable();
 	if (!need_resched()) {
@@ -65,7 +65,7 @@ void default_idle(void)
 		local_irq_enable();
 }
 #else
-void default_idle(void)
+static void default_idle(void)
 {
 	cpu_relax();
 }
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 05312a8abb8b..da30a374dd4e 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -824,8 +824,6 @@ static void apm_do_busy(void)
 
 static void (*original_pm_idle)(void);
 
-extern void default_idle(void);
-
 /**
  * apm_cpu_idle		-	cpu idling for APM capable Linux
  *
diff --git a/arch/i386/mach-visws/reboot.c b/arch/i386/mach-visws/reboot.c
index 5d73e042ed0a..99332abfad42 100644
--- a/arch/i386/mach-visws/reboot.c
+++ b/arch/i386/mach-visws/reboot.c
@@ -1,7 +1,6 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
-#include <linux/platform.h>
 
 #include <asm/io.h>
 #include "piix4.h"
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 3258e09278d0..958c1508036f 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -41,7 +41,6 @@
 #include <linux/serial_core.h>
 #include <linux/efi.h>
 #include <linux/initrd.h>
-#include <linux/platform.h>
 #include <linux/pm.h>
 #include <linux/cpufreq.h>
 
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 2d8ad0727b6b..33648efb772e 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -77,7 +77,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * The idle loop on an m68k..
  */
-void default_idle(void)
+static void default_idle(void)
 {
 	if (!need_resched())
 #if defined(MACH_ATARI_ONLY) && !defined(CONFIG_HADES)
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 63c117dae0c3..f861755ec88b 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(pm_power_off);
 /*
  * The idle loop on an m68knommu..
  */
-void default_idle(void)
+static void default_idle(void)
 {
 	local_irq_disable();
  	while (!need_resched()) {
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index e8dea4177113..0b485ef4be89 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -54,11 +54,6 @@
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
 
-void default_idle(void)
-{
-	barrier();
-}
-
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index da6fbae8df91..99182a415fe7 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -103,7 +103,7 @@ extern void s390_handle_mcck(void);
 /*
  * The idle loop on a S390...
  */
-void default_idle(void)
+static void default_idle(void)
 {
 	int cpu, rc;
 
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 9fd1723e6219..22dc9c21201d 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -19,7 +19,6 @@
 #include <linux/slab.h>
 #include <linux/pm.h>
 #include <linux/ptrace.h>
-#include <linux/platform.h>
 #include <linux/kallsyms.h>
 #include <linux/kexec.h>
 
diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c
index 621111ddf907..57218c76925c 100644
--- a/arch/v850/kernel/process.c
+++ b/arch/v850/kernel/process.c
@@ -37,7 +37,7 @@ extern void ret_from_fork (void);
 
 
 /* The idle loop.  */
-void default_idle (void)
+static void default_idle (void)
 {
 	while (! need_resched ())
 		asm ("halt; nop; nop; nop; nop; nop" ::: "cc");
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 22a05dec81a2..80a8f3079178 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -114,7 +114,7 @@ void exit_idle(void)
  * We use this if we don't have any better
  * idle routine..
  */
-void default_idle(void)
+static void default_idle(void)
 {
 	local_irq_enable();
 
diff --git a/include/asm-cris/system.h b/include/asm-cris/system.h
index d48670107a85..1d63c2aa8ec2 100644
--- a/include/asm-cris/system.h
+++ b/include/asm-cris/system.h
@@ -71,4 +71,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 
 #define arch_align_stack(x) (x)
 
+void default_idle(void);
+
 #endif
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index d0d8d7448d88..19cc79c9a35d 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -499,4 +499,6 @@ static inline void sched_cacheflush(void)
 extern unsigned long arch_align_stack(unsigned long sp);
 extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
 
+void default_idle(void);
+
 #endif
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index cd4233d66f15..2f3620593687 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -265,6 +265,8 @@ void sched_cacheflush(void);
 
 #define arch_align_stack(x) (x)
 
+void default_idle(void);
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/linux/platform.h b/include/linux/platform.h
deleted file mode 100644
index 3c33084a6ec2..000000000000
--- a/include/linux/platform.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * include/linux/platform.h - platform driver definitions
- *
- * Because of the prolific consumerism of the average American,
- * and the dominant marketing budgets of PC OEMs, we have been
- * blessed with frequent updates of the PC architecture. 
- *
- * While most of these calls are singular per architecture, they 
- * require an extra layer of abstraction on the x86 so the right
- * subsystem gets the right call. 
- *
- * Basically, this consolidates the power off and reboot callbacks 
- * into one structure, as well as adding power management hooks.
- *
- * When adding a platform driver, please make sure all callbacks are 
- * filled. There are defaults defined below that do nothing; use those
- * if you do not support that callback.
- */ 
-
-#ifndef _PLATFORM_H_
-#define _PLATFORM_H_
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-
-struct platform_t {
-	char	* name;
-	u32	suspend_states;
-	void	(*reboot)(char * cmd);
-	void	(*halt)(void);
-	void	(*power_off)(void);
-	int	(*suspend)(int state, int flags);
-	void	(*idle)(void);
-};
-
-extern struct platform_t * platform;
-extern void default_reboot(char * cmd);
-extern void default_halt(void);
-extern int default_suspend(int state, int flags);
-extern void default_idle(void);
-
-#endif /* __KERNEL__ */
-#endif /* _PLATFORM_H */
-- 
cgit v1.2.3


From 825a46af5ac171f9f41f794a0a00165588ba1589 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:03 -0800
Subject: [PATCH] cpuset memory spread basic implementation

This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).

The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.

All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.

There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures.  They are called 'memory_spread_page' and 'memory_spread_slab'.

If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.

If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.

The implementation is simple.  Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset.  In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.

The cpuset_mem_spread_node() routine is also simple.  It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.

This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit.  Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.

A couple of Copyright year ranges are updated as well.  And a couple of email
addresses that can be found in the MAINTAINERS file are removed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt |  76 ++++++++++++++++++++++++++++++++-
 include/linux/cpuset.h    |  29 ++++++++++++-
 include/linux/sched.h     |   3 ++
 kernel/cpuset.c           | 104 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 203 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 30c41459953c..159e2a0c3e80 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -18,7 +18,8 @@ CONTENTS:
   1.4 What are exclusive cpusets ?
   1.5 What does notify_on_release do ?
   1.6 What is memory_pressure ?
-  1.7 How do I use cpusets ?
+  1.7 What is memory spread ?
+  1.8 How do I use cpusets ?
 2. Usage Examples and Syntax
   2.1 Basic Usage
   2.2 Adding/removing cpus
@@ -317,7 +318,78 @@ the tasks in the cpuset, in units of reclaims attempted per second,
 times 1000.
 
 
-1.7 How do I use cpusets ?
+1.7 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'memory_spread_page' and
+'memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the tasks NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the tasks NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing tasks memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'memory_spread_page' turns on a per-process flag
+PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PF_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'memory_spread_cache' turns on the flag
+PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current tasks mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+
+1.8 How do I use cpusets ?
 --------------------------
 
 In order to minimize the impact of cpusets on critical kernel
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3bc606927116..9354722a9217 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -4,7 +4,7 @@
  *  cpuset interface
  *
  *  Copyright (C) 2003 BULL SA
- *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  */
 
@@ -51,6 +51,18 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 extern void cpuset_lock(void);
 extern void cpuset_unlock(void);
 
+extern int cpuset_mem_spread_node(void);
+
+static inline int cpuset_do_page_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_PAGE;
+}
+
+static inline int cpuset_do_slab_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_SLAB;
+}
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -99,6 +111,21 @@ static inline char *cpuset_task_status_allowed(struct task_struct *task,
 static inline void cpuset_lock(void) {}
 static inline void cpuset_unlock(void) {}
 
+static inline int cpuset_mem_spread_node(void)
+{
+	return 0;
+}
+
+static inline int cpuset_do_page_mem_spread(void)
+{
+	return 0;
+}
+
+static inline int cpuset_do_slab_mem_spread(void)
+{
+	return 0;
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e60a91d5b369..b0e37cfa09f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -869,6 +869,7 @@ struct task_struct {
 	struct cpuset *cpuset;
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
+	int cpuset_mem_spread_rotor;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
@@ -929,6 +930,8 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 44d13c246e5c..38f18b33de6c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
  *  Processor and Memory placement constraints for sets of tasks.
  *
  *  Copyright (C) 2003 BULL SA.
- *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  *  Portions derived from Patrick Mochel's sysfs code.
  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
- *  Portions Copyright (c) 2004 Silicon Graphics, Inc.
  *
- *  2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ *  2003-10-10 Written by Simon Derr.
  *  2003-10-22 Updates by Stephen Hemminger.
- *  2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ *  2004 May-July Rework by Paul Jackson.
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
@@ -108,7 +107,9 @@ typedef enum {
 	CS_MEM_EXCLUSIVE,
 	CS_MEMORY_MIGRATE,
 	CS_REMOVED,
-	CS_NOTIFY_ON_RELEASE
+	CS_NOTIFY_ON_RELEASE,
+	CS_SPREAD_PAGE,
+	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -137,6 +138,16 @@ static inline int is_memory_migrate(const struct cpuset *cs)
 	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
 }
 
+static inline int is_spread_page(const struct cpuset *cs)
+{
+	return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+	return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
 /*
  * Increment this atomic integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -657,6 +668,14 @@ void cpuset_update_task_memory_state(void)
 		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
+		if (is_spread_page(cs))
+			tsk->flags |= PF_SPREAD_PAGE;
+		else
+			tsk->flags &= ~PF_SPREAD_PAGE;
+		if (is_spread_slab(cs))
+			tsk->flags |= PF_SPREAD_SLAB;
+		else
+			tsk->flags &= ~PF_SPREAD_SLAB;
 		task_unlock(tsk);
 		mutex_unlock(&callback_mutex);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
@@ -956,7 +975,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
+ *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ *				CS_SPREAD_PAGE, CS_SPREAD_SLAB)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
@@ -1187,6 +1207,8 @@ typedef enum {
 	FILE_NOTIFY_ON_RELEASE,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
+	FILE_SPREAD_PAGE,
+	FILE_SPREAD_SLAB,
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
@@ -1246,6 +1268,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_MEMORY_PRESSURE:
 		retval = -EACCES;
 		break;
+	case FILE_SPREAD_PAGE:
+		retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+		cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+		break;
+	case FILE_SPREAD_SLAB:
+		retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+		cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1355,6 +1385,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_MEMORY_PRESSURE:
 		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
 		break;
+	case FILE_SPREAD_PAGE:
+		*s++ = is_spread_page(cs) ? '1' : '0';
+		break;
+	case FILE_SPREAD_SLAB:
+		*s++ = is_spread_slab(cs) ? '1' : '0';
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1718,6 +1754,16 @@ static struct cftype cft_memory_pressure = {
 	.private = FILE_MEMORY_PRESSURE,
 };
 
+static struct cftype cft_spread_page = {
+	.name = "memory_spread_page",
+	.private = FILE_SPREAD_PAGE,
+};
+
+static struct cftype cft_spread_slab = {
+	.name = "memory_spread_slab",
+	.private = FILE_SPREAD_SLAB,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1736,6 +1782,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
@@ -1764,6 +1814,10 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+	if (is_spread_page(parent))
+		set_bit(CS_SPREAD_PAGE, &cs->flags);
+	if (is_spread_slab(parent))
+		set_bit(CS_SPREAD_SLAB, &cs->flags);
 	cs->cpus_allowed = CPU_MASK_NONE;
 	cs->mems_allowed = NODE_MASK_NONE;
 	atomic_set(&cs->count, 0);
@@ -2200,6 +2254,44 @@ void cpuset_unlock(void)
 	mutex_unlock(&callback_mutex);
 }
 
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online.  So it
+ * should not be possible for the following code to return an
+ * offline node.  But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start.  The zonelist passed to
+ * __alloc_pages() will include all nodes.  If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+
+int cpuset_mem_spread_node(void)
+{
+	int node;
+
+	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+	if (node == MAX_NUMNODES)
+		node = first_node(current->mems_allowed);
+	current->cpuset_mem_spread_rotor = node;
+	return node;
+}
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+
 /**
  * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
  * @p: pointer to task_struct of some other task.
-- 
cgit v1.2.3


From 44110fe385af23ca5eee8a6ad4ff55d50339097a Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:04 -0800
Subject: [PATCH] cpuset memory spread page cache implementation and hooks

Change the page cache allocation calls to support cpuset memory spreading.

See the previous patch, cpuset_mem_spread, for an explanation of cpuset memory
spreading.

On systems without cpusets configured in the kernel, this is no change.

On systems with cpusets configured in the kernel, but the "memory_spread"
cpuset option not enabled for the current tasks cpuset, this adds a call to a
cpuset routine and failed bit test of the processor state flag PF_SPREAD_PAGE.

On tasks in cpusets with "memory_spread" enabled, this adds a call to a cpuset
routine that computes which of the tasks mems_allowed nodes should be
preferred for this allocation.

If memory spreading applies to a particular allocation, then any other NUMA
mempolicy does not apply.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pagemap.h |  5 +++++
 mm/filemap.c            | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ee700c6eb442..839f0b3c23aa 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -51,6 +51,10 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+#ifdef CONFIG_NUMA
+extern struct page *page_cache_alloc(struct address_space *x);
+extern struct page *page_cache_alloc_cold(struct address_space *x);
+#else
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
 	return alloc_pages(mapping_gfp_mask(x), 0);
@@ -60,6 +64,7 @@ static inline struct page *page_cache_alloc_cold(struct address_space *x)
 {
 	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
 }
+#endif
 
 typedef int filler_t(void *, struct page *);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index e8f58f7dd7a5..d4ff48ec269e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/cpuset.h>
 #include "filemap.h"
 #include "internal.h"
 
@@ -427,6 +428,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 	return ret;
 }
 
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+	if (cpuset_do_page_mem_spread()) {
+		int n = cpuset_mem_spread_node();
+		return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+	}
+	return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+	if (cpuset_do_page_mem_spread()) {
+		int n = cpuset_mem_spread_node();
+		return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+	}
+	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
-- 
cgit v1.2.3


From 101a50019ae5e370d73984ee05d56dd3b08f330a Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:07 -0800
Subject: [PATCH] cpuset memory spread slab cache implementation

Provide the slab cache infrastructure to support cpuset memory spreading.

See the previous patches, cpuset_mem_spread, for an explanation of cpuset
memory spreading.

This patch provides a slab cache SLAB_MEM_SPREAD flag.  If set in the
kmem_cache_create() call defining a slab cache, then any task marked with the
process state flag PF_MEMSPREAD will spread memory page allocations for that
cache over all the allowed nodes, instead of preferring the local (faulting)
node.

On systems not configured with CONFIG_NUMA, this results in no change to the
page allocation code path for slab caches.

On systems with cpusets configured in the kernel, but the "memory_spread"
cpuset option not enabled for the current tasks cpuset, this adds a call to a
cpuset routine and failed bit test of the processor state flag PF_SPREAD_SLAB.

For tasks so marked, a second inline test is done for the slab cache flag
SLAB_MEM_SPREAD, and if that is set and if the allocation is not
in_interrupt(), this adds a call to to a cpuset routine that computes which of
the tasks mems_allowed nodes should be preferred for this allocation.

==> This patch adds another hook into the performance critical
    code path to allocating objects from the slab cache, in the
    ____cache_alloc() chunk, below.  The next patch optimizes this
    hook, reducing the impact of the combined mempolicy plus memory
    spreading hooks on this critical code path to a single check
    against the tasks task_struct flags word.

This patch provides the generic slab flags and logic needed to apply memory
spreading to a particular slab.

A subsequent patch will mark a few specific slab caches for this placement
policy.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h |  1 +
 mm/slab.c            | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2b28c849d75a..e2ee5b268797 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -46,6 +46,7 @@ typedef struct kmem_cache kmem_cache_t;
 						   what is reclaimable later*/
 #define SLAB_PANIC		0x00040000UL	/* panic if kmem_cache_create() fails */
 #define SLAB_DESTROY_BY_RCU	0x00080000UL	/* defer freeing pages to RCU */
+#define SLAB_MEM_SPREAD		0x00100000UL	/* Spread some memory over cpuset */
 
 /* flags passed to a constructor func */
 #define	SLAB_CTOR_CONSTRUCTOR	0x001UL		/* if not set, then deconstructor */
diff --git a/mm/slab.c b/mm/slab.c
index 1c8f5ee230d5..de516658d3d8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -94,6 +94,7 @@
 #include	<linux/interrupt.h>
 #include	<linux/init.h>
 #include	<linux/compiler.h>
+#include	<linux/cpuset.h>
 #include	<linux/seq_file.h>
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
@@ -173,12 +174,12 @@
 			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU)
+			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU)
+			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #endif
 
 /*
@@ -2810,6 +2811,14 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	if (unlikely(current->mempolicy && !in_interrupt())) {
 		int nid = slab_node(current->mempolicy);
 
+		if (nid != numa_node_id())
+			return __cache_alloc_node(cachep, flags, nid);
+	}
+	if (unlikely(cpuset_do_slab_mem_spread() &&
+					(cachep->flags & SLAB_MEM_SPREAD) &&
+					!in_interrupt())) {
+		int nid = cpuset_mem_spread_node();
+
 		if (nid != numa_node_id())
 			return __cache_alloc_node(cachep, flags, nid);
 	}
-- 
cgit v1.2.3


From c61afb181c649754ea221f104e268cbacfc993e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:08 -0800
Subject: [PATCH] cpuset memory spread slab cache optimizations

The hooks in the slab cache allocator code path for support of NUMA
mempolicies and cpuset memory spreading are in an important code path.  Many
systems will use neither feature.

This patch optimizes those hooks down to a single check of some bits in the
current tasks task_struct flags.  For non NUMA systems, this hook and related
code is already ifdef'd out.

The optimization is done by using another task flag, set if the task is using
a non-default NUMA mempolicy.  Taking this flag bit along with the
PF_SPREAD_PAGE and PF_SPREAD_SLAB flag bits added earlier in this 'cpuset
memory spreading' patch set, one can check for the combination of any of these
special case memory placement mechanisms with a single test of the current
tasks task_struct flags.

This patch also tightens up the code, to save a few bytes of kernel text
space, and moves some of it out of line.  Due to the nested inlines called
from multiple places, we were ending up with three copies of this code, which
once we get off the main code path (for local node allocation) seems a bit
wasteful of instruction memory.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  5 +++++
 include/linux/sched.h     |  1 +
 kernel/fork.c             |  1 +
 mm/mempolicy.c            | 32 ++++++++++++++++++++++++++++++++
 mm/slab.c                 | 41 ++++++++++++++++++++++++++++-------------
 5 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index bbd2221923c3..6a7621b2b12b 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -147,6 +147,7 @@ extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void mpol_fix_fork_child_flag(struct task_struct *p);
 #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
 
 #ifdef CONFIG_CPUSET
@@ -248,6 +249,10 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
+static inline void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+}
+
 #define set_cpuset_being_rebound(x) do {} while (0)
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0e37cfa09f5..2cda439ece43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -932,6 +932,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/fork.c b/kernel/fork.c
index c21bae8c93b9..a02063903aaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1021,6 +1021,7 @@ static task_t *copy_process(unsigned long clone_flags,
  		p->mempolicy = NULL;
  		goto bad_fork_cleanup_cpuset;
  	}
+	mpol_fix_fork_child_flag(p);
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc740c22b..4f71cfd29c6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+	if (p->mempolicy)
+		p->flags |= PF_MEMPOLICY;
+	else
+		p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+	mpol_fix_fork_child_flag(current);
+}
+
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
+	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = first_node(new->v.nodes);
 	return 0;
diff --git a/mm/slab.c b/mm/slab.c
index de516658d3d8..f80b52388a12 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -2808,19 +2809,11 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(current->mempolicy && !in_interrupt())) {
-		int nid = slab_node(current->mempolicy);
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
-	}
-	if (unlikely(cpuset_do_slab_mem_spread() &&
-					(cachep->flags & SLAB_MEM_SPREAD) &&
-					!in_interrupt())) {
-		int nid = cpuset_mem_spread_node();
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
+	if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB |
+							PF_MEMPOLICY))) {
+		objp = alternate_node_alloc(cachep, flags);
+		if (objp != NULL)
+			return objp;
 	}
 #endif
 
@@ -2855,6 +2848,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 }
 
 #ifdef CONFIG_NUMA
+/*
+ * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	int nid_alloc, nid_here;
+
+	if (in_interrupt())
+		return NULL;
+	nid_alloc = nid_here = numa_node_id();
+	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+		nid_alloc = cpuset_mem_spread_node();
+	else if (current->mempolicy)
+		nid_alloc = slab_node(current->mempolicy);
+	if (nid_alloc != nid_here)
+		return __cache_alloc_node(cachep, flags, nid_alloc);
+	return NULL;
+}
+
 /*
  * A interface to enable slab creation on nodeid
  */
-- 
cgit v1.2.3


From ab7efcc97ebc92e03c0474dfd38f9c7b84b84115 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 24 Mar 2006 03:16:17 -0800
Subject: [PATCH] abstract type/size specification for assembly

Provide abstraction for generating type and size information of assembly
routines and data, while permitting architectures to override these
defaults.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: "Russell King" <rmk@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: "Andi Kleen" <ak@suse.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/linkage.h |  8 ++++++++
 include/asm-mips/linkage.h |  4 +++-
 include/asm-v850/linkage.h |  4 +++-
 include/linux/linkage.h    | 16 +++++++++++++---
 4 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-ia64/linkage.h b/include/asm-ia64/linkage.h
index 14cd72cd8007..ef22a45c1890 100644
--- a/include/asm-ia64/linkage.h
+++ b/include/asm-ia64/linkage.h
@@ -1,6 +1,14 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
+#ifndef __ASSEMBLY__
+
 #define asmlinkage CPP_ASMLINKAGE __attribute__((syscall_linkage))
 
+#else
+
+#include <asm/asmmacro.h>
+
+#endif
+
 #endif
diff --git a/include/asm-mips/linkage.h b/include/asm-mips/linkage.h
index 291c2d01c44f..b6185d3cfe68 100644
--- a/include/asm-mips/linkage.h
+++ b/include/asm-mips/linkage.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
-/* Nothing to see here... */
+#ifdef __ASSEMBLY__
+#include <asm/asm.h>
+#endif
 
 #endif
diff --git a/include/asm-v850/linkage.h b/include/asm-v850/linkage.h
index 291c2d01c44f..b6185d3cfe68 100644
--- a/include/asm-v850/linkage.h
+++ b/include/asm-v850/linkage.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
-/* Nothing to see here... */
+#ifdef __ASSEMBLY__
+#include <asm/asm.h>
+#endif
 
 #endif
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 147eb01e0d4b..c08c9983e840 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -28,17 +28,27 @@
 #define ALIGN __ALIGN
 #define ALIGN_STR __ALIGN_STR
 
+#ifndef ENTRY
 #define ENTRY(name) \
   .globl name; \
   ALIGN; \
   name:
+#endif
 
 #define KPROBE_ENTRY(name) \
   .section .kprobes.text, "ax"; \
-  .globl name; \
-  ALIGN; \
-  name:
+  ENTRY(name)
 
+#ifndef END
+#define END(name) \
+  .size name, .-name
+#endif
+
+#ifndef ENDPROC
+#define ENDPROC(name) \
+  .type name, @function; \
+  END(name)
+#endif
 
 #endif
 
-- 
cgit v1.2.3


From ebcf28e1c7a295f3321249dd235ad2e45938fdd9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:04 -0800
Subject: [PATCH] fadvise(): write commands

Add two new linux-specific fadvise extensions():

LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
offsets `offset' and `offset+len'.  Any pages which are currently under
writeout are skipped, whether or not they are dirty.

LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
offsets `offset' and `offset+len'.

By combining these two operations the application may do several things:

LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.

LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty
pages at the disk.

LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all
of the currently dirty pages at the disk, wait until they have been written.

It should be noted that none of these operations write out the file's
metadata.  So unless the application is strictly performing overwrites of
already-instantiated disk blocks, there are no guarantees here that the data
will be available after a crash.

To complete this suite of operations I guess we should have a "sync file
metadata only" operation.  This gives applications access to all the building
blocks needed for all sorts of sync operations.  But sync-metadata doesn't fit
well with the fadvise() interface.  Probably it should be a new syscall:
sys_fmetadatasync().

The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64().
It is made to represent that last affected byte in the file (ie: it is
inclusive).  Generally, all these byterange and pagerange functions are
inclusive so we can easily represent EOF with -1.

As Ulrich notes, these two functions are somewhat abusive of the fadvise()
concept, which appears to be "set the future policy for this fd".

But these commands are a perfect fit with the fadvise() impementation, and
several of the existing fadvise() commands are synchronous and don't affect
future policy either.   I think we can live with the slight incongruity.

Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fadvise.h |  6 ++++++
 include/linux/fs.h      |  5 +++++
 mm/fadvise.c            | 46 +++++++++++++++++++++++++++++++++++++++++-----
 mm/filemap.c            | 10 +++++-----
 4 files changed, 57 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h
index e8e747139b9a..b2913bba35d8 100644
--- a/include/linux/fadvise.h
+++ b/include/linux/fadvise.h
@@ -18,4 +18,10 @@
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
 #endif
 
+/*
+ * Linux-specific fadvise() extensions:
+ */
+#define LINUX_FADV_ASYNC_WRITE	32	/* Start writeout on range */
+#define LINUX_FADV_WRITE_WAIT	33	/* Wait upon writeout to range */
+
 #endif	/* FADVISE_H_INCLUDED */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65e6df247ea5..0ad70c1e5e55 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1473,6 +1473,11 @@ extern int filemap_fdatawait(struct address_space *);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
+extern int wait_on_page_writeback_range(struct address_space *mapping,
+				pgoff_t start, pgoff_t end);
+extern int __filemap_fdatawrite_range(struct address_space *mapping,
+				loff_t start, loff_t end, int sync_mode);
+
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e7704..907c39257ca0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/fadvise.h>
+#include <linux/writeback.h>
 #include <linux/syscalls.h>
 
 #include <asm/unistd.h>
@@ -22,13 +23,36 @@
 /*
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
+ *
+ * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
+ * offsets `offset' and `offset+len' inclusive.  Any pages which are currently
+ * under writeout are skipped, whether or not they are dirty.
+ *
+ * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
+ * offsets `offset' and `offset+len'.
+ *
+ * By combining these two operations the application may do several things:
+ *
+ * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
+ * dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
+ * all of the currently dirty pages at the disk, wait until they have been
+ * written.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
  */
 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
 	struct file *file = fget(fd);
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
-	loff_t endbyte;
+	loff_t endbyte;			/* inclusive */
 	pgoff_t start_index;
 	pgoff_t end_index;
 	unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 	endbyte = offset + len;
 	if (!len || endbyte < len)
 		endbyte = -1;
+	else
+		endbyte--;		/* inclusive */
 
 	bdi = mapping->backing_dev_info;
 
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 
 		/* First and last PARTIAL page! */
 		start_index = offset >> PAGE_CACHE_SHIFT;
-		end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+		end_index = endbyte >> PAGE_CACHE_SHIFT;
 
 		/* Careful about overflow on the "+1" */
 		nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 			filemap_flush(mapping);
 
 		/* First and last FULL page! */
-		start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
 		end_index = (endbyte >> PAGE_CACHE_SHIFT);
 
-		if (end_index > start_index)
-			invalidate_mapping_pages(mapping, start_index, end_index-1);
+		if (end_index >= start_index)
+			invalidate_mapping_pages(mapping, start_index,
+						end_index);
+		break;
+	case LINUX_FADV_ASYNC_WRITE:
+		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+						WB_SYNC_NONE);
+		break;
+	case LINUX_FADV_WRITE_WAIT:
+		ret = wait_on_page_writeback_range(mapping,
+					offset >> PAGE_CACHE_SHIFT,
+					endbyte >> PAGE_CACHE_SHIFT);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index c1b1708cc95d..3ef20739e725 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -183,8 +183,8 @@ static int sync_page(void *word)
  * these two operations is that if a dirty page/buffer is encountered, it must
  * be waited upon, and not just skipped over.
  */
-static int __filemap_fdatawrite_range(struct address_space *mapping,
-	loff_t start, loff_t end, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+				loff_t end, int sync_mode)
 {
 	int ret;
 	struct writeback_control wbc = {
@@ -213,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
-static int filemap_fdatawrite_range(struct address_space *mapping,
-	loff_t start, loff_t end)
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+				loff_t end)
 {
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
@@ -233,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
  * Wait for writeback to complete against pages indexed by start->end
  * inclusive
  */
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
 				pgoff_t start, pgoff_t end)
 {
 	struct pagevec pvec;
-- 
cgit v1.2.3


From fa5a734e406b53761fcc5ee22366006f71112c2d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:10 -0800
Subject: [PATCH] balance_dirty_pages_ratelimited: take nr_pages arg

Modify balance_dirty_pages_ratelimited() so that it can take a
number-of-pages-which-I-just-dirtied argument.  For msync().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h | 10 +++++++++-
 mm/page-writeback.c       | 24 +++++++++++++++---------
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 609565961494..56f92fcbe94a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -99,7 +99,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
 				      void __user *, size_t *, loff_t *);
 
 void page_writeback_init(void);
-void balance_dirty_pages_ratelimited(struct address_space *mapping);
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+					unsigned long nr_pages_dirtied);
+
+static inline void
+balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+	balance_dirty_pages_ratelimited_nr(mapping, 1);
+}
+
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 int sync_page_range(struct inode *inode, struct address_space *mapping,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c1052ee79f01..c67ddc464721 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -256,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 }
 
 /**
- * balance_dirty_pages_ratelimited - balance dirty memory state
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
+ * @nr_pages: number of pages which the caller has just dirtied
  *
  * Processes which are dirtying memory should call in here once for each page
  * which was newly dirtied.  The function will periodically check the system's
@@ -268,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
  * limit we decrease the ratelimiting by a lot, to prevent individual processes
  * from overshooting the limit by (ratelimit_pages) each.
  */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+					unsigned long nr_pages_dirtied)
 {
-	static DEFINE_PER_CPU(int, ratelimits) = 0;
-	long ratelimit;
+	static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
+	unsigned long ratelimit;
+	unsigned long *p;
 
 	ratelimit = ratelimit_pages;
 	if (dirty_exceeded)
@@ -281,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
-	if (get_cpu_var(ratelimits)++ >= ratelimit) {
-		__get_cpu_var(ratelimits) = 0;
-		put_cpu_var(ratelimits);
+	preempt_disable();
+	p =  &__get_cpu_var(ratelimits);
+	*p += nr_pages_dirtied;
+	if (unlikely(*p >= ratelimit)) {
+		*p = 0;
+		preempt_enable();
 		balance_dirty_pages(mapping);
 		return;
 	}
-	put_cpu_var(ratelimits);
+	preempt_enable();
 }
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
 void throttle_vm_writeout(void)
 {
-- 
cgit v1.2.3


From 4741c9fd36b3bcadd37238321c469049da94a4b9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:11 -0800
Subject: [PATCH] set_page_dirty() return value fixes

We need set_page_dirty() to return true if it actually transitioned the page
from a clean to dirty state.  This wasn't right in a couple of places.  Do a
kernel-wide audit, fix things up.

This leaves open the possibility of returning a negative errno from
set_page_dirty() sometime in the future.  But we don't do that at present.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/arch-v32/drivers/cryptocop.c |  2 +-
 drivers/block/rd.c                     |  3 ++-
 fs/buffer.c                            |  2 +-
 include/linux/fs.h                     |  2 +-
 mm/page-writeback.c                    | 11 ++++++-----
 5 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index 501fa52d8d3a..c59ee28a35f4 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2944,7 +2944,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
 		int spdl_err;
 		/* Mark output pages dirty. */
 		spdl_err = set_page_dirty_lock(outpages[i]);
-		DEBUG(if (spdl_err)printk("cryptocop_ioctl_process: set_page_dirty_lock returned %d\n", spdl_err));
+		DEBUG(if (spdl_err < 0)printk("cryptocop_ioctl_process: set_page_dirty_lock returned %d\n", spdl_err));
 	}
 	for (i = 0; i < nooutpages; i++){
 		put_page(outpages[i]);
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 1c54f46d3f70..940bfd7951e5 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -186,7 +186,8 @@ static int ramdisk_writepages(struct address_space *mapping,
  */
 static int ramdisk_set_page_dirty(struct page *page)
 {
-	SetPageDirty(page);
+	if (!TestSetPageDirty(page))
+		return 1;
 	return 0;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 11ca6eb46a33..24262ea8cc50 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -865,8 +865,8 @@ int __set_page_dirty_buffers(struct page *page)
 		}
 		write_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+		return 1;
 	}
-	
 	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ad70c1e5e55..092cfaee0cd2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -350,7 +350,7 @@ struct address_space_operations {
 	/* Write back some dirty pages from this mapping. */
 	int (*writepages)(struct address_space *, struct writeback_control *);
 
-	/* Set a page dirty */
+	/* Set a page dirty.  Return true if this dirtied it */
 	int (*set_page_dirty)(struct page *page);
 
 	int (*readpages)(struct file *filp, struct address_space *mapping,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c67ddc464721..893d7677579e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -628,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
-	int ret = 0;
-
 	if (!TestSetPageDirty(page)) {
 		struct address_space *mapping = page_mapping(page);
 		struct address_space *mapping2;
@@ -651,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
 							I_DIRTY_PAGES);
 			}
 		}
+		return 1;
 	}
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
@@ -682,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
 			return (*spd)(page);
 		return __set_page_dirty_buffers(page);
 	}
-	if (!PageDirty(page))
-		SetPageDirty(page);
+	if (!PageDirty(page)) {
+		if (!TestSetPageDirty(page))
+			return 1;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(set_page_dirty);
-- 
cgit v1.2.3


From 18e79b40ed9c5223b88771f805c69f5993fc131b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:14 -0800
Subject: [PATCH] fsync: extract internal code

Pull the guts out of do_fsync() - we can use it elsewhere.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c        | 43 +++++++++++++++++++++++--------------------
 include/linux/fs.h |  1 +
 2 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 24262ea8cc50..6d77ce9f54e5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -327,31 +327,24 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	return ret;
 }
 
-static long do_fsync(unsigned int fd, int datasync)
+long do_fsync(struct file *file, int datasync)
 {
-	struct file * file;
-	struct address_space *mapping;
-	int ret, err;
-
-	ret = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
+	int ret;
+	int err;
+	struct address_space *mapping = file->f_mapping;
 
-	ret = -EINVAL;
 	if (!file->f_op || !file->f_op->fsync) {
 		/* Why?  We can still call filemap_fdatawrite */
-		goto out_putf;
+		ret = -EINVAL;
+		goto out;
 	}
 
-	mapping = file->f_mapping;
-
 	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 
 	/*
-	 * We need to protect against concurrent writers,
-	 * which could cause livelocks in fsync_buffers_list
+	 * We need to protect against concurrent writers, which could cause
+	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
 	err = file->f_op->fsync(file, file->f_dentry, datasync);
@@ -362,21 +355,31 @@ static long do_fsync(unsigned int fd, int datasync)
 	if (!ret)
 		ret = err;
 	current->flags &= ~PF_SYNCWRITE;
-
-out_putf:
-	fput(file);
 out:
 	return ret;
 }
 
+static long __do_fsync(unsigned int fd, int datasync)
+{
+	struct file *file;
+	int ret = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		ret = do_fsync(file, datasync);
+		fput(file);
+	}
+	return ret;
+}
+
 asmlinkage long sys_fsync(unsigned int fd)
 {
-	return do_fsync(fd, 0);
+	return __do_fsync(fd, 0);
 }
 
 asmlinkage long sys_fdatasync(unsigned int fd)
 {
-	return do_fsync(fd, 1);
+	return __do_fsync(fd, 1);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 092cfaee0cd2..215696a0f16f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1478,6 +1478,7 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end, int sync_mode);
 
+extern long do_fsync(struct file *file, int datasync);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
-- 
cgit v1.2.3


From 97f2478db161714d0e3e864f38f5789c66f98b81 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus@drzeus.cx>
Date: Fri, 24 Mar 2006 03:18:16 -0800
Subject: [PATCH] Secure Digital Host Controller id and regs

Class code and register definitions for the Secure Digital Host Controller
standard.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f3dcf89d5232..6f080ae59286 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -69,6 +69,7 @@
 #define PCI_CLASS_SYSTEM_TIMER		0x0802
 #define PCI_CLASS_SYSTEM_RTC		0x0803
 #define PCI_CLASS_SYSTEM_PCI_HOTPLUG	0x0804
+#define PCI_CLASS_SYSTEM_SDHCI		0x0805
 #define PCI_CLASS_SYSTEM_OTHER		0x0880
 
 #define PCI_BASE_CLASS_INPUT		0x09
-- 
cgit v1.2.3


From 208a08f7cc2a8932ed76162d9844f9ae7d7fc015 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Fri, 24 Mar 2006 03:18:21 -0800
Subject: [PATCH] ide: Allow IDE interface to specify its not capable of 32-bit
 operations

In some embedded systems the IDE hardware interface may only support 16-bit
or smaller accesses.  Allow the interface to specify if this is the case
and don't allow the drive or user to override the setting.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/ide-disk.c  | 2 --
 drivers/ide/ide-probe.c | 9 +++++++++
 include/linux/ide.h     | 1 +
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index e238b7da824b..ccf528d733bf 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -978,8 +978,6 @@ static void idedisk_setup (ide_drive_t *drive)
 		ide_dma_verbose(drive);
 	printk("\n");
 
-	drive->no_io_32bit = id->dword_io ? 1 : 0;
-
 	/* write cache enabled? */
 	if ((id->csfo & 1) || (id->cfs_enable_1 & (1 << 5)))
 		drive->wcache = 1;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 427d1c204174..1b7b4c531bc2 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -858,6 +858,15 @@ static void probe_hwif(ide_hwif_t *hwif)
 			}
 		}
 	}
+
+	for (unit = 0; unit < MAX_DRIVES; ++unit) {
+		ide_drive_t *drive = &hwif->drives[unit];
+
+		if (hwif->no_io_32bit)
+			drive->no_io_32bit = 1;
+		else
+			drive->no_io_32bit = drive->id->dword_io ? 1 : 0;
+	}
 }
 
 static int hwif_init(ide_hwif_t *hwif);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a7fc4cc79b23..8d2db412ba9c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -792,6 +792,7 @@ typedef struct hwif_s {
 	unsigned	no_dsc     : 1;	/* 0 default, 1 dsc_overlap disabled */
 	unsigned	auto_poll  : 1; /* supports nop auto-poll */
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
+	unsigned	no_io_32bit : 1; /* 1 = can not do 32-bit IO ops */
 
 	struct device	gendev;
 	struct completion gendev_rel_comp; /* To deal with device release() */
-- 
cgit v1.2.3


From 6687a97d4041f996f725902d2990e5de6ef5cbe5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 24 Mar 2006 03:18:41 -0800
Subject: [PATCH] timer-irq-driven soft-watchdog, cleanups

Make the softlockup detector purely timer-interrupt driven, removing
softirq-context (timer) dependencies.  This means that if the softlockup
watchdog triggers, it has truly observed a longer than 10 seconds
scheduling delay of a SCHED_FIFO prio 99 task.

(the patch also turns off the softlockup detector during the initial bootup
phase and does small style fixes)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  4 ++--
 kernel/softlockup.c   | 54 ++++++++++++++++++++++++++++-----------------------
 kernel/timer.c        |  2 +-
 3 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2cda439ece43..e0054c1b9a09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -206,11 +206,11 @@ extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-extern void softlockup_tick(struct pt_regs *regs);
+extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 #else
-static inline void softlockup_tick(struct pt_regs *regs)
+static inline void softlockup_tick(void)
 {
 }
 static inline void spawn_softlockup_task(void)
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c67189a25d52..dd9524fa649a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -1,12 +1,11 @@
 /*
  * Detect Soft Lockups
  *
- * started by Ingo Molnar, (C) 2005, Red Hat
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
  *
  * this code detects soft lockups: incidents in where on a CPU
  * the kernel does not reschedule for 10 seconds or more.
  */
-
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/init.h>
@@ -17,13 +16,14 @@
 
 static DEFINE_SPINLOCK(print_lock);
 
-static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
-static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int did_panic = 0;
-static int softlock_panic(struct notifier_block *this, unsigned long event,
-				void *ptr)
+
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	did_panic = 1;
 
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
 
 void touch_softlockup_watchdog(void)
 {
-	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+	per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog);
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
  */
-void softlockup_tick(struct pt_regs *regs)
+void softlockup_tick(void)
 {
 	int this_cpu = smp_processor_id();
-	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+	unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
 
-	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+	/* prevent double reports: */
+	if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+		did_panic ||
+			!per_cpu(watchdog_task, this_cpu))
 		return;
 
-	/* Do not cause a second panic when there already was one */
-	if (did_panic)
+	/* do not print during early bootup: */
+	if (unlikely(system_state != SYSTEM_RUNNING)) {
+		touch_softlockup_watchdog();
 		return;
+	}
 
-	if (time_after(jiffies, timestamp + 10*HZ)) {
-		per_cpu(print_timestamp, this_cpu) = timestamp;
+	/* Wake up the high-prio watchdog task every second: */
+	if (time_after(jiffies, touch_timestamp + HZ))
+		wake_up_process(per_cpu(watchdog_task, this_cpu));
+
+	/* Warn about unreasonable 10+ seconds delays: */
+	if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
 
 		spin_lock(&print_lock);
 		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
 			this_cpu);
-		show_regs(regs);
+		dump_stack();
 		spin_unlock(&print_lock);
 	}
 }
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->flags |= PF_NOFREEZE;
 
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	/*
-	 * Run briefly once per second - if this gets delayed for
-	 * more than 10 seconds then the debug-printout triggers
-	 * in softlockup_tick():
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 10 seconds then the
+	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		msleep_interruptible(1000);
+		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
+		schedule();
 	}
-	__set_current_state(TASK_RUNNING);
 
 	return 0;
 }
@@ -114,7 +122,6 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		kthread_bind(p, hotcpu);
  		break;
 	case CPU_ONLINE:
-
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -146,4 +153,3 @@ __init void spawn_softlockup_task(void)
 
 	notifier_chain_register(&panic_notifier_list, &panic_block);
 }
-
diff --git a/kernel/timer.c b/kernel/timer.c
index 4427e725ccdd..17d956cebcb9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -915,6 +915,7 @@ static void run_timer_softirq(struct softirq_action *h)
 void run_local_timers(void)
 {
 	raise_softirq(TIMER_SOFTIRQ);
+	softlockup_tick();
 }
 
 /*
@@ -945,7 +946,6 @@ void do_timer(struct pt_regs *regs)
 	/* prevent loading jiffies before storing new jiffies_64 value. */
 	barrier();
 	update_times();
-	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3


From 96840aa00a031069a136ec4c55d0bdd09ac6d3a7 Mon Sep 17 00:00:00 2001
From: Davi Arnaut <davi.arnaut@gmail.com>
Date: Fri, 24 Mar 2006 03:18:42 -0800
Subject: [PATCH] strndup_user()

This patch series creates a strndup_user() function to easy copying C strings
from userspace.  Also we avoid common pitfalls like userspace modifying the
final \0 after the strlen_user().

Signed-off-by: Davi Arnaut <davi.arnaut@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/string.h |  2 ++
 mm/util.c              | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 369be3264a55..dee221429ad0 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -18,6 +18,8 @@ extern char * strsep(char **,const char *);
 extern __kernel_size_t strspn(const char *,const char *);
 extern __kernel_size_t strcspn(const char *,const char *);
 
+extern char *strndup_user(const char __user *, long);
+
 /*
  * Include machine specific inline routines
  */
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59da63c..49e29f751b50 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,6 +1,8 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/err.h>
+#include <asm/uaccess.h>
 
 /**
  * kzalloc - allocate memory. The memory is set to zero.
@@ -37,3 +39,38 @@ char *kstrdup(const char *s, gfp_t gfp)
 	return buf;
 }
 EXPORT_SYMBOL(kstrdup);
+
+/*
+ * strndup_user - duplicate an existing string from user space
+ *
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ */
+char *strndup_user(const char __user *s, long n)
+{
+	char *p;
+	long length;
+
+	length = strnlen_user(s, n);
+
+	if (!length)
+		return ERR_PTR(-EFAULT);
+
+	if (length > n)
+		return ERR_PTR(-EINVAL);
+
+	p = kmalloc(length, GFP_KERNEL);
+
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(p, s, length)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+
+	p[length - 1] = '\0';
+
+	return p;
+}
+EXPORT_SYMBOL(strndup_user);
-- 
cgit v1.2.3


From 13fce8062968996da496d4f65cc1c1f845704604 Mon Sep 17 00:00:00 2001
From: Andrzej Zaborowski <balrogg@gmail.com>
Date: Fri, 24 Mar 2006 18:13:37 +0100
Subject: Fix simple typos

This corrects some trivial errors in ARM docs and comments,

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 Documentation/arm/Booting | 2 +-
 Documentation/arm/README  | 2 +-
 Documentation/arm/Setup   | 2 +-
 include/linux/timer.h     | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/arm/Booting b/Documentation/arm/Booting
index fad566bb02fc..76850295af8f 100644
--- a/Documentation/arm/Booting
+++ b/Documentation/arm/Booting
@@ -118,7 +118,7 @@ to store page tables.  The recommended placement is 32KiB into RAM.
 
 In either case, the following conditions must be met:
 
-- Quiesce all DMA capable devicess so that memory does not get
+- Quiesce all DMA capable devices so that memory does not get
   corrupted by bogus network packets or disk data. This will save
   you many hours of debug.
 
diff --git a/Documentation/arm/README b/Documentation/arm/README
index 5ed6f3530b86..9b9c8226fdc4 100644
--- a/Documentation/arm/README
+++ b/Documentation/arm/README
@@ -89,7 +89,7 @@ Modules
   Although modularisation is supported (and required for the FP emulator),
   each module on an ARM2/ARM250/ARM3 machine when is loaded will take
   memory up to the next 32k boundary due to the size of the pages.
-  Therefore, modularisation on these machines really worth it?
+  Therefore, is modularisation on these machines really worth it?
 
   However, ARM6 and up machines allow modules to take multiples of 4k, and
   as such Acorn RiscPCs and other architectures using these processors can
diff --git a/Documentation/arm/Setup b/Documentation/arm/Setup
index 0abd0720d7ed..0cb1e64bde80 100644
--- a/Documentation/arm/Setup
+++ b/Documentation/arm/Setup
@@ -58,7 +58,7 @@ below:
  video_y
 
    This describes the character position of cursor on VGA console, and
-   is otherwise unused. (should not used for other console types, and
+   is otherwise unused. (should not be used for other console types, and
    should not be used for other purposes).
 
  memc_control_reg
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 9b9877fd2505..ee5a09e806e8 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -69,13 +69,13 @@ extern unsigned long next_timer_interrupt(void);
  * @timer: the timer to be added
  *
  * The kernel will do a ->function(->data) callback from the
- * timer interrupt at the ->expired point in the future. The
+ * timer interrupt at the ->expires point in the future. The
  * current time is 'jiffies'.
  *
- * The timer's ->expired, ->function (and if the handler uses it, ->data)
+ * The timer's ->expires, ->function (and if the handler uses it, ->data)
  * fields must be set prior calling this function.
  *
- * Timers with an ->expired field in the past will be executed in the next
+ * Timers with an ->expires field in the past will be executed in the next
  * timer tick.
  */
 static inline void add_timer(struct timer_list *timer)
-- 
cgit v1.2.3


From aec5c3c1a929d7d79a420e943285cf3ba26a7c0d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sat, 25 Mar 2006 01:33:34 +0900
Subject: [PATCH] libata: kill E.D.D.

E.D.D. has no user in-tree and mostly useless.  Kill it.  For possible
out-of-tree users, add a nice warning message and error handling if
LLDD doesn't report any useable reset mechanism (and thus tries to use
E.D.D.).

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 98 ++++++----------------------------------------
 include/linux/libata.h     |  1 -
 2 files changed, 13 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index d279666dcb38..0aff888d9ecd 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1081,9 +1081,8 @@ unsigned int ata_pio_need_iordy(const struct ata_device *adev)
  *
  *	Read ID data from the specified device.  ATA_CMD_ID_ATA is
  *	performed on ATA devices and ATA_CMD_ID_ATAPI on ATAPI
- *	devices.  This function also takes care of EDD signature
- *	misreporting (to be removed once EDD support is gone) and
- *	issues ATA_CMD_INIT_DEV_PARAMS for pre-ATA4 drives.
+ *	devices.  This function also issues ATA_CMD_INIT_DEV_PARAMS
+ *	for pre-ATA4 drives.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep)
@@ -1095,7 +1094,6 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 			   unsigned int *p_class, int post_reset, u16 **p_id)
 {
 	unsigned int class = *p_class;
-	unsigned int using_edd;
 	struct ata_taskfile tf;
 	unsigned int err_mask = 0;
 	u16 *id;
@@ -1104,12 +1102,6 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 
 	DPRINTK("ENTER, host %u, dev %u\n", ap->id, dev->devno);
 
-	if (ap->ops->probe_reset ||
-	    ap->flags & (ATA_FLAG_SRST | ATA_FLAG_SATA_RESET))
-		using_edd = 0;
-	else
-		using_edd = 1;
-
 	ata_dev_select(ap, dev->devno, 1, 1); /* select device 0/1 */
 
 	id = kmalloc(sizeof(id[0]) * ATA_ID_WORDS, GFP_KERNEL);
@@ -1139,32 +1131,9 @@ static int ata_dev_read_id(struct ata_port *ap, struct ata_device *dev,
 
 	err_mask = ata_exec_internal(ap, dev, &tf, DMA_FROM_DEVICE,
 				     id, sizeof(id[0]) * ATA_ID_WORDS);
-
 	if (err_mask) {
 		rc = -EIO;
 		reason = "I/O error";
-
-		if (err_mask & ~AC_ERR_DEV)
-			goto err_out;
-
-		/*
-		 * arg!  EDD works for all test cases, but seems to return
-		 * the ATA signature for some ATAPI devices.  Until the
-		 * reason for this is found and fixed, we fix up the mess
-		 * here.  If IDENTIFY DEVICE returns command aborted
-		 * (as ATAPI devices do), then we issue an
-		 * IDENTIFY PACKET DEVICE.
-		 *
-		 * ATA software reset (SRST, the default) does not appear
-		 * to have this problem.
-		 */
-		if ((using_edd) && (class == ATA_DEV_ATA)) {
-			u8 err = tf.feature;
-			if (err & ATA_ABORTED) {
-				class = ATA_DEV_ATAPI;
-				goto retry;
-			}
-		}
 		goto err_out;
 	}
 
@@ -2005,45 +1974,6 @@ static void ata_bus_post_reset(struct ata_port *ap, unsigned int devmask)
 		ap->ops->dev_select(ap, 0);
 }
 
-/**
- *	ata_bus_edd - Issue EXECUTE DEVICE DIAGNOSTIC command.
- *	@ap: Port to reset and probe
- *
- *	Use the EXECUTE DEVICE DIAGNOSTIC command to reset and
- *	probe the bus.  Not often used these days.
- *
- *	LOCKING:
- *	PCI/etc. bus probe sem.
- *	Obtains host_set lock.
- *
- */
-
-static unsigned int ata_bus_edd(struct ata_port *ap)
-{
-	struct ata_taskfile tf;
-	unsigned long flags;
-
-	/* set up execute-device-diag (bus reset) taskfile */
-	/* also, take interrupts to a known state (disabled) */
-	DPRINTK("execute-device-diag\n");
-	ata_tf_init(ap, &tf, 0);
-	tf.ctl |= ATA_NIEN;
-	tf.command = ATA_CMD_EDD;
-	tf.protocol = ATA_PROT_NODATA;
-
-	/* do bus reset */
-	spin_lock_irqsave(&ap->host_set->lock, flags);
-	ata_tf_to_host(ap, &tf);
-	spin_unlock_irqrestore(&ap->host_set->lock, flags);
-
-	/* spec says at least 2ms.  but who knows with those
-	 * crazy ATAPI devices...
-	 */
-	msleep(150);
-
-	return ata_busy_sleep(ap, ATA_TMOUT_BOOT_QUICK, ATA_TMOUT_BOOT);
-}
-
 static unsigned int ata_bus_softreset(struct ata_port *ap,
 				      unsigned int devmask)
 {
@@ -2116,7 +2046,7 @@ void ata_bus_reset(struct ata_port *ap)
 	struct ata_ioports *ioaddr = &ap->ioaddr;
 	unsigned int slave_possible = ap->flags & ATA_FLAG_SLAVE_POSS;
 	u8 err;
-	unsigned int dev0, dev1 = 0, rc = 0, devmask = 0;
+	unsigned int dev0, dev1 = 0, devmask = 0;
 
 	DPRINTK("ENTER, host %u, port %u\n", ap->id, ap->port_no);
 
@@ -2139,18 +2069,8 @@ void ata_bus_reset(struct ata_port *ap)
 
 	/* issue bus reset */
 	if (ap->flags & ATA_FLAG_SRST)
-		rc = ata_bus_softreset(ap, devmask);
-	else if ((ap->flags & ATA_FLAG_SATA_RESET) == 0) {
-		/* set up device control */
-		if (ap->flags & ATA_FLAG_MMIO)
-			writeb(ap->ctl, (void __iomem *) ioaddr->ctl_addr);
-		else
-			outb(ap->ctl, ioaddr->ctl_addr);
-		rc = ata_bus_edd(ap);
-	}
-
-	if (rc)
-		goto err_out;
+		if (ata_bus_softreset(ap, devmask))
+			goto err_out;
 
 	/*
 	 * determine by signature whether we have ATA or ATAPI devices
@@ -4536,6 +4456,14 @@ static struct ata_port * ata_host_add(const struct ata_probe_ent *ent,
 	int rc;
 
 	DPRINTK("ENTER\n");
+
+	if (!ent->port_ops->probe_reset &&
+	    !(ent->host_flags & (ATA_FLAG_SATA_RESET | ATA_FLAG_SRST))) {
+		printk(KERN_ERR "ata%u: no reset mechanism available\n",
+		       port_no);
+		return NULL;
+	}
+
 	host = scsi_host_alloc(ent->sht, sizeof(struct ata_port));
 	if (!host)
 		return NULL;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 047192253c3a..9fcc061e3adf 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -161,7 +161,6 @@ enum {
 	ATA_QCFLAG_EH_SCHEDULED = (1 << 5), /* EH scheduled */
 
 	/* various lengths of time */
-	ATA_TMOUT_EDD		= 5 * HZ,	/* heuristic */
 	ATA_TMOUT_PIO		= 30 * HZ,
 	ATA_TMOUT_BOOT		= 30 * HZ,	/* heuristic */
 	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* heuristic */
-- 
cgit v1.2.3


From 301e22d69140898eddd38a9134da711cb5dfc170 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sat, 18 Mar 2006 17:15:00 -0300
Subject: V4L/DVB (3584): Implement V4L2_TUNER_MODE_LANG1_LANG2 audio mode

Add a new audio mode V4L2_TUNER_MODE_LANG1_LANG2 (used by VIDIOC_G/S_TUNER).
This mode allows the user to select both languages of a bilingual transmission,
one language on the left, one on the right audio channel. If there is no
bilingual transmission, or it is not supported, then this mode should act like
V4L2_TUNER_MODE_STEREO.
This mode is introduced for PVR-like drivers where it is useful to be able to
record both languages of a bilingual broadcast.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 drivers/media/dvb/ttpci/av7110_v4l.c          | 5 +++++
 drivers/media/video/bt8xx/bttv-driver.c       | 3 ++-
 drivers/media/video/cx25840/cx25840-core.c    | 5 +++--
 drivers/media/video/cx88/cx88-tvaudio.c       | 3 +++
 drivers/media/video/msp3400-kthreads.c        | 8 ++++++--
 drivers/media/video/mxb.c                     | 6 ++++++
 drivers/media/video/saa7134/saa7134-tvaudio.c | 2 ++
 drivers/media/video/tvaudio.c                 | 1 +
 include/linux/videodev2.h                     | 1 +
 9 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb/ttpci/av7110_v4l.c b/drivers/media/dvb/ttpci/av7110_v4l.c
index 2f23ceab8d44..603a22e4bfe2 100644
--- a/drivers/media/dvb/ttpci/av7110_v4l.c
+++ b/drivers/media/dvb/ttpci/av7110_v4l.c
@@ -369,6 +369,11 @@ static int av7110_ioctl(struct saa7146_fh *fh, unsigned int cmd, void *arg)
 			fm_matrix = 0x3001; // stereo
 			src = 0x0020;
 			break;
+		case V4L2_TUNER_MODE_LANG1_LANG2:
+			dprintk(2, "VIDIOC_S_TUNER: V4L2_TUNER_MODE_LANG1_LANG2\n");
+			fm_matrix = 0x3000; // bilingual
+			src = 0x0020;
+			break;
 		case V4L2_TUNER_MODE_LANG1:
 			dprintk(2, "VIDIOC_S_TUNER: V4L2_TUNER_MODE_LANG1\n");
 			fm_matrix = 0x3000; // mono
diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c
index 80e4a7406ac2..74def9c23952 100644
--- a/drivers/media/video/bt8xx/bttv-driver.c
+++ b/drivers/media/video/bt8xx/bttv-driver.c
@@ -1878,7 +1878,8 @@ static int bttv_common_ioctls(struct bttv *btv, unsigned int cmd, void *arg)
 			bttv_call_i2c_clients(btv, VIDIOCGAUDIO, &va);
 			if (t->audmode == V4L2_TUNER_MODE_MONO)
 				va.mode = VIDEO_SOUND_MONO;
-			else if (t->audmode == V4L2_TUNER_MODE_STEREO)
+			else if (t->audmode == V4L2_TUNER_MODE_STEREO ||
+				 t->audmode == V4L2_TUNER_MODE_LANG1_LANG2)
 				va.mode = VIDEO_SOUND_STEREO;
 			else if (t->audmode == V4L2_TUNER_MODE_LANG1)
 				va.mode = VIDEO_SOUND_LANG1;
diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c
index 7aee37645d63..a65b3cc4bf03 100644
--- a/drivers/media/video/cx25840/cx25840-core.c
+++ b/drivers/media/video/cx25840/cx25840-core.c
@@ -810,13 +810,14 @@ static int cx25840_command(struct i2c_client *client, unsigned int cmd,
 			   bilingual -> lang1 */
 			cx25840_and_or(client, 0x809, ~0xf, 0x00);
 			break;
+		case V4L2_TUNER_MODE_STEREO:
 		case V4L2_TUNER_MODE_LANG1:
 			/* mono      -> mono
 			   stereo    -> stereo
 			   bilingual -> lang1 */
 			cx25840_and_or(client, 0x809, ~0xf, 0x04);
 			break;
-		case V4L2_TUNER_MODE_STEREO:
+		case V4L2_TUNER_MODE_LANG1_LANG2:
 			/* mono      -> mono
 			   stereo    -> stereo
 			   bilingual -> lang1/lang2 */
@@ -824,7 +825,7 @@ static int cx25840_command(struct i2c_client *client, unsigned int cmd,
 			break;
 		case V4L2_TUNER_MODE_LANG2:
 			/* mono      -> mono
-			   stereo    ->stereo
+			   stereo    -> stereo
 			   bilingual -> lang2 */
 			cx25840_and_or(client, 0x809, ~0xf, 0x01);
 			break;
diff --git a/drivers/media/video/cx88/cx88-tvaudio.c b/drivers/media/video/cx88/cx88-tvaudio.c
index da8d97ce0c4b..641a0c5a6490 100644
--- a/drivers/media/video/cx88/cx88-tvaudio.c
+++ b/drivers/media/video/cx88/cx88-tvaudio.c
@@ -885,6 +885,7 @@ void cx88_set_stereo(struct cx88_core *core, u32 mode, int manual)
 			set_audio_standard_BTSC(core, 1, EN_BTSC_FORCE_SAP);
 			break;
 		case V4L2_TUNER_MODE_STEREO:
+		case V4L2_TUNER_MODE_LANG1_LANG2:
 			set_audio_standard_BTSC(core, 0, EN_BTSC_FORCE_STEREO);
 			break;
 		}
@@ -905,6 +906,7 @@ void cx88_set_stereo(struct cx88_core *core, u32 mode, int manual)
 							 EN_NICAM_FORCE_MONO2);
 				break;
 			case V4L2_TUNER_MODE_STEREO:
+			case V4L2_TUNER_MODE_LANG1_LANG2:
 				set_audio_standard_NICAM(core,
 							 EN_NICAM_FORCE_STEREO);
 				break;
@@ -926,6 +928,7 @@ void cx88_set_stereo(struct cx88_core *core, u32 mode, int manual)
 							      EN_A2_FORCE_MONO2);
 					break;
 				case V4L2_TUNER_MODE_STEREO:
+				case V4L2_TUNER_MODE_LANG1_LANG2:
 					set_audio_standard_A2(core,
 							      EN_A2_FORCE_STEREO);
 					break;
diff --git a/drivers/media/video/msp3400-kthreads.c b/drivers/media/video/msp3400-kthreads.c
index 1c794c3b9f21..c3984ea9ca07 100644
--- a/drivers/media/video/msp3400-kthreads.c
+++ b/drivers/media/video/msp3400-kthreads.c
@@ -223,9 +223,9 @@ void msp3400c_set_mode(struct i2c_client *client, int mode)
    nor do they support stereo BTSC. */
 static void msp3400c_set_audmode(struct i2c_client *client)
 {
-	static char *strmode[] = { "mono", "stereo", "lang2", "lang1" };
+	static char *strmode[] = { "mono", "stereo", "lang2", "lang1", "lang1+lang2" };
 	struct msp_state *state = i2c_get_clientdata(client);
-	char *modestr = (state->audmode >= 0 && state->audmode < 4) ?
+	char *modestr = (state->audmode >= 0 && state->audmode < 5) ?
 		strmode[state->audmode] : "unknown";
 	int src = 0;	/* channel source: FM/AM, nicam or SCART */
 
@@ -250,6 +250,7 @@ static void msp3400c_set_audmode(struct i2c_client *client)
 		case V4L2_TUNER_MODE_MONO:
 		case V4L2_TUNER_MODE_LANG1:
 		case V4L2_TUNER_MODE_LANG2:
+		case V4L2_TUNER_MODE_LANG1_LANG2:
 			msp_write_dsp(client, 0x000e, 0x3000);
 			break;
 		}
@@ -261,6 +262,7 @@ static void msp3400c_set_audmode(struct i2c_client *client)
 			msp3400c_set_carrier(client, MSP_CARRIER(6.5), MSP_CARRIER(6.5));
 			break;
 		case V4L2_TUNER_MODE_STEREO:
+		case V4L2_TUNER_MODE_LANG1_LANG2:
 			msp3400c_set_carrier(client, MSP_CARRIER(7.2), MSP_CARRIER(7.02));
 			break;
 		case V4L2_TUNER_MODE_LANG1:
@@ -296,6 +298,7 @@ static void msp3400c_set_audmode(struct i2c_client *client)
 	/* switch audio */
 	switch (state->audmode) {
 	case V4L2_TUNER_MODE_STEREO:
+	case V4L2_TUNER_MODE_LANG1_LANG2:
 		src |= 0x0020;
 		break;
 	case V4L2_TUNER_MODE_MONO:
@@ -835,6 +838,7 @@ static void msp34xxg_set_source(struct i2c_client *client, u16 reg, int in)
 		matrix = 0x10;
 		break;
 	case V4L2_TUNER_MODE_STEREO:
+	case V4L2_TUNER_MODE_LANG1_LANG2:
 	default:
 		source = 1; /* stereo or A|B */
 		matrix = 0x20;
diff --git a/drivers/media/video/mxb.c b/drivers/media/video/mxb.c
index 14ca251787d5..b0aea4002d11 100644
--- a/drivers/media/video/mxb.c
+++ b/drivers/media/video/mxb.c
@@ -790,6 +790,12 @@ static int mxb_ioctl(struct saa7146_fh *fh, unsigned int cmd, void *arg)
 				DEB_D(("VIDIOC_S_TUNER: V4L2_TUNER_MODE_STEREO\n"));
 				break;
 			}
+			case V4L2_TUNER_MODE_LANG1_LANG2: {
+				mxb->cur_mode = V4L2_TUNER_MODE_LANG1_LANG2;
+				byte = TDA9840_SET_BOTH;
+				DEB_D(("VIDIOC_S_TUNER: V4L2_TUNER_MODE_LANG1_LANG2\n"));
+				break;
+			}
 			case V4L2_TUNER_MODE_LANG1: {
 				mxb->cur_mode = V4L2_TUNER_MODE_LANG1;
 				byte = TDA9840_SET_LANG1;
diff --git a/drivers/media/video/saa7134/saa7134-tvaudio.c b/drivers/media/video/saa7134/saa7134-tvaudio.c
index 3043233a8b6e..0db53d192b2a 100644
--- a/drivers/media/video/saa7134/saa7134-tvaudio.c
+++ b/drivers/media/video/saa7134/saa7134-tvaudio.c
@@ -482,12 +482,14 @@ static int tvaudio_setstereo(struct saa7134_dev *dev, struct saa7134_tvaudio *au
 		[ V4L2_TUNER_MODE_STEREO ] = "stereo",
 		[ V4L2_TUNER_MODE_LANG1  ] = "lang1",
 		[ V4L2_TUNER_MODE_LANG2  ] = "lang2",
+		[ V4L2_TUNER_MODE_LANG1_LANG2  ] = "lang1+lang2",
 	};
 	static u32 fm[] = {
 		[ V4L2_TUNER_MODE_MONO   ] = 0x00,  /* ch1  */
 		[ V4L2_TUNER_MODE_STEREO ] = 0x80,  /* auto */
 		[ V4L2_TUNER_MODE_LANG1  ] = 0x00,  /* ch1  */
 		[ V4L2_TUNER_MODE_LANG2  ] = 0x01,  /* ch2  */
+		[ V4L2_TUNER_MODE_LANG1_LANG2 ] = 0x80,  /* auto */
 	};
 	u32 reg;
 
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index 4e6d030d83c0..356bff455ad1 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -1733,6 +1733,7 @@ static int chip_command(struct i2c_client *client,
 			mode = VIDEO_SOUND_MONO;
 			break;
 		case V4L2_TUNER_MODE_STEREO:
+		case V4L2_TUNER_MODE_LANG1_LANG2:
 			mode = VIDEO_SOUND_STEREO;
 			break;
 		case V4L2_TUNER_MODE_LANG1:
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 724cfbf54b8a..2275bfec5b68 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -883,6 +883,7 @@ struct v4l2_modulator
 #define V4L2_TUNER_MODE_LANG2		0x0002
 #define V4L2_TUNER_MODE_SAP		0x0002
 #define V4L2_TUNER_MODE_LANG1		0x0003
+#define V4L2_TUNER_MODE_LANG1_LANG2	0x0004
 
 struct v4l2_frequency
 {
-- 
cgit v1.2.3


From a20c522498330ba0f4970a9bcd11890312277ae2 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Thu, 23 Mar 2006 19:37:58 -0300
Subject: V4L/DVB (3598): Add bit algorithm adapter for the Conexant CX2341X
 boards.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 include/linux/i2c-id.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index ec311bc89439..679b46a6a565 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -184,6 +184,7 @@
 #define I2C_HW_B_SAVAGE		0x01001d /* savage framebuffer driver */
 #define I2C_HW_B_RADEON		0x01001e /* radeon framebuffer driver */
 #define I2C_HW_B_EM28XX		0x01001f /* em28xx video capture cards */
+#define I2C_HW_B_CX2341X	0x010020 /* Conexant CX2341X MPEG encoder cards */
 
 /* --- PCF 8584 based algorithms					*/
 #define I2C_HW_P_LP		0x020000 /* Parallel port interface */
-- 
cgit v1.2.3


From a1a8feed1743ec8d2af1dafa7c5321679f0a3e4f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 23 Mar 2006 22:07:34 -0800
Subject: [MODULES]: Don't allow statically declared exports

Add an extern declaration for exported symbols to make the compiler warn
on symbols declared statically.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/module.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 70bd843c71cb..d9569151c182 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -183,6 +183,7 @@ void *__symbol_get_gpl(const char *symbol);
 
 /* For every exported symbol, place a struct in the __ksymtab section */
 #define __EXPORT_SYMBOL(sym, sec)				\
+	extern typeof(sym) sym;					\
 	__CRC_SYMBOL(sym, sec)					\
 	static const char __kstrtab_##sym[]			\
 	__attribute__((section("__ksymtab_strings")))		\
-- 
cgit v1.2.3


From c08b8a49100715b20e6f7c997e992428b5e06078 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Mar 2006 03:06:33 -0800
Subject: [PATCH] sys_alarm() unsigned signed conversion fixup

alarm() calls the kernel with an unsigend int timeout in seconds.  The
value is stored in the tv_sec field of a struct timeval to setup the
itimer.  The tv_sec field of struct timeval is of type long, which causes
the tv_sec value to be negative on 32 bit machines if seconds > INT_MAX.

Before the hrtimer merge (pre 2.6.16) such a negative value was converted
to the maximum jiffies timeout by the timeval_to_jiffies conversion.  It's
not clear whether this was intended or just happened to be done by the
timeval_to_jiffies code.

hrtimers expect a timeval in canonical form and treat a negative timeout as
already expired.  This breaks the legitimate usage of alarm() with a
timeout value > INT_MAX seconds.

For 32 bit machines it is therefor necessary to limit the internal seconds
value to avoid API breakage.  Instead of doing this in all implementations
of sys_alarm the duplicated sys_alarm code is moved into a common function
in itimer.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/sys_ia32.c   | 14 +-------------
 arch/mips/kernel/sysirix.c  | 22 +---------------------
 arch/x86_64/ia32/sys_ia32.c | 16 ++--------------
 include/linux/time.h        |  1 +
 kernel/itimer.c             | 37 +++++++++++++++++++++++++++++++++++++
 kernel/timer.c              | 14 +-------------
 6 files changed, 43 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 70dba1f0e2ee..13e739e4c84d 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1166,19 +1166,7 @@ put_tv32 (struct compat_timeval __user *o, struct timeval *i)
 asmlinkage unsigned long
 sys32_alarm (unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 /* Translations due to time_t size differences.  Which affects all
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 0fc3730a294f..5407b784cd01 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -645,27 +645,7 @@ static inline void getitimer_real(struct itimerval *value)
 
 asmlinkage unsigned int irix_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	if (!seconds) {
-		getitimer_real(&it_old);
-		del_timer(&current->real_timer);
-	} else {
-		it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-		it_new.it_value.tv_sec = seconds;
-		it_new.it_value.tv_usec = 0;
-		do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	}
-	oldalarm = it_old.it_value.tv_sec;
-	/*
-	 * ehhh.. We can't return 0 if we have an alarm pending ...
-	 * And we'd better return too much than too little anyway
-	 */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 asmlinkage int irix_pause(void)
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 2bc55af95419..2b2d029f477c 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -430,24 +430,12 @@ put_tv32(struct compat_timeval __user *o, struct timeval *i)
 	return err; 
 }
 
-extern int do_setitimer(int which, struct itimerval *, struct itimerval *);
+extern unsigned int alarm_setitimer(unsigned int seconds);
 
 asmlinkage long
 sys32_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 /* Translations due to time_t size differences.  Which affects all
diff --git a/include/linux/time.h b/include/linux/time.h
index d9cdba54b789..bf0e785e2e03 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -101,6 +101,7 @@ extern long do_utimes(int dfd, char __user *filename, struct timeval *times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
+extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 379be2f8c84c..a2dc375927d8 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -226,6 +226,43 @@ again:
 	return 0;
 }
 
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds:	number of seconds until alarm
+ *		0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ *
+ * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
+ * negative timeval settings which would cause immediate expiry.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+	struct itimerval it_new, it_old;
+
+#if BITS_PER_LONG < 64
+	if (seconds > INT_MAX)
+		seconds = INT_MAX;
+#endif
+	it_new.it_value.tv_sec = seconds;
+	it_new.it_value.tv_usec = 0;
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+
+	do_setitimer(ITIMER_REAL, &it_new, &it_old);
+
+	/*
+	 * We can't return 0 if we have an alarm pending ...  And we'd
+	 * better return too much than too little anyway
+	 */
+	if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+	      it_old.it_value.tv_usec >= 500000)
+		it_old.it_value.tv_sec++;
+
+	return it_old.it_value.tv_sec;
+}
+
 asmlinkage long sys_setitimer(int which,
 			      struct itimerval __user *value,
 			      struct itimerval __user *ovalue)
diff --git a/kernel/timer.c b/kernel/timer.c
index 17d956cebcb9..13fa72cac7d8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -956,19 +956,7 @@ void do_timer(struct pt_regs *regs)
  */
 asmlinkage unsigned long sys_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 #endif
-- 
cgit v1.2.3


From 871751e25d956ad24f129ca972b7851feaa61d53 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 25 Mar 2006 03:06:39 -0800
Subject: [PATCH] slab: implement /proc/slab_allocators

Implement /proc/slab_allocators.   It produces output like:

idr_layer_cache: 80 idr_pre_get+0x33/0x4e
buffer_head: 2555 alloc_buffer_head+0x20/0x75
mm_struct: 9 mm_alloc+0x1e/0x42
mm_struct: 20 dup_mm+0x36/0x370
vm_area_struct: 384 dup_mm+0x18f/0x370
vm_area_struct: 151 do_mmap_pgoff+0x2e0/0x7c3
vm_area_struct: 1 split_vma+0x5a/0x10e
vm_area_struct: 11 do_brk+0x206/0x2e2
vm_area_struct: 2 copy_vma+0xda/0x142
vm_area_struct: 9 setup_arg_pages+0x99/0x214
fs_cache: 8 copy_fs_struct+0x21/0x133
fs_cache: 29 copy_process+0xf38/0x10e3
files_cache: 30 alloc_files+0x1b/0xcf
signal_cache: 81 copy_process+0xbaa/0x10e3
sighand_cache: 77 copy_process+0xe65/0x10e3
sighand_cache: 1 de_thread+0x4d/0x5f8
anon_vma: 241 anon_vma_prepare+0xd9/0xf3
size-2048: 1 add_sect_attrs+0x5f/0x145
size-2048: 2 journal_init_revoke+0x99/0x302
size-2048: 2 journal_init_revoke+0x137/0x302
size-2048: 2 journal_init_inode+0xf9/0x1c4

Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Alexander Nyberg <alexn@telia.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
DESC
slab-leaks3-locking-fix
EDESC
From: Andrew Morton <akpm@osdl.org>

Update for slab-remove-cachep-spinlock.patch

Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Alexander Nyberg <alexn@telia.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c  |  37 +++++++++++
 include/linux/slab.h |   6 +-
 lib/Kconfig.debug    |   4 ++
 mm/slab.c            | 180 +++++++++++++++++++++++++++++++++++++++++++++++++--
 mm/util.c            |   4 +-
 net/core/skbuff.c    |   2 +-
 6 files changed, 222 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 826c131994c3..1e9ea37d457e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -485,6 +485,40 @@ static struct file_operations proc_slabinfo_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+extern struct seq_operations slabstats_op;
+static int slabstats_open(struct inode *inode, struct file *file)
+{
+	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	int ret = -ENOMEM;
+	if (n) {
+		ret = seq_open(file, &slabstats_op);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
+			m->private = n;
+			n = NULL;
+		}
+		kfree(n);
+	}
+	return ret;
+}
+
+static int slabstats_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	kfree(m->private);
+	return seq_release(inode, file);
+}
+
+static struct file_operations proc_slabstats_operations = {
+	.open		= slabstats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= slabstats_release,
+};
+#endif
 #endif
 
 static int show_stat(struct seq_file *p, void *v)
@@ -744,6 +778,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 #ifdef CONFIG_SLAB
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+	create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);
+#endif
 #endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e2ee5b268797..f88e08a5802c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -77,11 +77,12 @@ struct cache_sizes {
 };
 extern struct cache_sizes malloc_sizes[];
 
-#ifndef CONFIG_DEBUG_SLAB
 extern void *__kmalloc(size_t, gfp_t);
+#ifndef CONFIG_DEBUG_SLAB
+#define ____kmalloc(size, flags) __kmalloc(size, flags)
 #else
 extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
-#define __kmalloc(size, flags) \
+#define ____kmalloc(size, flags) \
     __kmalloc_track_caller(size, flags, __builtin_return_address(0))
 #endif
 
@@ -173,6 +174,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 #define kmem_ptr_validate(a, b) (0)
 #define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
 #define kmalloc_node(s, f, n) kmalloc(s, f)
+#define ____kmalloc kmalloc
 
 #endif /* CONFIG_SLOB */
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f2618e1c2b93..1fe3f897145f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -85,6 +85,10 @@ config DEBUG_SLAB
 	  allocation as well as poisoning memory on free to catch use of freed
 	  memory. This can make kmalloc/kfree-intensive workloads much slower.
 
+config DEBUG_SLAB_LEAK
+	bool "Memory leak debugging"
+	depends on DEBUG_SLAB
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT
diff --git a/mm/slab.c b/mm/slab.c
index 26138c9f8f00..a5047161084e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -204,7 +204,8 @@
 typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
-#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
+#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
+#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
 
 /* Max number of objs-per-slab for caches which use off-slab slabs.
  * Needed to avoid a possible looping condition in cache_grow().
@@ -2399,7 +2400,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
 	/* Verify that the slab belongs to the intended node */
 	WARN_ON(slabp->nodeid != nodeid);
 
-	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
 		printk(KERN_ERR "slab: double free detected in cache "
 				"'%s', objp %p\n", cachep->name, objp);
 		BUG();
@@ -2605,6 +2606,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 		 */
 		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
 	}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2788,6 +2792,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+	{
+		struct slab *slabp;
+		unsigned objnr;
+
+		slabp = page_get_slab(virt_to_page(objp));
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+	}
+#endif
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
@@ -3220,22 +3234,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	return __cache_alloc(cachep, flags, caller);
 }
 
-#ifndef CONFIG_DEBUG_SLAB
 
 void *__kmalloc(size_t size, gfp_t flags)
 {
+#ifndef CONFIG_DEBUG_SLAB
 	return __do_kmalloc(size, flags, NULL);
+#else
+	return __do_kmalloc(size, flags, __builtin_return_address(0));
+#endif
 }
 EXPORT_SYMBOL(__kmalloc);
 
-#else
-
+#ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
 {
 	return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
-
 #endif
 
 #ifdef CONFIG_SMP
@@ -3899,6 +3914,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 		res = count;
 	return res;
 }
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+
+static void *leaks_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	struct list_head *p;
+
+	mutex_lock(&cache_chain_mutex);
+	p = cache_chain.next;
+	while (n--) {
+		p = p->next;
+		if (p == &cache_chain)
+			return NULL;
+	}
+	return list_entry(p, struct kmem_cache, next);
+}
+
+static inline int add_caller(unsigned long *n, unsigned long v)
+{
+	unsigned long *p;
+	int l;
+	if (!v)
+		return 1;
+	l = n[1];
+	p = n + 2;
+	while (l) {
+		int i = l/2;
+		unsigned long *q = p + 2 * i;
+		if (*q == v) {
+			q[1]++;
+			return 1;
+		}
+		if (*q > v) {
+			l = i;
+		} else {
+			p = q + 2;
+			l -= i + 1;
+		}
+	}
+	if (++n[1] == n[0])
+		return 0;
+	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
+	p[0] = v;
+	p[1] = 1;
+	return 1;
+}
+
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+{
+	void *p;
+	int i;
+	if (n[0] == n[1])
+		return;
+	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+			continue;
+		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+			return;
+	}
+}
+
+static void show_symbol(struct seq_file *m, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char *modname;
+	const char *name;
+	unsigned long offset, size;
+	char namebuf[KSYM_NAME_LEN+1];
+
+	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+
+	if (name) {
+		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
+		if (modname)
+			seq_printf(m, " [%s]", modname);
+		return;
+	}
+#endif
+	seq_printf(m, "%p", (void *)address);
+}
+
+static int leaks_show(struct seq_file *m, void *p)
+{
+	struct kmem_cache *cachep = p;
+	struct list_head *q;
+	struct slab *slabp;
+	struct kmem_list3 *l3;
+	const char *name;
+	unsigned long *n = m->private;
+	int node;
+	int i;
+
+	if (!(cachep->flags & SLAB_STORE_USER))
+		return 0;
+	if (!(cachep->flags & SLAB_RED_ZONE))
+		return 0;
+
+	/* OK, we can do it */
+
+	n[1] = 0;
+
+	for_each_online_node(node) {
+		l3 = cachep->nodelists[node];
+		if (!l3)
+			continue;
+
+		check_irq_on();
+		spin_lock_irq(&l3->list_lock);
+
+		list_for_each(q, &l3->slabs_full) {
+			slabp = list_entry(q, struct slab, list);
+			handle_slab(n, cachep, slabp);
+		}
+		list_for_each(q, &l3->slabs_partial) {
+			slabp = list_entry(q, struct slab, list);
+			handle_slab(n, cachep, slabp);
+		}
+		spin_unlock_irq(&l3->list_lock);
+	}
+	name = cachep->name;
+	if (n[0] == n[1]) {
+		/* Increase the buffer size */
+		mutex_unlock(&cache_chain_mutex);
+		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+		if (!m->private) {
+			/* Too bad, we are really out */
+			m->private = n;
+			mutex_lock(&cache_chain_mutex);
+			return -ENOMEM;
+		}
+		*(unsigned long *)m->private = n[0] * 2;
+		kfree(n);
+		mutex_lock(&cache_chain_mutex);
+		/* Now make sure this entry will be retried */
+		m->count = m->size;
+		return 0;
+	}
+	for (i = 0; i < n[1]; i++) {
+		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
+		show_symbol(m, n[2*i+2]);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations slabstats_op = {
+	.start = leaks_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = leaks_show,
+};
+#endif
 #endif
 
 /**
diff --git a/mm/util.c b/mm/util.c
index 49e29f751b50..b68d3d7d0359 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,7 +11,7 @@
  */
 void *kzalloc(size_t size, gfp_t flags)
 {
-	void *ret = kmalloc(size, flags);
+	void *ret = ____kmalloc(size, flags);
 	if (ret)
 		memset(ret, 0, size);
 	return ret;
@@ -33,7 +33,7 @@ char *kstrdup(const char *s, gfp_t gfp)
 		return NULL;
 
 	len = strlen(s) + 1;
-	buf = kmalloc(len, gfp);
+	buf = ____kmalloc(len, gfp);
 	if (buf)
 		memcpy(buf, s, len);
 	return buf;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c9f878454531..09464fa8d72f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -149,7 +149,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 
 	/* Get the DATA. Size must match skb_add_mtu(). */
 	size = SKB_DATA_ALIGN(size);
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+	data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
 	if (!data)
 		goto nodata;
 
-- 
cgit v1.2.3


From a8c0f9a41f88da703ade33f9c1626a55c786e8bb Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 25 Mar 2006 03:06:42 -0800
Subject: [PATCH] slab: introduce kmem_cache_zalloc allocator

Introduce a memory-zeroing variant of kmem_cache_alloc.  The allocator
already exits in XFS and there are potential users for it so this patch
makes the allocator available for the general public.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h |  2 ++
 mm/slab.c            | 17 +++++++++++++++++
 mm/slob.c            | 10 ++++++++++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index f88e08a5802c..1216b09e07b1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -64,6 +64,7 @@ extern kmem_cache_t *kmem_cache_create(const char *, size_t, size_t, unsigned lo
 extern int kmem_cache_destroy(kmem_cache_t *);
 extern int kmem_cache_shrink(kmem_cache_t *);
 extern void *kmem_cache_alloc(kmem_cache_t *, gfp_t);
+extern void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 extern unsigned int kmem_cache_size(kmem_cache_t *);
 extern const char *kmem_cache_name(kmem_cache_t *);
@@ -156,6 +157,7 @@ struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
 	void (*)(void *, struct kmem_cache *, unsigned long));
 int kmem_cache_destroy(struct kmem_cache *c);
 void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags);
+void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 void kmem_cache_free(struct kmem_cache *c, void *b);
 const char *kmem_cache_name(struct kmem_cache *);
 void *kmalloc(size_t size, gfp_t flags);
diff --git a/mm/slab.c b/mm/slab.c
index a5047161084e..6f5aeebd4306 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3107,6 +3107,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+/**
+ * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * @cache: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache and set the allocated memory to zero.
+ * The flags are only relevant if the cache has no available objects.
+ */
+void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
+{
+	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
+	if (ret)
+		memset(ret, 0, obj_size(cache));
+	return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+
 /**
  * kmem_ptr_validate - check if an untrusted pointer might
  *	be a slab entry.
diff --git a/mm/slob.c b/mm/slob.c
index a1f42bdc0245..9bcc7e2cabfd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *ret = kmem_cache_alloc(c, flags);
+	if (ret)
+		memset(ret, 0, c->size);
+
+	return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
 	if (c->dtor)
-- 
cgit v1.2.3


From 40c07ae8daa659b8feb149c84731629386873c16 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 25 Mar 2006 03:06:43 -0800
Subject: [PATCH] slab: optimize constant-size kzalloc calls

As suggested by Eric Dumazet, optimize kzalloc() calls that pass a
compile-time constant size.  Please note that the patch increases kernel
text slightly (~200 bytes for defconfig on x86).

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 30 +++++++++++++++++++++++++++---
 mm/util.c            |  6 +++---
 2 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1216b09e07b1..15e1d9736b1b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -110,7 +110,30 @@ found:
 	return __kmalloc(size, flags);
 }
 
-extern void *kzalloc(size_t, gfp_t);
+extern void *__kzalloc(size_t, gfp_t);
+
+static inline void *kzalloc(size_t size, gfp_t flags)
+{
+	if (__builtin_constant_p(size)) {
+		int i = 0;
+#define CACHE(x) \
+		if (size <= x) \
+			goto found; \
+		else \
+			i++;
+#include "kmalloc_sizes.h"
+#undef CACHE
+		{
+			extern void __you_cannot_kzalloc_that_much(void);
+			__you_cannot_kzalloc_that_much();
+		}
+found:
+		return kmem_cache_zalloc((flags & GFP_DMA) ?
+			malloc_sizes[i].cs_dmacachep :
+			malloc_sizes[i].cs_cachep, flags);
+	}
+	return __kzalloc(size, flags);
+}
 
 /**
  * kcalloc - allocate memory for an array. The memory is set to zero.
@@ -161,14 +184,14 @@ void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 void kmem_cache_free(struct kmem_cache *c, void *b);
 const char *kmem_cache_name(struct kmem_cache *);
 void *kmalloc(size_t size, gfp_t flags);
-void *kzalloc(size_t size, gfp_t flags);
+void *__kzalloc(size_t size, gfp_t flags);
 void kfree(const void *m);
 unsigned int ksize(const void *m);
 unsigned int kmem_cache_size(struct kmem_cache *c);
 
 static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
-	return kzalloc(n * size, flags);
+	return __kzalloc(n * size, flags);
 }
 
 #define kmem_cache_shrink(d) (0)
@@ -176,6 +199,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 #define kmem_ptr_validate(a, b) (0)
 #define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
 #define kmalloc_node(s, f, n) kmalloc(s, f)
+#define kzalloc(s, f) __kzalloc(s, f)
 #define ____kmalloc kmalloc
 
 #endif /* CONFIG_SLOB */
diff --git a/mm/util.c b/mm/util.c
index b68d3d7d0359..7368479220b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,18 +5,18 @@
 #include <asm/uaccess.h>
 
 /**
- * kzalloc - allocate memory. The memory is set to zero.
+ * __kzalloc - allocate memory. The memory is set to zero.
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kzalloc(size_t size, gfp_t flags)
+void *__kzalloc(size_t size, gfp_t flags)
 {
 	void *ret = ____kmalloc(size, flags);
 	if (ret)
 		memset(ret, 0, size);
 	return ret;
 }
-EXPORT_SYMBOL(kzalloc);
+EXPORT_SYMBOL(__kzalloc);
 
 /*
  * kstrdup - allocate space for and copy an existing string
-- 
cgit v1.2.3


From e3df18983ea090a2e00dd5c2c6167bb431a0e0a2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:06:53 -0800
Subject: [PATCH] jbd: embed j_commit_timer in journal struct

The kjournald timer is currently on the kernel thread's stack and the journal
structure points at it.  Save a pointer hop by moving the timer into the
journal structure.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/journal.c     | 19 +++++++++----------
 fs/jbd/transaction.c |  4 ++--
 include/linux/jbd.h  |  4 +++-
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 95a628d8cac8..6bd4647dc5a1 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -111,18 +111,17 @@ static void commit_timeout(unsigned long __data)
 
 static int kjournald(void *arg)
 {
-	journal_t *journal = (journal_t *) arg;
+	journal_t *journal = arg;
 	transaction_t *transaction;
-	struct timer_list timer;
 
 	daemonize("kjournald");
 
-	/* Set up an interval timer which can be used to trigger a
-           commit wakeup after the commit interval expires */
-	init_timer(&timer);
-	timer.data = (unsigned long) current;
-	timer.function = commit_timeout;
-	journal->j_commit_timer = &timer;
+	/*
+	 * Set up an interval timer which can be used to trigger a commit wakeup
+	 * after the commit interval expires
+	 */
+	setup_timer(&journal->j_commit_timer, commit_timeout,
+			(unsigned long)current);
 
 	/* Record that the journal thread is running */
 	journal->j_task = current;
@@ -146,7 +145,7 @@ loop:
 	if (journal->j_commit_sequence != journal->j_commit_request) {
 		jbd_debug(1, "OK, requests differ\n");
 		spin_unlock(&journal->j_state_lock);
-		del_timer_sync(journal->j_commit_timer);
+		del_timer_sync(&journal->j_commit_timer);
 		journal_commit_transaction(journal);
 		spin_lock(&journal->j_state_lock);
 		goto loop;
@@ -203,7 +202,7 @@ loop:
 
 end_loop:
 	spin_unlock(&journal->j_state_lock);
-	del_timer_sync(journal->j_commit_timer);
+	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
 	jbd_debug(1, "Journal thread exiting.\n");
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5fc40888f4cf..ada31fa272e3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -53,8 +53,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 	spin_lock_init(&transaction->t_handle_lock);
 
 	/* Set up the commit timer for the new transaction. */
-	journal->j_commit_timer->expires = transaction->t_expires;
-	add_timer(journal->j_commit_timer);
+	journal->j_commit_timer.expires = transaction->t_expires;
+	add_timer(&journal->j_commit_timer);
 
 	J_ASSERT(journal->j_running_transaction == NULL);
 	journal->j_running_transaction = transaction;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 2ccbfb6340ba..4fc7dffd66ef 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -29,6 +29,8 @@
 #include <linux/stddef.h>
 #include <linux/bit_spinlock.h>
 #include <linux/mutex.h>
+#include <linux/timer.h>
+
 #include <asm/semaphore.h>
 #endif
 
@@ -787,7 +789,7 @@ struct journal_s
 	unsigned long		j_commit_interval;
 
 	/* The timer used to wakeup the commit thread: */
-	struct timer_list	*j_commit_timer;
+	struct timer_list	j_commit_timer;
 
 	/*
 	 * The revoke table: maintains the list of revoked blocks in the
-- 
cgit v1.2.3


From ca5734db60630f7c5564a61a5b9034c1bb369c3d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 25 Mar 2006 03:06:55 -0800
Subject: [PATCH] Small cleanup in quota.h

Remove unused quota flag.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/quota.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 8dc2d04a103f..2dab71e1c3d1 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -209,7 +209,6 @@ extern struct dqstats dqstats;
 #define DQ_FAKE_B	3	/* no limits only usage */
 #define DQ_READ_B	4	/* dquot was read into memory */
 #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
-#define DQ_WAITFREE_B	6	/* dquot being waited (by invalidate_dquots) */
 
 struct dquot {
 	struct hlist_node dq_hash;	/* Hash list in memory */
-- 
cgit v1.2.3


From bdfc326614b90e7bc47ee4a8fed05988555f0169 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 25 Mar 2006 03:06:56 -0800
Subject: [PATCH] fs/inode.c: make iprune_mutex static

There's no reason for iprune_mutex being global.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         | 2 +-
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index a51c671c54cf..85da11044adc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -91,7 +91,7 @@ DEFINE_SPINLOCK(inode_lock);
  * from its final dispose_list, the struct super_block they refer to
  * (for inode->i_sb->s_op) may already have been freed and reused.
  */
-DEFINE_MUTEX(iprune_mutex);
+static DEFINE_MUTEX(iprune_mutex);
 
 /*
  * Statistics gathering..
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 215696a0f16f..7c750312261b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1558,7 +1558,6 @@ extern void destroy_inode(struct inode *);
 extern struct inode *new_inode(struct super_block *);
 extern int remove_suid(struct dentry *);
 extern void remove_dquot_ref(struct super_block *, int, struct list_head *);
-extern struct mutex iprune_mutex;
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 extern void remove_inode_hash(struct inode *);
-- 
cgit v1.2.3


From 23f9e0f891c9b159a199629d4426f6ae0c383508 Mon Sep 17 00:00:00 2001
From: Alexander Zarochentzev <zam@namesys.com>
Date: Sat, 25 Mar 2006 03:06:57 -0800
Subject: [PATCH] reiserfs: fix transaction overflowing

This patch fixes a bug in reiserfs truncate.  A transaction might overflow
when truncating long highly fragmented file.  The fix is to split
truncation into several transactions to avoid overflowing.

Signed-off-by: Vladimir V. Saveliev <vs@namesys.com>
Cc; Charles McColgan <cm@chuck.net>
Cc: Alexander Zarochentsev <zam@namesys.com>
Cc: Hans Reiser <reiser@namesys.com>
Cc: Chris Mason <mason@suse.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/stree.c         | 210 +++++++++++++++-----------------------------
 include/linux/reiserfs_fs.h |   5 ++
 2 files changed, 77 insertions(+), 138 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index e2d08d7bcffc..d2b25e1ba6e9 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -981,6 +981,8 @@ static inline int prepare_for_direntry_item(struct path *path,
 	return M_CUT;
 }
 
+#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
+
 /*  If the path points to a directory or direct item, calculate mode and the size cut, for balance.
     If the path points to an indirect item, remove some number of its unformatted nodes.
     In case of file truncate calculate whether this item must be deleted/truncated or last
@@ -1020,148 +1022,79 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
 
 	/* Case of an indirect item. */
 	{
-		int n_unfm_number,	/* Number of the item unformatted nodes. */
-		 n_counter, n_blk_size;
-		__le32 *p_n_unfm_pointer;	/* Pointer to the unformatted node number. */
-		__u32 tmp;
-		struct item_head s_ih;	/* Item header. */
-		char c_mode;	/* Returned mode of the balance. */
-		int need_research;
-
-		n_blk_size = p_s_sb->s_blocksize;
-
-		/* Search for the needed object indirect item until there are no unformatted nodes to be removed. */
-		do {
-			need_research = 0;
-			p_s_bh = PATH_PLAST_BUFFER(p_s_path);
-			/* Copy indirect item header to a temp variable. */
-			copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
-			/* Calculate number of unformatted nodes in this item. */
-			n_unfm_number = I_UNFM_NUM(&s_ih);
-
-			RFALSE(!is_indirect_le_ih(&s_ih) || !n_unfm_number ||
-			       pos_in_item(p_s_path) + 1 != n_unfm_number,
-			       "PAP-5240: invalid item %h "
-			       "n_unfm_number = %d *p_n_pos_in_item = %d",
-			       &s_ih, n_unfm_number, pos_in_item(p_s_path));
-
-			/* Calculate balance mode and position in the item to remove unformatted nodes. */
-			if (n_new_file_length == max_reiserfs_offset(inode)) {	/* Case of delete. */
-				pos_in_item(p_s_path) = 0;
-				*p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
-				c_mode = M_DELETE;
-			} else {	/* Case of truncate. */
-				if (n_new_file_length < le_ih_k_offset(&s_ih)) {
-					pos_in_item(p_s_path) = 0;
-					*p_n_cut_size =
-					    -(IH_SIZE + ih_item_len(&s_ih));
-					c_mode = M_DELETE;	/* Delete this item. */
-				} else {
-					/* indirect item must be truncated starting from *p_n_pos_in_item-th position */
-					pos_in_item(p_s_path) =
-					    (n_new_file_length + n_blk_size -
-					     le_ih_k_offset(&s_ih)) >> p_s_sb->
-					    s_blocksize_bits;
-
-					RFALSE(pos_in_item(p_s_path) >
-					       n_unfm_number,
-					       "PAP-5250: invalid position in the item");
-
-					/* Either convert last unformatted node of indirect item to direct item or increase
-					   its free space.  */
-					if (pos_in_item(p_s_path) ==
-					    n_unfm_number) {
-						*p_n_cut_size = 0;	/* Nothing to cut. */
-						return M_CONVERT;	/* Maybe convert last unformatted node to the direct item. */
-					}
-					/* Calculate size to cut. */
-					*p_n_cut_size =
-					    -(ih_item_len(&s_ih) -
-					      pos_in_item(p_s_path) *
-					      UNFM_P_SIZE);
-
-					c_mode = M_CUT;	/* Cut from this indirect item. */
-				}
-			}
+	    int blk_size = p_s_sb->s_blocksize;
+	    struct item_head s_ih;
+	    int need_re_search;
+	    int delete = 0;
+	    int result = M_CUT;
+	    int pos = 0;
+
+	    if ( n_new_file_length == max_reiserfs_offset (inode) ) {
+		/* prepare_for_delete_or_cut() is called by
+		 * reiserfs_delete_item() */
+		n_new_file_length = 0;
+		delete = 1;
+	    }
+
+	    do {
+		need_re_search = 0;
+		*p_n_cut_size = 0;
+		p_s_bh = PATH_PLAST_BUFFER(p_s_path);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+		pos = I_UNFM_NUM(&s_ih);
 
-			RFALSE(n_unfm_number <= pos_in_item(p_s_path),
-			       "PAP-5260: invalid position in the indirect item");
-
-			/* pointers to be cut */
-			n_unfm_number -= pos_in_item(p_s_path);
-			/* Set pointer to the last unformatted node pointer that is to be cut. */
-			p_n_unfm_pointer =
-			    (__le32 *) B_I_PITEM(p_s_bh,
-						 &s_ih) + I_UNFM_NUM(&s_ih) -
-			    1 - *p_n_removed;
-
-			/* We go through the unformatted nodes pointers of the indirect
-			   item and look for the unformatted nodes in the cache. If we
-			   found some of them we free it, zero corresponding indirect item
-			   entry and log buffer containing that indirect item. For this we
-			   need to prepare last path element for logging. If some
-			   unformatted node has b_count > 1 we must not free this
-			   unformatted node since it is in use. */
-			reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
-			// note: path could be changed, first line in for loop takes care
-			// of it
+		while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > n_new_file_length) {
+		    __u32 *unfm, block;
 
-			for (n_counter = *p_n_removed;
-			     n_counter < n_unfm_number;
-			     n_counter++, p_n_unfm_pointer--) {
+		    /* Each unformatted block deletion may involve one additional
+		     * bitmap block into the transaction, thereby the initial
+		     * journal space reservation might not be enough. */
+		    if (!delete && (*p_n_cut_size) != 0 &&
+			reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
+			break;
+		    }
 
-				cond_resched();
-				if (item_moved(&s_ih, p_s_path)) {
-					need_research = 1;
-					break;
-				}
-				RFALSE(p_n_unfm_pointer <
-				       (__le32 *) B_I_PITEM(p_s_bh, &s_ih)
-				       || p_n_unfm_pointer >
-				       (__le32 *) B_I_PITEM(p_s_bh,
-							    &s_ih) +
-				       I_UNFM_NUM(&s_ih) - 1,
-				       "vs-5265: pointer out of range");
-
-				/* Hole, nothing to remove. */
-				if (!get_block_num(p_n_unfm_pointer, 0)) {
-					(*p_n_removed)++;
-					continue;
-				}
+		    unfm = (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + pos - 1;
+		    block = get_block_num(unfm, 0);
 
-				(*p_n_removed)++;
+		    if (block != 0) {
+			reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
+			put_block_num(unfm, 0, 0);
+			journal_mark_dirty (th, p_s_sb, p_s_bh);
+			reiserfs_free_block(th, inode, block, 1);
+		    }
 
-				tmp = get_block_num(p_n_unfm_pointer, 0);
-				put_block_num(p_n_unfm_pointer, 0, 0);
-				journal_mark_dirty(th, p_s_sb, p_s_bh);
-				reiserfs_free_block(th, inode, tmp, 1);
-				if (item_moved(&s_ih, p_s_path)) {
-					need_research = 1;
-					break;
-				}
-			}
+		    cond_resched();
 
-			/* a trick.  If the buffer has been logged, this
-			 ** will do nothing.  If we've broken the loop without
-			 ** logging it, it will restore the buffer
-			 **
-			 */
-			reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
-
-			/* This loop can be optimized. */
-		} while ((*p_n_removed < n_unfm_number || need_research) &&
-			 search_for_position_by_key(p_s_sb, p_s_item_key,
-						    p_s_path) ==
-			 POSITION_FOUND);
-
-		RFALSE(*p_n_removed < n_unfm_number,
-		       "PAP-5310: indirect item is not found");
-		RFALSE(item_moved(&s_ih, p_s_path),
-		       "after while, comp failed, retry");
-
-		if (c_mode == M_CUT)
-			pos_in_item(p_s_path) *= UNFM_P_SIZE;
-		return c_mode;
+		    if (item_moved (&s_ih, p_s_path))  {
+			need_re_search = 1;
+			break;
+		    }
+
+		    pos --;
+		    (*p_n_removed) ++;
+		    (*p_n_cut_size) -= UNFM_P_SIZE;
+
+		    if (pos == 0) {
+			(*p_n_cut_size) -= IH_SIZE;
+			result = M_DELETE;
+			break;
+		    }
+		}
+		/* a trick.  If the buffer has been logged, this will do nothing.  If
+		** we've broken the loop without logging it, it will restore the
+		** buffer */
+		reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
+	    } while (need_re_search &&
+		     search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND);
+	    pos_in_item(p_s_path) = pos * UNFM_P_SIZE;
+
+	    if (*p_n_cut_size == 0) {
+		/* Nothing were cut. maybe convert last unformatted node to the
+		 * direct item? */
+		result = M_CONVERT;
+	    }
+	    return result;
 	}
 }
 
@@ -1948,7 +1881,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 		 ** sure the file is consistent before ending the current trans
 		 ** and starting a new one
 		 */
-		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+		if (journal_transaction_should_end(th, 0) ||
+		    reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
 			int orig_len_alloc = th->t_blocks_allocated;
 			decrement_counters_in_path(&s_search_path);
 
@@ -1962,7 +1896,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p
 			if (err)
 				goto out;
 			err = journal_begin(th, p_s_inode->i_sb,
-					    JOURNAL_PER_BALANCE_CNT * 6);
+					    JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
 			if (err)
 				goto out;
 			reiserfs_update_inode_transaction(p_s_inode);
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index dad78cecfd20..912f1b7cb18f 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1704,6 +1704,11 @@ static inline int reiserfs_transaction_running(struct super_block *s)
 	return 0;
 }
 
+static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
+{
+	return th->t_blocks_allocated - th->t_blocks_logged;
+}
+
 int reiserfs_async_progress_wait(struct super_block *s);
 
 struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
-- 
cgit v1.2.3


From b500531e6f5f234ed267bd7060ee06d144faf0ca Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@linuxhacker.ru>
Date: Sat, 25 Mar 2006 03:07:01 -0800
Subject: [PATCH] Introduce FMODE_EXEC file flag

Introduce FMODE_EXEC file flag, to indicate that file is being opened for
execution.  This is useful for distributed filesystems to maintain
consistent behavior for returning ETXTBUSY when opening for write and
execution happens on different nodes.

akpm:

  Needed by Lustre at present.  I assume their objective to to work towards
  being able to install Lustre on an unmodified distro kernel, which seems
  sane.  It should have zero runtime cost.

  Trond and Chuck indicate that NFS4 can probably use this too, for the same
  thing.

  Steven says it's also on the GFS todo list.

Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Chuck Lever <cel@citi.umich.edu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c          | 4 ++--
 include/linux/fs.h | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 0b515ac53134..d8c477a56257 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -127,7 +127,7 @@ asmlinkage long sys_uselib(const char __user * library)
 	struct nameidata nd;
 	int error;
 
-	error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ);
+	error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 	if (error)
 		goto out;
 
@@ -477,7 +477,7 @@ struct file *open_exec(const char *name)
 	int err;
 	struct file *file;
 
-	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ);
+	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
 	file = ERR_PTR(err);
 
 	if (!err) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7c750312261b..21e8cf795c38 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -65,6 +65,11 @@ extern int dir_notify_enable;
 #define FMODE_PREAD	8
 #define FMODE_PWRITE	FMODE_PREAD	/* These go hand in hand */
 
+/* File is being opened for execution. Primary users of this flag are
+   distributed filesystems that can use it to achieve correct ETXTBUSY
+   behavior for cross-node execution/opening_for_writing of files */
+#define FMODE_EXEC	16
+
 #define RW_MASK		1
 #define RWA_MASK	2
 #define READ 0
-- 
cgit v1.2.3


From 1aef821a6b3aeca8c19d06aee012ed9db617d1e3 Mon Sep 17 00:00:00 2001
From: Thomas Koeller <thomas@koeller.dyndns.org>
Date: Sat, 25 Mar 2006 03:07:03 -0800
Subject: [PATCH] constify tty flip buffer handling

Add a couple of 'const' qualifiers to the TTY flip buffer APIs, where
appropriate.

Signed-off-by: Thomas Koeller <thomas@koeller.dyndns.org>
Acked-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/tty_io.c    | 4 ++--
 include/linux/tty_flip.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 76592ee1fb38..48d795bb8c4b 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -354,7 +354,7 @@ int tty_buffer_request_room(struct tty_struct *tty, size_t size)
 
 EXPORT_SYMBOL_GPL(tty_buffer_request_room);
 
-int tty_insert_flip_string(struct tty_struct *tty, unsigned char *chars, size_t size)
+int tty_insert_flip_string(struct tty_struct *tty, const unsigned char *chars, size_t size)
 {
 	int copied = 0;
 	do {
@@ -378,7 +378,7 @@ int tty_insert_flip_string(struct tty_struct *tty, unsigned char *chars, size_t
 
 EXPORT_SYMBOL_GPL(tty_insert_flip_string);
 
-int tty_insert_flip_string_flags(struct tty_struct *tty, unsigned char *chars, char *flags, size_t size)
+int tty_insert_flip_string_flags(struct tty_struct *tty, const unsigned char *chars, const char *flags, size_t size)
 {
 	int copied = 0;
 	do {
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 0c6169fff366..0976a163b459 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -2,8 +2,8 @@
 #define _LINUX_TTY_FLIP_H
 
 extern int tty_buffer_request_room(struct tty_struct *tty, size_t size);
-extern int tty_insert_flip_string(struct tty_struct *tty, unsigned char *chars, size_t size);
-extern int tty_insert_flip_string_flags(struct tty_struct *tty, unsigned char *chars, char *flags, size_t size);
+extern int tty_insert_flip_string(struct tty_struct *tty, const unsigned char *chars, size_t size);
+extern int tty_insert_flip_string_flags(struct tty_struct *tty, const unsigned char *chars, const char *flags, size_t size);
 extern int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_t size);
 extern int tty_prepare_flip_string_flags(struct tty_struct *tty, unsigned char **chars, char **flags, size_t size);
 
-- 
cgit v1.2.3


From 8d3b33f67fdc0fb364a1ef6d8fbbea7c2e4e6c98 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 25 Mar 2006 03:07:05 -0800
Subject: [PATCH] Remove MODULE_PARM

MODULE_PARM was actually breaking: recent gcc version optimize them out as
unused.  It's time to replace the last users, which are generally in the
most unloved drivers anyway.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/networking/ray_cs.txt  |   2 +-
 Documentation/sound/oss/Introduction |   2 +-
 Documentation/sound/oss/cs46xx       |  16 +--
 arch/ppc/8xx_io/cs4218_tdm.c         |  10 +-
 drivers/block/ataflop.c              |   4 +-
 drivers/cdrom/cm206.c                |   8 +-
 drivers/cdrom/sbpcd.c                |  10 +-
 drivers/char/istallion.c             |   8 +-
 drivers/char/mxser.c                 |   8 +-
 drivers/char/riscom8.c               |   8 +-
 drivers/isdn/hardware/avm/b1dma.c    |   2 +-
 drivers/isdn/hardware/avm/b1isa.c    |   4 +-
 drivers/isdn/hardware/avm/c4.c       |   2 +-
 drivers/isdn/hardware/avm/t1isa.c    |   6 +-
 drivers/isdn/hysdn/hycapi.c          |   2 +-
 drivers/isdn/hysdn/hysdn_net.c       |   2 +-
 drivers/isdn/isdnloop/isdnloop.c     |   2 +-
 drivers/media/video/zr36120.c        |   8 +-
 drivers/mmc/au1xmmc.c                |   2 +-
 drivers/mtd/maps/pcmciamtd.c         |  14 +--
 drivers/net/atari_bionet.c           |   2 +-
 drivers/net/atari_pamsnet.c          |   2 +-
 drivers/net/atarilance.c             |   2 +-
 drivers/net/cassini.c                |  11 ++-
 drivers/net/chelsio/cxgb2.c          |   2 +-
 drivers/net/fec_8xx/fec_main.c       |   4 +-
 drivers/net/fs_enet/fs_enet-main.c   |   4 +-
 drivers/net/gt96100eth.c             |   4 +-
 drivers/net/hamradio/dmascc.c        |   2 +-
 drivers/net/hamradio/mkiss.c         |   2 +-
 drivers/net/irda/irport.c            |   4 +-
 drivers/net/lasi_82596.c             |   6 +-
 drivers/net/mac89x0.c                |   2 +-
 drivers/net/mace.c                   |   2 +-
 drivers/net/meth.c                   |   2 +-
 drivers/net/ne-h8300.c               |   6 +-
 drivers/net/ni5010.c                 |   4 +-
 drivers/net/sun3lance.c              |   2 +-
 drivers/s390/block/dasd.c            |   1 -
 drivers/s390/block/dasd_devmap.c     |   3 +
 drivers/scsi/arm/cumana_2.c          |   2 +-
 drivers/scsi/arm/eesox.c             |   2 +-
 drivers/scsi/arm/powertec.c          |   2 +-
 drivers/scsi/atari_scsi.c            |  10 +-
 drivers/video/pm3fb.c                |  18 ++--
 include/linux/module.h               |  19 ----
 include/video/pm3fb.h                |   3 -
 init/Kconfig                         |   9 --
 kernel/module.c                      | 183 +++--------------------------------
 kernel/rcutorture.c                  |  10 +-
 sound/oss/au1000.c                   |   2 +-
 sound/oss/au1550_ac97.c              |   2 +-
 sound/oss/dmasound/dmasound_core.c   |  10 +-
 sound/oss/ite8172.c                  |   4 +-
 sound/oss/swarm_cs4297a.c            |   4 +-
 sound/oss/waveartist.c               |   8 +-
 56 files changed, 146 insertions(+), 329 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ray_cs.txt b/Documentation/networking/ray_cs.txt
index 5427f8c7df95..145d27a52395 100644
--- a/Documentation/networking/ray_cs.txt
+++ b/Documentation/networking/ray_cs.txt
@@ -25,7 +25,7 @@ the essid= string parameter is available via the kernel command line.
 This will change after the method of sorting out parameters for all
 the PCMCIA drivers is agreed upon.  If you must have a built in driver
 with nondefault parameters, they can be edited in
-/usr/src/linux/drivers/net/pcmcia/ray_cs.c.  Searching for MODULE_PARM
+/usr/src/linux/drivers/net/pcmcia/ray_cs.c.  Searching for module_param
 will find them all.
 
 Information on card services is available at:
diff --git a/Documentation/sound/oss/Introduction b/Documentation/sound/oss/Introduction
index 15d4fb975ac0..f04ba6bb7395 100644
--- a/Documentation/sound/oss/Introduction
+++ b/Documentation/sound/oss/Introduction
@@ -69,7 +69,7 @@ are available, for example IRQ, address, DMA.
 
 Warning, the options for different cards sometime use different names 
 for the same or a similar feature (dma1= versus dma16=).  As a last 
-resort, inspect the code (search for MODULE_PARM).
+resort, inspect the code (search for module_param).
 
 Notes:
 
diff --git a/Documentation/sound/oss/cs46xx b/Documentation/sound/oss/cs46xx
index 88d6cf8b39f3..b54432709863 100644
--- a/Documentation/sound/oss/cs46xx
+++ b/Documentation/sound/oss/cs46xx
@@ -88,7 +88,7 @@ parameters.  for a copy email: twoller@crystal.cirrus.com
 
 MODULE_PARMS definitions
 ------------------------
-MODULE_PARM(defaultorder, "i");
+module_param(defaultorder, ulong, 0);
 defaultorder=N
 where N is a value from 1 to 12
 The buffer order determines the size of the dma buffer for the driver.
@@ -98,18 +98,18 @@ to not underrun the dma buffer as easily.  As default, use 32k (order=3)
 rather than 64k as some of the games work more responsively.
 (2^N) * PAGE_SIZE = allocated buffer size
 
-MODULE_PARM(cs_debuglevel, "i");
-MODULE_PARM(cs_debugmask, "i");
+module_param(cs_debuglevel, ulong, 0644);
+module_param(cs_debugmask, ulong, 0644);
 cs_debuglevel=N
 cs_debugmask=0xMMMMMMMM
 where N is a value from 0 (no debug printfs), to 9 (maximum)
 0xMMMMMMMM is a debug mask corresponding to the CS_xxx bits (see driver source).
 
-MODULE_PARM(hercules_egpio_disable, "i");
+module_param(hercules_egpio_disable, ulong, 0);
 hercules_egpio_disable=N
 where N is a 0 (enable egpio), or a 1 (disable egpio support)
 
-MODULE_PARM(initdelay, "i");
+module_param(initdelay, ulong, 0);
 initdelay=N
 This value is used to determine the millescond delay during the initialization
 code prior to powering up the PLL.  On laptops this value can be used to
@@ -118,19 +118,19 @@ system is booted under battery power then the mdelay()/udelay() functions fail t
 properly delay the required time.  Also, if the system is booted under AC power
 and then the power removed, the mdelay()/udelay() functions will not delay properly.
  
-MODULE_PARM(powerdown, "i");
+module_param(powerdown, ulong, 0);
 powerdown=N
 where N is 0 (disable any powerdown of the internal blocks) or 1 (enable powerdown)
 
 
-MODULE_PARM(external_amp, "i");
+module_param(external_amp, bool, 0);
 external_amp=1
 if N is set to 1, then force enabling the EAPD support in the primary AC97 codec.
 override the detection logic and force the external amp bit in the AC97 0x26 register
 to be reset (0).  EAPD should be 0 for powerup, and 1 for powerdown.  The VTB Santa Cruz
 card has inverted logic, so there is a special function for these cards.
 
-MODULE_PARM(thinkpad, "i");
+module_param(thinkpad, bool, 0);
 thinkpad=1
 if N is set to 1, then force enabling the clkrun functionality.
 Currently, when the part is being used, then clkrun is disabled for the entire system,
diff --git a/arch/ppc/8xx_io/cs4218_tdm.c b/arch/ppc/8xx_io/cs4218_tdm.c
index 49eb2a7e65c0..a892356d5c3b 100644
--- a/arch/ppc/8xx_io/cs4218_tdm.c
+++ b/arch/ppc/8xx_io/cs4218_tdm.c
@@ -126,11 +126,11 @@ static int numReadBufs = 4, readbufSize = 32;
 */
 static volatile cbd_t	*rx_base, *rx_cur, *tx_base, *tx_cur;
 
-MODULE_PARM(catchRadius, "i");
-MODULE_PARM(numBufs, "i");
-MODULE_PARM(bufSize, "i");
-MODULE_PARM(numreadBufs, "i");
-MODULE_PARM(readbufSize, "i");
+module_param(catchRadius, int, 0);
+module_param(numBufs, int, 0);
+module_param(bufSize, int, 0);
+module_param(numreadBufs, int, 0);
+module_param(readbufSize, int, 0);
 
 #define arraysize(x)	(sizeof(x)/sizeof(*(x)))
 #define le2be16(x)	(((x)<<8 & 0xff00) | ((x)>>8 & 0x00ff))
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index f8ce235ccfc3..c39650920bdf 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -271,7 +271,7 @@ unsigned char *DMABuffer;			  /* buffer for writes */
 static unsigned long PhysDMABuffer;   /* physical address */
 
 static int UseTrackbuffer = -1;		  /* Do track buffering? */
-MODULE_PARM(UseTrackbuffer, "i");
+module_param(UseTrackbuffer, int, 0);
 
 unsigned char *TrackBuffer;			  /* buffer for reads */
 static unsigned long PhysTrackBuffer; /* physical address */
@@ -296,7 +296,7 @@ static int MotorOn = 0, MotorOffTrys;
 static int IsFormatting = 0, FormatError;
 
 static int UserSteprate[FD_MAX_UNITS] = { -1, -1 };
-MODULE_PARM(UserSteprate, "1-" __MODULE_STRING(FD_MAX_UNITS) "i");
+module_param_array(UserSteprate, int, NULL, 0);
 
 /* Synchronization of FDC access. */
 static volatile int fdc_busy = 0;
diff --git a/drivers/cdrom/cm206.c b/drivers/cdrom/cm206.c
index fad27a87ce35..f43a988dd413 100644
--- a/drivers/cdrom/cm206.c
+++ b/drivers/cdrom/cm206.c
@@ -218,12 +218,12 @@ static int cm206_base = CM206_BASE;
 static int cm206_irq = CM206_IRQ;
 #ifdef MODULE
 static int cm206[2] = { 0, 0 };	/* for compatible `insmod' parameter passing */
+module_param_array(cm206, int, NULL, 0);	/* base,irq or irq,base */
 #endif
 
-MODULE_PARM(cm206_base, "i");	/* base */
-MODULE_PARM(cm206_irq, "i");	/* irq */
-MODULE_PARM(cm206, "1-2i");	/* base,irq or irq,base */
-MODULE_PARM(auto_probe, "i");	/* auto probe base and irq */
+module_param(cm206_base, int, 0);	/* base */
+module_param(cm206_irq, int, 0);	/* irq */
+module_param(auto_probe, bool, 0);	/* auto probe base and irq */
 MODULE_LICENSE("GPL");
 
 #define POLLOOP 100		/* milliseconds */
diff --git a/drivers/cdrom/sbpcd.c b/drivers/cdrom/sbpcd.c
index 4760f515f591..05c9e865ecaf 100644
--- a/drivers/cdrom/sbpcd.c
+++ b/drivers/cdrom/sbpcd.c
@@ -464,8 +464,13 @@ static int sbpcd[] =
 static  __cacheline_aligned DEFINE_SPINLOCK(sbpcd_lock);
 static struct request_queue *sbpcd_queue;
 
-MODULE_PARM(sbpcd, "2i");
-MODULE_PARM(max_drives, "i");
+/* You can only set the first pair, from old MODULE_PARM code.  */
+static int sbpcd_set(const char *val, struct kernel_param *kp)
+{
+	get_options((char *)val, 2, (int *)sbpcd);
+	return 0;
+}
+module_param_call(sbpcd, sbpcd_set, NULL, NULL, 0);
 
 #define NUM_PROBE  (sizeof(sbpcd) / sizeof(int))
 
@@ -553,6 +558,7 @@ static unsigned char msgnum;
 static char msgbuf[80];
 
 static int max_drives = MAX_DRIVES;
+module_param(max_drives, int, 0);
 #ifndef MODULE
 static unsigned char setup_done;
 static const char *str_sb_l = "soundblaster";
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index ede128356af2..e5247f85a446 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -378,13 +378,13 @@ MODULE_DESCRIPTION("Stallion Intelligent Multiport Serial Driver");
 MODULE_LICENSE("GPL");
 
 
-MODULE_PARM(board0, "1-3s");
+module_param_array(board0, charp, NULL, 0);
 MODULE_PARM_DESC(board0, "Board 0 config -> name[,ioaddr[,memaddr]");
-MODULE_PARM(board1, "1-3s");
+module_param_array(board1, charp, NULL, 0);
 MODULE_PARM_DESC(board1, "Board 1 config -> name[,ioaddr[,memaddr]");
-MODULE_PARM(board2, "1-3s");
+module_param_array(board2, charp, NULL, 0);
 MODULE_PARM_DESC(board2, "Board 2 config -> name[,ioaddr[,memaddr]");
-MODULE_PARM(board3, "1-3s");
+module_param_array(board3, charp, NULL, 0);
 MODULE_PARM_DESC(board3, "Board 3 config -> name[,ioaddr[,memaddr]");
 
 #endif
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index ea725a9964e2..0fb2fb9fb024 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -243,10 +243,10 @@ static int verbose = 0;
 
 MODULE_AUTHOR("Casper Yang");
 MODULE_DESCRIPTION("MOXA Smartio/Industio Family Multiport Board Device Driver");
-MODULE_PARM(ioaddr, "1-4i");
-MODULE_PARM(ttymajor, "i");
-MODULE_PARM(calloutmajor, "i");
-MODULE_PARM(verbose, "i");
+module_param_array(ioaddr, int, NULL, 0);
+module_param(ttymajor, int, 0);
+module_param(calloutmajor, int, 0);
+module_param(verbose, bool, 0);
 MODULE_LICENSE("GPL");
 
 struct mxser_log {
diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c
index 119e629656b7..657c0d88f48c 100644
--- a/drivers/char/riscom8.c
+++ b/drivers/char/riscom8.c
@@ -1743,10 +1743,10 @@ static int iobase;
 static int iobase1;
 static int iobase2;
 static int iobase3;
-MODULE_PARM(iobase, "i");
-MODULE_PARM(iobase1, "i");
-MODULE_PARM(iobase2, "i");
-MODULE_PARM(iobase3, "i");
+module_param(iobase, int, 0);
+module_param(iobase1, int, 0);
+module_param(iobase2, int, 0);
+module_param(iobase3, int, 0);
 
 MODULE_LICENSE("GPL");
 #endif /* MODULE */
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index 91dd0551fc7c..4d64e5cbcdbf 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -39,7 +39,7 @@ MODULE_AUTHOR("Carsten Paeth");
 MODULE_LICENSE("GPL");
 
 static int suppress_pollack = 0;
-MODULE_PARM(suppress_pollack, "0-1i");
+module_param(suppress_pollack, bool, 0);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/b1isa.c b/drivers/isdn/hardware/avm/b1isa.c
index 38bd4dfecbd1..80fb488848b8 100644
--- a/drivers/isdn/hardware/avm/b1isa.c
+++ b/drivers/isdn/hardware/avm/b1isa.c
@@ -169,8 +169,8 @@ static struct pci_dev isa_dev[MAX_CARDS];
 static int io[MAX_CARDS];
 static int irq[MAX_CARDS];
 
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_CARDS) "i");
-MODULE_PARM(irq, "1-" __MODULE_STRING(MAX_CARDS) "i");
+module_param_array(io, int, NULL, 0);
+module_param_array(irq, int, NULL, 0);
 MODULE_PARM_DESC(io, "I/O base address(es)");
 MODULE_PARM_DESC(irq, "IRQ number(s) (assigned)");
 
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index 724aac2c1cca..f7253b2136ea 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -50,7 +50,7 @@ MODULE_DEVICE_TABLE(pci, c4_pci_tbl);
 MODULE_DESCRIPTION("CAPI4Linux: Driver for AVM C2/C4 cards");
 MODULE_AUTHOR("Carsten Paeth");
 MODULE_LICENSE("GPL");
-MODULE_PARM(suppress_pollack, "0-1i");
+module_param(suppress_pollack, bool, 0);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c
index 3b701d97bdf1..5a2f854d55b5 100644
--- a/drivers/isdn/hardware/avm/t1isa.c
+++ b/drivers/isdn/hardware/avm/t1isa.c
@@ -519,9 +519,9 @@ static int io[MAX_CARDS];
 static int irq[MAX_CARDS];
 static int cardnr[MAX_CARDS];
 
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_CARDS) "i");
-MODULE_PARM(irq, "1-" __MODULE_STRING(MAX_CARDS) "i");
-MODULE_PARM(cardnr, "1-" __MODULE_STRING(MAX_CARDS) "i");
+module_param_array(io, int, NULL, 0);
+module_param_array(irq, int, NULL, 0);
+module_param_array(cardnr, int, NULL, 0);
 MODULE_PARM_DESC(io, "I/O base address(es)");
 MODULE_PARM_DESC(irq, "IRQ number(s) (assigned)");
 MODULE_PARM_DESC(cardnr, "Card number(s) (as jumpered)");
diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c
index 55fbea0b6ac8..6bac43cc91bd 100644
--- a/drivers/isdn/hysdn/hycapi.c
+++ b/drivers/isdn/hysdn/hycapi.c
@@ -31,7 +31,7 @@
 static char hycapi_revision[]="$Revision: 1.8.6.4 $";
 
 unsigned int hycapi_enable = 0xffffffff; 
-MODULE_PARM(hycapi_enable, "i");
+module_param(hycapi_enable, uint, 0);
 
 typedef struct _hycapi_appl {
 	unsigned int ctrl_mask;
diff --git a/drivers/isdn/hysdn/hysdn_net.c b/drivers/isdn/hysdn/hysdn_net.c
index 683647b1f2bc..d205249a1242 100644
--- a/drivers/isdn/hysdn/hysdn_net.c
+++ b/drivers/isdn/hysdn/hysdn_net.c
@@ -24,7 +24,7 @@
 #include "hysdn_defs.h"
 
 unsigned int hynet_enable = 0xffffffff; 
-MODULE_PARM(hynet_enable, "i");
+module_param(hynet_enable, uint, 0);
 
 /* store the actual version for log reporting */
 char *hysdn_net_revision = "$Revision: 1.8.6.4 $";
diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c
index 33d339700411..a67d31af797a 100644
--- a/drivers/isdn/isdnloop/isdnloop.c
+++ b/drivers/isdn/isdnloop/isdnloop.c
@@ -22,7 +22,7 @@ static char *isdnloop_id = "loop0";
 MODULE_DESCRIPTION("ISDN4Linux: Pseudo Driver that simulates an ISDN card");
 MODULE_AUTHOR("Fritz Elfert");
 MODULE_LICENSE("GPL");
-MODULE_PARM(isdnloop_id, "s");
+module_param(isdnloop_id, charp, 0);
 MODULE_PARM_DESC(isdnloop_id, "ID-String of first card");
 
 static int isdnloop_addcard(char *);
diff --git a/drivers/media/video/zr36120.c b/drivers/media/video/zr36120.c
index d4c633b8a7f5..417a999afee3 100644
--- a/drivers/media/video/zr36120.c
+++ b/drivers/media/video/zr36120.c
@@ -70,10 +70,10 @@ MODULE_AUTHOR("Pauline Middelink <middelin@polyware.nl>");
 MODULE_DESCRIPTION("Zoran ZR36120 based framegrabber");
 MODULE_LICENSE("GPL");
 
-MODULE_PARM(triton1,"i");
-MODULE_PARM(cardtype,"1-" __MODULE_STRING(ZORAN_MAX) "i");
-MODULE_PARM(video_nr,"i");
-MODULE_PARM(vbi_nr,"i");
+module_param(triton1, uint, 0);
+module_param_array(cardtype, uint, NULL, 0);
+module_param(video_nr, int, 0);
+module_param(vbi_nr, int, 0);
 
 static int zoran_cards;
 static struct zoran zorans[ZORAN_MAX];
diff --git a/drivers/mmc/au1xmmc.c b/drivers/mmc/au1xmmc.c
index 8d84b045bc83..85e89c77bdea 100644
--- a/drivers/mmc/au1xmmc.c
+++ b/drivers/mmc/au1xmmc.c
@@ -87,7 +87,7 @@ struct au1xmmc_host *au1xmmc_hosts[AU1XMMC_CONTROLLER_COUNT];
 static int dma = 1;
 
 #ifdef MODULE
-MODULE_PARM(dma, "i");
+module_param(dma, bool, 0);
 MODULE_PARM_DESC(dma, "Use DMA engine for data transfers (0 = disabled)");
 #endif
 
diff --git a/drivers/mtd/maps/pcmciamtd.c b/drivers/mtd/maps/pcmciamtd.c
index f0f8916da7ad..f988c817e196 100644
--- a/drivers/mtd/maps/pcmciamtd.c
+++ b/drivers/mtd/maps/pcmciamtd.c
@@ -28,7 +28,7 @@
 
 #ifdef CONFIG_MTD_DEBUG
 static int debug = CONFIG_MTD_DEBUG_VERBOSE;
-MODULE_PARM(debug, "i");
+module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Set Debug Level 0=quiet, 5=noisy");
 #undef DEBUG
 #define DEBUG(n, format, arg...) \
@@ -89,17 +89,17 @@ static int mem_type;
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Simon Evans <spse@secret.org.uk>");
 MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_PARM(bankwidth, "i");
+module_param(bankwidth, int, 0);
 MODULE_PARM_DESC(bankwidth, "Set bankwidth (1=8 bit, 2=16 bit, default=2)");
-MODULE_PARM(mem_speed, "i");
+module_param(mem_speed, int, 0);
 MODULE_PARM_DESC(mem_speed, "Set memory access speed in ns");
-MODULE_PARM(force_size, "i");
+module_param(force_size, int, 0);
 MODULE_PARM_DESC(force_size, "Force size of card in MiB (1-64)");
-MODULE_PARM(setvpp, "i");
+module_param(setvpp, int, 0);
 MODULE_PARM_DESC(setvpp, "Set Vpp (0=Never, 1=On writes, 2=Always on, default=0)");
-MODULE_PARM(vpp, "i");
+module_param(vpp, int, 0);
 MODULE_PARM_DESC(vpp, "Vpp value in 1/10ths eg 33=3.3V 120=12V (Dangerous)");
-MODULE_PARM(mem_type, "i");
+module_param(mem_type, int, 0);
 MODULE_PARM_DESC(mem_type, "Set Memory type (0=Flash, 1=RAM, 2=ROM, default=0)");
 
 
diff --git a/drivers/net/atari_bionet.c b/drivers/net/atari_bionet.c
index 0095384ff454..5e5f80b99b9e 100644
--- a/drivers/net/atari_bionet.c
+++ b/drivers/net/atari_bionet.c
@@ -123,7 +123,7 @@ static char version[] =
  * Global variable 'bionet_debug'. Can be set at load time by 'insmod'
  */
 unsigned int bionet_debug = NET_DEBUG;
-MODULE_PARM(bionet_debug, "i");
+module_param(bionet_debug, int, 0);
 MODULE_PARM_DESC(bionet_debug, "bionet debug level (0-2)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/atari_pamsnet.c b/drivers/net/atari_pamsnet.c
index 8b997809f9de..d6039e62d832 100644
--- a/drivers/net/atari_pamsnet.c
+++ b/drivers/net/atari_pamsnet.c
@@ -119,7 +119,7 @@ static char *version =
  * Global variable 'pamsnet_debug'. Can be set at load time by 'insmod'
  */
 unsigned int pamsnet_debug = NET_DEBUG;
-MODULE_PARM(pamsnet_debug, "i");
+module_param(pamsnet_debug, int, 0);
 MODULE_PARM_DESC(pamsnet_debug, "pamsnet debug enable (0-1)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c
index e01b6a78ec63..442b2cbeb58a 100644
--- a/drivers/net/atarilance.c
+++ b/drivers/net/atarilance.c
@@ -78,7 +78,7 @@ static int lance_debug = LANCE_DEBUG;
 #else
 static int lance_debug = 1;
 #endif
-MODULE_PARM(lance_debug, "i");
+module_param(lance_debug, int, 0);
 MODULE_PARM_DESC(lance_debug, "atarilance debug level (0-3)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 8f1573e658a5..ac48f7543500 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -192,12 +192,15 @@
 static char version[] __devinitdata =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 
+static int cassini_debug = -1;	/* -1 == use CAS_DEF_MSG_ENABLE as value */
+static int link_mode;
+
 MODULE_AUTHOR("Adrian Sun (asun@darksunrising.com)");
 MODULE_DESCRIPTION("Sun Cassini(+) ethernet driver");
 MODULE_LICENSE("GPL");
-MODULE_PARM(cassini_debug, "i");
+module_param(cassini_debug, int, 0);
 MODULE_PARM_DESC(cassini_debug, "Cassini bitmapped debugging message enable value");
-MODULE_PARM(link_mode, "i");
+module_param(link_mode, int, 0);
 MODULE_PARM_DESC(link_mode, "default link mode");
 
 /*
@@ -209,7 +212,7 @@ MODULE_PARM_DESC(link_mode, "default link mode");
  * Value in seconds, for user input.
  */
 static int linkdown_timeout = DEFAULT_LINKDOWN_TIMEOUT;
-MODULE_PARM(linkdown_timeout, "i");
+module_param(linkdown_timeout, int, 0);
 MODULE_PARM_DESC(linkdown_timeout,
 "min reset interval in sec. for PCS linkdown issue; disabled if not positive");
 
@@ -221,8 +224,6 @@ MODULE_PARM_DESC(linkdown_timeout,
 static int link_transition_timeout;
 
 
-static int cassini_debug = -1;	/* -1 == use CAS_DEF_MSG_ENABLE as value */
-static int link_mode;
 
 static u16 link_modes[] __devinitdata = {
 	BMCR_ANENABLE,			 /* 0 : autoneg */
diff --git a/drivers/net/chelsio/cxgb2.c b/drivers/net/chelsio/cxgb2.c
index 349ebe783ed6..7fe2638ae06d 100644
--- a/drivers/net/chelsio/cxgb2.c
+++ b/drivers/net/chelsio/cxgb2.c
@@ -124,7 +124,7 @@ MODULE_LICENSE("GPL");
 
 static int dflt_msg_enable = DFLT_MSG_ENABLE;
 
-MODULE_PARM(dflt_msg_enable, "i");
+module_param(dflt_msg_enable, int, 0);
 MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T1 message enable bitmap");
 
 
diff --git a/drivers/net/fec_8xx/fec_main.c b/drivers/net/fec_8xx/fec_main.c
index b4f3a9f8a535..7e4338097139 100644
--- a/drivers/net/fec_8xx/fec_main.c
+++ b/drivers/net/fec_8xx/fec_main.c
@@ -55,11 +55,11 @@ MODULE_AUTHOR("Pantelis Antoniou <panto@intracom.gr>");
 MODULE_DESCRIPTION("Motorola 8xx FEC ethernet driver");
 MODULE_LICENSE("GPL");
 
-MODULE_PARM(fec_8xx_debug, "i");
+int fec_8xx_debug = -1;		/* -1 == use FEC_8XX_DEF_MSG_ENABLE as value */
+module_param(fec_8xx_debug, int, 0);
 MODULE_PARM_DESC(fec_8xx_debug,
 		 "FEC 8xx bitmapped debugging message enable value");
 
-int fec_8xx_debug = -1;		/* -1 == use FEC_8XX_DEF_MSG_ENABLE as value */
 
 /*************************************************/
 
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index f5d49a110654..196298f33db8 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -58,11 +58,11 @@ MODULE_DESCRIPTION("Freescale Ethernet Driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
-MODULE_PARM(fs_enet_debug, "i");
+int fs_enet_debug = -1;		/* -1 == use FS_ENET_DEF_MSG_ENABLE as value */
+module_param(fs_enet_debug, int, 0);
 MODULE_PARM_DESC(fs_enet_debug,
 		 "Freescale bitmapped debugging message enable value");
 
-int fs_enet_debug = -1;		/* -1 == use FS_ENET_DEF_MSG_ENABLE as value */
 
 static void fs_set_multicast_list(struct net_device *dev)
 {
diff --git a/drivers/net/gt96100eth.c b/drivers/net/gt96100eth.c
index 5958a6314723..2d2435404614 100644
--- a/drivers/net/gt96100eth.c
+++ b/drivers/net/gt96100eth.c
@@ -114,8 +114,8 @@ static int max_interrupt_work = 32;
 
 static char mac0[18] = "00.02.03.04.05.06";
 static char mac1[18] = "00.01.02.03.04.05";
-MODULE_PARM(mac0, "c18");
-MODULE_PARM(mac1, "c18");
+module_param_string(mac0, mac0, 18, 0);
+module_param_string(mac1, mac0, 18, 0);
 MODULE_PARM_DESC(mac0, "MAC address for GT96100 ethernet port 0");
 MODULE_PARM_DESC(mac1, "MAC address for GT96100 ethernet port 1");
 
diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index c8dc40214a08..79a8fbcf5f93 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -280,7 +280,7 @@ static unsigned long rand;
 
 MODULE_AUTHOR("Klaus Kudielka");
 MODULE_DESCRIPTION("Driver for high-speed SCC boards");
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_NUM_DEVS) "i");
+module_param_array(io, int, NULL, 0);
 MODULE_LICENSE("GPL");
 
 static void __exit dmascc_exit(void)
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index dc5e9d59deed..d81a8e1eeb8d 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -1012,7 +1012,7 @@ static void __exit mkiss_exit_driver(void)
 
 MODULE_AUTHOR("Ralf Baechle DL5RB <ralf@linux-mips.org>");
 MODULE_DESCRIPTION("KISS driver for AX.25 over TTYs");
-MODULE_PARM(crc_force, "i");
+module_param(crc_force, int, 0);
 MODULE_PARM_DESC(crc_force, "crc [0 = auto | 1 = none | 2 = flexnet | 3 = smack]");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_LDISC(N_AX25);
diff --git a/drivers/net/irda/irport.c b/drivers/net/irda/irport.c
index 6070195b87bd..98fa5319e5cc 100644
--- a/drivers/net/irda/irport.c
+++ b/drivers/net/irda/irport.c
@@ -1118,9 +1118,9 @@ static void __exit irport_cleanup(void)
  	}
 }
 
-MODULE_PARM(io, "1-4i");
+module_param_array(io, int, NULL, 0);
 MODULE_PARM_DESC(io, "Base I/O addresses");
-MODULE_PARM(irq, "1-4i");
+module_param_array(irq, int, NULL, 0);
 MODULE_PARM_DESC(irq, "IRQ lines");
 
 MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no>");
diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c
index f7b7238d8352..957888de3d7e 100644
--- a/drivers/net/lasi_82596.c
+++ b/drivers/net/lasi_82596.c
@@ -177,7 +177,7 @@ static int i596_debug = (DEB_SERIOUS|DEB_PROBE);
 MODULE_AUTHOR("Richard Hirst");
 MODULE_DESCRIPTION("i82596 driver");
 MODULE_LICENSE("GPL");
-MODULE_PARM(i596_debug, "i");
+module_param(i596_debug, int, 0);
 MODULE_PARM_DESC(i596_debug, "lasi_82596 debug mask");
 
 /* Copy frames shorter than rx_copybreak, otherwise pass on up in
@@ -1520,9 +1520,9 @@ static void set_multicast_list(struct net_device *dev)
 	}
 }
 
-MODULE_PARM(debug, "i");
-MODULE_PARM_DESC(debug, "lasi_82596 debug mask");
 static int debug = -1;
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "lasi_82596 debug mask");
 
 static int num_drivers;
 static struct net_device *netdevs[MAX_DRIVERS];
diff --git a/drivers/net/mac89x0.c b/drivers/net/mac89x0.c
index f65b0db111b8..cd3c9a5a98b2 100644
--- a/drivers/net/mac89x0.c
+++ b/drivers/net/mac89x0.c
@@ -629,7 +629,7 @@ static int set_mac_address(struct net_device *dev, void *addr)
 static struct net_device *dev_cs89x0;
 static int debug;
 
-MODULE_PARM(debug, "i");
+module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "CS89[02]0 debug level (0-5)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/mace.c b/drivers/net/mace.c
index 2a5add257b8f..77792b286027 100644
--- a/drivers/net/mace.c
+++ b/drivers/net/mace.c
@@ -1042,7 +1042,7 @@ static void __exit mace_cleanup(void)
 
 MODULE_AUTHOR("Paul Mackerras");
 MODULE_DESCRIPTION("PowerMac MACE driver.");
-MODULE_PARM(port_aaui, "i");
+module_param(port_aaui, int, 0);
 MODULE_PARM_DESC(port_aaui, "MACE uses AAUI port (0-1)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/meth.c b/drivers/net/meth.c
index e23655f5049f..d644bf3a933c 100644
--- a/drivers/net/meth.c
+++ b/drivers/net/meth.c
@@ -62,7 +62,7 @@ MODULE_DESCRIPTION("SGI O2 Builtin Fast Ethernet driver");
 
 #ifdef HAVE_TX_TIMEOUT
 static int timeout = TX_TIMEOUT;
-MODULE_PARM(timeout, "i");
+module_param(timeout, int, 0);
 #endif
 
 /*
diff --git a/drivers/net/ne-h8300.c b/drivers/net/ne-h8300.c
index aaebd28a1920..7ea3d596ac3b 100644
--- a/drivers/net/ne-h8300.c
+++ b/drivers/net/ne-h8300.c
@@ -601,9 +601,9 @@ static int io[MAX_NE_CARDS];
 static int irq[MAX_NE_CARDS];
 static int bad[MAX_NE_CARDS];	/* 0xbad = bad sig or no reset ack */
 
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_NE_CARDS) "i");
-MODULE_PARM(irq, "1-" __MODULE_STRING(MAX_NE_CARDS) "i");
-MODULE_PARM(bad, "1-" __MODULE_STRING(MAX_NE_CARDS) "i");
+module_param_array(io, int, NULL, 0);
+module_param_array(irq, int, NULL, 0);
+module_param_array(bad, int, NULL, 0);
 MODULE_PARM_DESC(io, "I/O base address(es)");
 MODULE_PARM_DESC(irq, "IRQ number(s)");
 MODULE_DESCRIPTION("H8/300 NE2000 Ethernet driver");
diff --git a/drivers/net/ni5010.c b/drivers/net/ni5010.c
index 2ab01a5d1d22..a68bf474f6ed 100644
--- a/drivers/net/ni5010.c
+++ b/drivers/net/ni5010.c
@@ -766,8 +766,8 @@ static void ni5010_show_registers(struct net_device *dev)
 #ifdef MODULE
 static struct net_device *dev_ni5010;
 
-MODULE_PARM(io, "i");
-MODULE_PARM(irq, "i");
+module_param(io, int, 0);
+module_param(irq, int, 0);
 MODULE_PARM_DESC(io, "ni5010 I/O base address");
 MODULE_PARM_DESC(irq, "ni5010 IRQ number");
 
diff --git a/drivers/net/sun3lance.c b/drivers/net/sun3lance.c
index 01bdb2334058..d4c0002b43db 100644
--- a/drivers/net/sun3lance.c
+++ b/drivers/net/sun3lance.c
@@ -71,7 +71,7 @@ static int lance_debug = LANCE_DEBUG;
 #else
 static int lance_debug = 1;
 #endif
-MODULE_PARM(lance_debug, "i");
+module_param(lance_debug, int, 0);
 MODULE_PARM_DESC(lance_debug, "SUN3 Lance debug level (0-3)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index dfe542b206cc..7967916bda18 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -43,7 +43,6 @@ MODULE_AUTHOR("Holger Smolinski <Holger.Smolinski@de.ibm.com>");
 MODULE_DESCRIPTION("Linux on S/390 DASD device driver,"
 		   " Copyright 2000 IBM Corporation");
 MODULE_SUPPORTED_DEVICE("dasd");
-MODULE_PARM(dasd, "1-" __MODULE_STRING(256) "s");
 MODULE_LICENSE("GPL");
 
 /*
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index f576f243cd93..2f720108a7e0 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -16,6 +16,7 @@
 #include <linux/config.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
 #include <asm/debug.h>
 #include <asm/uaccess.h>
@@ -69,6 +70,8 @@ int dasd_autodetect = 0;	/* is true, when autodetection is active */
  * strings when running as a module.
  */
 static char *dasd[256];
+module_param_array(dasd, charp, NULL, 0);
+
 /*
  * Single spinlock to protect devmap structures and lists.
  */
diff --git a/drivers/scsi/arm/cumana_2.c b/drivers/scsi/arm/cumana_2.c
index 583d2d8c8335..fad2109268bb 100644
--- a/drivers/scsi/arm/cumana_2.c
+++ b/drivers/scsi/arm/cumana_2.c
@@ -550,6 +550,6 @@ module_exit(cumanascsi2_exit);
 
 MODULE_AUTHOR("Russell King");
 MODULE_DESCRIPTION("Cumana SCSI-2 driver for Acorn machines");
-MODULE_PARM(term, "1-8i");
+module_param_array(term, int, NULL, 0);
 MODULE_PARM_DESC(term, "SCSI bus termination");
 MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/arm/eesox.c b/drivers/scsi/arm/eesox.c
index 3ffec7efc9d5..dcbb4b2b3fe0 100644
--- a/drivers/scsi/arm/eesox.c
+++ b/drivers/scsi/arm/eesox.c
@@ -674,6 +674,6 @@ module_exit(eesox_exit);
 
 MODULE_AUTHOR("Russell King");
 MODULE_DESCRIPTION("EESOX 'Fast' SCSI driver for Acorn machines");
-MODULE_PARM(term, "1-8i");
+module_param_array(term, int, NULL, 0);
 MODULE_PARM_DESC(term, "SCSI bus termination");
 MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/arm/powertec.c b/drivers/scsi/arm/powertec.c
index 3113bdcedb13..3d69f6c45a6b 100644
--- a/drivers/scsi/arm/powertec.c
+++ b/drivers/scsi/arm/powertec.c
@@ -466,6 +466,6 @@ module_exit(powertecscsi_exit);
 
 MODULE_AUTHOR("Russell King");
 MODULE_DESCRIPTION("Powertec SCSI driver");
-MODULE_PARM(term, "1-8i");
+module_param_array(term, int, NULL, 0);
 MODULE_PARM_DESC(term, "SCSI bus termination");
 MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/atari_scsi.c b/drivers/scsi/atari_scsi.c
index f4c1ca7c1572..f677c5a32a68 100644
--- a/drivers/scsi/atari_scsi.c
+++ b/drivers/scsi/atari_scsi.c
@@ -239,17 +239,17 @@ static int atari_read_overruns = 0;
 #endif
 
 static int setup_can_queue = -1;
-MODULE_PARM(setup_can_queue, "i");
+module_param(setup_can_queue, int, 0);
 static int setup_cmd_per_lun = -1;
-MODULE_PARM(setup_cmd_per_lun, "i");
+module_param(setup_cmd_per_lun, int, 0);
 static int setup_sg_tablesize = -1;
-MODULE_PARM(setup_sg_tablesize, "i");
+module_param(setup_sg_tablesize, int, 0);
 #ifdef SUPPORT_TAGS
 static int setup_use_tagged_queuing = -1;
-MODULE_PARM(setup_use_tagged_queuing, "i");
+module_param(setup_use_tagged_queuing, int, 0);
 #endif
 static int setup_hostid = -1;
-MODULE_PARM(setup_hostid, "i");
+module_param(setup_hostid, int, 0);
 
 
 #if defined(CONFIG_TT_DMA_EMUL)
diff --git a/drivers/video/pm3fb.c b/drivers/video/pm3fb.c
index 0e78ddc81583..52c18a35fb41 100644
--- a/drivers/video/pm3fb.c
+++ b/drivers/video/pm3fb.c
@@ -3532,26 +3532,26 @@ int __init pm3fb_init(void)
 MODULE_AUTHOR("Romain Dolbeau");
 MODULE_DESCRIPTION("Permedia3 framebuffer device driver");
 static char *mode[PM3_MAX_BOARD];
-MODULE_PARM(mode,PM3_MAX_BOARD_MODULE_ARRAY_STRING);
+module_param_array(mode, charp, NULL, 0);
 MODULE_PARM_DESC(mode,"video mode");
-MODULE_PARM(disable,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param_array(disable, short, NULL, 0);
 MODULE_PARM_DESC(disable,"disable board");
 static short off[PM3_MAX_BOARD];
-MODULE_PARM(off,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param_array(off, short, NULL, 0);
 MODULE_PARM_DESC(off,"disable board");
 static char *pciid[PM3_MAX_BOARD];
-MODULE_PARM(pciid,PM3_MAX_BOARD_MODULE_ARRAY_STRING);
+module_param_array(pciid, charp, NULL, 0);
 MODULE_PARM_DESC(pciid,"board PCI Id");
-MODULE_PARM(noaccel,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param_array(noaccel, short, NULL, 0);
 MODULE_PARM_DESC(noaccel,"disable accel");
 static char *font[PM3_MAX_BOARD];
-MODULE_PARM(font,PM3_MAX_BOARD_MODULE_ARRAY_STRING);
+module_param_array(font, charp, NULL, 0);
 MODULE_PARM_DESC(font,"choose font");
-MODULE_PARM(depth,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param(depth, short, NULL, 0);
 MODULE_PARM_DESC(depth,"boot-time depth");
-MODULE_PARM(printtimings, "h");
+module_param(printtimings, short, NULL, 0);
 MODULE_PARM_DESC(printtimings, "print the memory timings of the card(s)");
-MODULE_PARM(forcesize, PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param(forcesize, short, NULL, 0);
 MODULE_PARM_DESC(forcesize, "force specified memory size");
 /*
 MODULE_SUPPORTED_DEVICE("Permedia3 PCI boards")
diff --git a/include/linux/module.h b/include/linux/module.h
index 70bd843c71cb..e144309836af 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -554,25 +554,6 @@ static inline void module_remove_driver(struct device_driver *driver)
 
 /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */
 
-struct obsolete_modparm {
-	char name[64];
-	char type[64-sizeof(void *)];
-	void *addr;
-};
-
-static inline void MODULE_PARM_(void) { }
-#ifdef MODULE
-/* DEPRECATED: Do not use. */
-#define MODULE_PARM(var,type)						    \
-extern struct obsolete_modparm __parm_##var \
-__attribute__((section("__obsparm"))); \
-struct obsolete_modparm __parm_##var = \
-{ __stringify(var), type, &MODULE_PARM_ }; \
-__MODULE_PARM_TYPE(var, type);
-#else
-#define MODULE_PARM(var,type) static void __attribute__((__unused__)) *__parm_##var = &MODULE_PARM_;
-#endif
-
 #define __MODULE_STRING(x) __stringify(x)
 
 /* Use symbol_get and symbol_put instead.  You'll thank me. */
diff --git a/include/video/pm3fb.h b/include/video/pm3fb.h
index 6f4ea808cf74..ac021379ac40 100644
--- a/include/video/pm3fb.h
+++ b/include/video/pm3fb.h
@@ -1128,10 +1128,7 @@
 #endif
 
 /* max number of simultaneous board */
-/* warning : make sure module array def's are coherent with PM3_MAX_BOARD */
 #define PM3_MAX_BOARD 4
-#define PM3_MAX_BOARD_MODULE_ARRAY_SHORT "1-4h"
-#define PM3_MAX_BOARD_MODULE_ARRAY_STRING "1-4s"
 
 /* max size of options */
 #define PM3_OPTIONS_SIZE 256
diff --git a/init/Kconfig b/init/Kconfig
index 1d19fd25204b..05951c1d654e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -470,15 +470,6 @@ config MODULE_FORCE_UNLOAD
 	  rmmod).  This is mainly for kernel developers and desperate users.
 	  If unsure, say N.
 
-config OBSOLETE_MODPARM
-	bool
-	default y
-	depends on MODULES
-	help
-	  You need this option to use module parameters on modules which
-	  have not been converted to the new module parameter system yet.
-	  If unsure, say Y.
-
 config MODVERSIONS
 	bool "Module versioning support"
 	depends on MODULES
diff --git a/kernel/module.c b/kernel/module.c
index 54623c714bba..ddfe45ac2fd1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -233,24 +233,6 @@ static unsigned long __find_symbol(const char *name,
  	return 0;
 }
 
-/* Find a symbol in this elf symbol table */
-static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
-				       unsigned int symindex,
-				       const char *strtab,
-				       const char *name)
-{
-	unsigned int i;
-	Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
-
-	/* Search (defined) internal symbols first. */
-	for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
-		if (sym[i].st_shndx != SHN_UNDEF
-		    && strcmp(name, strtab + sym[i].st_name) == 0)
-			return sym[i].st_value;
-	}
-	return 0;
-}
-
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -785,139 +767,6 @@ static struct module_attribute *modinfo_attrs[] = {
 	NULL,
 };
 
-#ifdef CONFIG_OBSOLETE_MODPARM
-/* Bounds checking done below */
-static int obsparm_copy_string(const char *val, struct kernel_param *kp)
-{
-	strcpy(kp->arg, val);
-	return 0;
-}
-
-static int set_obsolete(const char *val, struct kernel_param *kp)
-{
-	unsigned int min, max;
-	unsigned int size, maxsize;
-	int dummy;
-	char *endp;
-	const char *p;
-	struct obsolete_modparm *obsparm = kp->arg;
-
-	if (!val) {
-		printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
-		return -EINVAL;
-	}
-
-	/* type is: [min[-max]]{b,h,i,l,s} */
-	p = obsparm->type;
-	min = simple_strtol(p, &endp, 10);
-	if (endp == obsparm->type)
-		min = max = 1;
-	else if (*endp == '-') {
-		p = endp+1;
-		max = simple_strtol(p, &endp, 10);
-	} else
-		max = min;
-	switch (*endp) {
-	case 'b':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   1, param_set_byte, &dummy);
-	case 'h':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(short), param_set_short, &dummy);
-	case 'i':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(int), param_set_int, &dummy);
-	case 'l':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(long), param_set_long, &dummy);
-	case 's':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(char *), param_set_charp, &dummy);
-
-	case 'c':
-		/* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
-		   and the decl is "char xxx[5][50];" */
-		p = endp+1;
-		maxsize = simple_strtol(p, &endp, 10);
-		/* We check lengths here (yes, this is a hack). */
-		p = val;
-		while (p[size = strcspn(p, ",")]) {
-			if (size >= maxsize) 
-				goto oversize;
-			p += size+1;
-		}
-		if (size >= maxsize) 
-			goto oversize;
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   maxsize, obsparm_copy_string, &dummy);
-	}
-	printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
-	return -EINVAL;
- oversize:
-	printk(KERN_ERR
-	       "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
-	return -EINVAL;
-}
-
-static int obsolete_params(const char *name,
-			   char *args,
-			   struct obsolete_modparm obsparm[],
-			   unsigned int num,
-			   Elf_Shdr *sechdrs,
-			   unsigned int symindex,
-			   const char *strtab)
-{
-	struct kernel_param *kp;
-	unsigned int i;
-	int ret;
-
-	kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
-	if (!kp)
-		return -ENOMEM;
-
-	for (i = 0; i < num; i++) {
-		char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
-
-		snprintf(sym_name, sizeof(sym_name), "%s%s",
-			 MODULE_SYMBOL_PREFIX, obsparm[i].name);
-
-		kp[i].name = obsparm[i].name;
-		kp[i].perm = 000;
-		kp[i].set = set_obsolete;
-		kp[i].get = NULL;
-		obsparm[i].addr
-			= (void *)find_local_symbol(sechdrs, symindex, strtab,
-						    sym_name);
-		if (!obsparm[i].addr) {
-			printk("%s: falsely claims to have parameter %s\n",
-			       name, obsparm[i].name);
-			ret = -EINVAL;
-			goto out;
-		}
-		kp[i].arg = &obsparm[i];
-	}
-
-	ret = parse_args(name, args, kp, num, NULL);
- out:
-	kfree(kp);
-	return ret;
-}
-#else
-static int obsolete_params(const char *name,
-			   char *args,
-			   struct obsolete_modparm obsparm[],
-			   unsigned int num,
-			   Elf_Shdr *sechdrs,
-			   unsigned int symindex,
-			   const char *strtab)
-{
-	if (num != 0)
-		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
-		       name);
-	return 0;
-}
-#endif /* CONFIG_OBSOLETE_MODPARM */
-
 static const char vermagic[] = VERMAGIC_STRING;
 
 #ifdef CONFIG_MODVERSIONS
@@ -1874,27 +1723,17 @@ static struct module *load_module(void __user *umod,
 	set_fs(old_fs);
 
 	mod->args = args;
-	if (obsparmindex) {
-		err = obsolete_params(mod->name, mod->args,
-				      (struct obsolete_modparm *)
-				      sechdrs[obsparmindex].sh_addr,
-				      sechdrs[obsparmindex].sh_size
-				      / sizeof(struct obsolete_modparm),
-				      sechdrs, symindex,
-				      (char *)sechdrs[strindex].sh_addr);
-		if (setupindex)
-			printk(KERN_WARNING "%s: Ignoring new-style "
-			       "parameters in presence of obsolete ones\n",
-			       mod->name);
-	} else {
-		/* Size of section 0 is 0, so this works well if no params */
-		err = parse_args(mod->name, mod->args,
-				 (struct kernel_param *)
-				 sechdrs[setupindex].sh_addr,
-				 sechdrs[setupindex].sh_size
-				 / sizeof(struct kernel_param),
-				 NULL);
-	}
+	if (obsparmindex)
+		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+		       mod->name);
+
+	/* Size of section 0 is 0, so this works well if no params */
+	err = parse_args(mod->name, mod->args,
+			 (struct kernel_param *)
+			 sechdrs[setupindex].sh_addr,
+			 sechdrs[setupindex].sh_size
+			 / sizeof(struct kernel_param),
+			 NULL);
 	if (err < 0)
 		goto arch_cleanup;
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9a1fa8894b95..b4b362b5baf5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -54,15 +54,15 @@ static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
 
-MODULE_PARM(nreaders, "i");
+module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-MODULE_PARM(stat_interval, "i");
+module_param(stat_interval, int, 0);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-MODULE_PARM(verbose, "i");
+module_param(verbose, bool, 0);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-MODULE_PARM(test_no_idle_hz, "i");
+module_param(test_no_idle_hz, bool, 0);
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-MODULE_PARM(shuffle_interval, "i");
+module_param(shuffle_interval, int, 0);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 #define TORTURE_FLAG "rcutorture: "
 #define PRINTK_STRING(s) \
diff --git a/sound/oss/au1000.c b/sound/oss/au1000.c
index fe54de25aafc..eacb0aef21e1 100644
--- a/sound/oss/au1000.c
+++ b/sound/oss/au1000.c
@@ -100,7 +100,7 @@
 
 /* Boot options */
 static int      vra = 0;	// 0 = no VRA, 1 = use VRA if codec supports it
-MODULE_PARM(vra, "i");
+module_param(vra, bool, 0);
 MODULE_PARM_DESC(vra, "if 1 use VRA if codec supports it");
 
 
diff --git a/sound/oss/au1550_ac97.c b/sound/oss/au1550_ac97.c
index 6a4956b8025d..c1168fae6be6 100644
--- a/sound/oss/au1550_ac97.c
+++ b/sound/oss/au1550_ac97.c
@@ -79,7 +79,7 @@
  * 0 = no VRA, 1 = use VRA if codec supports it
  */
 static int      vra = 1;
-MODULE_PARM(vra, "i");
+module_param(vra, bool, 0);
 MODULE_PARM_DESC(vra, "if 1 use VRA if codec supports it");
 
 static struct au1550_state {
diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c
index c9302a1e515b..87bd3100aef3 100644
--- a/sound/oss/dmasound/dmasound_core.c
+++ b/sound/oss/dmasound/dmasound_core.c
@@ -195,18 +195,18 @@
      */
 
 int dmasound_catchRadius = 0;
-MODULE_PARM(dmasound_catchRadius, "i");
+module_param(dmasound_catchRadius, int, 0);
 
 static unsigned int numWriteBufs = DEFAULT_N_BUFFERS;
-MODULE_PARM(numWriteBufs, "i");
+module_param(numWriteBufs, int, 0);
 static unsigned int writeBufSize = DEFAULT_BUFF_SIZE ;	/* in bytes */
-MODULE_PARM(writeBufSize, "i");
+module_param(writeBufSize, int, 0);
 
 #ifdef HAS_RECORD
 static unsigned int numReadBufs = DEFAULT_N_BUFFERS;
-MODULE_PARM(numReadBufs, "i");
+module_param(numReadBufs, int, 0);
 static unsigned int readBufSize = DEFAULT_BUFF_SIZE;	/* in bytes */
-MODULE_PARM(readBufSize, "i");
+module_param(readBufSize, int, 0);
 #endif
 
 MODULE_LICENSE("GPL");
diff --git a/sound/oss/ite8172.c b/sound/oss/ite8172.c
index ffcb910f5c3e..00ac1c95a429 100644
--- a/sound/oss/ite8172.c
+++ b/sound/oss/ite8172.c
@@ -1968,9 +1968,9 @@ static int i2s_fmt[NR_DEVICE];
 
 static unsigned int devindex;
 
-MODULE_PARM(spdif, "1-" __MODULE_STRING(NR_DEVICE) "i");
+module_param_array(spdif, int, NULL, 0);
 MODULE_PARM_DESC(spdif, "if 1 the S/PDIF digital output is enabled");
-MODULE_PARM(i2s_fmt, "1-" __MODULE_STRING(NR_DEVICE) "i");
+module_param_array(i2s_fmt, int, NULL, 0);
 MODULE_PARM_DESC(i2s_fmt, "the format of I2S");
 
 MODULE_AUTHOR("Monta Vista Software, stevel@mvista.com");
diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c
index dce9016cbcfd..eb5ea32fd1b0 100644
--- a/sound/oss/swarm_cs4297a.c
+++ b/sound/oss/swarm_cs4297a.c
@@ -154,8 +154,8 @@ static void start_adc(struct cs4297a_state *s);
 #if CSDEBUG
 static unsigned long cs_debuglevel = 4;	// levels range from 1-9
 static unsigned long cs_debugmask = CS_INIT /*| CS_IOCTL*/;
-MODULE_PARM(cs_debuglevel, "i");
-MODULE_PARM(cs_debugmask, "i");
+module_param(cs_debuglevel, int, 0);
+module_param(cs_debugmask, int, 0);
 #endif
 #define CS_TRUE 	1
 #define CS_FALSE 	0
diff --git a/sound/oss/waveartist.c b/sound/oss/waveartist.c
index 99d04ad3ca13..afcb524a40eb 100644
--- a/sound/oss/waveartist.c
+++ b/sound/oss/waveartist.c
@@ -2028,8 +2028,8 @@ __setup("waveartist=", setup_waveartist);
 #endif
 
 MODULE_DESCRIPTION("Rockwell WaveArtist RWA-010 sound driver");
-MODULE_PARM(io, "i");		/* IO base */
-MODULE_PARM(irq, "i");		/* IRQ */
-MODULE_PARM(dma, "i");		/* DMA */
-MODULE_PARM(dma2, "i");		/* DMA2 */
+module_param(io, int, 0);		/* IO base */
+module_param(irq, int, 0);		/* IRQ */
+module_param(dma, int, 0);		/* DMA */
+module_param(dma2, int, 0);		/* DMA2 */
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 9871728b756646e0d758a966ba00f2c0ff812817 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 25 Mar 2006 03:07:06 -0800
Subject: [PATCH] kernel/params.c: make param_array() static

param_array() in kernel/params.c can now become static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/moduleparam.h |  7 -------
 kernel/params.c             | 12 ++++++------
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index b5c98c43779e..7c0c2c198f1f 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -162,13 +162,6 @@ extern int param_array_get(char *buffer, struct kernel_param *kp);
 extern int param_set_copystring(const char *val, struct kernel_param *kp);
 extern int param_get_string(char *buffer, struct kernel_param *kp);
 
-int param_array(const char *name,
-		const char *val,
-		unsigned int min, unsigned int max,
-		void *elem, int elemsize,
-		int (*set)(const char *, struct kernel_param *kp),
-		int *num);
-
 /* for exporting parameters in /sys/parameters */
 
 struct module;
diff --git a/kernel/params.c b/kernel/params.c
index a29150582310..9de637a5c8bc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
 }
 
 /* We cheat here and temporarily mangle the string. */
-int param_array(const char *name,
-		const char *val,
-		unsigned int min, unsigned int max,
-		void *elem, int elemsize,
-		int (*set)(const char *, struct kernel_param *kp),
-		int *num)
+static int param_array(const char *name,
+		       const char *val,
+		       unsigned int min, unsigned int max,
+		       void *elem, int elemsize,
+		       int (*set)(const char *, struct kernel_param *kp),
+		       int *num)
 {
 	int ret;
 	struct kernel_param kp;
-- 
cgit v1.2.3


From c32ccd87bfd1414b0aabfcd8dbc7539ad23bcbaa Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Mar 2006 03:07:09 -0800
Subject: [PATCH] inotify: lock avoidance with parent watch status in dentry

Previous inotify work avoidance is good when inotify is completely unused,
but it breaks down if even a single watch is in place anywhere in the
system.  Robin Holt notices that udev is one such culprit - it slows down a
512-thread application on a 512 CPU system from 6 seconds to 22 minutes.

Solve this by adding a flag in the dentry that tells inotify whether or not
its parent inode has a watch on it.  Event queueing to parent will skip
taking locks if this flag is cleared.  Setting and clearing of this flag on
all child dentries versus event delivery: this is no in terms of race
cases, and that was shown to be equivalent to always performing the check.

The essential behaviour is that activity occuring _after_ a watch has been
added and _before_ it has been removed, will generate events.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c              |  8 +++++
 fs/inotify.c             | 87 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/dcache.h   |  2 ++
 include/linux/fsnotify.h | 19 +++++++++++
 include/linux/inotify.h  | 11 ++++++
 5 files changed, 117 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 139e5fd22fa6..0f7ec12d65ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -802,6 +802,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 	if (inode)
 		list_add(&entry->d_alias, &inode->i_dentry);
 	entry->d_inode = inode;
+	fsnotify_d_instantiate(entry, inode);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
@@ -853,6 +854,7 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 	list_add(&entry->d_alias, &inode->i_dentry);
 do_negative:
 	entry->d_inode = inode;
+	fsnotify_d_instantiate(entry, inode);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 	return NULL;
@@ -983,6 +985,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+			fsnotify_d_instantiate(new, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
 			d_rehash(dentry);
@@ -992,6 +995,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			/* d_instantiate takes dcache_lock, so we do it by hand */
 			list_add(&dentry->d_alias, &inode->i_dentry);
 			dentry->d_inode = inode;
+			fsnotify_d_instantiate(dentry, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
@@ -1176,6 +1180,9 @@ void d_delete(struct dentry * dentry)
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
 	if (atomic_read(&dentry->d_count) == 1) {
+		/* remove this and other inotify debug checks after 2.6.18 */
+		dentry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
+
 		dentry_iput(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
@@ -1342,6 +1349,7 @@ already_unhashed:
 
 	list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
 	spin_unlock(&target->d_lock);
+	fsnotify_d_move(dentry);
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
 	spin_unlock(&dcache_lock);
diff --git a/fs/inotify.c b/fs/inotify.c
index 0ee39ef591c6..a61e93e17853 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -38,7 +38,6 @@
 #include <asm/ioctls.h>
 
 static atomic_t inotify_cookie;
-static atomic_t inotify_watches;
 
 static kmem_cache_t *watch_cachep;
 static kmem_cache_t *event_cachep;
@@ -380,6 +379,48 @@ static int find_inode(const char __user *dirname, struct nameidata *nd,
 	return error;
 }
 
+/*
+ * inotify_inode_watched - returns nonzero if there are watches on this inode
+ * and zero otherwise.  We call this lockless, we do not care if we race.
+ */
+static inline int inotify_inode_watched(struct inode *inode)
+{
+	return !list_empty(&inode->inotify_watches);
+}
+
+/*
+ * Get child dentry flag into synch with parent inode.
+ * Flag should always be clear for negative dentrys.
+ */
+static void set_dentry_child_flags(struct inode *inode, int watched)
+{
+	struct dentry *alias;
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode) {
+				WARN_ON(child->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
+				continue;
+			}
+			spin_lock(&child->d_lock);
+			if (watched) {
+				WARN_ON(child->d_flags &
+						DCACHE_INOTIFY_PARENT_WATCHED);
+				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+			} else {
+				WARN_ON(!(child->d_flags &
+					DCACHE_INOTIFY_PARENT_WATCHED));
+				child->d_flags&=~DCACHE_INOTIFY_PARENT_WATCHED;
+			}
+			spin_unlock(&child->d_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+}
+
 /*
  * create_watch - creates a watch on the given device.
  *
@@ -426,7 +467,6 @@ static struct inotify_watch *create_watch(struct inotify_device *dev,
 	get_inotify_watch(watch);
 
 	atomic_inc(&dev->user->inotify_watches);
-	atomic_inc(&inotify_watches);
 
 	return watch;
 }
@@ -458,8 +498,10 @@ static void remove_watch_no_event(struct inotify_watch *watch,
 	list_del(&watch->i_list);
 	list_del(&watch->d_list);
 
+	if (!inotify_inode_watched(watch->inode))
+		set_dentry_child_flags(watch->inode, 0);
+
 	atomic_dec(&dev->user->inotify_watches);
-	atomic_dec(&inotify_watches);
 	idr_remove(&dev->idr, watch->wd);
 	put_inotify_watch(watch);
 }
@@ -481,16 +523,39 @@ static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev)
 	remove_watch_no_event(watch, dev);
 }
 
+/* Kernel API */
+
 /*
- * inotify_inode_watched - returns nonzero if there are watches on this inode
- * and zero otherwise.  We call this lockless, we do not care if we race.
+ * inotify_d_instantiate - instantiate dcache entry for inode
  */
-static inline int inotify_inode_watched(struct inode *inode)
+void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
 {
-	return !list_empty(&inode->inotify_watches);
+	struct dentry *parent;
+
+	if (!inode)
+		return;
+
+	WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
+	spin_lock(&entry->d_lock);
+	parent = entry->d_parent;
+	if (inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	spin_unlock(&entry->d_lock);
 }
 
-/* Kernel API */
+/*
+ * inotify_d_move - dcache entry has been moved
+ */
+void inotify_d_move(struct dentry *entry)
+{
+	struct dentry *parent;
+
+	parent = entry->d_parent;
+	if (inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	else
+		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
+}
 
 /**
  * inotify_inode_queue_event - queue an event to all watches on this inode
@@ -538,7 +603,7 @@ void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
 	struct dentry *parent;
 	struct inode *inode;
 
-	if (!atomic_read (&inotify_watches))
+	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
 		return;
 
 	spin_lock(&dentry->d_lock);
@@ -993,6 +1058,9 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 		goto out;
 	}
 
+	if (!inotify_inode_watched(inode))
+		set_dentry_child_flags(inode, 1);
+
 	/* Add the watch to the device's and the inode's list */
 	list_add(&watch->d_list, &dev->watches);
 	list_add(&watch->i_list, &inode->inotify_watches);
@@ -1065,7 +1133,6 @@ static int __init inotify_setup(void)
 	inotify_max_user_watches = 8192;
 
 	atomic_set(&inotify_cookie, 0);
-	atomic_set(&inotify_watches, 0);
 
 	watch_cachep = kmem_cache_create("inotify_watch_cache",
 					 sizeof(struct inotify_watch),
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4361f3789975..d10bd30c337e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -162,6 +162,8 @@ d_iput:		no		no		no       yes
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
 #define DCACHE_UNHASHED		0x0010	
 
+#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
+
 extern spinlock_t dcache_lock;
 
 /**
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 03b8e7932b83..f7e517c1f1bd 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -16,6 +16,25 @@
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
 
+/*
+ * fsnotify_d_instantiate - instantiate a dentry for inode
+ * Called with dcache_lock held.
+ */
+static inline void fsnotify_d_instantiate(struct dentry *entry,
+						struct inode *inode)
+{
+	inotify_d_instantiate(entry, inode);
+}
+
+/*
+ * fsnotify_d_move - entry has been moved
+ * Called with dcache_lock and entry->d_lock held.
+ */
+static inline void fsnotify_d_move(struct dentry *entry)
+{
+	inotify_d_move(entry);
+}
+
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
  */
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 267c88b5f742..09e00433c78e 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -71,6 +71,8 @@ struct inotify_event {
 
 #ifdef CONFIG_INOTIFY
 
+extern void inotify_d_instantiate(struct dentry *, struct inode *);
+extern void inotify_d_move(struct dentry *);
 extern void inotify_inode_queue_event(struct inode *, __u32, __u32,
 				      const char *);
 extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32,
@@ -81,6 +83,15 @@ extern u32 inotify_get_cookie(void);
 
 #else
 
+static inline void inotify_d_instantiate(struct dentry *dentry,
+					struct inode *inode)
+{
+}
+
+static inline void inotify_d_move(struct dentry *dentry)
+{
+}
+
 static inline void inotify_inode_queue_event(struct inode *inode,
 					     __u32 mask, __u32 cookie,
 					     const char *filename)
-- 
cgit v1.2.3


From e6a6784627483381d012b507bb0d49809658a1fa Mon Sep 17 00:00:00 2001
From: Rene Herman <rene.herman@keyaccess.nl>
Date: Sat, 25 Mar 2006 03:07:13 -0800
Subject: [PATCH] parport: move PP_MAJOR from ppdev.h to major.h

Today I wondered about /dev/parport<n> after not seeing anything in
drivers/parport register char-major-99.  Having PP_MAJOR in
include/linux/major.h would've allowed me to more quickly determine that it
was the ppdev driver driving these.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/ppdev.c  | 3 ++-
 include/linux/major.h | 1 +
 include/linux/ppdev.h | 2 --
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index 306ee0f091a4..bee6c47b45bd 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -65,10 +65,11 @@
 #include <linux/parport.h>
 #include <linux/ctype.h>
 #include <linux/poll.h>
-#include <asm/uaccess.h>
+#include <linux/major.h>
 #include <linux/ppdev.h>
 #include <linux/smp_lock.h>
 #include <linux/device.h>
+#include <asm/uaccess.h>
 
 #define PP_VERSION "ppdev: user-space parallel port driver"
 #define CHRDEV "ppdev"
diff --git a/include/linux/major.h b/include/linux/major.h
index e36a46702d94..0a74c52924c9 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -113,6 +113,7 @@
 
 #define UBD_MAJOR		98
 
+#define PP_MAJOR		99
 #define JSFD_MAJOR		99
 
 #define PHONE_MAJOR		100
diff --git a/include/linux/ppdev.h b/include/linux/ppdev.h
index 141c96586824..f376a7598a78 100644
--- a/include/linux/ppdev.h
+++ b/include/linux/ppdev.h
@@ -14,8 +14,6 @@
  * Added PPGETMODES/PPGETMODE/PPGETPHASE, Fred Barnes <frmb2@ukc.ac.uk>, 03/01/2001
  */
 
-#define PP_MAJOR	99
-
 #define PP_IOCTL	'p'
 
 /* Set mode for read/write (e.g. IEEE1284_MODE_EPP) */
-- 
cgit v1.2.3


From cd02b966bfcad12d1b2e265dc8dbc331d4c184c4 Mon Sep 17 00:00:00 2001
From: "Vladimir V. Saveliev" <vs@namesys.com>
Date: Sat, 25 Mar 2006 03:07:15 -0800
Subject: [PATCH] reiserfs: cleanups

Clean up several places where gcc issues warnings when -W is specified.
Thanks to Neil for finding that.

Signed-off-by: Vladimir V. Saveliev <vs@namesys.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Hans Reiser <reiser@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/fix_node.c         | 4 +---
 fs/reiserfs/super.c            | 8 ++++----
 include/linux/reiserfs_xattr.h | 6 +++---
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index aa22588019ec..5600d3d60cf7 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -191,9 +191,7 @@ static void create_virtual_node(struct tree_balance *tb, int h)
 					       "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
 					       key, vn->vn_affected_item_num,
 					       vn->vn_mode, M_DELETE);
-			} else
-				/* we can delete directory item, that has only one directory entry in it */
-				;
+			}
 		}
 #endif
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 93e6ef9360e3..cae2abbc0c71 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -685,14 +685,14 @@ static const arg_desc_t logging_mode[] = {
 	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
 	{"writeback", 1 << REISERFS_DATA_WRITEBACK,
 	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
-	{NULL, 0}
+	{.value = NULL}
 };
 
 /* possible values for -o barrier= */
 static const arg_desc_t barrier_mode[] = {
 	{"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
 	{"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
-	{NULL, 0}
+	{.value = NULL}
 };
 
 /* possible values for "-o block-allocator=" and bits which are to be set in
@@ -890,7 +890,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		{"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
 		{"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
 #endif
-		{"nolog",},	/* This is unsupported */
+		{.option_name = "nolog"},
 		{"replayonly",.setmask = 1 << REPLAYONLY},
 		{"block-allocator",.arg_required = 'a',.values = balloc},
 		{"data",.arg_required = 'd',.values = logging_mode},
@@ -908,7 +908,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		{"grpjquota",.arg_required =
 		 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
 		{"jqfmt",.arg_required = 'f',.values = NULL},
-		{NULL,}
+		{.option_name = NULL}
 	};
 
 	*blocks = 0;
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 87280eb6083d..5353afb11db3 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -101,13 +101,13 @@ static inline void reiserfs_mark_inode_private(struct inode *inode)
 #else
 
 #define is_reiserfs_priv_object(inode) 0
-#define reiserfs_mark_inode_private(inode)
+#define reiserfs_mark_inode_private(inode) do {;} while(0)
 #define reiserfs_getxattr NULL
 #define reiserfs_setxattr NULL
 #define reiserfs_listxattr NULL
 #define reiserfs_removexattr NULL
-#define reiserfs_write_lock_xattrs(sb)
-#define reiserfs_write_unlock_xattrs(sb)
+#define reiserfs_write_lock_xattrs(sb) do {;} while(0)
+#define reiserfs_write_unlock_xattrs(sb) do {;} while(0)
 #define reiserfs_read_lock_xattrs(sb)
 #define reiserfs_read_unlock_xattrs(sb)
 
-- 
cgit v1.2.3


From e51c01b08474ea454a965a937fff0407ab6714c7 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Sat, 25 Mar 2006 03:07:17 -0800
Subject: [PATCH] hp300: fix driver_register() return handling, remove
 dio_module_init()

Remove the assumption that driver_register() returns the number of devices
bound to the driver.  In fact, it returns zero for success or a negative
error value.

dio_module_init() used the device count to automatically unregister and
unload drivers that found no devices.  That might have worked at one time,
but has been broken for some time because dio_register_driver() returned
either a negative error or a positive count (never zero).  So it could only
unregister on failure, when it's not needed anyway.

This functionality could be resurrected in individual drivers by counting
devices in their .probe() methods.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Philip Blundell <philb@gnu.org>
Cc: Jochen Friedrich <jochen@scram.de>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/dio/dio-driver.c    |  9 ++-------
 drivers/net/hplance.c       |  2 +-
 drivers/serial/8250_hp300.c | 10 +++++-----
 drivers/video/hpfb.c        |  4 +++-
 include/linux/dio.h         | 32 --------------------------------
 5 files changed, 11 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dio/dio-driver.c b/drivers/dio/dio-driver.c
index ca8e69d2f64d..e4c48e329367 100644
--- a/drivers/dio/dio-driver.c
+++ b/drivers/dio/dio-driver.c
@@ -71,22 +71,17 @@ static int dio_device_probe(struct device *dev)
 	 *  @drv: the driver structure to register
 	 *
 	 *  Adds the driver structure to the list of registered drivers
-	 *  Returns the number of DIO devices which were claimed by the driver
-	 *  during registration.  The driver remains registered even if the
-	 *  return value is zero.
+	 *  Returns zero or a negative error value.
 	 */
 
 int dio_register_driver(struct dio_driver *drv)
 {
-	int count = 0;
-
 	/* initialize common driver fields */
 	drv->driver.name = drv->name;
 	drv->driver.bus = &dio_bus_type;
 
 	/* register with core */
-	count = driver_register(&drv->driver);
-	return count ? count : 1;
+	return driver_register(&drv->driver);
 }
 
 
diff --git a/drivers/net/hplance.c b/drivers/net/hplance.c
index d8410634bcaf..685693464605 100644
--- a/drivers/net/hplance.c
+++ b/drivers/net/hplance.c
@@ -217,7 +217,7 @@ static int hplance_close(struct net_device *dev)
 
 int __init hplance_init_module(void)
 {
-	return dio_module_init(&hplance_driver);
+	return dio_register_driver(&hplance_driver);
 }
 
 void __exit hplance_cleanup_module(void)
diff --git a/drivers/serial/8250_hp300.c b/drivers/serial/8250_hp300.c
index 4315afe9c080..53e81a44c1a3 100644
--- a/drivers/serial/8250_hp300.c
+++ b/drivers/serial/8250_hp300.c
@@ -55,6 +55,8 @@ static struct dio_driver hpdca_driver = {
 
 #endif
 
+static unsigned int num_ports;
+
 extern int hp300_uart_scode;
 
 /* Offset to UART registers from base of DCA */
@@ -199,6 +201,8 @@ static int __devinit hpdca_init_one(struct dio_dev *d,
 	out_8(d->resource.start + DIO_VIRADDRBASE + DCA_ID, 0xff);
 	udelay(100);
 
+	num_ports++;
+
 	return 0;
 }
 #endif
@@ -206,7 +210,6 @@ static int __devinit hpdca_init_one(struct dio_dev *d,
 static int __init hp300_8250_init(void)
 {
 	static int called = 0;
-	int num_ports;
 #ifdef CONFIG_HPAPCI
 	int line;
 	unsigned long base;
@@ -221,11 +224,8 @@ static int __init hp300_8250_init(void)
 	if (!MACH_IS_HP300)
 		return -ENODEV;
 
-	num_ports = 0;
-
 #ifdef CONFIG_HPDCA
-	if (dio_module_init(&hpdca_driver) == 0)
-		num_ports++;
+	dio_register_driver(&hpdca_driver);
 #endif
 #ifdef CONFIG_HPAPCI
 	if (hp300_model < HP_400) {
diff --git a/drivers/video/hpfb.c b/drivers/video/hpfb.c
index bebdac59d231..abd920a663a0 100644
--- a/drivers/video/hpfb.c
+++ b/drivers/video/hpfb.c
@@ -386,7 +386,9 @@ int __init hpfb_init(void)
 	if (fb_get_options("hpfb", NULL))
 		return -ENODEV;
 
-	dio_module_init(&hpfb_driver);
+	err = dio_register_driver(&hpfb_driver);
+	if (err)
+		return err;
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
diff --git a/include/linux/dio.h b/include/linux/dio.h
index fae9395fcf4f..1e65ebc2a3db 100644
--- a/include/linux/dio.h
+++ b/include/linux/dio.h
@@ -276,37 +276,5 @@ static inline void dio_set_drvdata (struct dio_dev *d, void *data)
 	dev_set_drvdata(&d->dev, data);
 }
 
-/*
- * A helper function which helps ensure correct dio_driver
- * setup and cleanup for commonly-encountered hotplug/modular cases
- *
- * This MUST stay in a header, as it checks for -DMODULE
- */
-static inline int dio_module_init(struct dio_driver *drv)
-{
-	int rc = dio_register_driver(drv);
-
-	if (rc > 0)
-		return 0;
-
-	/* iff CONFIG_HOTPLUG and built into kernel, we should
-	 * leave the driver around for future hotplug events.
-	 * For the module case, a hotplug daemon of some sort
-	 * should load a module in response to an insert event. */
-#if defined(CONFIG_HOTPLUG) && !defined(MODULE)
-	if (rc == 0)
-		return 0;
-#else
-	if (rc == 0)
-		rc = -ENODEV;
-#endif
-
-	/* if we get here, we need to clean up DIO driver instance
-	 * and return some sort of error */
-	dio_unregister_driver(drv);
-
-	return rc;
-}
-
 #endif /* __KERNEL__ */
 #endif /* ndef _LINUX_DIO_H */
-- 
cgit v1.2.3


From 33d8675ea66e79d21da3ed64ce88dfb2a18bc6a7 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Sat, 25 Mar 2006 03:07:20 -0800
Subject: [PATCH] amiga: fix driver_register() return handling, remove
 zorro_module_init()

Remove the assumption that driver_register() returns the number of devices
bound to the driver.  In fact, it returns zero for success or a negative
error value.

zorro_module_init() used the device count to automatically unregister and
unload drivers that found no devices.  That might have worked at one time,
but has been broken for some time because zorro_register_driver() returned
either a negative error or a positive count (never zero).  So it could only
unregister on failure, when it's not needed anyway.

This functionality could be resurrected in individual drivers by counting
devices in their .probe() methods.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/net/a2065.c          |  2 +-
 drivers/net/ariadne.c        |  2 +-
 drivers/net/hydra.c          |  2 +-
 drivers/net/zorro8390.c      |  2 +-
 drivers/video/cirrusfb.c     |  2 +-
 drivers/zorro/zorro-driver.c |  9 ++-------
 include/linux/zorro.h        | 33 ---------------------------------
 7 files changed, 7 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c
index 8e538a6d7d97..79bb56b8dcef 100644
--- a/drivers/net/a2065.c
+++ b/drivers/net/a2065.c
@@ -829,7 +829,7 @@ static void __devexit a2065_remove_one(struct zorro_dev *z)
 
 static int __init a2065_init_module(void)
 {
-	return zorro_module_init(&a2065_driver);
+	return zorro_register_driver(&a2065_driver);
 }
 
 static void __exit a2065_cleanup_module(void)
diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c
index 9fe93acfc8ef..d1b6b1f794e2 100644
--- a/drivers/net/ariadne.c
+++ b/drivers/net/ariadne.c
@@ -864,7 +864,7 @@ static void __devexit ariadne_remove_one(struct zorro_dev *z)
 
 static int __init ariadne_init_module(void)
 {
-    return zorro_module_init(&ariadne_driver);
+    return zorro_register_driver(&ariadne_driver);
 }
 
 static void __exit ariadne_cleanup_module(void)
diff --git a/drivers/net/hydra.c b/drivers/net/hydra.c
index 6e0ca7340a8f..d9fb8e74e631 100644
--- a/drivers/net/hydra.c
+++ b/drivers/net/hydra.c
@@ -242,7 +242,7 @@ static void __devexit hydra_remove_one(struct zorro_dev *z)
 
 static int __init hydra_init_module(void)
 {
-    return zorro_module_init(&hydra_driver);
+    return zorro_register_driver(&hydra_driver);
 }
 
 static void __exit hydra_cleanup_module(void)
diff --git a/drivers/net/zorro8390.c b/drivers/net/zorro8390.c
index 761021603597..8037e5806d0a 100644
--- a/drivers/net/zorro8390.c
+++ b/drivers/net/zorro8390.c
@@ -426,7 +426,7 @@ static void __devexit zorro8390_remove_one(struct zorro_dev *z)
 
 static int __init zorro8390_init_module(void)
 {
-    return zorro_module_init(&zorro8390_driver);
+    return zorro_register_driver(&zorro8390_driver);
 }
 
 static void __exit zorro8390_cleanup_module(void)
diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index e0dbdfc0c8b4..66d6f2f0a219 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -2622,7 +2622,7 @@ static int __init cirrusfb_init(void)
 #endif
 
 #ifdef CONFIG_ZORRO
-	error |= zorro_module_init(&cirrusfb_zorro_driver);
+	error |= zorro_register_driver(&cirrusfb_zorro_driver);
 #endif
 #ifdef CONFIG_PCI
 	error |= pci_register_driver(&cirrusfb_pci_driver);
diff --git a/drivers/zorro/zorro-driver.c b/drivers/zorro/zorro-driver.c
index fcbee748c592..067c07be928c 100644
--- a/drivers/zorro/zorro-driver.c
+++ b/drivers/zorro/zorro-driver.c
@@ -65,22 +65,17 @@ static int zorro_device_probe(struct device *dev)
      *  @drv: the driver structure to register
      *
      *  Adds the driver structure to the list of registered drivers
-     *  Returns the number of Zorro devices which were claimed by the driver
-     *  during registration.  The driver remains registered even if the
-     *  return value is zero.
+     *  Returns zero or a negative error value.
      */
 
 int zorro_register_driver(struct zorro_driver *drv)
 {
-	int count = 0;
-
 	/* initialize common driver fields */
 	drv->driver.name = drv->name;
 	drv->driver.bus = &zorro_bus_type;
 
 	/* register with core */
-	count = driver_register(&drv->driver);
-	return count ? count : 1;
+	return driver_register(&drv->driver);
 }
 
 
diff --git a/include/linux/zorro.h b/include/linux/zorro.h
index ba5b72768bbe..2f135cf6eef1 100644
--- a/include/linux/zorro.h
+++ b/include/linux/zorro.h
@@ -271,39 +271,6 @@ static inline void zorro_set_drvdata (struct zorro_dev *z, void *data)
 }
 
 
-/*
- * A helper function which helps ensure correct zorro_driver
- * setup and cleanup for commonly-encountered hotplug/modular cases
- *
- * This MUST stay in a header, as it checks for -DMODULE
- */
-static inline int zorro_module_init(struct zorro_driver *drv)
-{
-	int rc = zorro_register_driver(drv);
-
-	if (rc > 0)
-		return 0;
-
-	/* iff CONFIG_HOTPLUG and built into kernel, we should
-	 * leave the driver around for future hotplug events.
-	 * For the module case, a hotplug daemon of some sort
-	 * should load a module in response to an insert event. */
-#if defined(CONFIG_HOTPLUG) && !defined(MODULE)
-	if (rc == 0)
-		return 0;
-#else
-	if (rc == 0)
-		rc = -ENODEV;
-#endif
-
-	/* if we get here, we need to clean up Zorro driver instance
-	 * and return some sort of error */
-	zorro_unregister_driver(drv);
-
-	return rc;
-}
-
-
     /*
      *  Bitmask indicating portions of available Zorro II RAM that are unused
      *  by the system. Every bit represents a 64K chunk, for a maximum of 8MB
-- 
cgit v1.2.3


From c777ac5594f772ac760e02c3ac71d067616b579d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:07:36 -0800
Subject: [PATCH] irq: uninline migration functions

Uninline some massive IRQ migration functions.  Put them in the new
kernel/irq/migration.c.

Cc: Andi Kleen <ak@muc.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h    | 49 ++---------------------------------------------
 kernel/irq/Makefile    |  3 +--
 kernel/irq/migration.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 49 deletions(-)
 create mode 100644 kernel/irq/migration.c

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 6c5d4c898ccb..ee2a82a572f7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -114,53 +114,8 @@ static inline void set_native_irq_info(int irq, cpumask_t mask)
 #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
 extern cpumask_t pending_irq_cpumask[NR_IRQS];
 
-static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
-{
-	irq_desc_t *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&desc->lock, flags);
-	desc->move_irq = 1;
-	pending_irq_cpumask[irq] = mask;
-	spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-static inline void
-move_native_irq(int irq)
-{
-	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
-
-	if (likely (!desc->move_irq))
-		return;
-
-	desc->move_irq = 0;
-
-	if (likely(cpus_empty(pending_irq_cpumask[irq])))
-		return;
-
-	if (!desc->handler->set_affinity)
-		return;
-
-	/* note - we hold the desc->lock */
-	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
-
-	/*
-	 * If there was a valid mask to work with, please
-	 * do the disable, re-program, enable sequence.
-	 * This is *not* particularly important for level triggered
-	 * but in a edge trigger case, we might be setting rte
-	 * when an active trigger is comming in. This could
-	 * cause some ioapics to mal-function.
-	 * Being paranoid i guess!
-	 */
-	if (unlikely(!cpus_empty(tmp))) {
-		desc->handler->disable(irq);
-		desc->handler->set_affinity(irq,tmp);
-		desc->handler->enable(irq);
-	}
-	cpus_clear(pending_irq_cpumask[irq]);
-}
+void set_pending_irq(unsigned int irq, cpumask_t mask);
+void move_native_irq(int irq);
 
 #ifdef CONFIG_PCI_MSI
 /*
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 49378738ff5e..2b33f852be3e 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,4 @@
 
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o migration.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
-
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 000000000000..6bdd03c524c7
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,52 @@
+#include <linux/irq.h>
+
+#if defined(CONFIG_GENERIC_PENDING_IRQ)
+
+void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->move_irq = 1;
+	pending_irq_cpumask[irq] = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void move_native_irq(int irq)
+{
+	cpumask_t tmp;
+	irq_desc_t *desc = irq_descp(irq);
+
+	if (likely (!desc->move_irq))
+		return;
+
+	desc->move_irq = 0;
+
+	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+		return;
+
+	if (!desc->handler->set_affinity)
+		return;
+
+	/* note - we hold the desc->lock */
+	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 */
+	if (unlikely(!cpus_empty(tmp))) {
+		desc->handler->disable(irq);
+		desc->handler->set_affinity(irq,tmp);
+		desc->handler->enable(irq);
+	}
+	cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#endif
-- 
cgit v1.2.3


From 77d47582c2345e071df02afaf9191641009287c4 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 25 Mar 2006 03:07:39 -0800
Subject: [PATCH] add a proper prototype for setup_arch()

This patch adds a proper prototype for setup_arch() in init.h.

This patch is based on a patch by Ben Dooks <ben-linux@fluff.org>.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/init.h | 4 ++++
 init/main.c          | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index ff8d8b8632f4..ed0ac7c39fdc 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -69,6 +69,10 @@ extern initcall_t __security_initcall_start[], __security_initcall_end[];
 
 /* Defined in init/main.c */
 extern char saved_command_line[];
+
+/* used by init/main.c */
+extern void setup_arch(char **);
+
 #endif
   
 #ifndef MODULE
diff --git a/init/main.c b/init/main.c
index a596cb8ac982..006dcd547dc2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -306,8 +306,6 @@ static int __init rdinit_setup(char *str)
 }
 __setup("rdinit=", rdinit_setup);
 
-extern void setup_arch(char **);
-
 #ifndef CONFIG_SMP
 
 #ifdef CONFIG_X86_LOCAL_APIC
-- 
cgit v1.2.3


From 12b5989be10011387a9da5dee82e5c0d6f9d02e7 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Sat, 25 Mar 2006 03:07:41 -0800
Subject: [PATCH] refactor capable() to one implementation, add __capable()
 helper

Move capable() to kernel/capability.c and eliminate duplicate
implementations.  Add __capable() function which can be used to check for
capabiilty of any process.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/capability.h |  3 ++-
 include/linux/security.h   | 22 ++++++++++++++++------
 kernel/capability.c        | 16 ++++++++++++++++
 kernel/sys.c               | 12 ------------
 security/security.c        | 23 -----------------------
 5 files changed, 34 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 5a23ce752629..6548b35ab9f6 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -357,7 +357,8 @@ static inline kernel_cap_t cap_invert(kernel_cap_t c)
 
 #define cap_is_fs_cap(c)     (CAP_TO_MASK(c) & CAP_FS_MASK)
 
-extern int capable(int cap);
+int capable(int cap);
+int __capable(struct task_struct *t, int cap);
 
 #endif /* __KERNEL__ */
 
diff --git a/include/linux/security.h b/include/linux/security.h
index b18eb8cfa639..3c19be35124b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1040,6 +1040,11 @@ struct swap_info_struct;
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
+ * @capable:
+ *	Check whether the @tsk process has the @cap capability.
+ *	@tsk contains the task_struct for the process.
+ *	@cap contains the capability <include/linux/capability.h>.
+ *	Return 0 if the capability is granted for @tsk.
  * @acct:
  *	Check permission before enabling or disabling process accounting.  If
  *	accounting is being enabled, then @file refers to the open file used to
@@ -1053,11 +1058,6 @@ struct swap_info_struct;
  *	@table contains the ctl_table structure for the sysctl variable.
  *	@op contains the operation (001 = search, 002 = write, 004 = read).
  *	Return 0 if permission is granted.
- * @capable:
- *	Check whether the @tsk process has the @cap capability.
- *	@tsk contains the task_struct for the process.
- *	@cap contains the capability <include/linux/capability.h>.
- *	Return 0 if the capability is granted for @tsk.
  * @syslog:
  *	Check permission before accessing the kernel message ring or changing
  *	logging to the console.
@@ -1099,9 +1099,9 @@ struct security_operations {
 			    kernel_cap_t * effective,
 			    kernel_cap_t * inheritable,
 			    kernel_cap_t * permitted);
+	int (*capable) (struct task_struct * tsk, int cap);
 	int (*acct) (struct file * file);
 	int (*sysctl) (struct ctl_table * table, int op);
-	int (*capable) (struct task_struct * tsk, int cap);
 	int (*quotactl) (int cmds, int type, int id, struct super_block * sb);
 	int (*quota_on) (struct dentry * dentry);
 	int (*syslog) (int type);
@@ -1347,6 +1347,11 @@ static inline void security_capset_set (struct task_struct *target,
 	security_ops->capset_set (target, effective, inheritable, permitted);
 }
 
+static inline int security_capable(struct task_struct *tsk, int cap)
+{
+	return security_ops->capable(tsk, cap);
+}
+
 static inline int security_acct (struct file *file)
 {
 	return security_ops->acct (file);
@@ -2050,6 +2055,11 @@ static inline void security_capset_set (struct task_struct *target,
 	cap_capset_set (target, effective, inheritable, permitted);
 }
 
+static inline int security_capable(struct task_struct *tsk, int cap)
+{
+	return cap_capable(tsk, cap);
+}
+
 static inline int security_acct (struct file *file)
 {
 	return 0;
diff --git a/kernel/capability.c b/kernel/capability.c
index bfa3c92e16f2..1a4d8a40d3f9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -233,3 +233,19 @@ out:
 
      return ret;
 }
+
+int __capable(struct task_struct *t, int cap)
+{
+	if (security_capable(t, cap) == 0) {
+		t->flags |= PF_SUPERPRIV;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__capable);
+
+int capable(int cap)
+{
+	return __capable(current, cap);
+}
+EXPORT_SYMBOL(capable);
diff --git a/kernel/sys.c b/kernel/sys.c
index 19d058be49d4..421009cedb51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -224,18 +224,6 @@ int unregister_reboot_notifier(struct notifier_block * nb)
 
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
-#ifndef CONFIG_SECURITY
-int capable(int cap)
-{
-        if (cap_raised(current->cap_effective, cap)) {
-	       current->flags |= PF_SUPERPRIV;
-	       return 1;
-        }
-        return 0;
-}
-EXPORT_SYMBOL(capable);
-#endif
-
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
 	int no_nice;
diff --git a/security/security.c b/security/security.c
index f693e1f66b98..51ef509710b9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -174,31 +174,8 @@ int mod_unreg_security(const char *name, struct security_operations *ops)
 	return security_ops->unregister_security(name, ops);
 }
 
-/**
- * capable - calls the currently loaded security module's capable() function with the specified capability
- * @cap: the requested capability level.
- *
- * This function calls the currently loaded security module's capable()
- * function with a pointer to the current task and the specified @cap value.
- *
- * This allows the security module to implement the capable function call
- * however it chooses to.
- */
-int capable(int cap)
-{
-	if (security_ops->capable(current, cap)) {
-		/* capability denied */
-		return 0;
-	}
-
-	/* capability granted */
-	current->flags |= PF_SUPERPRIV;
-	return 1;
-}
-
 EXPORT_SYMBOL_GPL(register_security);
 EXPORT_SYMBOL_GPL(unregister_security);
 EXPORT_SYMBOL_GPL(mod_reg_security);
 EXPORT_SYMBOL_GPL(mod_unreg_security);
-EXPORT_SYMBOL(capable);
 EXPORT_SYMBOL(security_ops);
-- 
cgit v1.2.3


From 962749af67b145c57917bfbff3c303ebd7d5988c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:08:01 -0800
Subject: [PATCH] roundup_pow_of_two() 64-bit fix

fls() takes an integer, so roundup_pow_of_two() is busted for ulongs larger
than 2^32-1.

Fix this by implementing and using fls_long().

(Why does roundup_pow_of_two() return a long?)

(Why is roundup_pow_of_two() __attribute_const__ whereas long_log2() is
__attribute_pure__?)

(Why does long_log2() suck so much?  Because we were missing fls_long()?)

Cc: Roland Dreier <rdreier@cisco.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bitops.h | 7 +++++++
 include/linux/kernel.h | 5 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 208650b1ad3a..f17525a963d1 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -175,4 +175,11 @@ static inline __u32 ror32(__u32 word, unsigned int shift)
 	return (word >> shift) | (word << (32 - shift));
 }
 
+static inline unsigned fls_long(unsigned long l)
+{
+	if (sizeof(l) == 4)
+		return fls(l);
+	return fls64(l);
+}
+
 #endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bb6e7ddee2fd..03d6cfaa5b8a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -154,9 +154,10 @@ static inline int __attribute_pure__ long_log2(unsigned long x)
 	return r;
 }
 
-static inline unsigned long __attribute_const__ roundup_pow_of_two(unsigned long x)
+static inline unsigned long
+__attribute_const__ roundup_pow_of_two(unsigned long x)
 {
-	return (1UL << fls(x - 1));
+	return 1UL << fls_long(x - 1);
 }
 
 extern int printk_ratelimit(void);
-- 
cgit v1.2.3


From daff89f324755f87a060d5125a205c0755811ea9 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Sat, 25 Mar 2006 03:08:05 -0800
Subject: [PATCH] radix-tree documentation cleanups

Documentation changes to help radix tree users avoid overrunning the tags
array.  RADIX_TREE_TAGS moves to linux/radix-tree.h and is now known as
RADIX_TREE_MAX_TAGS (Nick Piggin's idea).  Tag parameters are changed to
unsigned, and some comments are updated.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/radix-tree.h | 13 +++++++-----
 lib/radix-tree.c           | 49 +++++++++++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c57ff2fcb30a..dd83cca28001 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -45,6 +45,8 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
+#define RADIX_TREE_MAX_TAGS 2
+
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
@@ -55,15 +57,16 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 int radix_tree_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, int tag);
+			unsigned long index, unsigned int tag);
 void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, int tag);
+			unsigned long index, unsigned int tag);
 int radix_tree_tag_get(struct radix_tree_root *root,
-			unsigned long index, int tag);
+			unsigned long index, unsigned int tag);
 unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items, int tag);
-int radix_tree_tagged(struct radix_tree_root *root, int tag);
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag);
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1e5b17dc7e3d..7097bb239e40 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -37,7 +37,6 @@
 #else
 #define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
 #endif
-#define RADIX_TREE_TAGS		2
 
 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
 #define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
@@ -48,7 +47,7 @@
 struct radix_tree_node {
 	unsigned int	count;
 	void		*slots[RADIX_TREE_MAP_SIZE];
-	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
+	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
 struct radix_tree_path {
@@ -135,17 +134,20 @@ out:
 	return ret;
 }
 
-static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
+static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
+		int offset)
 {
 	__set_bit(offset, node->tags[tag]);
 }
 
-static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+		int offset)
 {
 	__clear_bit(offset, node->tags[tag]);
 }
 
-static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
+static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+		int offset)
 {
 	return test_bit(offset, node->tags[tag]);
 }
@@ -154,7 +156,7 @@ static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
  */
-static inline int any_tag_set(struct radix_tree_node *node, int tag)
+static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
 {
 	int idx;
 	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
@@ -180,7 +182,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_node *node;
 	unsigned int height;
-	char tags[RADIX_TREE_TAGS];
+	char tags[RADIX_TREE_MAX_TAGS];
 	int tag;
 
 	/* Figure out what the height should be.  */
@@ -197,7 +199,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 	 * Prepare the tag status of the top-level node for propagation
 	 * into the newly-pushed top-level node(s)
 	 */
-	for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 		tags[tag] = 0;
 		if (any_tag_set(root->rnode, tag))
 			tags[tag] = 1;
@@ -211,7 +213,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 		node->slots[0] = root->rnode;
 
 		/* Propagate the aggregated tag info into the new root */
-		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 			if (tags[tag])
 				tag_set(node, tag, 0);
 		}
@@ -349,14 +351,15 @@ EXPORT_SYMBOL(radix_tree_lookup);
  *	@index:		index key
  *	@tag: 		tag index
  *
- *	Set the search tag corresponging to @index in the radix tree.  From
+ *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *	corresponding to @index in the radix tree.  From
  *	the root all the way down to the leaf node.
  *
  *	Returns the address of the tagged item.   Setting a tag on a not-present
  *	item is a bug.
  */
 void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, int tag)
+			unsigned long index, unsigned int tag)
 {
 	unsigned int height, shift;
 	struct radix_tree_node *slot;
@@ -390,7 +393,8 @@ EXPORT_SYMBOL(radix_tree_tag_set);
  *	@index:		index key
  *	@tag: 		tag index
  *
- *	Clear the search tag corresponging to @index in the radix tree.  If
+ *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *	corresponding to @index in the radix tree.  If
  *	this causes the leaf node to have no tags set then clear the tag in the
  *	next-to-leaf node, etc.
  *
@@ -398,7 +402,7 @@ EXPORT_SYMBOL(radix_tree_tag_set);
  *	has the same return value and semantics as radix_tree_lookup().
  */
 void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, int tag)
+			unsigned long index, unsigned int tag)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
 	struct radix_tree_node *slot;
@@ -450,7 +454,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
  * radix_tree_tag_get - get a tag on a radix tree node
  * @root:		radix tree root
  * @index:		index key
- * @tag: 		tag index
+ * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
  *
  * Return values:
  *
@@ -459,7 +463,7 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
  * -1: tag present, unset
  */
 int radix_tree_tag_get(struct radix_tree_root *root,
-			unsigned long index, int tag)
+			unsigned long index, unsigned int tag)
 {
 	unsigned int height, shift;
 	struct radix_tree_node *slot;
@@ -592,7 +596,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
  */
 static unsigned int
 __lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index, int tag)
+	unsigned int max_items, unsigned long *next_index, unsigned int tag)
 {
 	unsigned int nr_found = 0;
 	unsigned int shift;
@@ -646,7 +650,7 @@ out:
  *	@results:	where the results of the lookup are placed
  *	@first_index:	start the lookup from this key
  *	@max_items:	place up to this many items at *results
- *	@tag:		the tag index
+ *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
  *
  *	Performs an index-ascending scan of the tree for present items which
  *	have the tag indexed by @tag set.  Places the items at *@results and
@@ -654,7 +658,8 @@ out:
  */
 unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items, int tag)
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag)
 {
 	const unsigned long max_index = radix_tree_maxindex(root->height);
 	unsigned long cur_index = first_index;
@@ -716,7 +721,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	struct radix_tree_node *slot;
 	unsigned int height, shift;
 	void *ret = NULL;
-	char tags[RADIX_TREE_TAGS];
+	char tags[RADIX_TREE_MAX_TAGS];
 	int nr_cleared_tags;
 	int tag;
 	int offset;
@@ -751,7 +756,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	 * Clear all tags associated with the just-deleted item
 	 */
 	nr_cleared_tags = 0;
-	for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 		tags[tag] = 1;
 		if (tag_get(pathp->node, tag, pathp->offset)) {
 			tag_clear(pathp->node, tag, pathp->offset);
@@ -763,7 +768,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	}
 
 	for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
-		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 			if (tags[tag])
 				continue;
 
@@ -801,7 +806,7 @@ EXPORT_SYMBOL(radix_tree_delete);
  *	@root:		radix tree root
  *	@tag:		tag to test
  */
-int radix_tree_tagged(struct radix_tree_root *root, int tag)
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 {
   	struct radix_tree_node *rnode;
   	rnode = root->rnode;
-- 
cgit v1.2.3


From ccb46000f4bb459777686611157ac0eac928704e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:08:08 -0800
Subject: [PATCH] cpumask: uninline first_cpu()

           text    data     bss     dec     hex filename
before: 3490577 1322408  360000 5172985  4eeef9 vmlinux
after:  3488027 1322496  360128 5170651  4ee5db vmlinux

Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 11 ++++++-----
 lib/Makefile            |  2 ++
 lib/cpumask.c           | 11 +++++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 lib/cpumask.c

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 60e56c6e03dd..9b702fd24a72 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -212,11 +212,12 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
 }
 
-#define first_cpu(src) __first_cpu(&(src), NR_CPUS)
-static inline int __first_cpu(const cpumask_t *srcp, int nbits)
-{
-	return min_t(int, nbits, find_first_bit(srcp->bits, nbits));
-}
+#ifdef CONFIG_SMP
+int __first_cpu(const cpumask_t *srcp);
+#define first_cpu(src) __first_cpu(&(src))
+#else
+#define first_cpu(src)	0
+#endif
 
 #define next_cpu(n, src) __next_cpu((n), &(src), NR_CPUS)
 static inline int __next_cpu(int n, const cpumask_t *srcp, int nbits)
diff --git a/lib/Makefile b/lib/Makefile
index 648b2c1242fd..f827e3c24ec0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,8 @@ lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
 	 sha1.o
 
+lib-$(CONFIG_SMP) += cpumask.o
+
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 
 obj-y += sort.o parser.o halfmd4.o iomap_copy.o
diff --git a/lib/cpumask.c b/lib/cpumask.c
new file mode 100644
index 000000000000..1560d97390dd
--- /dev/null
+++ b/lib/cpumask.c
@@ -0,0 +1,11 @@
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+
+int __first_cpu(const cpumask_t *srcp)
+{
+	return min_t(int, NR_CPUS, find_first_bit(srcp->bits, NR_CPUS));
+}
+EXPORT_SYMBOL(__first_cpu);
+
-- 
cgit v1.2.3


From 3d18bd74a22d0bed3bc81fc64c4ba6344a10f155 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:08:09 -0800
Subject: [PATCH] cpumask: uninline next_cpu()

           text    data     bss     dec     hex filename
before: 3488027 1322496  360128 5170651  4ee5db vmlinux
after:  3485112 1322480  359968 5167560  4ed9c8 vmlinux

2931 bytes saved

Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 11 ++++-------
 lib/cpumask.c           |  5 +++++
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 9b702fd24a72..4b29e508a0b6 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -215,16 +215,13 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 #ifdef CONFIG_SMP
 int __first_cpu(const cpumask_t *srcp);
 #define first_cpu(src) __first_cpu(&(src))
+int __next_cpu(int n, const cpumask_t *srcp);
+#define next_cpu(n, src) __next_cpu((n), &(src))
 #else
-#define first_cpu(src)	0
+#define first_cpu(src)		0
+#define next_cpu(n, src)	1
 #endif
 
-#define next_cpu(n, src) __next_cpu((n), &(src), NR_CPUS)
-static inline int __next_cpu(int n, const cpumask_t *srcp, int nbits)
-{
-	return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1));
-}
-
 #define cpumask_of_cpu(cpu)						\
 ({									\
 	typeof(_unused_cpumask_arg_) m;					\
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 1560d97390dd..ba2f8543052c 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -9,3 +9,8 @@ int __first_cpu(const cpumask_t *srcp)
 }
 EXPORT_SYMBOL(__first_cpu);
 
+int __next_cpu(int n, const cpumask_t *srcp)
+{
+	return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1));
+}
+EXPORT_SYMBOL(__next_cpu);
-- 
cgit v1.2.3


From 8630282070b4a52b12cfa514ba8558e2f3d56360 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:08:09 -0800
Subject: [PATCH] cpumask: uninline highest_possible_processor_id()

Shrinks the only caller (net/bridge/netfilter/ebtables.c) by 174 bytes.

Also, optimise highest_possible_processor_id() out of existence on
CONFIG_SMP=n.

Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 15 ++++++---------
 lib/cpumask.c           | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 4b29e508a0b6..f770039344c5 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -396,6 +396,12 @@ extern cpumask_t cpu_present_map;
 #define cpu_present(cpu)	((cpu) == 0)
 #endif
 
+#ifdef CONFIG_SMP
+int highest_possible_processor_id(void);
+#else
+#define highest_possible_processor_id()	0
+#endif
+
 #define any_online_cpu(mask)			\
 ({						\
 	int cpu;				\
@@ -409,14 +415,5 @@ extern cpumask_t cpu_present_map;
 #define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
-/* Find the highest possible smp_processor_id() */
-#define highest_possible_processor_id() \
-({ \
-	unsigned int cpu, highest = 0; \
-	for_each_cpu_mask(cpu, cpu_possible_map) \
-		highest = cpu; \
-	highest; \
-})
-
 
 #endif /* __LINUX_CPUMASK_H */
diff --git a/lib/cpumask.c b/lib/cpumask.c
index ba2f8543052c..ea25a034276c 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -14,3 +14,20 @@ int __next_cpu(int n, const cpumask_t *srcp)
 	return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1));
 }
 EXPORT_SYMBOL(__next_cpu);
+
+/*
+ * Find the highest possible smp_processor_id()
+ *
+ * Note: if we're prepared to assume that cpu_possible_map never changes
+ * (reasonable) then this function should cache its return value.
+ */
+int highest_possible_processor_id(void)
+{
+	unsigned int cpu;
+	unsigned highest = 0;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		highest = cpu;
+	return highest;
+}
+EXPORT_SYMBOL(highest_possible_processor_id);
-- 
cgit v1.2.3


From 96a9b4d31eba4722ba7aad2cc15118a7799f499f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:08:10 -0800
Subject: [PATCH] cpumask: uninline any_online_cpu()

           text    data     bss     dec     hex filename
before: 3605597 1363528  363328 5332453  515de5 vmlinux
after:  3605295 1363612  363200 5332107  515c8b vmlinux

218 bytes saved.

Also, optimise any_online_cpu() out of existence on CONFIG_SMP=n.

This function seems inefficient.  Can't we simply AND the two masks, then use
find_first_bit()?

Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 13 +++----------
 lib/cpumask.c           | 12 ++++++++++++
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index f770039344c5..99e6115d8e52 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -398,22 +398,15 @@ extern cpumask_t cpu_present_map;
 
 #ifdef CONFIG_SMP
 int highest_possible_processor_id(void);
+#define any_online_cpu(mask) __any_online_cpu(&(mask))
+int __any_online_cpu(const cpumask_t *mask);
 #else
 #define highest_possible_processor_id()	0
+#define any_online_cpu(mask)		0
 #endif
 
-#define any_online_cpu(mask)			\
-({						\
-	int cpu;				\
-	for_each_cpu_mask(cpu, (mask))		\
-		if (cpu_online(cpu))		\
-			break;			\
-	cpu;					\
-})
-
 #define for_each_cpu(cpu)	  for_each_cpu_mask((cpu), cpu_possible_map)
 #define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
-
 #endif /* __LINUX_CPUMASK_H */
diff --git a/lib/cpumask.c b/lib/cpumask.c
index ea25a034276c..3a67dc5ada7d 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -31,3 +31,15 @@ int highest_possible_processor_id(void)
 	return highest;
 }
 EXPORT_SYMBOL(highest_possible_processor_id);
+
+int __any_online_cpu(const cpumask_t *mask)
+{
+	int cpu;
+
+	for_each_cpu_mask(cpu, *mask) {
+		if (cpu_online(cpu))
+			break;
+	}
+	return cpu;
+}
+EXPORT_SYMBOL(__any_online_cpu);
-- 
cgit v1.2.3


From 34f361ade2fb4a869f6a7714d01c04ce4cfa75d9 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Mar 2006 03:08:18 -0800
Subject: [PATCH] Check if cpu can be onlined before calling smp_prepare_cpu()

- Moved check for online cpu out of smp_prepare_cpu()

- Moved default declaration of smp_prepare_cpu() to kernel/cpu.c

- Removed lock_cpu_hotplug() from smp_prepare_cpu() to around it, since
  its called from cpu_up() as well now.

- Removed clearing from cpu_present_map during cpu_offline as it breaks
  using cpu_up() directly during a subsequent online operation.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smpboot.c | 33 ++++++++++++++++++---------------
 drivers/base/cpu.c         |  9 +--------
 include/linux/cpu.h        |  1 -
 kernel/power/smp.c         |  4 +---
 4 files changed, 20 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4c470e99a742..82371d83bfa9 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1003,7 +1003,6 @@ void cpu_exit_clear(void)
 
 	cpu_clear(cpu, cpu_callout_map);
 	cpu_clear(cpu, cpu_callin_map);
-	cpu_clear(cpu, cpu_present_map);
 
 	cpu_clear(cpu, smp_commenced_mask);
 	unmap_cpu_to_logical_apicid(cpu);
@@ -1015,31 +1014,20 @@ struct warm_boot_cpu_info {
 	int cpu;
 };
 
-static void __devinit do_warm_boot_cpu(void *p)
+static void __cpuinit do_warm_boot_cpu(void *p)
 {
 	struct warm_boot_cpu_info *info = p;
 	do_boot_cpu(info->apicid, info->cpu);
 	complete(info->complete);
 }
 
-int __devinit smp_prepare_cpu(int cpu)
+static int __cpuinit __smp_prepare_cpu(int cpu)
 {
 	DECLARE_COMPLETION(done);
 	struct warm_boot_cpu_info info;
 	struct work_struct task;
 	int	apicid, ret;
 
-	lock_cpu_hotplug();
-
-	/*
-	 * On x86, CPU0 is never offlined.  Trying to bring up an
-	 * already-booted CPU will hang.  So check for that case.
-	 */
-	if (cpu_online(cpu)) {
-		ret = -EINVAL;
-		goto exit;
-	}
-
 	apicid = x86_cpu_to_apicid[cpu];
 	if (apicid == BAD_APICID) {
 		ret = -ENODEV;
@@ -1064,7 +1052,6 @@ int __devinit smp_prepare_cpu(int cpu)
 	zap_low_mappings();
 	ret = 0;
 exit:
-	unlock_cpu_hotplug();
 	return ret;
 }
 #endif
@@ -1392,6 +1379,22 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+	int ret=0;
+
+	/*
+	 * We do warm boot only on cpus that had booted earlier
+	 * Otherwise cold boot is all handled from smp_boot_cpus().
+	 * cpu_callin_map is set during AP kickstart process. Its reset
+	 * when a cpu is taken offline from cpu_exit_clear().
+	 */
+	if (!cpu_isset(cpu, cpu_callin_map))
+		ret = __smp_prepare_cpu(cpu);
+
+	if (ret)
+		return -EIO;
+#endif
+
 	/* In case one didn't come up */
 	if (!cpu_isset(cpu, cpu_callin_map)) {
 		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 29f3d7504da1..dd712b24ec91 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -19,11 +19,6 @@ EXPORT_SYMBOL(cpu_sysdev_class);
 static struct sys_device *cpu_sys_devices[NR_CPUS];
 
 #ifdef CONFIG_HOTPLUG_CPU
-int __attribute__((weak)) smp_prepare_cpu (int cpu)
-{
-	return 0;
-}
-
 static ssize_t show_online(struct sys_device *dev, char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -44,9 +39,7 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
 			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
 		break;
 	case '1':
-		ret = smp_prepare_cpu(cpu->sysdev.id);
-		if (!ret)
-			ret = cpu_up(cpu->sysdev.id);
+		ret = cpu_up(cpu->sysdev.id);
 		if (!ret)
 			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 		break;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d612b89dce33..08d50c53aab4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -74,7 +74,6 @@ extern int lock_cpu_hotplug_interruptible(void);
 	register_cpu_notifier(&fn##_nb);			\
 }
 int cpu_down(unsigned int cpu);
-extern int __attribute__((weak)) smp_prepare_cpu(int cpu);
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
 #else
 #define lock_cpu_hotplug()	do { } while (0)
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 911fc62b8225..5957312b2d68 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -49,9 +49,7 @@ void enable_nonboot_cpus(void)
 
 	printk("Thawing cpus ...\n");
 	for_each_cpu_mask(cpu, frozen_cpus) {
-		error = smp_prepare_cpu(cpu);
-		if (!error)
-			error = cpu_up(cpu);
+		error = cpu_up(cpu);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
 			continue;
-- 
cgit v1.2.3


From d12ddde2bbf46b34eae3fb3fd36c0e42832b537c Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 25 Mar 2006 03:08:21 -0800
Subject: [PATCH] udf: remove duplicate definitions

This patch removes duplicate definitions from include/linux/udf_fs_i.h
which are already defined in fs/udf/ecma_167.h.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/udf_fs_i.h | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udf_fs_i.h b/include/linux/udf_fs_i.h
index 1e7508420fcf..ffaf05679ffb 100644
--- a/include/linux/udf_fs_i.h
+++ b/include/linux/udf_fs_i.h
@@ -15,27 +15,6 @@
 
 #ifdef __KERNEL__
 
-#ifndef _ECMA_167_H
-typedef struct
-{
-	__u32			logicalBlockNum;
-	__u16			partitionReferenceNum;
-} __attribute__ ((packed)) lb_addr;
-
-typedef struct
-{
-	__u32			extLength;
-	__u32			extPosition;
-} __attribute__ ((packed)) short_ad;
-
-typedef struct
-{
-	__u32			extLength;
-	lb_addr			extLocation;
-	__u8			impUse[6];
-} __attribute__ ((packed)) long_ad;
-#endif
-
 struct udf_inode_info
 {
 	struct timespec		i_crtime;
-- 
cgit v1.2.3


From 5ddcfa878d5b10b0ab94251a4229a8a9daaf93ed Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 25 Mar 2006 03:08:28 -0800
Subject: [PATCH] remove pps support

This removes the support for pps.  It's completely unused within the kernel
and is basically in the way for further cleanups.  It should be easier to
readd proper support for it after the rest has been converted to NTP4
(where the pps mechanisms are quite different from NTP3 anyway).

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: Adrian Bunk <bunk@stusta.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h | 41 -----------------------------------
 kernel/time.c         | 59 ++++++++++++++-------------------------------------
 kernel/timer.c        | 13 ++----------
 3 files changed, 18 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index b7ca1204e42a..82dc9ae79d37 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -97,37 +97,10 @@
 
 #define MAXPHASE 512000L        /* max phase error (us) */
 #define MAXFREQ (512L << SHIFT_USEC)  /* max frequency error (ppm) */
-#define MAXTIME (200L << PPS_AVG) /* max PPS error (jitter) (200 us) */
 #define MINSEC 16L              /* min interval between updates (s) */
 #define MAXSEC 1200L            /* max interval between updates (s) */
 #define	NTP_PHASE_LIMIT	(MAXPHASE << 5)	/* beyond max. dispersion */
 
-/*
- * The following defines are used only if a pulse-per-second (PPS)
- * signal is available and connected via a modem control lead, such as
- * produced by the optional ppsclock feature incorporated in the Sun
- * asynch driver. They establish the design parameters of the frequency-
- * lock loop used to discipline the CPU clock oscillator to the PPS
- * signal.
- *
- * PPS_AVG is the averaging factor for the frequency loop, as well as
- * the time and frequency dispersion.
- *
- * PPS_SHIFT and PPS_SHIFTMAX specify the minimum and maximum
- * calibration intervals, respectively, in seconds as a power of two.
- *
- * PPS_VALID is the maximum interval before the PPS signal is considered
- * invalid and protocol updates used directly instead.
- *
- * MAXGLITCH is the maximum interval before a time offset of more than
- * MAXTIME is believed.
- */
-#define PPS_AVG 2		/* pps averaging constant (shift) */
-#define PPS_SHIFT 2		/* min interval duration (s) (shift) */
-#define PPS_SHIFTMAX 8		/* max interval duration (s) (shift) */
-#define PPS_VALID 120		/* pps signal watchdog max (s) */
-#define MAXGLITCH 30		/* pps signal glitch max (s) */
-
 /*
  * syscall interface - used (mainly by NTP daemon)
  * to discipline kernel clock oscillator
@@ -246,20 +219,6 @@ extern long time_reftime;	/* time at last adjustment (s) */
 extern long time_adjust;	/* The amount of adjtime left */
 extern long time_next_adjust;	/* Value for time_adjust at next tick */
 
-/* interface variables pps->timer interrupt */
-extern long pps_offset;		/* pps time offset (us) */
-extern long pps_jitter;		/* time dispersion (jitter) (us) */
-extern long pps_freq;		/* frequency offset (scaled ppm) */
-extern long pps_stabil;		/* frequency dispersion (scaled ppm) */
-extern long pps_valid;		/* pps signal watchdog counter */
-
-/* interface variables pps->adjtimex */
-extern int pps_shift;		/* interval duration (s) (shift) */
-extern long pps_jitcnt;		/* jitter limit exceeded */
-extern long pps_calcnt;		/* calibration intervals */
-extern long pps_errcnt;		/* calibration errors */
-extern long pps_stbcnt;		/* stability limit exceeded */
-
 /**
  * ntp_clear - Clears the NTP state variables
  *
diff --git a/kernel/time.c b/kernel/time.c
index 804539165d8b..e00a97b77241 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
 	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
-long pps_offset;		/* pps time offset (us) */
-long pps_jitter = MAXTIME;	/* time dispersion (jitter) (us) */
-
-long pps_freq;			/* frequency offset (scaled ppm) */
-long pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
-
-long pps_valid = PPS_VALID;	/* pps signal watchdog counter */
-
-int pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
-
-long pps_jitcnt;		/* jitter limit exceeded */
-long pps_calcnt;		/* calibration intervals */
-long pps_errcnt;		/* calibration errors */
-long pps_stbcnt;		/* stability limit exceeded */
-
-/* hook for a loadable hardpps kernel module */
-void (*hardpps_ptr)(struct timeval *);
-
 /* we call this to notify the arch when the clock is being
  * controlled.  If no such arch routine, do nothing.
  */
@@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc)
 		    result = -EINVAL;
 		    goto leave;
 		}
-		time_freq = txc->freq - pps_freq;
+		time_freq = txc->freq;
 	    }
 
 	    if (txc->modes & ADJ_MAXERROR) {
@@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc)
 		    if ((time_next_adjust = txc->offset) == 0)
 			 time_adjust = 0;
 		}
-		else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
-		    ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
-		            (STA_PPSTIME | STA_PPSSIGNAL) ?
-		            pps_offset : txc->offset;
+		else if (time_status & STA_PLL) {
+		    ltemp = txc->offset;
 
 		    /*
 		     * Scale the phase adjustment and
@@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc)
 		    }
 		    time_freq = min(time_freq, time_tolerance);
 		    time_freq = max(time_freq, -time_tolerance);
-		} /* STA_PLL || STA_PPSTIME */
+		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK) {
 		tick_usec = txc->tick;
 		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
 	    }
 	} /* txc->modes */
-leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
-	    || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
-		&& (time_status & STA_PPSSIGNAL) == 0)
-	    /* p. 24, (b) */
-	    || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
-		== (STA_PPSTIME|STA_PPSJITTER))
-	    /* p. 24, (c) */
-	    || ((time_status & STA_PPSFREQ) != 0
-		&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
-	    /* p. 24, (d) */
+leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 		result = TIME_ERROR;
 	
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
@@ -380,7 +351,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	else {
 	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
 	}
-	txc->freq	   = time_freq + pps_freq;
+	txc->freq	   = time_freq;
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
@@ -388,14 +359,16 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	txc->precision	   = time_precision;
 	txc->tolerance	   = time_tolerance;
 	txc->tick	   = tick_usec;
-	txc->ppsfreq	   = pps_freq;
-	txc->jitter	   = pps_jitter >> PPS_AVG;
-	txc->shift	   = pps_shift;
-	txc->stabil	   = pps_stabil;
-	txc->jitcnt	   = pps_jitcnt;
-	txc->calcnt	   = pps_calcnt;
-	txc->errcnt	   = pps_errcnt;
-	txc->stbcnt	   = pps_stbcnt;
+
+	/* PPS is not implemented, so these are zero */
+	txc->ppsfreq	   = 0;
+	txc->jitter	   = 0;
+	txc->shift	   = 0;
+	txc->stabil	   = 0;
+	txc->jitcnt	   = 0;
+	txc->calcnt	   = 0;
+	txc->errcnt	   = 0;
+	txc->stbcnt	   = 0;
 	write_sequnlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
 	notify_arch_cmos_timer();
diff --git a/kernel/timer.c b/kernel/timer.c
index 13fa72cac7d8..ab189dd187cb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -697,18 +697,9 @@ static void second_overflow(void)
 
 	/*
 	 * Compute the frequency estimate and additional phase adjustment due
-	 * to frequency error for the next second. When the PPS signal is
-	 * engaged, gnaw on the watchdog counter and update the frequency
-	 * computed by the pll and the PPS signal.
+	 * to frequency error for the next second.
 	 */
-	pps_valid++;
-	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
-		pps_jitter = MAXTIME;
-		pps_stabil = MAXFREQ;
-		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-				STA_PPSWANDER | STA_PPSERROR);
-	}
-	ltemp = time_freq + pps_freq;
+	ltemp = time_freq;
 	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
-- 
cgit v1.2.3


From f083a329e63d471a5e9238e837772b1b76c218db Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 25 Mar 2006 16:30:19 +0100
Subject: [PATCH] x86_64: Clean up and tweak ACPI blacklist year code

 - Move the core parser into dmi_scan.c.  It can be useful for other
   subsystems too.
 - Differentiate between field doesn't exist and field is 0 or
   unparseable.  The first case is likely an old BIOS with broken ACPI,
   the later is likely a slightly buggy BIOS where someone forget to
   edit the date.  Don't blacklist in the later case.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 30 ++++++++++++++++++++++++++++++
 drivers/acpi/blacklist.c    | 27 ++++++---------------------
 include/linux/dmi.h         |  2 ++
 3 files changed, 38 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index ca2a0cbcac04..d2dfd9c8d691 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -299,3 +299,33 @@ struct dmi_device * dmi_find_device(int type, const char *name,
 	return NULL;
 }
 EXPORT_SYMBOL(dmi_find_device);
+
+/**
+ *	dmi_get_year - Return year of a DMI date
+ *	@field:	data index (like dmi_get_system_info)
+ *
+ *	Returns -1 when the field doesn't exist. 0 when it is broken.
+ */
+int dmi_get_year(int field)
+{
+	int year;
+	char *s = dmi_get_system_info(field);
+
+	if (!s)
+		return -1;
+	if (*s == '\0')
+		return 0;
+	s = strrchr(s, '/');
+	if (!s)
+		return 0;
+
+	s += 1;
+	year = simple_strtoul(s, NULL, 0);
+	if (year && year < 100) {	/* 2-digit year */
+		year += 1900;
+		if (year < 1996)	/* no dates < spec 1.0 */
+			year += 100;
+	}
+
+	return year;
+}
diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c
index 9824f679a910..f9c972b26f4f 100644
--- a/drivers/acpi/blacklist.c
+++ b/drivers/acpi/blacklist.c
@@ -77,28 +77,13 @@ static struct acpi_blacklist_item acpi_blacklist[] __initdata = {
 
 static int __init blacklist_by_year(void)
 {
-	int year;
-	char *s = dmi_get_system_info(DMI_BIOS_DATE);
-
-	if (!s)
-		return 0;
-	if (!*s)
-		return 0;
-
-	s = strrchr(s, '/');
-	if (!s)
+	int year = dmi_get_year(DMI_BIOS_DATE);
+	/* Doesn't exist? Likely an old system */
+	if (year == -1) 
+		return 1;
+	/* 0? Likely a buggy new BIOS */
+	if (year == 0)
 		return 0;
-
-	s += 1;
-
-	year = simple_strtoul(s, NULL, 0);
-
-	if (year < 100) {	/* 2-digit year */
-		year += 1900;
-		if (year < 1996)	/* no dates < spec 1.0 */
-			year += 100;
-	}
-
 	if (year < CONFIG_ACPI_BLACKLIST_YEAR) {
 		printk(KERN_ERR PREFIX "BIOS age (%d) fails cutoff (%d), "
 		       "acpi=force is required to enable ACPI\n",
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 2e6bbe014157..64fd6c366604 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -68,6 +68,7 @@ extern char * dmi_get_system_info(int field);
 extern struct dmi_device * dmi_find_device(int type, const char *name,
 	struct dmi_device *from);
 extern void dmi_scan_machine(void);
+extern int dmi_get_year(int field);
 
 #else
 
@@ -75,6 +76,7 @@ static inline int dmi_check_system(struct dmi_system_id *list) { return 0; }
 static inline char * dmi_get_system_info(int field) { return NULL; }
 static inline struct dmi_device * dmi_find_device(int type, const char *name,
 	struct dmi_device *from) { return NULL; }
+static inline int dmi_get_year(int year) { return 0; }
 
 #endif
 
-- 
cgit v1.2.3


From 267b48014a5c0c2ae90b04dad5d95ceb903365a6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 25 Mar 2006 16:31:10 +0100
Subject: [PATCH] x86_64: Try to allocate node memmap near the end of node

This fixes problems with very large nodes (over 128GB) filling up all of
the first 4GB with their mem_map and not leaving enough space for the
swiotlb.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/numa.c   | 12 +++++++++++-
 include/linux/bootmem.h |  3 +++
 mm/bootmem.c            |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index e4b62753a19a..07471a3eb190 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -149,7 +149,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 /* Initialize final allocator for a zone */
 void __init setup_node_zones(int nodeid)
 { 
-	unsigned long start_pfn, end_pfn; 
+	unsigned long start_pfn, end_pfn, memmapsize, limit;
 	unsigned long zones[MAX_NR_ZONES];
 	unsigned long holes[MAX_NR_ZONES];
 
@@ -159,6 +159,16 @@ void __init setup_node_zones(int nodeid)
 	Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
 		nodeid, start_pfn, end_pfn);
 
+	/* Try to allocate mem_map at end to not fill up precious <4GB
+	   memory. */
+	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
+	limit = end_pfn << PAGE_SHIFT;
+	NODE_DATA(nodeid)->node_mem_map = 
+		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
+				memmapsize, SMP_CACHE_BYTES, 
+				round_down(limit - memmapsize, PAGE_SIZE), 
+				limit);
+
 	size_zones(zones, holes, start_pfn, end_pfn);
 	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
 			    start_pfn, holes);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 993da8cc9706..7155452fb4a8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -51,6 +51,9 @@ extern void * __init __alloc_bootmem_low_node(pg_data_t *pgdat,
 					      unsigned long size,
 					      unsigned long align,
 					      unsigned long goal);
+extern void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
+		unsigned long size, unsigned long align, unsigned long goal,
+		unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 35c32290f717..b55bd39fc5dd 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
  *
  * NOTE:  This function is _not_ reentrant.
  */
-static void * __init
+void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	      unsigned long align, unsigned long goal, unsigned long limit)
 {
-- 
cgit v1.2.3


From 686f8c5d77149f78ff6090dde774b2e43a7319b2 Mon Sep 17 00:00:00 2001
From: Todd Poynor <tpoynor@mvista.com>
Date: Sat, 25 Mar 2006 18:15:24 +0000
Subject: include/linux/clk.h is betraying its ARM origins

include/linux/clk.h is betraying its ARM origins.

Signed-off-by: Todd Poynor <tpoynor@mvista.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/clk.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 12848f81bb37..5ca8c6fddb56 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -8,8 +8,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#ifndef ASMARM_CLOCK_H
-#define ASMARM_CLOCK_H
+#ifndef __LINUX_CLK_H
+#define __LINUX_CLK_H
 
 struct device;
 
-- 
cgit v1.2.3


From 104c7b03ea0913a24be103db66d8cf1f1f99a49a Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Sat, 25 Mar 2006 23:03:13 +0000
Subject: [ARM] 3383/3: ixp2000: ixdp2x01 platform serial conversion

Patch from Lennert Buytenhek

Add a PLAT8250_DEV_PLATFORM2, and convert the two ixdp2x01 CPLD serial
ports to use platform serial devices with ids PLAT8250_DEV_PLATFORM[12].
(The on-chip xscale UART is PLAT8250_DEV_PLATFORM, id #0.)

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mach-ixp2000/ixdp2x01.c | 78 ++++++++++++++++++++++++++++++----------
 include/linux/serial_8250.h      |  1 +
 2 files changed, 60 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ixp2000/ixdp2x01.c b/arch/arm/mach-ixp2000/ixdp2x01.c
index f9d4968c1d66..66915282a463 100644
--- a/arch/arm/mach-ixp2000/ixdp2x01.c
+++ b/arch/arm/mach-ixp2000/ixdp2x01.c
@@ -30,6 +30,7 @@
 #include <linux/tty.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
+#include <linux/serial_8250.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -132,7 +133,7 @@ void __init ixdp2x01_init_irq(void)
 
 
 /*************************************************************************
- * IXDP2x01 memory map and serial ports
+ * IXDP2x01 memory map
  *************************************************************************/
 static struct map_desc ixdp2x01_io_desc __initdata = {
 	.virtual	= IXDP2X01_VIRT_CPLD_BASE, 
@@ -141,40 +142,78 @@ static struct map_desc ixdp2x01_io_desc __initdata = {
 	.type		= MT_DEVICE
 };
 
-static struct uart_port ixdp2x01_serial_ports[2] = {
+static void __init ixdp2x01_map_io(void)
+{
+	ixp2000_map_io();
+	iotable_init(&ixdp2x01_io_desc, 1);
+}
+
+
+/*************************************************************************
+ * IXDP2x01 serial ports
+ *************************************************************************/
+static struct plat_serial8250_port ixdp2x01_serial_port1[] = {
 	{
-		.membase	= (char *)(IXDP2X01_UART1_VIRT_BASE),
 		.mapbase	= (unsigned long)IXDP2X01_UART1_PHYS_BASE,
+		.membase	= (char *)IXDP2X01_UART1_VIRT_BASE,
 		.irq		= IRQ_IXDP2X01_UART1,
-		.flags		= UPF_SKIP_TEST,
+		.flags		= UPF_BOOT_AUTOCONF | UPF_SKIP_TEST,
 		.iotype		= UPIO_MEM32,
 		.regshift	= 2,
 		.uartclk	= IXDP2X01_UART_CLK,
-		.line		= 1,
-		.type		= PORT_16550A,
-		.fifosize	= 16
-	}, {
-		.membase	= (char *)(IXDP2X01_UART2_VIRT_BASE),
+	},
+	{ }
+};
+
+static struct resource ixdp2x01_uart_resource1 = {
+	.start		= IXDP2X01_UART1_PHYS_BASE,
+	.end		= IXDP2X01_UART1_PHYS_BASE + 0xffff,
+	.flags		= IORESOURCE_MEM,
+};
+
+static struct platform_device ixdp2x01_serial_device1 = {
+	.name		= "serial8250",
+	.id		= PLAT8250_DEV_PLATFORM1,
+	.dev		= {
+		.platform_data		= ixdp2x01_serial_port1,
+	},
+	.num_resources	= 1,
+	.resource	= &ixdp2x01_uart_resource1,
+};
+
+static struct plat_serial8250_port ixdp2x01_serial_port2[] = {
+	{
 		.mapbase	= (unsigned long)IXDP2X01_UART2_PHYS_BASE,
+		.membase	= (char *)IXDP2X01_UART2_VIRT_BASE,
 		.irq		= IRQ_IXDP2X01_UART2,
-		.flags		= UPF_SKIP_TEST,
+		.flags		= UPF_BOOT_AUTOCONF | UPF_SKIP_TEST,
 		.iotype		= UPIO_MEM32,
 		.regshift	= 2,
 		.uartclk	= IXDP2X01_UART_CLK,
-		.line		= 2,
-		.type		= PORT_16550A,
-		.fifosize	= 16
 	}, 
+	{ }
 };
 
-static void __init ixdp2x01_map_io(void)
-{
-	ixp2000_map_io();	
+static struct resource ixdp2x01_uart_resource2 = {
+	.start		= IXDP2X01_UART2_PHYS_BASE,
+	.end		= IXDP2X01_UART2_PHYS_BASE + 0xffff,
+	.flags		= IORESOURCE_MEM,
+};
 
-	iotable_init(&ixdp2x01_io_desc, 1);
+static struct platform_device ixdp2x01_serial_device2 = {
+	.name		= "serial8250",
+	.id		= PLAT8250_DEV_PLATFORM2,
+	.dev		= {
+		.platform_data		= ixdp2x01_serial_port2,
+	},
+	.num_resources	= 1,
+	.resource	= &ixdp2x01_uart_resource2,
+};
 
-	early_serial_setup(&ixdp2x01_serial_ports[0]);
-	early_serial_setup(&ixdp2x01_serial_ports[1]);
+static void ixdp2x01_uart_init(void)
+{
+	platform_device_register(&ixdp2x01_serial_device1);
+	platform_device_register(&ixdp2x01_serial_device2);
 }
 
 
@@ -374,6 +413,7 @@ static void __init ixdp2x01_init_machine(void)
 
 	platform_add_devices(ixdp2x01_devices, ARRAY_SIZE(ixdp2x01_devices));
 	ixp2000_uart_init();
+	ixdp2x01_uart_init();
 }
 
 
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 73b464f0926a..8e9681413726 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -37,6 +37,7 @@ enum {
 	PLAT8250_DEV_LEGACY = -1,
 	PLAT8250_DEV_PLATFORM,
 	PLAT8250_DEV_PLATFORM1,
+	PLAT8250_DEV_PLATFORM2,
 	PLAT8250_DEV_FOURPORT,
 	PLAT8250_DEV_ACCENT,
 	PLAT8250_DEV_BOCA,
-- 
cgit v1.2.3


From 64a07bd82ed526d813b64b0957543eef55bdf9c0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sun, 26 Mar 2006 01:36:55 -0800
Subject: [PATCH] protect remove_proc_entry

It has been discovered that the remove_proc_entry has a race in the removing
of entries in the proc file system that are siblings.  There's no protection
around the traversing and removing of elements that belong in the same
subdirectory.

This subdirectory list is protected in other areas by the BKL.  So the BKL was
at first used to protect this area too, but unfortunately, remove_proc_entry
may be called with spinlocks held.  The BKL may schedule, so this was not a
solution.

The final solution was to add a new global spin lock to protect this list,
called proc_subdir_lock.  This lock now protects the list in
remove_proc_entry, and I also went around looking for other areas that this
list is modified and added this protection there too.  Care must be taken
since these locations call several functions that may also schedule.

Since I don't see any location that these functions that modify the
subdirectory list are called by interrupts, the irqsave/restore versions of
the spin lock was _not_ used.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/generic.c       | 32 +++++++++++++++++++++++++++++---
 fs/proc/proc_devtree.c  |  2 ++
 include/linux/proc_fs.h |  3 +++
 3 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 20e5c4509a43..47b7a20d45eb 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/namei.h>
 #include <linux/bitops.h>
+#include <linux/spinlock.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -29,6 +30,8 @@ static ssize_t proc_file_write(struct file *file, const char __user *buffer,
 			       size_t count, loff_t *ppos);
 static loff_t proc_file_lseek(struct file *, loff_t, int);
 
+DEFINE_SPINLOCK(proc_subdir_lock);
+
 int proc_match(int len, const char *name, struct proc_dir_entry *de)
 {
 	if (de->namelen != len)
@@ -277,7 +280,9 @@ static int xlate_proc_name(const char *name,
 	const char     		*cp = name, *next;
 	struct proc_dir_entry	*de;
 	int			len;
+	int 			rtn = 0;
 
+	spin_lock(&proc_subdir_lock);
 	de = &proc_root;
 	while (1) {
 		next = strchr(cp, '/');
@@ -289,13 +294,17 @@ static int xlate_proc_name(const char *name,
 			if (proc_match(len, cp, de))
 				break;
 		}
-		if (!de)
-			return -ENOENT;
+		if (!de) {
+			rtn = -ENOENT;
+			goto out;
+		}
 		cp += len + 1;
 	}
 	*residual = cp;
 	*ret = de;
-	return 0;
+out:
+	spin_unlock(&proc_subdir_lock);
+	return rtn;
 }
 
 static DEFINE_IDR(proc_inum_idr);
@@ -380,6 +389,7 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
 	int error = -ENOENT;
 
 	lock_kernel();
+	spin_lock(&proc_subdir_lock);
 	de = PDE(dir);
 	if (de) {
 		for (de = de->subdir; de ; de = de->next) {
@@ -388,12 +398,15 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
 			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
 				unsigned int ino = de->low_ino;
 
+				spin_unlock(&proc_subdir_lock);
 				error = -EINVAL;
 				inode = proc_get_inode(dir->i_sb, ino, de);
+				spin_lock(&proc_subdir_lock);
 				break;
 			}
 		}
 	}
+	spin_unlock(&proc_subdir_lock);
 	unlock_kernel();
 
 	if (inode) {
@@ -447,11 +460,13 @@ int proc_readdir(struct file * filp,
 			filp->f_pos++;
 			/* fall through */
 		default:
+			spin_lock(&proc_subdir_lock);
 			de = de->subdir;
 			i -= 2;
 			for (;;) {
 				if (!de) {
 					ret = 1;
+					spin_unlock(&proc_subdir_lock);
 					goto out;
 				}
 				if (!i)
@@ -461,12 +476,16 @@ int proc_readdir(struct file * filp,
 			}
 
 			do {
+				/* filldir passes info to user space */
+				spin_unlock(&proc_subdir_lock);
 				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
 					    de->low_ino, de->mode >> 12) < 0)
 					goto out;
+				spin_lock(&proc_subdir_lock);
 				filp->f_pos++;
 				de = de->next;
 			} while (de);
+			spin_unlock(&proc_subdir_lock);
 	}
 	ret = 1;
 out:	unlock_kernel();
@@ -500,9 +519,13 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 	if (i == 0)
 		return -EAGAIN;
 	dp->low_ino = i;
+
+	spin_lock(&proc_subdir_lock);
 	dp->next = dir->subdir;
 	dp->parent = dir;
 	dir->subdir = dp;
+	spin_unlock(&proc_subdir_lock);
+
 	if (S_ISDIR(dp->mode)) {
 		if (dp->proc_iops == NULL) {
 			dp->proc_fops = &proc_dir_operations;
@@ -694,6 +717,8 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
 		goto out;
 	len = strlen(fn);
+
+	spin_lock(&proc_subdir_lock);
 	for (p = &parent->subdir; *p; p=&(*p)->next ) {
 		if (!proc_match(len, fn, *p))
 			continue;
@@ -714,6 +739,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 		}
 		break;
 	}
+	spin_unlock(&proc_subdir_lock);
 out:
 	return;
 }
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 9bdd077d6f55..596b4b4f1cc8 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -136,9 +136,11 @@ void proc_device_tree_add_node(struct device_node *np,
 		 * properties are quite unimportant for us though, thus we
 		 * simply "skip" them here, but we do have to check.
 		 */
+		spin_lock(&proc_subdir_lock);
 		for (ent = de->subdir; ent != NULL; ent = ent->next)
 			if (!strcmp(ent->name, pp->name))
 				break;
+		spin_unlock(&proc_subdir_lock);
 		if (ent != NULL) {
 			printk(KERN_WARNING "device-tree: property \"%s\" name"
 			       " conflicts with node in %s\n", pp->name,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index aa6322d45198..6b12b0f661b4 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -4,6 +4,7 @@
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
+#include <linux/spinlock.h>
 #include <asm/atomic.h>
 
 /*
@@ -92,6 +93,8 @@ extern struct proc_dir_entry *proc_bus;
 extern struct proc_dir_entry *proc_root_driver;
 extern struct proc_dir_entry *proc_root_kcore;
 
+extern spinlock_t proc_subdir_lock;
+
 extern void proc_root_init(void);
 extern void proc_misc_init(void);
 
-- 
cgit v1.2.3


From 03beb07664d768db97bf454ae5c9581cd4737bb4 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Sun, 26 Mar 2006 01:36:57 -0800
Subject: [PATCH] Add API for flushing Anon pages

Currently, get_user_pages() returns fully coherent pages to the kernel for
anything other than anonymous pages.  This is a problem for things like
fuse and the SCSI generic ioctl SG_IO which can potentially wish to do DMA
to anonymous pages passed in by users.

The fix is to add a new memory management API: flush_anon_page() which
is used in get_user_pages() to make anonymous pages coherent.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cachetlb.txt | 9 +++++++++
 include/linux/highmem.h    | 6 ++++++
 mm/memory.c                | 2 ++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 4ae418889b88..1f312a9893d9 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -362,6 +362,15 @@ maps this page at its virtual address.
 	likely that you will need to flush the instruction cache
 	for copy_to_user_page().
 
+  void flush_anon_page(struct page *page, unsigned long vmaddr)
+  	When the kernel needs to access the contents of an anonymous
+	page, it calls this function (currently only
+	get_user_pages()).  Note: flush_dcache_page() deliberately
+	doesn't work for an anonymous page.  The default
+	implementation is a nop (and should remain so for all coherent
+	architectures).  For incoherent architectures, it should flush
+	the cache of the page at vmaddr in the current user process.
+
   void flush_icache_range(unsigned long start, unsigned long end)
   	When the kernel stores into addresses that it will execute
 	out of (eg when loading modules), this function is called.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 6bece9280eb7..7bd2593dbef9 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,12 @@
 
 #include <asm/cacheflush.h>
 
+#ifndef ARCH_HAS_FLUSH_ANON_PAGE
+static inline void flush_anon_page(struct page *page, unsigned long vmaddr)
+{
+}
+#endif
+
 #ifdef CONFIG_HIGHMEM
 
 #include <asm/highmem.h>
diff --git a/mm/memory.c b/mm/memory.c
index e347e106ca3a..67686048f094 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1071,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			}
 			if (pages) {
 				pages[i] = page;
+
+				flush_anon_page(page, start);
 				flush_dcache_page(page);
 			}
 			if (vmas)
-- 
cgit v1.2.3


From 5a3a5a98b6422d05c39eaa32c8b3f83840c7b768 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Sun, 26 Mar 2006 01:36:59 -0800
Subject: [PATCH] Add flush_kernel_dcache_page() API

We have a problem in a lot of emulated storage in that it takes a page from
get_user_pages() and does something like

kmap_atomic(page)
modify page
kunmap_atomic(page)

However, nothing has flushed the kernel cache view of the page before the
kunmap.  We need a lightweight API to do this, so this new API would
specifically be for flushing the kernel cache view of a user page which the
kernel has modified.  The driver would need to add
flush_kernel_dcache_page(page) before the final kunmap.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cachetlb.txt | 12 ++++++++++++
 include/linux/highmem.h    |  6 ++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 1f312a9893d9..53245c429f7d 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -371,6 +371,18 @@ maps this page at its virtual address.
 	architectures).  For incoherent architectures, it should flush
 	the cache of the page at vmaddr in the current user process.
 
+  void flush_kernel_dcache_page(struct page *page)
+	When the kernel needs to modify a user page is has obtained
+	with kmap, it calls this function after all modifications are
+	complete (but before kunmapping it) to bring the underlying
+	page up to date.  It is assumed here that the user has no
+	incoherent cached copies (i.e. the original page was obtained
+	from a mechanism like get_user_pages()).  The default
+	implementation is a nop and should remain so on all coherent
+	architectures.  On incoherent architectures, this should flush
+	the kernel cache for page (using page_address(page)).
+
+
   void flush_icache_range(unsigned long start, unsigned long end)
   	When the kernel stores into addresses that it will execute
 	out of (eg when loading modules), this function is called.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 7bd2593dbef9..892c4ea1b425 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -13,6 +13,12 @@ static inline void flush_anon_page(struct page *page, unsigned long vmaddr)
 }
 #endif
 
+#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+static inline void flush_kernel_dcache_page(struct page *page)
+{
+}
+#endif
+
 #ifdef CONFIG_HIGHMEM
 
 #include <asm/highmem.h>
-- 
cgit v1.2.3


From 136939a2b5aa4302281215745ccd567e1df2e8d4 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Sun, 26 Mar 2006 01:37:05 -0800
Subject: [PATCH] EFI, /dev/mem: simplify efi_mem_attribute_range()

Pass the size, not a pointer to the size, to efi_mem_attribute_range().

This function validates memory regions for the /dev/mem read/write/mmap paths.
The pointer allows arches to reduce the size of the range, but I think that's
unnecessary complexity.  Simplifying it will let me use
efi_mem_attribute_range() to improve the ia64 ioremap() implementation.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Cc: "Tolentino, Matthew E" <matthew.e.tolentino@intel.com>
Cc: "Brown, Len" <len.brown@intel.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/efi.c | 41 ++++++++++++++++++++---------------------
 drivers/char/mem.c     | 18 ++++++------------
 include/asm-ia64/io.h  |  4 ++--
 include/linux/efi.h    |  2 ++
 4 files changed, 30 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 9990320b6f9a..2993748c13df 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -677,27 +677,34 @@ EXPORT_SYMBOL(efi_mem_attributes);
 /*
  * Determines whether the memory at phys_addr supports the desired
  * attribute (WB, UC, etc).  If this returns 1, the caller can safely
- * access *size bytes at phys_addr with the specified attribute.
+ * access size bytes at phys_addr with the specified attribute.
  */
-static int
-efi_mem_attribute_range (unsigned long phys_addr, unsigned long *size, u64 attr)
+int
+efi_mem_attribute_range (unsigned long phys_addr, unsigned long size, u64 attr)
 {
+	unsigned long end = phys_addr + size;
 	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
-	unsigned long md_end;
 
-	if (!md || (md->attribute & attr) != attr)
+	/*
+	 * Some firmware doesn't report MMIO regions in the EFI memory
+	 * map.  The Intel BigSur (a.k.a. HP i2000) has this problem.
+	 * On those platforms, we have to assume UC is valid everywhere.
+	 */
+	if (!md || (md->attribute & attr) != attr) {
+		if (attr == EFI_MEMORY_UC && !efi_memmap_has_mmio())
+			return 1;
 		return 0;
+	}
 
 	do {
-		md_end = efi_md_end(md);
-		if (phys_addr + *size <= md_end)
+		unsigned long md_end = efi_md_end(md);
+
+		if (end <= md_end)
 			return 1;
 
 		md = efi_memory_descriptor(md_end);
-		if (!md || (md->attribute & attr) != attr) {
-			*size = md_end - phys_addr;
-			return 1;
-		}
+		if (!md || (md->attribute & attr) != attr)
+			return 0;
 	} while (md);
 	return 0;
 }
@@ -708,7 +715,7 @@ efi_mem_attribute_range (unsigned long phys_addr, unsigned long *size, u64 attr)
  * control access size.
  */
 int
-valid_phys_addr_range (unsigned long phys_addr, unsigned long *size)
+valid_phys_addr_range (unsigned long phys_addr, unsigned long size)
 {
 	return efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB);
 }
@@ -723,7 +730,7 @@ valid_phys_addr_range (unsigned long phys_addr, unsigned long *size)
  * because that doesn't appear in the boot-time EFI memory map.
  */
 int
-valid_mmap_phys_addr_range (unsigned long phys_addr, unsigned long *size)
+valid_mmap_phys_addr_range (unsigned long phys_addr, unsigned long size)
 {
 	if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB))
 		return 1;
@@ -731,14 +738,6 @@ valid_mmap_phys_addr_range (unsigned long phys_addr, unsigned long *size)
 	if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_UC))
 		return 1;
 
-	/*
-	 * Some firmware doesn't report MMIO regions in the EFI memory map.
-	 * The Intel BigSur (a.k.a. HP i2000) has this problem.  In this
-	 * case, we can't use the EFI memory map to validate mmap requests.
-	 */
-	if (!efi_memmap_has_mmio())
-		return 1;
-
 	return 0;
 }
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 26d0116b48d4..5245ba1649ed 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -88,21 +88,15 @@ static inline int uncached_access(struct file *file, unsigned long addr)
 }
 
 #ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE
-static inline int valid_phys_addr_range(unsigned long addr, size_t *count)
+static inline int valid_phys_addr_range(unsigned long addr, size_t count)
 {
-	unsigned long end_mem;
-
-	end_mem = __pa(high_memory);
-	if (addr >= end_mem)
+	if (addr + count > __pa(high_memory))
 		return 0;
 
-	if (*count > end_mem - addr)
-		*count = end_mem - addr;
-
 	return 1;
 }
 
-static inline int valid_mmap_phys_addr_range(unsigned long addr, size_t *size)
+static inline int valid_mmap_phys_addr_range(unsigned long addr, size_t size)
 {
 	return 1;
 }
@@ -119,7 +113,7 @@ static ssize_t read_mem(struct file * file, char __user * buf,
 	ssize_t read, sz;
 	char *ptr;
 
-	if (!valid_phys_addr_range(p, &count))
+	if (!valid_phys_addr_range(p, count))
 		return -EFAULT;
 	read = 0;
 #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED
@@ -177,7 +171,7 @@ static ssize_t write_mem(struct file * file, const char __user * buf,
 	unsigned long copied;
 	void *ptr;
 
-	if (!valid_phys_addr_range(p, &count))
+	if (!valid_phys_addr_range(p, count))
 		return -EFAULT;
 
 	written = 0;
@@ -249,7 +243,7 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma)
 {
 	size_t size = vma->vm_end - vma->vm_start;
 
-	if (!valid_mmap_phys_addr_range(vma->vm_pgoff << PAGE_SHIFT, &size))
+	if (!valid_mmap_phys_addr_range(vma->vm_pgoff << PAGE_SHIFT, size))
 		return -EINVAL;
 
 	vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
diff --git a/include/asm-ia64/io.h b/include/asm-ia64/io.h
index 0d9bcc36f2a9..acba019b08e9 100644
--- a/include/asm-ia64/io.h
+++ b/include/asm-ia64/io.h
@@ -88,8 +88,8 @@ phys_to_virt (unsigned long address)
 }
 
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
-extern int valid_phys_addr_range (unsigned long addr, size_t *count); /* efi.c */
-extern int valid_mmap_phys_addr_range (unsigned long addr, size_t *count);
+extern int valid_phys_addr_range (unsigned long addr, size_t count); /* efi.c */
+extern int valid_mmap_phys_addr_range (unsigned long addr, size_t count);
 
 /*
  * The following two macros are deprecated and scheduled for removal.
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c7c5dd316182..d15725470aa4 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -292,6 +292,8 @@ extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if pos
 extern u64 efi_get_iobase (void);
 extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
+extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size,
+				    u64 attr);
 extern int __init efi_uart_console_only (void);
 extern void efi_initialize_iomem_resources(struct resource *code_resource,
 					struct resource *data_resource);
-- 
cgit v1.2.3


From b2c99e3c70d77fb194df5aa1642030080d28ea48 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Sun, 26 Mar 2006 01:37:08 -0800
Subject: [PATCH] EFI: keep physical table addresses in efi structure

Almost all users of the table addresses from the EFI system table want
physical addresses.  So rather than doing the pa->va->pa conversion, just keep
physical addresses in struct efi.

This fixes a DMI bug: the efi structure contained the physical SMBIOS address
on x86 but the virtual address on ia64, so dmi_scan_machine() used ioremap()
on a virtual address on ia64.

This is essentially the same as an earlier patch by Matt Tolentino:
	http://marc.theaimsgroup.com/?l=linux-kernel&m=112130292316281&w=2
except that this changes all table addresses, not just ACPI addresses.

Matt's original patch was backed out because it caused MCAs on HP sx1000
systems.  That problem is resolved by the ioremap() attribute checking added
for ia64.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Cc: "Tolentino, Matthew E" <matthew.e.tolentino@intel.com>
Cc: "Brown, Len" <len.brown@intel.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/acpi/boot.c |  8 ++++----
 arch/i386/kernel/dmi_scan.c  |  4 ++--
 arch/i386/kernel/efi.c       | 21 +++++++++++++++------
 arch/ia64/kernel/acpi.c      |  6 +++---
 arch/ia64/kernel/efi.c       | 21 +++++++++++++++------
 arch/ia64/kernel/setup.c     |  2 +-
 arch/ia64/sn/kernel/setup.c  |  5 +++--
 drivers/acpi/osl.c           | 10 ++++------
 drivers/firmware/efivars.c   | 28 ++++++++++++++--------------
 drivers/firmware/pcdp.c      | 19 ++++++++++++-------
 include/asm-ia64/sn/sn_sal.h |  2 +-
 include/linux/efi.h          | 18 ++++++++++--------
 12 files changed, 84 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index f1a21945963d..033066176b3e 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -668,10 +668,10 @@ unsigned long __init acpi_find_rsdp(void)
 	unsigned long rsdp_phys = 0;
 
 	if (efi_enabled) {
-		if (efi.acpi20)
-			return __pa(efi.acpi20);
-		else if (efi.acpi)
-			return __pa(efi.acpi);
+		if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
+			return efi.acpi20;
+		else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
+			return efi.acpi;
 	}
 	/*
 	 * Scan memory looking for the RSDP signature. First search EBDA (low
diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index c032f9e06bb6..170d4c9f9bc3 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -217,14 +217,14 @@ void __init dmi_scan_machine(void)
 	int rc;
 
 	if (efi_enabled) {
-		if (!efi.smbios)
+		if (efi.smbios == EFI_INVALID_TABLE_ADDR)
 			goto out;
 
                /* This is called as a core_initcall() because it isn't
                 * needed during early boot.  This also means we can
                 * iounmap the space when we're done with it.
 		*/
-		p = dmi_ioremap((unsigned long)efi.smbios, 32);
+		p = dmi_ioremap(efi.smbios, 32);
 		if (p == NULL)
 			goto out;
 
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 7ec6cfa01fb3..c224c2aebbab 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -381,29 +381,38 @@ void __init efi_init(void)
 	if (config_tables == NULL)
 		printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
 
+	efi.mps        = EFI_INVALID_TABLE_ADDR;
+	efi.acpi       = EFI_INVALID_TABLE_ADDR;
+	efi.acpi20     = EFI_INVALID_TABLE_ADDR;
+	efi.smbios     = EFI_INVALID_TABLE_ADDR;
+	efi.sal_systab = EFI_INVALID_TABLE_ADDR;
+	efi.boot_info  = EFI_INVALID_TABLE_ADDR;
+	efi.hcdp       = EFI_INVALID_TABLE_ADDR;
+	efi.uga        = EFI_INVALID_TABLE_ADDR;
+
 	for (i = 0; i < num_config_tables; i++) {
 		if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
-			efi.mps = (void *)config_tables[i].table;
+			efi.mps = config_tables[i].table;
 			printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
 		} else
 		    if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
-			efi.acpi20 = __va(config_tables[i].table);
+			efi.acpi20 = config_tables[i].table;
 			printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
 		} else
 		    if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
-			efi.acpi = __va(config_tables[i].table);
+			efi.acpi = config_tables[i].table;
 			printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
 		} else
 		    if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
-			efi.smbios = (void *) config_tables[i].table;
+			efi.smbios = config_tables[i].table;
 			printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
 		} else
 		    if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
-			efi.hcdp = (void *)config_tables[i].table;
+			efi.hcdp = config_tables[i].table;
 			printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
 		} else
 		    if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
-			efi.uga = (void *)config_tables[i].table;
+			efi.uga = config_tables[i].table;
 			printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
 		}
 	}
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index a4e218ce2edb..58c93a30348c 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -651,9 +651,9 @@ unsigned long __init acpi_find_rsdp(void)
 {
 	unsigned long rsdp_phys = 0;
 
-	if (efi.acpi20)
-		rsdp_phys = __pa(efi.acpi20);
-	else if (efi.acpi)
+	if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
+		rsdp_phys = efi.acpi20;
+	else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
 		printk(KERN_WARNING PREFIX
 		       "v1.0/r0.71 tables no longer supported\n");
 	return rsdp_phys;
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 2993748c13df..12cfedce73b1 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -458,24 +458,33 @@ efi_init (void)
 	printk(KERN_INFO "EFI v%u.%.02u by %s:",
 	       efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor);
 
+	efi.mps        = EFI_INVALID_TABLE_ADDR;
+	efi.acpi       = EFI_INVALID_TABLE_ADDR;
+	efi.acpi20     = EFI_INVALID_TABLE_ADDR;
+	efi.smbios     = EFI_INVALID_TABLE_ADDR;
+	efi.sal_systab = EFI_INVALID_TABLE_ADDR;
+	efi.boot_info  = EFI_INVALID_TABLE_ADDR;
+	efi.hcdp       = EFI_INVALID_TABLE_ADDR;
+	efi.uga        = EFI_INVALID_TABLE_ADDR;
+
 	for (i = 0; i < (int) efi.systab->nr_tables; i++) {
 		if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
-			efi.mps = __va(config_tables[i].table);
+			efi.mps = config_tables[i].table;
 			printk(" MPS=0x%lx", config_tables[i].table);
 		} else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
-			efi.acpi20 = __va(config_tables[i].table);
+			efi.acpi20 = config_tables[i].table;
 			printk(" ACPI 2.0=0x%lx", config_tables[i].table);
 		} else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
-			efi.acpi = __va(config_tables[i].table);
+			efi.acpi = config_tables[i].table;
 			printk(" ACPI=0x%lx", config_tables[i].table);
 		} else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
-			efi.smbios = __va(config_tables[i].table);
+			efi.smbios = config_tables[i].table;
 			printk(" SMBIOS=0x%lx", config_tables[i].table);
 		} else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) {
-			efi.sal_systab = __va(config_tables[i].table);
+			efi.sal_systab = config_tables[i].table;
 			printk(" SALsystab=0x%lx", config_tables[i].table);
 		} else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
-			efi.hcdp = __va(config_tables[i].table);
+			efi.hcdp = config_tables[i].table;
 			printk(" HCDP=0x%lx", config_tables[i].table);
 		}
 	}
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index a4421a66ea5b..e4dfda1eb7dd 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -434,7 +434,7 @@ setup_arch (char **cmdline_p)
 	find_memory();
 
 	/* process SAL system table: */
-	ia64_sal_init(efi.sal_systab);
+	ia64_sal_init(__va(efi.sal_systab));
 
 	ia64_setup_printk_clock();
 
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 8b6d5c844708..30988dfbddff 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -327,10 +327,11 @@ sn_scan_pcdp(void)
 	struct pcdp_interface_pci if_pci;
 	extern struct efi efi;
 
-	pcdp = efi.hcdp;
-	if (! pcdp)
+	if (efi.hcdp == EFI_INVALID_TABLE_ADDR)
 		return;		/* no hcdp/pcdp table */
 
+	pcdp = __va(efi.hcdp);
+
 	if (pcdp->rev < 3)
 		return;		/* only support PCDP (rev >= 3) */
 
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index ac5bbaedac1b..fc8a3bce6cbb 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -156,12 +156,10 @@ acpi_status acpi_os_get_root_pointer(u32 flags, struct acpi_pointer *addr)
 {
 	if (efi_enabled) {
 		addr->pointer_type = ACPI_PHYSICAL_POINTER;
-		if (efi.acpi20)
-			addr->pointer.physical =
-			    (acpi_physical_address) virt_to_phys(efi.acpi20);
-		else if (efi.acpi)
-			addr->pointer.physical =
-			    (acpi_physical_address) virt_to_phys(efi.acpi);
+		if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
+			addr->pointer.physical = efi.acpi20;
+		else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
+			addr->pointer.physical = efi.acpi;
 		else {
 			printk(KERN_ERR PREFIX
 			       "System description tables not found\n");
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 343379f23a53..9b7e4d52ffd4 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -568,20 +568,20 @@ systab_read(struct subsystem *entry, char *buf)
 	if (!entry || !buf)
 		return -EINVAL;
 
-	if (efi.mps)
-		str += sprintf(str, "MPS=0x%lx\n", __pa(efi.mps));
-	if (efi.acpi20)
-		str += sprintf(str, "ACPI20=0x%lx\n", __pa(efi.acpi20));
-	if (efi.acpi)
-		str += sprintf(str, "ACPI=0x%lx\n", __pa(efi.acpi));
-	if (efi.smbios)
-		str += sprintf(str, "SMBIOS=0x%lx\n", __pa(efi.smbios));
-	if (efi.hcdp)
-		str += sprintf(str, "HCDP=0x%lx\n", __pa(efi.hcdp));
-	if (efi.boot_info)
-		str += sprintf(str, "BOOTINFO=0x%lx\n", __pa(efi.boot_info));
-	if (efi.uga)
-		str += sprintf(str, "UGA=0x%lx\n", __pa(efi.uga));
+	if (efi.mps != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "MPS=0x%lx\n", efi.mps);
+	if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "ACPI20=0x%lx\n", efi.acpi20);
+	if (efi.acpi != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
+	if (efi.smbios != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
+	if (efi.hcdp != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "HCDP=0x%lx\n", efi.hcdp);
+	if (efi.boot_info != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "BOOTINFO=0x%lx\n", efi.boot_info);
+	if (efi.uga != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "UGA=0x%lx\n", efi.uga);
 
 	return str - buf;
 }
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
index ae1fb45dbb40..c37baf9448bc 100644
--- a/drivers/firmware/pcdp.c
+++ b/drivers/firmware/pcdp.c
@@ -89,19 +89,20 @@ efi_setup_pcdp_console(char *cmdline)
 	struct pcdp_uart *uart;
 	struct pcdp_device *dev, *end;
 	int i, serial = 0;
+	int rc = -ENODEV;
 
-	pcdp = efi.hcdp;
-	if (!pcdp)
+	if (efi.hcdp == EFI_INVALID_TABLE_ADDR)
 		return -ENODEV;
 
-	printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, __pa(pcdp));
+	pcdp = ioremap(efi.hcdp, 4096);
+	printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp);
 
 	if (strstr(cmdline, "console=hcdp")) {
 		if (pcdp->rev < 3)
 			serial = 1;
 	} else if (strstr(cmdline, "console=")) {
 		printk(KERN_INFO "Explicit \"console=\"; ignoring PCDP\n");
-		return -ENODEV;
+		goto out;
 	}
 
 	if (pcdp->rev < 3 && efi_uart_console_only())
@@ -110,7 +111,8 @@ efi_setup_pcdp_console(char *cmdline)
 	for (i = 0, uart = pcdp->uart; i < pcdp->num_uarts; i++, uart++) {
 		if (uart->flags & PCDP_UART_PRIMARY_CONSOLE || serial) {
 			if (uart->type == PCDP_CONSOLE_UART) {
-				return setup_serial_console(uart);
+				rc = setup_serial_console(uart);
+				goto out;
 			}
 		}
 	}
@@ -121,10 +123,13 @@ efi_setup_pcdp_console(char *cmdline)
 	     dev = (struct pcdp_device *) ((u8 *) dev + dev->length)) {
 		if (dev->flags & PCDP_PRIMARY_CONSOLE) {
 			if (dev->type == PCDP_CONSOLE_VGA) {
-				return setup_vga_console(dev);
+				rc = setup_vga_console(dev);
+				goto out;
 			}
 		}
 	}
 
-	return -ENODEV;
+out:
+	iounmap(pcdp);
+	return rc;
 }
diff --git a/include/asm-ia64/sn/sn_sal.h b/include/asm-ia64/sn/sn_sal.h
index 244449df7411..bf4cc867a698 100644
--- a/include/asm-ia64/sn/sn_sal.h
+++ b/include/asm-ia64/sn/sn_sal.h
@@ -159,7 +159,7 @@
 static inline u32
 sn_sal_rev(void)
 {
-	struct ia64_sal_systab *systab = efi.sal_systab;
+	struct ia64_sal_systab *systab = __va(efi.sal_systab);
 
 	return (u32)(systab->sal_b_rev_major << 8 | systab->sal_b_rev_minor);
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d15725470aa4..e203613d3aec 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -240,19 +240,21 @@ struct efi_memory_map {
 	unsigned long desc_size;
 };
 
+#define EFI_INVALID_TABLE_ADDR		(~0UL)
+
 /*
  * All runtime access to EFI goes through this structure:
  */
 extern struct efi {
 	efi_system_table_t *systab;	/* EFI system table */
-	void *mps;			/* MPS table */
-	void *acpi;			/* ACPI table  (IA64 ext 0.71) */
-	void *acpi20;			/* ACPI table  (ACPI 2.0) */
-	void *smbios;			/* SM BIOS table */
-	void *sal_systab;		/* SAL system table */
-	void *boot_info;		/* boot info table */
-	void *hcdp;			/* HCDP table */
-	void *uga;			/* UGA table */
+	unsigned long mps;		/* MPS table */
+	unsigned long acpi;		/* ACPI table  (IA64 ext 0.71) */
+	unsigned long acpi20;		/* ACPI table  (ACPI 2.0) */
+	unsigned long smbios;		/* SM BIOS table */
+	unsigned long sal_systab;	/* SAL system table */
+	unsigned long boot_info;	/* boot info table */
+	unsigned long hcdp;		/* HCDP table */
+	unsigned long uga;		/* UGA table */
 	efi_get_time_t *get_time;
 	efi_set_time_t *set_time;
 	efi_get_wakeup_time_t *get_wakeup_time;
-- 
cgit v1.2.3


From 3978d7179d3849848df8a37dd0a5acc20bcb8750 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 26 Mar 2006 01:37:17 -0800
Subject: [PATCH] Make address_space_operations->sync_page return void

The only user ignores the return value, and the only instanace
(block_sync_page) always returns 0...

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 3 +--
 fs/cifs/file.c              | 6 ++++--
 include/linux/buffer_head.h | 2 +-
 include/linux/fs.h          | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 3b3ab5281920..0b9456fd074f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3007,7 +3007,7 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 
-int block_sync_page(struct page *page)
+void block_sync_page(struct page *page)
 {
 	struct address_space *mapping;
 
@@ -3015,7 +3015,6 @@ int block_sync_page(struct page *page)
 	mapping = page_mapping(page);
 	if (mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, page);
-	return 0;
 }
 
 /*
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 165d67426381..fb49aef1f2ec 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1339,7 +1339,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return rc;
 }
 
-/* static int cifs_sync_page(struct page *page)
+/* static void cifs_sync_page(struct page *page)
 {
 	struct address_space *mapping;
 	struct inode *inode;
@@ -1353,16 +1353,18 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 		return 0;
 	inode = mapping->host;
 	if (!inode)
-		return 0; */
+		return; */
 
 /*	fill in rpages then 
 	result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
 
 /*	cFYI(1, ("rpages is %d for sync page of Index %ld ", rpages, index));
 
+#if 0
 	if (rc < 0)
 		return rc;
 	return 0;
+#endif
 } */
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 9f159baf153f..27dd97b3fce9 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -200,7 +200,7 @@ int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 int generic_cont_expand(struct inode *inode, loff_t size);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
-int block_sync_page(struct page *);
+void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
 int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5adf32b90f36..972435d4df5c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -350,7 +350,7 @@ struct writeback_control;
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
-	int (*sync_page)(struct page *);
+	void (*sync_page)(struct page *);
 
 	/* Write back some dirty pages from this mapping. */
 	int (*writepages)(struct address_space *, struct writeback_control *);
-- 
cgit v1.2.3


From 2ff28e22bdb8727fbc7d7889807bc5a73aae56c5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 26 Mar 2006 01:37:18 -0800
Subject: [PATCH] Make address_space_operations->invalidatepage return void

The return value of this function is never used, so let's be honest and
declare it as void.

Some places where invalidatepage returned 0, I have inserted comments
suggesting a BUG_ON.

[akpm@osdl.org: JBD BUG fix]
[akpm@osdl.org: rework for git-nfs]
[akpm@osdl.org: don't go BUG in block_invalidate_page()]
Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/file.c               |  6 +++---
 fs/buffer.c                 | 18 ++++++++----------
 fs/ext3/inode.c             |  4 ++--
 fs/jbd/transaction.c        | 13 +++++--------
 fs/jfs/jfs_metapage.c       |  7 +++----
 fs/nfs/file.c               |  3 +--
 fs/reiserfs/inode.c         |  8 +++++---
 fs/xfs/linux-2.6/xfs_aops.c |  4 ++--
 include/linux/buffer_head.h |  4 ++--
 include/linux/fs.h          |  2 +-
 include/linux/jbd.h         |  2 +-
 11 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 150b19227922..7bb716887e29 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -28,7 +28,7 @@ static int afs_file_release(struct inode *inode, struct file *file);
 #endif
 
 static int afs_file_readpage(struct file *file, struct page *page);
-static int afs_file_invalidatepage(struct page *page, unsigned long offset);
+static void afs_file_invalidatepage(struct page *page, unsigned long offset);
 static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
 
 struct inode_operations afs_file_inode_operations = {
@@ -212,7 +212,7 @@ int afs_cache_get_page_cookie(struct page *page,
 /*
  * invalidate part or all of a page
  */
-static int afs_file_invalidatepage(struct page *page, unsigned long offset)
+static void afs_file_invalidatepage(struct page *page, unsigned long offset)
 {
 	int ret = 1;
 
@@ -238,11 +238,11 @@ static int afs_file_invalidatepage(struct page *page, unsigned long offset)
 			if (!PageWriteback(page))
 				ret = page->mapping->a_ops->releasepage(page,
 									0);
+			/* possibly should BUG_ON(!ret); - neilb */
 		}
 	}
 
 	_leave(" = %d", ret);
-	return ret;
 } /* end afs_file_invalidatepage() */
 
 /*****************************************************************************/
diff --git a/fs/buffer.c b/fs/buffer.c
index 0b9456fd074f..f25f58096428 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1593,11 +1593,10 @@ EXPORT_SYMBOL(try_to_release_page);
  * point.  Because the caller is about to free (and possibly reuse) those
  * blocks on-disk.
  */
-int block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
-	int ret = 1;
 
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
@@ -1624,19 +1623,18 @@ int block_invalidatepage(struct page *page, unsigned long offset)
 	 * so real IO is not possible anymore.
 	 */
 	if (offset == 0)
-		ret = try_to_release_page(page, 0);
+		try_to_release_page(page, 0);
 out:
-	return ret;
+	return;
 }
 EXPORT_SYMBOL(block_invalidatepage);
 
-int do_invalidatepage(struct page *page, unsigned long offset)
+void do_invalidatepage(struct page *page, unsigned long offset)
 {
-	int (*invalidatepage)(struct page *, unsigned long);
-	invalidatepage = page->mapping->a_ops->invalidatepage;
-	if (invalidatepage == NULL)
-		invalidatepage = block_invalidatepage;
-	return (*invalidatepage)(page, offset);
+	void (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage ? :
+		block_invalidatepage;
+	(*invalidatepage)(page, offset);
 }
 
 /*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2c361377e0a5..76e22c9c9c6c 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1430,7 +1430,7 @@ ext3_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
 
-static int ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 
@@ -1440,7 +1440,7 @@ static int ext3_invalidatepage(struct page *page, unsigned long offset)
 	if (offset == 0)
 		ClearPageChecked(page);
 
-	return journal_invalidatepage(journal, page, offset);
+	journal_invalidatepage(journal, page, offset);
 }
 
 static int ext3_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ada31fa272e3..c609f5034fcd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1873,16 +1873,15 @@ zap_buffer_unlocked:
 }
 
 /** 
- * int journal_invalidatepage() 
+ * void journal_invalidatepage()
  * @journal: journal to use for flush... 
  * @page:    page to flush
  * @offset:  length of page to invalidate.
  *
  * Reap page buffers containing data after offset in page.
  *
- * Return non-zero if the page's buffers were successfully reaped.
  */
-int journal_invalidatepage(journal_t *journal, 
+void journal_invalidatepage(journal_t *journal,
 		      struct page *page, 
 		      unsigned long offset)
 {
@@ -1893,7 +1892,7 @@ int journal_invalidatepage(journal_t *journal,
 	if (!PageLocked(page))
 		BUG();
 	if (!page_has_buffers(page))
-		return 1;
+		return;
 
 	/* We will potentially be playing with lists other than just the
 	 * data lists (especially for journaled data mode), so be
@@ -1916,11 +1915,9 @@ int journal_invalidatepage(journal_t *journal,
 	} while (bh != head);
 
 	if (!offset) {
-		if (!may_free || !try_to_free_buffers(page))
-			return 0;
-		J_ASSERT(!page_has_buffers(page));
+		if (may_free && try_to_free_buffers(page))
+			J_ASSERT(!page_has_buffers(page));
 	}
-	return 1;
 }
 
 /* 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 5fbaeaadccd3..8508043849f3 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -578,14 +578,13 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 	return 0;
 }
 
-static int metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned long offset)
 {
 	BUG_ON(offset);
 
-	if (PageWriteback(page))
-		return 0;
+	BUG_ON(PageWriteback(page));
 
-	return metapage_releasepage(page, 0);
+	metapage_releasepage(page, 0);
 }
 
 struct address_space_operations jfs_metapage_aops = {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5263b2864a44..dee49a0cb995 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -318,10 +318,9 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
 	return status;
 }
 
-static int nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
 	/* FIXME: we really should cancel any unstarted writes on this page */
-	return 1;
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d60f6238c66a..62e18c19b446 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2793,7 +2793,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 }
 
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
-static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
 	struct inode *inode = page->mapping->host;
@@ -2832,10 +2832,12 @@ static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
-	if (!offset && ret)
+	if (!offset && ret) {
 		ret = try_to_release_page(page, 0);
+		/* maybe should BUG_ON(!ret); - neilb */
+	}
       out:
-	return ret;
+	return;
 }
 
 static int reiserfs_set_page_dirty(struct page *page)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 97fc056130eb..4f2476f188b0 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1442,14 +1442,14 @@ xfs_vm_readpages(
 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_block);
 }
 
-STATIC int
+STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
 	unsigned long		offset)
 {
 	xfs_page_trace(XFS_INVALIDPAGE_ENTER,
 			page->mapping->host, page, offset);
-	return block_invalidatepage(page, offset);
+	block_invalidatepage(page, offset);
 }
 
 struct address_space_operations xfs_address_space_operations = {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 27dd97b3fce9..da917ed096a3 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -189,8 +189,8 @@ extern int buffer_heads_over_limit;
  * address_spaces.
  */
 int try_to_release_page(struct page * page, gfp_t gfp_mask);
-int block_invalidatepage(struct page *page, unsigned long offset);
-int do_invalidatepage(struct page *page, unsigned long offset);
+void block_invalidatepage(struct page *page, unsigned long offset);
+void do_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 972435d4df5c..9674679525f9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -369,7 +369,7 @@ struct address_space_operations {
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
-	int (*invalidatepage) (struct page *, unsigned long);
+	void (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, gfp_t);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 4fc7dffd66ef..6a425e370cb3 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -895,7 +895,7 @@ extern int	 journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern void	 journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 journal_forget (handle_t *, struct buffer_head *);
 extern void	 journal_sync_buffer (struct buffer_head *);
-extern int	 journal_invalidatepage(journal_t *,
+extern void	 journal_invalidatepage(journal_t *,
 				struct page *, unsigned long);
 extern int	 journal_try_to_free_buffers(journal_t *, struct page *, gfp_t);
 extern int	 journal_stop(handle_t *);
-- 
cgit v1.2.3


From 3c30b06df404c8892c225a99ecfd3f02789c0513 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Sun, 26 Mar 2006 01:37:19 -0800
Subject: [PATCH] cleanup smp_call_function UP build

net/core/flow.c: In function 'flow_cache_flush':
net/core/flow.c:299: warning: statement with no effect

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/smp.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index d699a16b0cb2..e2fa3ab4afc5 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -82,7 +82,11 @@ void smp_prepare_boot_cpu(void);
  */
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
-#define smp_call_function(func,info,retry,wait)	({ 0; })
+static inline int up_smp_call_function(void)
+{
+	return 0;
+}
+#define smp_call_function(func,info,retry,wait)	(up_smp_call_function())
 #define on_each_cpu(func,info,retry,wait)	\
 	({					\
 		local_irq_disable();		\
-- 
cgit v1.2.3


From 50c812b2b9513e3df34eae8c30cb2c221b79b2cb Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Sun, 26 Mar 2006 01:37:21 -0800
Subject: [PATCH] ipmi: add full sysfs support

Add full driver model support for the IPMI driver.  It links in the proper
bus and device support.

It adds an "ipmi" driver interface that has each BMC discovered by the
driver (as a device).  These BMCs appear in the devices/platform directory.
 If there are multiple interfaces to the same BMC, the driver should
discover this and will only have one BMC entry.  The BMC entry will have
pointers to each interface device that connects to it.

The device information (statistics and config information) has not yet been
ported over to the driver model from proc, that will come later.

This work was based on work by Yani Ioannou.  I basically rewrote it using
that code as a guide, but he still deserves credit :).

[bunk@stusta.de: make ipmi_find_bmc_guid() static]
Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Yani Ioannou <yani.ioannou@gmail.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/ipmi/ipmi_devintf.c    |  48 +++-
 drivers/char/ipmi/ipmi_msghandler.c | 552 +++++++++++++++++++++++++++++++++++-
 drivers/char/ipmi/ipmi_poweroff.c   |   2 +-
 drivers/char/ipmi/ipmi_si_intf.c    | 114 +++++---
 drivers/char/ipmi/ipmi_watchdog.c   |   2 +-
 include/linux/ipmi.h                |   3 +-
 include/linux/ipmi_msgdefs.h        |   1 +
 include/linux/ipmi_smi.h            |  47 ++-
 8 files changed, 712 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index 7c0684deea06..932feedda262 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -90,7 +90,7 @@ static unsigned int ipmi_poll(struct file *file, poll_table *wait)
 
 	spin_lock_irqsave(&priv->recv_msg_lock, flags);
 
-	if (! list_empty(&(priv->recv_msgs)))
+	if (!list_empty(&(priv->recv_msgs)))
 		mask |= (POLLIN | POLLRDNORM);
 
 	spin_unlock_irqrestore(&priv->recv_msg_lock, flags);
@@ -789,21 +789,53 @@ MODULE_PARM_DESC(ipmi_major, "Sets the major number of the IPMI device.  By"
 		 " interface.  Other values will set the major device number"
 		 " to that value.");
 
+/* Keep track of the devices that are registered. */
+struct ipmi_reg_list {
+	dev_t            dev;
+	struct list_head link;
+};
+static LIST_HEAD(reg_list);
+static DEFINE_MUTEX(reg_list_mutex);
+
 static struct class *ipmi_class;
 
-static void ipmi_new_smi(int if_num)
+static void ipmi_new_smi(int if_num, struct device *device)
 {
 	dev_t dev = MKDEV(ipmi_major, if_num);
+	struct ipmi_reg_list *entry;
 
 	devfs_mk_cdev(dev, S_IFCHR | S_IRUSR | S_IWUSR,
 		      "ipmidev/%d", if_num);
 
-	class_device_create(ipmi_class, NULL, dev, NULL, "ipmi%d", if_num);
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		printk(KERN_ERR "ipmi_devintf: Unable to create the"
+		       " ipmi class device link\n");
+		return;
+	}
+	entry->dev = dev;
+
+	mutex_lock(&reg_list_mutex);
+	class_device_create(ipmi_class, NULL, dev, device, "ipmi%d", if_num);
+	list_add(&entry->link, &reg_list);
+	mutex_unlock(&reg_list_mutex);
 }
 
 static void ipmi_smi_gone(int if_num)
 {
-	class_device_destroy(ipmi_class, MKDEV(ipmi_major, if_num));
+	dev_t dev = MKDEV(ipmi_major, if_num);
+	struct ipmi_reg_list *entry;
+
+	mutex_lock(&reg_list_mutex);
+	list_for_each_entry(entry, &reg_list, link) {
+		if (entry->dev == dev) {
+			list_del(&entry->link);
+			kfree(entry);
+			break;
+		}
+	}
+	class_device_destroy(ipmi_class, dev);
+	mutex_unlock(&reg_list_mutex);
 	devfs_remove("ipmidev/%d", if_num);
 }
 
@@ -856,6 +888,14 @@ module_init(init_ipmi_devintf);
 
 static __exit void cleanup_ipmi(void)
 {
+	struct ipmi_reg_list *entry, *entry2;
+	mutex_lock(&reg_list_mutex);
+	list_for_each_entry_safe(entry, entry2, &reg_list, link) {
+		list_del(&entry->link);
+		class_device_destroy(ipmi_class, entry->dev);
+		kfree(entry);
+	}
+	mutex_unlock(&reg_list_mutex);
 	class_destroy(ipmi_class);
 	ipmi_smi_watcher_unregister(&smi_watcher);
 	devfs_remove(DEVICE_NAME);
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index abd4c5118a1b..f553b7a86841 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -162,6 +162,28 @@ struct ipmi_proc_entry
 };
 #endif
 
+struct bmc_device
+{
+	struct platform_device *dev;
+	struct ipmi_device_id  id;
+	unsigned char          guid[16];
+	int                    guid_set;
+
+	struct kref	       refcount;
+
+	/* bmc device attributes */
+	struct device_attribute device_id_attr;
+	struct device_attribute provides_dev_sdrs_attr;
+	struct device_attribute revision_attr;
+	struct device_attribute firmware_rev_attr;
+	struct device_attribute version_attr;
+	struct device_attribute add_dev_support_attr;
+	struct device_attribute manufacturer_id_attr;
+	struct device_attribute product_id_attr;
+	struct device_attribute guid_attr;
+	struct device_attribute aux_firmware_rev_attr;
+};
+
 #define IPMI_IPMB_NUM_SEQ	64
 #define IPMI_MAX_CHANNELS       16
 struct ipmi_smi
@@ -178,9 +200,8 @@ struct ipmi_smi
 	/* Used for wake ups at startup. */
 	wait_queue_head_t waitq;
 
-	/* The IPMI version of the BMC on the other end. */
-	unsigned char       version_major;
-	unsigned char       version_minor;
+	struct bmc_device *bmc;
+	char *my_dev_name;
 
 	/* This is the lower-layer's sender routine. */
 	struct ipmi_smi_handlers *handlers;
@@ -194,6 +215,9 @@ struct ipmi_smi
 	struct ipmi_proc_entry *proc_entries;
 #endif
 
+	/* Driver-model device for the system interface. */
+	struct device          *si_dev;
+
 	/* A table of sequence numbers for this interface.  We use the
            sequence numbers for IPMB messages that go out of the
            interface to match them up with their responses.  A routine
@@ -312,6 +336,7 @@ struct ipmi_smi
 	/* Events that were received with the proper format. */
 	unsigned int events;
 };
+#define to_si_intf_from_dev(device) container_of(device, struct ipmi_smi, dev)
 
 /* Used to mark an interface entry that cannot be used but is not a
  * free entry, either, primarily used at creation and deletion time so
@@ -320,6 +345,15 @@ struct ipmi_smi
 #define IPMI_INVALID_INTERFACE(i) (((i) == NULL) \
 				   || (i == IPMI_INVALID_INTERFACE_ENTRY))
 
+/**
+ * The driver model view of the IPMI messaging driver.
+ */
+static struct device_driver ipmidriver = {
+	.name = "ipmi",
+	.bus = &platform_bus_type
+};
+static DEFINE_MUTEX(ipmidriver_mutex);
+
 #define MAX_IPMI_INTERFACES 4
 static ipmi_smi_t ipmi_interfaces[MAX_IPMI_INTERFACES];
 
@@ -393,7 +427,7 @@ int ipmi_smi_watcher_register(struct ipmi_smi_watcher *watcher)
 		if (IPMI_INVALID_INTERFACE(intf))
 			continue;
 		spin_unlock_irqrestore(&interfaces_lock, flags);
-		watcher->new_smi(i);
+		watcher->new_smi(i, intf->si_dev);
 		spin_lock_irqsave(&interfaces_lock, flags);
 	}
 	spin_unlock_irqrestore(&interfaces_lock, flags);
@@ -409,14 +443,14 @@ int ipmi_smi_watcher_unregister(struct ipmi_smi_watcher *watcher)
 }
 
 static void
-call_smi_watchers(int i)
+call_smi_watchers(int i, struct device *dev)
 {
 	struct ipmi_smi_watcher *w;
 
 	down_read(&smi_watchers_sem);
 	list_for_each_entry(w, &smi_watchers, link) {
 		if (try_module_get(w->owner)) {
-			w->new_smi(i);
+			w->new_smi(i, dev);
 			module_put(w->owner);
 		}
 	}
@@ -844,8 +878,8 @@ void ipmi_get_version(ipmi_user_t   user,
 		      unsigned char *major,
 		      unsigned char *minor)
 {
-	*major = user->intf->version_major;
-	*minor = user->intf->version_minor;
+	*major = ipmi_version_major(&user->intf->bmc->id);
+	*minor = ipmi_version_minor(&user->intf->bmc->id);
 }
 
 int ipmi_set_my_address(ipmi_user_t   user,
@@ -1553,7 +1587,8 @@ static int version_file_read_proc(char *page, char **start, off_t off,
 	ipmi_smi_t intf = data;
 
 	return sprintf(out, "%d.%d\n",
-		       intf->version_major, intf->version_minor);
+		       ipmi_version_major(&intf->bmc->id),
+		       ipmi_version_minor(&intf->bmc->id));
 }
 
 static int stat_file_read_proc(char *page, char **start, off_t off,
@@ -1712,6 +1747,470 @@ static void remove_proc_entries(ipmi_smi_t smi)
 #endif /* CONFIG_PROC_FS */
 }
 
+static int __find_bmc_guid(struct device *dev, void *data)
+{
+	unsigned char *id = data;
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+	return memcmp(bmc->guid, id, 16) == 0;
+}
+
+static struct bmc_device *ipmi_find_bmc_guid(struct device_driver *drv,
+					     unsigned char *guid)
+{
+	struct device *dev;
+
+	dev = driver_find_device(drv, NULL, guid, __find_bmc_guid);
+	if (dev)
+		return dev_get_drvdata(dev);
+	else
+		return NULL;
+}
+
+struct prod_dev_id {
+	unsigned int  product_id;
+	unsigned char device_id;
+};
+
+static int __find_bmc_prod_dev_id(struct device *dev, void *data)
+{
+	struct prod_dev_id *id = data;
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return (bmc->id.product_id == id->product_id
+		&& bmc->id.product_id == id->product_id
+		&& bmc->id.device_id == id->device_id);
+}
+
+static struct bmc_device *ipmi_find_bmc_prod_dev_id(
+	struct device_driver *drv,
+	unsigned char product_id, unsigned char device_id)
+{
+	struct prod_dev_id id = {
+		.product_id = product_id,
+		.device_id = device_id,
+	};
+	struct device *dev;
+
+	dev = driver_find_device(drv, NULL, &id, __find_bmc_prod_dev_id);
+	if (dev)
+		return dev_get_drvdata(dev);
+	else
+		return NULL;
+}
+
+static ssize_t device_id_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 10, "%u\n", bmc->id.device_id);
+}
+
+static ssize_t provides_dev_sdrs_show(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 10, "%u\n",
+			bmc->id.device_revision && 0x80 >> 7);
+}
+
+static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 20, "%u\n",
+			bmc->id.device_revision && 0x0F);
+}
+
+static ssize_t firmware_rev_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 20, "%u.%x\n", bmc->id.firmware_revision_1,
+			bmc->id.firmware_revision_2);
+}
+
+static ssize_t ipmi_version_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 20, "%u.%u\n",
+			ipmi_version_major(&bmc->id),
+			ipmi_version_minor(&bmc->id));
+}
+
+static ssize_t add_dev_support_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 10, "0x%02x\n",
+			bmc->id.additional_device_support);
+}
+
+static ssize_t manufacturer_id_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 20, "0x%6.6x\n", bmc->id.manufacturer_id);
+}
+
+static ssize_t product_id_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 10, "0x%4.4x\n", bmc->id.product_id);
+}
+
+static ssize_t aux_firmware_rev_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 21, "0x%02x 0x%02x 0x%02x 0x%02x\n",
+			bmc->id.aux_firmware_revision[3],
+			bmc->id.aux_firmware_revision[2],
+			bmc->id.aux_firmware_revision[1],
+			bmc->id.aux_firmware_revision[0]);
+}
+
+static ssize_t guid_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct bmc_device *bmc = dev_get_drvdata(dev);
+
+	return snprintf(buf, 100, "%Lx%Lx\n",
+			(long long) bmc->guid[0],
+			(long long) bmc->guid[8]);
+}
+
+static void
+cleanup_bmc_device(struct kref *ref)
+{
+	struct bmc_device *bmc;
+
+	bmc = container_of(ref, struct bmc_device, refcount);
+
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->device_id_attr);
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->provides_dev_sdrs_attr);
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->revision_attr);
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->firmware_rev_attr);
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->version_attr);
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->add_dev_support_attr);
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->manufacturer_id_attr);
+	device_remove_file(&bmc->dev->dev,
+			   &bmc->product_id_attr);
+	if (bmc->id.aux_firmware_revision_set)
+		device_remove_file(&bmc->dev->dev,
+				   &bmc->aux_firmware_rev_attr);
+	if (bmc->guid_set)
+		device_remove_file(&bmc->dev->dev,
+				   &bmc->guid_attr);
+	platform_device_unregister(bmc->dev);
+	kfree(bmc);
+}
+
+static void ipmi_bmc_unregister(ipmi_smi_t intf)
+{
+	struct bmc_device *bmc = intf->bmc;
+
+	sysfs_remove_link(&intf->si_dev->kobj, "bmc");
+	if (intf->my_dev_name) {
+		sysfs_remove_link(&bmc->dev->dev.kobj, intf->my_dev_name);
+		kfree(intf->my_dev_name);
+		intf->my_dev_name = NULL;
+	}
+
+	mutex_lock(&ipmidriver_mutex);
+	kref_put(&bmc->refcount, cleanup_bmc_device);
+	mutex_unlock(&ipmidriver_mutex);
+}
+
+static int ipmi_bmc_register(ipmi_smi_t intf)
+{
+	int               rv;
+	struct bmc_device *bmc = intf->bmc;
+	struct bmc_device *old_bmc;
+	int               size;
+	char              dummy[1];
+
+	mutex_lock(&ipmidriver_mutex);
+
+	/*
+	 * Try to find if there is an bmc_device struct
+	 * representing the interfaced BMC already
+	 */
+	if (bmc->guid_set)
+		old_bmc = ipmi_find_bmc_guid(&ipmidriver, bmc->guid);
+	else
+		old_bmc = ipmi_find_bmc_prod_dev_id(&ipmidriver,
+						    bmc->id.product_id,
+						    bmc->id.device_id);
+
+	/*
+	 * If there is already an bmc_device, free the new one,
+	 * otherwise register the new BMC device
+	 */
+	if (old_bmc) {
+		kfree(bmc);
+		intf->bmc = old_bmc;
+		bmc = old_bmc;
+
+		kref_get(&bmc->refcount);
+		mutex_unlock(&ipmidriver_mutex);
+
+		printk(KERN_INFO
+		       "ipmi: interfacing existing BMC (man_id: 0x%6.6x,"
+		       " prod_id: 0x%4.4x, dev_id: 0x%2.2x)\n",
+		       bmc->id.manufacturer_id,
+		       bmc->id.product_id,
+		       bmc->id.device_id);
+	} else {
+		bmc->dev = platform_device_alloc("ipmi_bmc",
+						 bmc->id.device_id);
+		if (! bmc->dev) {
+			printk(KERN_ERR
+			       "ipmi_msghandler:"
+			       " Unable to allocate platform device\n");
+			return -ENOMEM;
+		}
+		bmc->dev->dev.driver = &ipmidriver;
+		dev_set_drvdata(&bmc->dev->dev, bmc);
+		kref_init(&bmc->refcount);
+
+		rv = platform_device_register(bmc->dev);
+		mutex_unlock(&ipmidriver_mutex);
+		if (rv) {
+			printk(KERN_ERR
+			       "ipmi_msghandler:"
+			       " Unable to register bmc device: %d\n",
+			       rv);
+			/* Don't go to out_err, you can only do that if
+			   the device is registered already. */
+			return rv;
+		}
+
+		bmc->device_id_attr.attr.name = "device_id";
+		bmc->device_id_attr.attr.owner = THIS_MODULE;
+		bmc->device_id_attr.attr.mode = S_IRUGO;
+		bmc->device_id_attr.show = device_id_show;
+
+		bmc->provides_dev_sdrs_attr.attr.name = "provides_device_sdrs";
+		bmc->provides_dev_sdrs_attr.attr.owner = THIS_MODULE;
+		bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO;
+		bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show;
+
+
+		bmc->revision_attr.attr.name = "revision";
+		bmc->revision_attr.attr.owner = THIS_MODULE;
+		bmc->revision_attr.attr.mode = S_IRUGO;
+		bmc->revision_attr.show = revision_show;
+
+		bmc->firmware_rev_attr.attr.name = "firmware_revision";
+		bmc->firmware_rev_attr.attr.owner = THIS_MODULE;
+		bmc->firmware_rev_attr.attr.mode = S_IRUGO;
+		bmc->firmware_rev_attr.show = firmware_rev_show;
+
+		bmc->version_attr.attr.name = "ipmi_version";
+		bmc->version_attr.attr.owner = THIS_MODULE;
+		bmc->version_attr.attr.mode = S_IRUGO;
+		bmc->version_attr.show = ipmi_version_show;
+
+		bmc->add_dev_support_attr.attr.name
+			= "additional_device_support";
+		bmc->add_dev_support_attr.attr.owner = THIS_MODULE;
+		bmc->add_dev_support_attr.attr.mode = S_IRUGO;
+		bmc->add_dev_support_attr.show = add_dev_support_show;
+
+		bmc->manufacturer_id_attr.attr.name = "manufacturer_id";
+		bmc->manufacturer_id_attr.attr.owner = THIS_MODULE;
+		bmc->manufacturer_id_attr.attr.mode = S_IRUGO;
+		bmc->manufacturer_id_attr.show = manufacturer_id_show;
+
+		bmc->product_id_attr.attr.name = "product_id";
+		bmc->product_id_attr.attr.owner = THIS_MODULE;
+		bmc->product_id_attr.attr.mode = S_IRUGO;
+		bmc->product_id_attr.show = product_id_show;
+
+		bmc->guid_attr.attr.name = "guid";
+		bmc->guid_attr.attr.owner = THIS_MODULE;
+		bmc->guid_attr.attr.mode = S_IRUGO;
+		bmc->guid_attr.show = guid_show;
+
+		bmc->aux_firmware_rev_attr.attr.name = "aux_firmware_revision";
+		bmc->aux_firmware_rev_attr.attr.owner = THIS_MODULE;
+		bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO;
+		bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show;
+
+		device_create_file(&bmc->dev->dev,
+				   &bmc->device_id_attr);
+		device_create_file(&bmc->dev->dev,
+				   &bmc->provides_dev_sdrs_attr);
+		device_create_file(&bmc->dev->dev,
+				   &bmc->revision_attr);
+		device_create_file(&bmc->dev->dev,
+				   &bmc->firmware_rev_attr);
+		device_create_file(&bmc->dev->dev,
+				   &bmc->version_attr);
+		device_create_file(&bmc->dev->dev,
+				   &bmc->add_dev_support_attr);
+		device_create_file(&bmc->dev->dev,
+				   &bmc->manufacturer_id_attr);
+		device_create_file(&bmc->dev->dev,
+				   &bmc->product_id_attr);
+		if (bmc->id.aux_firmware_revision_set)
+			device_create_file(&bmc->dev->dev,
+					   &bmc->aux_firmware_rev_attr);
+		if (bmc->guid_set)
+			device_create_file(&bmc->dev->dev,
+					   &bmc->guid_attr);
+
+		printk(KERN_INFO
+		       "ipmi: Found new BMC (man_id: 0x%6.6x, "
+		       " prod_id: 0x%4.4x, dev_id: 0x%2.2x)\n",
+		       bmc->id.manufacturer_id,
+		       bmc->id.product_id,
+		       bmc->id.device_id);
+	}
+
+	/*
+	 * create symlink from system interface device to bmc device
+	 * and back.
+	 */
+	rv = sysfs_create_link(&intf->si_dev->kobj,
+			       &bmc->dev->dev.kobj, "bmc");
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_msghandler: Unable to create bmc symlink: %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	size = snprintf(dummy, 0, "ipmi%d", intf->intf_num);
+	intf->my_dev_name = kmalloc(size+1, GFP_KERNEL);
+	if (!intf->my_dev_name) {
+		rv = -ENOMEM;
+		printk(KERN_ERR
+		       "ipmi_msghandler: allocate link from BMC: %d\n",
+		       rv);
+		goto out_err;
+	}
+	snprintf(intf->my_dev_name, size+1, "ipmi%d", intf->intf_num);
+
+	rv = sysfs_create_link(&bmc->dev->dev.kobj, &intf->si_dev->kobj,
+			       intf->my_dev_name);
+	if (rv) {
+		kfree(intf->my_dev_name);
+		intf->my_dev_name = NULL;
+		printk(KERN_ERR
+		       "ipmi_msghandler:"
+		       " Unable to create symlink to bmc: %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	ipmi_bmc_unregister(intf);
+	return rv;
+}
+
+static int
+send_guid_cmd(ipmi_smi_t intf, int chan)
+{
+	struct kernel_ipmi_msg            msg;
+	struct ipmi_system_interface_addr si;
+
+	si.addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
+	si.channel = IPMI_BMC_CHANNEL;
+	si.lun = 0;
+
+	msg.netfn = IPMI_NETFN_APP_REQUEST;
+	msg.cmd = IPMI_GET_DEVICE_GUID_CMD;
+	msg.data = NULL;
+	msg.data_len = 0;
+	return i_ipmi_request(NULL,
+			      intf,
+			      (struct ipmi_addr *) &si,
+			      0,
+			      &msg,
+			      intf,
+			      NULL,
+			      NULL,
+			      0,
+			      intf->channels[0].address,
+			      intf->channels[0].lun,
+			      -1, 0);
+}
+
+static void
+guid_handler(ipmi_smi_t intf, struct ipmi_recv_msg *msg)
+{
+	if ((msg->addr.addr_type != IPMI_SYSTEM_INTERFACE_ADDR_TYPE)
+	    || (msg->msg.netfn != IPMI_NETFN_APP_RESPONSE)
+	    || (msg->msg.cmd != IPMI_GET_DEVICE_GUID_CMD))
+		/* Not for me */
+		return;
+
+	if (msg->msg.data[0] != 0) {
+		/* Error from getting the GUID, the BMC doesn't have one. */
+		intf->bmc->guid_set = 0;
+		goto out;
+	}
+
+	if (msg->msg.data_len < 17) {
+		intf->bmc->guid_set = 0;
+		printk(KERN_WARNING PFX
+		       "guid_handler: The GUID response from the BMC was too"
+		       " short, it was %d but should have been 17.  Assuming"
+		       " GUID is not available.\n",
+		       msg->msg.data_len);
+		goto out;
+	}
+
+	memcpy(intf->bmc->guid, msg->msg.data, 16);
+	intf->bmc->guid_set = 1;
+ out:
+	wake_up(&intf->waitq);
+}
+
+static void
+get_guid(ipmi_smi_t intf)
+{
+	int rv;
+
+	intf->bmc->guid_set = 0x2;
+	intf->null_user_handler = guid_handler;
+	rv = send_guid_cmd(intf, 0);
+	if (rv)
+		/* Send failed, no GUID available. */
+		intf->bmc->guid_set = 0;
+	wait_event(intf->waitq, intf->bmc->guid_set != 2);
+	intf->null_user_handler = NULL;
+}
+
 static int
 send_channel_info_cmd(ipmi_smi_t intf, int chan)
 {
@@ -1804,8 +2303,8 @@ channel_handler(ipmi_smi_t intf, struct ipmi_recv_msg *msg)
 
 int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		      void		       *send_info,
-		      unsigned char            version_major,
-		      unsigned char            version_minor,
+		      struct ipmi_device_id    *device_id,
+		      struct device            *si_dev,
 		      unsigned char            slave_addr,
 		      ipmi_smi_t               *new_intf)
 {
@@ -1813,7 +2312,11 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	int              rv;
 	ipmi_smi_t       intf;
 	unsigned long    flags;
+	int              version_major;
+	int              version_minor;
 
+	version_major = ipmi_version_major(device_id);
+	version_minor = ipmi_version_minor(device_id);
 
 	/* Make sure the driver is actually initialized, this handles
 	   problems with initialization order. */
@@ -1831,10 +2334,15 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	if (!intf)
 		return -ENOMEM;
 	memset(intf, 0, sizeof(*intf));
+	intf->bmc = kzalloc(sizeof(*intf->bmc), GFP_KERNEL);
+	if (!intf->bmc) {
+		kfree(intf);
+		return -ENOMEM;
+	}
 	intf->intf_num = -1;
 	kref_init(&intf->refcount);
-	intf->version_major = version_major;
-	intf->version_minor = version_minor;
+	intf->bmc->id = *device_id;
+	intf->si_dev = si_dev;
 	for (j = 0; j < IPMI_MAX_CHANNELS; j++) {
 		intf->channels[j].address = IPMI_BMC_SLAVE_ADDR;
 		intf->channels[j].lun = 2;
@@ -1884,6 +2392,8 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	   caller before sending any messages with it. */
 	*new_intf = intf;
 
+	get_guid(intf);
+
 	if ((version_major > 1)
 	    || ((version_major == 1) && (version_minor >= 5)))
 	{
@@ -1898,6 +2408,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		/* Wait for the channel info to be read. */
 		wait_event(intf->waitq,
 			   intf->curr_channel >= IPMI_MAX_CHANNELS);
+		intf->null_user_handler = NULL;
 	} else {
 		/* Assume a single IPMB channel at zero. */
 		intf->channels[0].medium = IPMI_CHANNEL_MEDIUM_IPMB;
@@ -1907,6 +2418,8 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	if (rv == 0)
 		rv = add_proc_entries(intf, i);
 
+	rv = ipmi_bmc_register(intf);
+
  out:
 	if (rv) {
 		if (intf->proc_dir)
@@ -1921,7 +2434,7 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		spin_lock_irqsave(&interfaces_lock, flags);
 		ipmi_interfaces[i] = intf;
 		spin_unlock_irqrestore(&interfaces_lock, flags);
-		call_smi_watchers(i);
+		call_smi_watchers(i, intf->si_dev);
 	}
 
 	return rv;
@@ -1933,6 +2446,8 @@ int ipmi_unregister_smi(ipmi_smi_t intf)
 	struct ipmi_smi_watcher *w;
 	unsigned long           flags;
 
+	ipmi_bmc_unregister(intf);
+
 	spin_lock_irqsave(&interfaces_lock, flags);
 	for (i = 0; i < MAX_IPMI_INTERFACES; i++) {
 		if (ipmi_interfaces[i] == intf) {
@@ -3196,10 +3711,17 @@ static struct notifier_block panic_block = {
 static int ipmi_init_msghandler(void)
 {
 	int i;
+	int rv;
 
 	if (initialized)
 		return 0;
 
+	rv = driver_register(&ipmidriver);
+	if (rv) {
+		printk(KERN_ERR PFX "Could not register IPMI driver\n");
+		return rv;
+	}
+
 	printk(KERN_INFO "ipmi message handler version "
 	       IPMI_DRIVER_VERSION "\n");
 
@@ -3256,6 +3778,8 @@ static __exit void cleanup_ipmi(void)
 	remove_proc_entry(proc_ipmi_root->name, &proc_root);
 #endif /* CONFIG_PROC_FS */
 
+	driver_unregister(&ipmidriver);
+
 	initialized = 0;
 
 	/* Check for buffer leaks. */
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index e8ed26b77d4c..786a2802ca34 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -464,7 +464,7 @@ static void ipmi_poweroff_function (void)
 
 /* Wait for an IPMI interface to be installed, the first one installed
    will be grabbed by this code and used to perform the powerdown. */
-static void ipmi_po_new_smi(int if_num)
+static void ipmi_po_new_smi(int if_num, struct device *device)
 {
 	struct ipmi_system_interface_addr smi_addr;
 	struct kernel_ipmi_msg            send_msg;
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index f3b3b23c5330..12f858dc9994 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -112,20 +112,13 @@ enum si_type {
 };
 static char *si_to_str[] = { "KCS", "SMIC", "BT" };
 
-struct ipmi_device_id {
-	unsigned char device_id;
-	unsigned char device_revision;
-	unsigned char firmware_revision_1;
-	unsigned char firmware_revision_2;
-	unsigned char ipmi_version;
-	unsigned char additional_device_support;
-	unsigned char manufacturer_id[3];
-	unsigned char product_id[2];
-	unsigned char aux_firmware_revision[4];
-} __attribute__((packed));
-
-#define ipmi_version_major(v) ((v)->ipmi_version & 0xf)
-#define ipmi_version_minor(v) ((v)->ipmi_version >> 4)
+#define DEVICE_NAME "ipmi_si"
+
+static struct device_driver ipmi_driver =
+{
+	.name = DEVICE_NAME,
+	.bus = &platform_bus_type
+};
 
 struct smi_info
 {
@@ -208,8 +201,17 @@ struct smi_info
 	   interrupts. */
 	int interrupt_disabled;
 
+	/* From the get device id response... */
 	struct ipmi_device_id device_id;
 
+	/* Driver model stuff. */
+	struct device *dev;
+	struct platform_device *pdev;
+
+	 /* True if we allocated the device, false if it came from
+	  * someplace else (like PCI). */
+	int dev_registered;
+
 	/* Slave address, could be reported from DMI. */
 	unsigned char slave_addr;
 
@@ -987,8 +989,6 @@ static LIST_HEAD(smi_infos);
 static DECLARE_MUTEX(smi_infos_lock);
 static int smi_num; /* Used to sequence the SMIs */
 
-#define DEVICE_NAME "ipmi_si"
-
 #define DEFAULT_REGSPACING	1
 
 static int           si_trydefaults = 1;
@@ -1164,7 +1164,6 @@ static void port_cleanup(struct smi_info *info)
 
 		release_region (addr, mapsize);
 	}
-	kfree(info);
 }
 
 static int port_setup(struct smi_info *info)
@@ -1273,7 +1272,6 @@ static void mem_cleanup(struct smi_info *info)
 
 		release_mem_region(addr, mapsize);
 	}
-	kfree(info);
 }
 
 static int mem_setup(struct smi_info *info)
@@ -1858,6 +1856,8 @@ static int __devinit ipmi_pci_probe(struct pci_dev *pdev,
 	if (info->irq)
 		info->irq_setup = std_irq_setup;
 
+	info->dev = &pdev->dev;
+
 	return try_smi_init(info);
 }
 
@@ -1898,11 +1898,11 @@ static struct pci_driver ipmi_pci_driver = {
 
 static int try_get_dev_id(struct smi_info *smi_info)
 {
-	unsigned char      msg[2];
-	unsigned char      *resp;
-	unsigned long      resp_len;
-	enum si_sm_result smi_result;
-	int               rv = 0;
+	unsigned char         msg[2];
+	unsigned char         *resp;
+	unsigned long         resp_len;
+	enum si_sm_result     smi_result;
+	int                   rv = 0;
 
 	resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
 	if (!resp)
@@ -1941,7 +1941,7 @@ static int try_get_dev_id(struct smi_info *smi_info)
 	/* Otherwise, we got some data. */
 	resp_len = smi_info->handlers->get_result(smi_info->si_sm,
 						  resp, IPMI_MAX_MSG_LENGTH);
-	if (resp_len < 6) {
+	if (resp_len < 14) {
 		/* That's odd, it should be longer. */
 		rv = -EINVAL;
 		goto out;
@@ -1954,8 +1954,7 @@ static int try_get_dev_id(struct smi_info *smi_info)
 	}
 
 	/* Record info from the get device id, in case we need it. */
-	memcpy(&smi_info->device_id, &resp[3],
-	       min_t(unsigned long, resp_len-3, sizeof(smi_info->device_id)));
+	ipmi_demangle_device_id(resp+3, resp_len-3, &smi_info->device_id);
 
  out:
 	kfree(resp);
@@ -2058,15 +2057,14 @@ static int oem_data_avail_to_receive_msg_avail(struct smi_info *smi_info)
 #define DELL_POWEREDGE_8G_BMC_DEVICE_ID  0x20
 #define DELL_POWEREDGE_8G_BMC_DEVICE_REV 0x80
 #define DELL_POWEREDGE_8G_BMC_IPMI_VERSION 0x51
-#define DELL_IANA_MFR_ID {0xA2, 0x02, 0x00}
+#define DELL_IANA_MFR_ID 0x0002a2
 static void setup_dell_poweredge_oem_data_handler(struct smi_info *smi_info)
 {
 	struct ipmi_device_id *id = &smi_info->device_id;
-	const char mfr[3]=DELL_IANA_MFR_ID;
-	if (!memcmp(mfr, id->manufacturer_id, sizeof(mfr))) {
+	if (id->manufacturer_id == DELL_IANA_MFR_ID) {
 		if (id->device_id       == DELL_POWEREDGE_8G_BMC_DEVICE_ID  &&
 		    id->device_revision == DELL_POWEREDGE_8G_BMC_DEVICE_REV &&
-		    id->ipmi_version    == DELL_POWEREDGE_8G_BMC_IPMI_VERSION) {
+		    id->ipmi_version   == DELL_POWEREDGE_8G_BMC_IPMI_VERSION) {
 			smi_info->oem_data_avail_handler =
 				oem_data_avail_to_receive_msg_avail;
 		}
@@ -2138,8 +2136,7 @@ static void
 setup_dell_poweredge_bt_xaction_handler(struct smi_info *smi_info)
 {
 	struct ipmi_device_id *id = &smi_info->device_id;
-	const char mfr[3]=DELL_IANA_MFR_ID;
- 	if (!memcmp(mfr, id->manufacturer_id, sizeof(mfr)) &&
+	if (id->manufacturer_id == DELL_IANA_MFR_ID &&
 	    smi_info->si_type == SI_BT)
 		register_xaction_notifier(&dell_poweredge_bt_xaction_notifier);
 }
@@ -2358,10 +2355,36 @@ static int try_smi_init(struct smi_info *new_smi)
 		new_smi->thread = kthread_run(ipmi_thread, new_smi,
 					      "kipmi%d", new_smi->intf_num);
 
+	if (!new_smi->dev) {
+		/* If we don't already have a device from something
+		 * else (like PCI), then register a new one. */
+		new_smi->pdev = platform_device_alloc("ipmi_si",
+						      new_smi->intf_num);
+		if (rv) {
+			printk(KERN_ERR
+			       "ipmi_si_intf:"
+			       " Unable to allocate platform device\n");
+			goto out_err_stop_timer;
+		}
+		new_smi->dev = &new_smi->pdev->dev;
+		new_smi->dev->driver = &ipmi_driver;
+
+		rv = platform_device_register(new_smi->pdev);
+		if (rv) {
+			printk(KERN_ERR
+			       "ipmi_si_intf:"
+			       " Unable to register system interface device:"
+			       " %d\n",
+			       rv);
+			goto out_err_stop_timer;
+		}
+		new_smi->dev_registered = 1;
+	}
+
 	rv = ipmi_register_smi(&handlers,
 			       new_smi,
-			       ipmi_version_major(&new_smi->device_id),
-			       ipmi_version_minor(&new_smi->device_id),
+			       &new_smi->device_id,
+			       new_smi->dev,
 			       new_smi->slave_addr,
 			       &(new_smi->intf));
 	if (rv) {
@@ -2425,6 +2448,11 @@ static int try_smi_init(struct smi_info *new_smi)
 	if (new_smi->io_cleanup)
 		new_smi->io_cleanup(new_smi);
 
+	if (new_smi->dev_registered)
+		platform_device_unregister(new_smi->pdev);
+
+	kfree(new_smi);
+
 	up(&smi_infos_lock);
 
 	return rv;
@@ -2434,11 +2462,22 @@ static __devinit int init_ipmi_si(void)
 {
 	int  i;
 	char *str;
+	int  rv;
 
 	if (initialized)
 		return 0;
 	initialized = 1;
 
+	/* Register the device drivers. */
+	rv = driver_register(&ipmi_driver);
+	if (rv) {
+		printk(KERN_ERR
+		       "init_ipmi_si: Unable to register driver: %d\n",
+		       rv);
+		return rv;
+	}
+
+
 	/* Parse out the si_type string into its components. */
 	str = si_type_str;
 	if (*str != '\0') {
@@ -2549,6 +2588,11 @@ static void __devexit cleanup_one_si(struct smi_info *to_clean)
 		to_clean->addr_source_cleanup(to_clean);
 	if (to_clean->io_cleanup)
 		to_clean->io_cleanup(to_clean);
+
+	if (to_clean->dev_registered)
+		platform_device_unregister(to_clean->pdev);
+
+	kfree(to_clean);
 }
 
 static __exit void cleanup_ipmi_si(void)
@@ -2566,6 +2610,8 @@ static __exit void cleanup_ipmi_si(void)
 	list_for_each_entry_safe(e, tmp_e, &smi_infos, link)
 		cleanup_one_si(e);
 	up(&smi_infos_lock);
+
+	driver_unregister(&ipmi_driver);
 }
 module_exit(cleanup_ipmi_si);
 
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 1f3159eb1ede..616539310d9a 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -996,7 +996,7 @@ static struct notifier_block wdog_panic_notifier = {
 };
 
 
-static void ipmi_new_smi(int if_num)
+static void ipmi_new_smi(int if_num, struct device *device)
 {
 	ipmi_register_watchdog(if_num);
 }
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index d6276e60b3bf..0a84b56935c2 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -36,6 +36,7 @@
 
 #include <linux/ipmi_msgdefs.h>
 #include <linux/compiler.h>
+#include <linux/device.h>
 
 /*
  * This file describes an interface to an IPMI driver.  You have to
@@ -397,7 +398,7 @@ struct ipmi_smi_watcher
 	   the watcher list.  So you can add and remove users from the
 	   IPMI interface, send messages, etc., but you cannot add
 	   or remove SMI watchers or SMI interfaces. */
-	void (*new_smi)(int if_num);
+	void (*new_smi)(int if_num, struct device *dev);
 	void (*smi_gone)(int if_num);
 };
 
diff --git a/include/linux/ipmi_msgdefs.h b/include/linux/ipmi_msgdefs.h
index 03bc64dc2ec1..22f5e2afda4f 100644
--- a/include/linux/ipmi_msgdefs.h
+++ b/include/linux/ipmi_msgdefs.h
@@ -47,6 +47,7 @@
 #define IPMI_NETFN_APP_RESPONSE			0x07
 #define IPMI_GET_DEVICE_ID_CMD		0x01
 #define IPMI_CLEAR_MSG_FLAGS_CMD	0x30
+#define IPMI_GET_DEVICE_GUID_CMD	0x08
 #define IPMI_GET_MSG_FLAGS_CMD		0x31
 #define IPMI_SEND_MSG_CMD		0x34
 #define IPMI_GET_MSG_CMD		0x33
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index e36ee157ad67..53571288a9fc 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -37,6 +37,9 @@
 #include <linux/ipmi_msgdefs.h>
 #include <linux/proc_fs.h>
 #include <linux/module.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/ipmi_smi.h>
 
 /* This files describes the interface for IPMI system management interface
    drivers to bind into the IPMI message handler. */
@@ -113,12 +116,52 @@ struct ipmi_smi_handlers
 	void (*dec_usecount)(void *send_info);
 };
 
+struct ipmi_device_id {
+	unsigned char device_id;
+	unsigned char device_revision;
+	unsigned char firmware_revision_1;
+	unsigned char firmware_revision_2;
+	unsigned char ipmi_version;
+	unsigned char additional_device_support;
+	unsigned int  manufacturer_id;
+	unsigned int  product_id;
+	unsigned char aux_firmware_revision[4];
+	unsigned int  aux_firmware_revision_set : 1;
+};
+
+#define ipmi_version_major(v) ((v)->ipmi_version & 0xf)
+#define ipmi_version_minor(v) ((v)->ipmi_version >> 4)
+
+/* Take a pointer to a raw data buffer and a length and extract device
+   id information from it.  The first byte of data must point to the
+   byte from the get device id response after the completion code.
+   The caller is responsible for making sure the length is at least
+   11 and the command completed without error. */
+static inline void ipmi_demangle_device_id(unsigned char *data,
+					   unsigned int  data_len,
+					   struct ipmi_device_id *id)
+{
+	id->device_id = data[0];
+	id->device_revision = data[1];
+	id->firmware_revision_1 = data[2];
+	id->firmware_revision_2 = data[3];
+	id->ipmi_version = data[4];
+	id->additional_device_support = data[5];
+	id->manufacturer_id = data[6] | (data[7] << 8) | (data[8] << 16);
+	id->product_id = data[9] | (data[10] << 8);
+	if (data_len >= 15) {
+		memcpy(id->aux_firmware_revision, data+11, 4);
+		id->aux_firmware_revision_set = 1;
+	} else
+		id->aux_firmware_revision_set = 0;
+}
+
 /* Add a low-level interface to the IPMI driver.  Note that if the
    interface doesn't know its slave address, it should pass in zero. */
 int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 		      void                     *send_info,
-		      unsigned char            version_major,
-		      unsigned char            version_minor,
+		      struct ipmi_device_id    *device_id,
+		      struct device            *dev,
 		      unsigned char            slave_addr,
 		      ipmi_smi_t               *intf);
 
-- 
cgit v1.2.3


From 878a9f30d7b13015f3aa4534d7877d985f150183 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Sun, 26 Mar 2006 01:37:23 -0800
Subject: [PATCH] hpet header sanitization

Add __KERNEL__ block.
Use __KERNEL__ to allow ioctl interface to be usable.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hpet.h | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hpet.h b/include/linux/hpet.h
index 27238194b212..707f7cb9e795 100644
--- a/include/linux/hpet.h
+++ b/include/linux/hpet.h
@@ -3,6 +3,8 @@
 
 #include <linux/compiler.h>
 
+#ifdef __KERNEL__
+
 /*
  * Offsets into HPET Registers
  */
@@ -85,22 +87,6 @@ struct hpet {
 #define	Tn_FSB_INT_ADDR_SHIFT		(32UL)
 #define	Tn_FSB_INT_VAL_MASK		(0x00000000ffffffffULL)
 
-struct hpet_info {
-	unsigned long hi_ireqfreq;	/* Hz */
-	unsigned long hi_flags;	/* information */
-	unsigned short hi_hpet;
-	unsigned short hi_timer;
-};
-
-#define	HPET_INFO_PERIODIC	0x0001	/* timer is periodic */
-
-#define	HPET_IE_ON	_IO('h', 0x01)	/* interrupt on */
-#define	HPET_IE_OFF	_IO('h', 0x02)	/* interrupt off */
-#define	HPET_INFO	_IOR('h', 0x03, struct hpet_info)
-#define	HPET_EPI	_IO('h', 0x04)	/* enable periodic */
-#define	HPET_DPI	_IO('h', 0x05)	/* disable periodic */
-#define	HPET_IRQFREQ	_IOW('h', 0x6, unsigned long)	/* IRQFREQ usec */
-
 /*
  * exported interfaces
  */
@@ -133,4 +119,22 @@ int hpet_register(struct hpet_task *, int);
 int hpet_unregister(struct hpet_task *);
 int hpet_control(struct hpet_task *, unsigned int, unsigned long);
 
+#endif /* __KERNEL__ */
+
+struct hpet_info {
+	unsigned long hi_ireqfreq;	/* Hz */
+	unsigned long hi_flags;	/* information */
+	unsigned short hi_hpet;
+	unsigned short hi_timer;
+};
+
+#define	HPET_INFO_PERIODIC	0x0001	/* timer is periodic */
+
+#define	HPET_IE_ON	_IO('h', 0x01)	/* interrupt on */
+#define	HPET_IE_OFF	_IO('h', 0x02)	/* interrupt off */
+#define	HPET_INFO	_IOR('h', 0x03, struct hpet_info)
+#define	HPET_EPI	_IO('h', 0x04)	/* enable periodic */
+#define	HPET_DPI	_IO('h', 0x05)	/* disable periodic */
+#define	HPET_IRQFREQ	_IOW('h', 0x6, unsigned long)	/* IRQFREQ usec */
+
 #endif				/* !__HPET__ */
-- 
cgit v1.2.3


From 5842add2f3b519111b6401f3a35862bd00a3aa7e Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Sun, 26 Mar 2006 01:37:26 -0800
Subject: [PATCH] VFS,fs/locks.c,NFSD4: add race_free posix_lock_file_conf()
 interface

Lockd and the NFSv4 server both exercise a race condition where
posix_test_lock() is called either before or after posix_lock_file() to
deal with a denied lock request due to a conflicting lock.

Remove the race condition for the NFSv4 server by adding a new conflicting
lock parameter to __posix_lock_file() , changing the name to
__posix_lock_file_conf().

Keep posix_lock_file() interface, add posix_lock_conf() interface, both
call __posix_lock_file_conf().

[akpm@osdl.org: Put the EXPORT_SYMBOL() where it belongs]
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c         | 32 ++++++++++++++++++++++++--------
 include/linux/fs.h |  1 +
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 4badf6a0e7b6..4d9e71d43e7e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -792,9 +792,7 @@ out:
 	return error;
 }
 
-EXPORT_SYMBOL(posix_lock_file);
-
-static int __posix_lock_file(struct inode *inode, struct file_lock *request)
+static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
 	struct file_lock *fl;
 	struct file_lock *new_fl, *new_fl2;
@@ -818,6 +816,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request)
 				continue;
 			if (!posix_locks_conflict(request, fl))
 				continue;
+			if (conflock)
+				locks_copy_lock(conflock, fl);
 			error = -EAGAIN;
 			if (!(request->fl_flags & FL_SLEEP))
 				goto out;
@@ -987,8 +987,24 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request)
  */
 int posix_lock_file(struct file *filp, struct file_lock *fl)
 {
-	return __posix_lock_file(filp->f_dentry->d_inode, fl);
+	return __posix_lock_file_conf(filp->f_dentry->d_inode, fl, NULL);
+}
+EXPORT_SYMBOL(posix_lock_file);
+
+/**
+ * posix_lock_file_conf - Apply a POSIX-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ * @conflock: Place to return a copy of the conflicting lock, if found.
+ *
+ * Except for the conflock parameter, acts just like posix_lock_file.
+ */
+int posix_lock_file_conf(struct file *filp, struct file_lock *fl,
+			struct file_lock *conflock)
+{
+	return __posix_lock_file_conf(filp->f_dentry->d_inode, fl, conflock);
 }
+EXPORT_SYMBOL(posix_lock_file_conf);
 
 /**
  * posix_lock_file_wait - Apply a POSIX-style lock to a file
@@ -1004,7 +1020,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	int error;
 	might_sleep ();
 	for (;;) {
-		error = __posix_lock_file(filp->f_dentry->d_inode, fl);
+		error = posix_lock_file(filp, fl);
 		if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1076,7 +1092,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 	fl.fl_end = offset + count - 1;
 
 	for (;;) {
-		error = __posix_lock_file(inode, &fl);
+		error = __posix_lock_file_conf(inode, &fl, NULL);
 		if (error != -EAGAIN)
 			break;
 		if (!(fl.fl_flags & FL_SLEEP))
@@ -1689,7 +1705,7 @@ again:
 		error = filp->f_op->lock(filp, cmd, file_lock);
 	else {
 		for (;;) {
-			error = __posix_lock_file(inode, file_lock);
+			error = posix_lock_file(filp, file_lock);
 			if ((error != -EAGAIN) || (cmd == F_SETLK))
 				break;
 			error = wait_event_interruptible(file_lock->fl_wait,
@@ -1832,7 +1848,7 @@ again:
 		error = filp->f_op->lock(filp, cmd, file_lock);
 	else {
 		for (;;) {
-			error = __posix_lock_file(inode, file_lock);
+			error = posix_lock_file(filp, file_lock);
 			if ((error != -EAGAIN) || (cmd == F_SETLK64))
 				break;
 			error = wait_event_interruptible(file_lock->fl_wait,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9674679525f9..ab67181a5a55 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -763,6 +763,7 @@ extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
 extern void locks_remove_flock(struct file *);
 extern int posix_test_lock(struct file *, struct file_lock *, struct file_lock *);
+extern int posix_lock_file_conf(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file *, struct file_lock *);
-- 
cgit v1.2.3


From 88959ea968709c35e8b979ac9f5a398fa748091a Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sun, 26 Mar 2006 01:37:27 -0800
Subject: [PATCH] create struct compat_timex and use it everywhere

We had a copy of the compatibility version of struct timex in each 64 bit
architecture.  This patch just creates a global one and replaces all the
usages of the old ones.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kyle McMartin <kyle@parisc-linux.org>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/sys_ia32.c         | 15 +--------------
 arch/mips/kernel/linux32.c        | 15 +--------------
 arch/parisc/kernel/sys_parisc32.c | 37 ++++---------------------------------
 arch/powerpc/kernel/sys_ppc32.c   | 15 +--------------
 arch/s390/kernel/compat_linux.c   | 15 +--------------
 arch/sparc64/kernel/sys_sparc32.c | 15 +--------------
 arch/x86_64/ia32/sys_ia32.c       | 19 +++----------------
 include/linux/compat.h            | 26 ++++++++++++++++++++++++++
 8 files changed, 38 insertions(+), 119 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 13e739e4c84d..1e1a2353aefd 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -2594,23 +2594,10 @@ sys32_setresgid(compat_gid_t rgid, compat_gid_t egid,
 
 /* Handle adjtimex compatibility. */
 
-struct timex32 {
-	u32 modes;
-	s32 offset, freq, maxerror, esterror;
-	s32 status, constant, precision, tolerance;
-	struct compat_timeval time;
-	s32 tick;
-	s32 ppsfreq, jitter, shift, stabil;
-	s32 jitcnt, calcnt, errcnt, stbcnt;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-};
-
 extern int do_adjtimex(struct timex *);
 
 asmlinkage long
-sys32_adjtimex(struct timex32 *utp)
+sys32_adjtimex(struct compat_timex *utp)
 {
 	struct timex txc;
 	int ret;
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 013bc93688e8..f33e779796f0 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -1159,22 +1159,9 @@ out:
 
 /* Handle adjtimex compatibility. */
 
-struct timex32 {
-	u32 modes;
-	s32 offset, freq, maxerror, esterror;
-	s32 status, constant, precision, tolerance;
-	struct compat_timeval time;
-	s32 tick;
-	s32 ppsfreq, jitter, shift, stabil;
-	s32 jitcnt, calcnt, errcnt, stbcnt;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-};
-
 extern int do_adjtimex(struct timex *);
 
-asmlinkage int sys32_adjtimex(struct timex32 __user *utp)
+asmlinkage int sys32_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
 	int ret;
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 613569018410..ca1d8dbbadf3 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -567,43 +567,14 @@ asmlinkage int sys32_sendfile64(int out_fd, int in_fd, compat_loff_t __user *off
 }
 
 
-struct timex32 {
-	unsigned int modes;	/* mode selector */
-	int offset;		/* time offset (usec) */
-	int freq;		/* frequency offset (scaled ppm) */
-	int maxerror;		/* maximum error (usec) */
-	int esterror;		/* estimated error (usec) */
-	int status;		/* clock command/status */
-	int constant;		/* pll time constant */
-	int precision;		/* clock precision (usec) (read only) */
-	int tolerance;		/* clock frequency tolerance (ppm)
-				 * (read only)
-				 */
-	struct compat_timeval time;	/* (read only) */
-	int tick;		/* (modified) usecs between clock ticks */
-
-	int ppsfreq;           /* pps frequency (scaled ppm) (ro) */
-	int jitter;            /* pps jitter (us) (ro) */
-	int shift;              /* interval duration (s) (shift) (ro) */
-	int stabil;            /* pps stability (scaled ppm) (ro) */
-	int jitcnt;            /* jitter limit exceeded (ro) */
-	int calcnt;            /* calibration intervals (ro) */
-	int errcnt;            /* calibration errors (ro) */
-	int stbcnt;            /* stability limit exceeded (ro) */
-
-	int  :32; int  :32; int  :32; int  :32;
-	int  :32; int  :32; int  :32; int  :32;
-	int  :32; int  :32; int  :32; int  :32;
-};
-
-asmlinkage long sys32_adjtimex(struct timex32 __user *txc_p32)
+asmlinkage long sys32_adjtimex(struct compat_timex __user *txc_p32)
 {
 	struct timex txc;
-	struct timex32 t32;
+	struct compat_timex t32;
 	int ret;
 	extern int do_adjtimex(struct timex *txc);
 
-	if(copy_from_user(&t32, txc_p32, sizeof(struct timex32)))
+	if(copy_from_user(&t32, txc_p32, sizeof(struct compat_timex)))
 		return -EFAULT;
 #undef CP
 #define CP(x) txc.x = t32.x
@@ -620,7 +591,7 @@ asmlinkage long sys32_adjtimex(struct timex32 __user *txc_p32)
 	CP(time.tv_sec); CP(time.tv_usec); CP(tick); CP(ppsfreq); CP(jitter);
 	CP(shift); CP(stabil); CP(jitcnt); CP(calcnt); CP(errcnt);
 	CP(stbcnt);
-	return copy_to_user(txc_p32, &t32, sizeof(struct timex32)) ? -EFAULT : ret;
+	return copy_to_user(txc_p32, &t32, sizeof(struct compat_timex)) ? -EFAULT : ret;
 }
 
 
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index cd75ab2908fa..d9867f583c68 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -162,22 +162,9 @@ asmlinkage long compat_sys_sysfs(u32 option, u32 arg1, u32 arg2)
 }
 
 /* Handle adjtimex compatibility. */
-struct timex32 {
-	u32 modes;
-	s32 offset, freq, maxerror, esterror;
-	s32 status, constant, precision, tolerance;
-	struct compat_timeval time;
-	s32 tick;
-	s32 ppsfreq, jitter, shift, stabil;
-	s32 jitcnt, calcnt, errcnt, stbcnt;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-};
-
 extern int do_adjtimex(struct timex *);
 
-asmlinkage long compat_sys_adjtimex(struct timex32 __user *utp)
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
 	int ret;
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index cc058dc3bc8b..9809264f2f4d 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -707,22 +707,9 @@ asmlinkage long sys32_sendfile64(int out_fd, int in_fd,
 
 /* Handle adjtimex compatibility. */
 
-struct timex32 {
-	u32 modes;
-	s32 offset, freq, maxerror, esterror;
-	s32 status, constant, precision, tolerance;
-	struct compat_timeval time;
-	s32 tick;
-	s32 ppsfreq, jitter, shift, stabil;
-	s32 jitcnt, calcnt, errcnt, stbcnt;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-};
-
 extern int do_adjtimex(struct timex *);
 
-asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
+asmlinkage long sys32_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
 	int ret;
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 0e41df024489..48f02f201918 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -947,22 +947,9 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
 
 /* Handle adjtimex compatibility. */
 
-struct timex32 {
-	u32 modes;
-	s32 offset, freq, maxerror, esterror;
-	s32 status, constant, precision, tolerance;
-	struct compat_timeval time;
-	s32 tick;
-	s32 ppsfreq, jitter, shift, stabil;
-	s32 jitcnt, calcnt, errcnt, stbcnt;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-};
-
 extern int do_adjtimex(struct timex *);
 
-asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
+asmlinkage long sys32_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
 	int ret;
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 2b2d029f477c..b13121e451a8 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -769,30 +769,17 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
 
 /* Handle adjtimex compatibility. */
 
-struct timex32 {
-	u32 modes;
-	s32 offset, freq, maxerror, esterror;
-	s32 status, constant, precision, tolerance;
-	struct compat_timeval time;
-	s32 tick;
-	s32 ppsfreq, jitter, shift, stabil;
-	s32 jitcnt, calcnt, errcnt, stbcnt;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-};
-
 extern int do_adjtimex(struct timex *);
 
 asmlinkage long
-sys32_adjtimex(struct timex32 __user *utp)
+sys32_adjtimex(struct compat_timex __user *utp)
 {
 	struct timex txc;
 	int ret;
 
 	memset(&txc, 0, sizeof(struct timex));
 
-	if (!access_ok(VERIFY_READ, utp, sizeof(struct timex32)) ||
+	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
 	   __get_user(txc.modes, &utp->modes) ||
 	   __get_user(txc.offset, &utp->offset) ||
 	   __get_user(txc.freq, &utp->freq) ||
@@ -817,7 +804,7 @@ sys32_adjtimex(struct timex32 __user *utp)
 
 	ret = do_adjtimex(&txc);
 
-	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct timex32)) ||
+	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
 	   __put_user(txc.modes, &utp->modes) ||
 	   __put_user(txc.offset, &utp->offset) ||
 	   __put_user(txc.freq, &utp->freq) ||
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c9ab2a26348c..859f95700d36 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -45,6 +45,32 @@ struct compat_tms {
 	compat_clock_t		tms_cstime;
 };
 
+struct compat_timex {
+	compat_uint_t modes;
+	compat_long_t offset;
+	compat_long_t freq;
+	compat_long_t maxerror;
+	compat_long_t esterror;
+	compat_int_t status;
+	compat_long_t constant;
+	compat_long_t precision;
+	compat_long_t tolerance;
+	struct compat_timeval time;
+	compat_long_t tick;
+	compat_long_t ppsfreq;
+	compat_long_t jitter;
+	compat_int_t shift;
+	compat_long_t stabil;
+	compat_long_t jitcnt;
+	compat_long_t calcnt;
+	compat_long_t errcnt;
+	compat_long_t stbcnt;
+
+	compat_int_t :32; compat_int_t :32; compat_int_t :32; compat_int_t :32;
+	compat_int_t :32; compat_int_t :32; compat_int_t :32; compat_int_t :32;
+	compat_int_t :32; compat_int_t :32; compat_int_t :32; compat_int_t :32;
+};
+
 #define _COMPAT_NSIG_WORDS	(_COMPAT_NSIG / _COMPAT_NSIG_BPW)
 
 typedef struct {
-- 
cgit v1.2.3


From 3158e9411a66fb98d495ac441c242264f31aaf3e Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sun, 26 Mar 2006 01:37:29 -0800
Subject: [PATCH] consolidate sys32/compat_adjtimex

Create compat_sys_adjtimex and use it an all appropriate places.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/osf_sys.c        |  1 -
 arch/ia64/ia32/sys_ia32.c          | 62 ------------------------------------
 arch/mips/kernel/linux32.c         | 61 ------------------------------------
 arch/mips/kernel/scall64-n32.S     |  2 +-
 arch/mips/kernel/scall64-o32.S     |  2 +-
 arch/parisc/kernel/sys_parisc32.c  | 29 -----------------
 arch/parisc/kernel/syscall_table.S |  2 +-
 arch/powerpc/kernel/sys_ppc32.c    | 60 -----------------------------------
 arch/s390/kernel/compat_linux.c    | 61 ------------------------------------
 arch/s390/kernel/compat_wrapper.S  |  8 ++---
 arch/s390/kernel/syscalls.S        |  2 +-
 arch/sparc64/kernel/sys_sparc32.c  | 61 ------------------------------------
 arch/sparc64/kernel/systbls.S      |  2 +-
 arch/x86_64/ia32/ia32entry.S       |  2 +-
 arch/x86_64/ia32/sys_ia32.c        | 64 --------------------------------------
 include/linux/compat.h             |  2 ++
 include/linux/timex.h              |  2 ++
 kernel/compat.c                    | 59 +++++++++++++++++++++++++++++++++++
 18 files changed, 73 insertions(+), 409 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 7fb14f42a125..31afe3d91ac6 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -821,7 +821,6 @@ osf_setsysinfo(unsigned long op, void __user *buffer, unsigned long nbytes,
    affects all sorts of things, like timeval and itimerval.  */
 
 extern struct timezone sys_tz;
-extern int do_adjtimex(struct timex *);
 
 struct timeval32
 {
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 1e1a2353aefd..5366b3b23d09 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -25,7 +25,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -2591,65 +2590,4 @@ sys32_setresgid(compat_gid_t rgid, compat_gid_t egid,
 	ssgid = (sgid == (compat_gid_t)-1) ? ((gid_t)-1) : ((gid_t)sgid);
 	return sys_setresgid(srgid, segid, ssgid);
 }
-
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long
-sys32_adjtimex(struct compat_timex *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if(get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if(put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
 #endif /* NOTYET */
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index f33e779796f0..3f40c37a9ee6 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -30,7 +30,6 @@
 #include <linux/utime.h>
 #include <linux/utsname.h>
 #include <linux/personality.h>
-#include <linux/timex.h>
 #include <linux/dnotify.h>
 #include <linux/module.h>
 #include <linux/binfmts.h>
@@ -1157,66 +1156,6 @@ out:
 	return err;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage int sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if (put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 asmlinkage int sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset,
 	s32 count)
 {
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 02c8267e45e7..05a2c0567dae 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -273,7 +273,7 @@ EXPORT(sysn32_call_table)
 	PTR	sys_pivot_root
 	PTR	sys32_sysctl
 	PTR	sys_prctl
-	PTR	sys32_adjtimex
+	PTR	compat_sys_adjtimex
 	PTR	compat_sys_setrlimit		/* 6155 */
 	PTR	sys_chroot
 	PTR	sys_sync
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 797e0d874889..19c4ca481b02 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -328,7 +328,7 @@ sys_call_table:
 	PTR	sys_setdomainname
 	PTR	sys32_newuname
 	PTR	sys_ni_syscall			/* sys_modify_ldt */
-	PTR	sys32_adjtimex
+	PTR	compat_sys_adjtimex
 	PTR	sys_mprotect			/* 4125 */
 	PTR	compat_sys_sigprocmask
 	PTR	sys_ni_syscall			/* was creat_module */
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index ca1d8dbbadf3..d286f68a3d3a 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -21,7 +21,6 @@
 #include <linux/times.h>
 #include <linux/utsname.h>
 #include <linux/time.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -567,34 +566,6 @@ asmlinkage int sys32_sendfile64(int out_fd, int in_fd, compat_loff_t __user *off
 }
 
 
-asmlinkage long sys32_adjtimex(struct compat_timex __user *txc_p32)
-{
-	struct timex txc;
-	struct compat_timex t32;
-	int ret;
-	extern int do_adjtimex(struct timex *txc);
-
-	if(copy_from_user(&t32, txc_p32, sizeof(struct compat_timex)))
-		return -EFAULT;
-#undef CP
-#define CP(x) txc.x = t32.x
-	CP(modes); CP(offset); CP(freq); CP(maxerror); CP(esterror);
-	CP(status); CP(constant); CP(precision); CP(tolerance);
-	CP(time.tv_sec); CP(time.tv_usec); CP(tick); CP(ppsfreq); CP(jitter);
-	CP(shift); CP(stabil); CP(jitcnt); CP(calcnt); CP(errcnt);
-	CP(stbcnt);
-	ret = do_adjtimex(&txc);
-#undef CP
-#define CP(x) t32.x = txc.x
-	CP(modes); CP(offset); CP(freq); CP(maxerror); CP(esterror);
-	CP(status); CP(constant); CP(precision); CP(tolerance);
-	CP(time.tv_sec); CP(time.tv_usec); CP(tick); CP(ppsfreq); CP(jitter);
-	CP(shift); CP(stabil); CP(jitcnt); CP(calcnt); CP(errcnt);
-	CP(stbcnt);
-	return copy_to_user(txc_p32, &t32, sizeof(struct compat_timex)) ? -EFAULT : ret;
-}
-
-
 struct sysinfo32 {
 	s32 uptime;
 	u32 loads[3];
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index 71011eadb872..89b6c56ea0a8 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -207,7 +207,7 @@
 	/* struct sockaddr... */
 	ENTRY_SAME(recvfrom)
 	/* struct timex contains longs */
-	ENTRY_DIFF(adjtimex)
+	ENTRY_COMP(adjtimex)
 	ENTRY_SAME(mprotect)		/* 125 */
 	/* old_sigset_t forced to 32 bits.  Beware glibc sigset_t */
 	ENTRY_COMP(sigprocmask)
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index d9867f583c68..ec274e688816 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -24,7 +24,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -161,65 +160,6 @@ asmlinkage long compat_sys_sysfs(u32 option, u32 arg1, u32 arg2)
 	return sys_sysfs((int)option, arg1, arg2);
 }
 
-/* Handle adjtimex compatibility. */
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-	
-	memset(&txc, 0, sizeof(struct timex));
-
-	if(get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if(put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 asmlinkage long compat_sys_pause(void)
 {
 	current->state = TASK_INTERRUPTIBLE;
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 9809264f2f4d..5e14de37c17b 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -26,7 +26,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -705,66 +704,6 @@ asmlinkage long sys32_sendfile64(int out_fd, int in_fd,
 	return ret;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if(get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if(put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 #ifdef CONFIG_SYSCTL
 struct __sysctl_args32 {
 	u32 name;
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 50e80138e7ad..199da68bd7be 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -551,10 +551,10 @@ sys32_newuname_wrapper:
 	llgtr	%r2,%r2			# struct new_utsname *
 	jg	s390x_newuname		# branch to system call
 
-	.globl  sys32_adjtimex_wrapper 
-sys32_adjtimex_wrapper:
-	llgtr	%r2,%r2			# struct timex_emu31 *
-	jg	sys32_adjtimex		# branch to system call
+	.globl  compat_sys_adjtimex_wrapper
+compat_sys_adjtimex_wrapper:
+	llgtr	%r2,%r2			# struct compat_timex *
+	jg	compat_sys_adjtimex	# branch to system call
 
 	.globl  sys32_mprotect_wrapper 
 sys32_mprotect_wrapper:
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 7c88d85c3597..2f56654da821 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -132,7 +132,7 @@ SYSCALL(sys_clone_glue,sys_clone_glue,sys32_clone_glue)		/* 120 */
 SYSCALL(sys_setdomainname,sys_setdomainname,sys32_setdomainname_wrapper)
 SYSCALL(sys_newuname,s390x_newuname,sys32_newuname_wrapper)
 NI_SYSCALL							/* modify_ldt for i386 */
-SYSCALL(sys_adjtimex,sys_adjtimex,sys32_adjtimex_wrapper)
+SYSCALL(sys_adjtimex,sys_adjtimex,compat_sys_adjtimex_wrapper)
 SYSCALL(sys_mprotect,sys_mprotect,sys32_mprotect_wrapper)	/* 125 */
 SYSCALL(sys_sigprocmask,sys_sigprocmask,compat_sys_sigprocmask_wrapper)
 NI_SYSCALL							/* old "create module" */
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 48f02f201918..2e906bad56fa 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -19,7 +19,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -945,66 +944,6 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
 	return ret;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (get_user(txc.modes, &utp->modes) ||
-	    __get_user(txc.offset, &utp->offset) ||
-	    __get_user(txc.freq, &utp->freq) ||
-	    __get_user(txc.maxerror, &utp->maxerror) ||
-	    __get_user(txc.esterror, &utp->esterror) ||
-	    __get_user(txc.status, &utp->status) ||
-	    __get_user(txc.constant, &utp->constant) ||
-	    __get_user(txc.precision, &utp->precision) ||
-	    __get_user(txc.tolerance, &utp->tolerance) ||
-	    __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	    __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	    __get_user(txc.tick, &utp->tick) ||
-	    __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	    __get_user(txc.jitter, &utp->jitter) ||
-	    __get_user(txc.shift, &utp->shift) ||
-	    __get_user(txc.stabil, &utp->stabil) ||
-	    __get_user(txc.jitcnt, &utp->jitcnt) ||
-	    __get_user(txc.calcnt, &utp->calcnt) ||
-	    __get_user(txc.errcnt, &utp->errcnt) ||
-	    __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if (put_user(txc.modes, &utp->modes) ||
-	    __put_user(txc.offset, &utp->offset) ||
-	    __put_user(txc.freq, &utp->freq) ||
-	    __put_user(txc.maxerror, &utp->maxerror) ||
-	    __put_user(txc.esterror, &utp->esterror) ||
-	    __put_user(txc.status, &utp->status) ||
-	    __put_user(txc.constant, &utp->constant) ||
-	    __put_user(txc.precision, &utp->precision) ||
-	    __put_user(txc.tolerance, &utp->tolerance) ||
-	    __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	    __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	    __put_user(txc.tick, &utp->tick) ||
-	    __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	    __put_user(txc.jitter, &utp->jitter) ||
-	    __put_user(txc.shift, &utp->shift) ||
-	    __put_user(txc.stabil, &utp->stabil) ||
-	    __put_user(txc.jitcnt, &utp->jitcnt) ||
-	    __put_user(txc.calcnt, &utp->calcnt) ||
-	    __put_user(txc.errcnt, &utp->errcnt) ||
-	    __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 /* This is just a version for 32-bit applications which does
  * not force O_LARGEFILE on.
  */
diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
index c3adb7ac167d..3b250f2318fd 100644
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@@ -63,7 +63,7 @@ sys_call_table32:
 /*200*/	.word sys32_ssetmask, sys_sigsuspend, compat_sys_newlstat, sys_uselib, compat_sys_old_readdir
 	.word sys32_readahead, sys32_socketcall, sys32_syslog, sys32_lookup_dcookie, sys32_fadvise64
 /*210*/	.word sys32_fadvise64_64, sys32_tgkill, sys32_waitpid, sys_swapoff, sys32_sysinfo
-	.word sys32_ipc, sys32_sigreturn, sys_clone, sys32_ioprio_get, sys32_adjtimex
+	.word sys32_ipc, sys32_sigreturn, sys_clone, sys32_ioprio_get, compat_sys_adjtimex
 /*220*/	.word sys32_sigprocmask, sys_ni_syscall, sys32_delete_module, sys_ni_syscall, sys32_getpgid
 	.word sys32_bdflush, sys32_sysfs, sys_nis_syscall, sys32_setfsuid16, sys32_setfsgid16
 /*230*/	.word sys32_select, compat_sys_time, sys_nis_syscall, compat_sys_stime, compat_sys_statfs64
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 00dee176c08e..7549a4389fbf 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -501,7 +501,7 @@ ia32_sys_call_table:
 	.quad sys_setdomainname
 	.quad sys_uname
 	.quad sys_modify_ldt
-	.quad sys32_adjtimex
+	.quad compat_sys_adjtimex
 	.quad sys32_mprotect		/* 125 */
 	.quad compat_sys_sigprocmask
 	.quad quiet_ni_syscall		/* create_module */
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index b13121e451a8..f182b20858e2 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -30,7 +30,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -767,69 +766,6 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
 	return ret;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long
-sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
-	   __get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
-	   __put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
 	unsigned long prot, unsigned long flags,
 	unsigned long fd, unsigned long pgoff)
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 859f95700d36..24d659cdbafe 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -207,5 +207,7 @@ static inline int compat_timespec_compare(struct compat_timespec *lhs,
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 82dc9ae79d37..03914b7e41b1 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -307,6 +307,8 @@ time_interpolator_reset(void)
 /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
 extern u64 current_tick_length(void);
 
+extern int do_adjtimex(struct timex *);
+
 #endif /* KERNEL */
 
 #endif /* LINUX_TIMEX_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index 8c9cd88b6785..b9bdd1271f44 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
+#include <linux/timex.h>
 
 #include <asm/uaccess.h>
 
@@ -898,3 +899,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 	return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
+
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+{
+	struct timex txc;
+	int ret;
+
+	memset(&txc, 0, sizeof(struct timex));
+
+	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+			__get_user(txc.modes, &utp->modes) ||
+			__get_user(txc.offset, &utp->offset) ||
+			__get_user(txc.freq, &utp->freq) ||
+			__get_user(txc.maxerror, &utp->maxerror) ||
+			__get_user(txc.esterror, &utp->esterror) ||
+			__get_user(txc.status, &utp->status) ||
+			__get_user(txc.constant, &utp->constant) ||
+			__get_user(txc.precision, &utp->precision) ||
+			__get_user(txc.tolerance, &utp->tolerance) ||
+			__get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+			__get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+			__get_user(txc.tick, &utp->tick) ||
+			__get_user(txc.ppsfreq, &utp->ppsfreq) ||
+			__get_user(txc.jitter, &utp->jitter) ||
+			__get_user(txc.shift, &utp->shift) ||
+			__get_user(txc.stabil, &utp->stabil) ||
+			__get_user(txc.jitcnt, &utp->jitcnt) ||
+			__get_user(txc.calcnt, &utp->calcnt) ||
+			__get_user(txc.errcnt, &utp->errcnt) ||
+			__get_user(txc.stbcnt, &utp->stbcnt))
+		return -EFAULT;
+
+	ret = do_adjtimex(&txc);
+
+	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+			__put_user(txc.modes, &utp->modes) ||
+			__put_user(txc.offset, &utp->offset) ||
+			__put_user(txc.freq, &utp->freq) ||
+			__put_user(txc.maxerror, &utp->maxerror) ||
+			__put_user(txc.esterror, &utp->esterror) ||
+			__put_user(txc.status, &utp->status) ||
+			__put_user(txc.constant, &utp->constant) ||
+			__put_user(txc.precision, &utp->precision) ||
+			__put_user(txc.tolerance, &utp->tolerance) ||
+			__put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+			__put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+			__put_user(txc.tick, &utp->tick) ||
+			__put_user(txc.ppsfreq, &utp->ppsfreq) ||
+			__put_user(txc.jitter, &utp->jitter) ||
+			__put_user(txc.shift, &utp->shift) ||
+			__put_user(txc.stabil, &utp->stabil) ||
+			__put_user(txc.jitcnt, &utp->jitcnt) ||
+			__put_user(txc.calcnt, &utp->calcnt) ||
+			__put_user(txc.errcnt, &utp->errcnt) ||
+			__put_user(txc.stbcnt, &utp->stbcnt))
+		ret = -EFAULT;
+
+	return ret;
+}
-- 
cgit v1.2.3


From 22e6c1b39c648850438decd491f62d311800c7db Mon Sep 17 00:00:00 2001
From: Maneesh Soni <maneesh@in.ibm.com>
Date: Sun, 26 Mar 2006 01:37:29 -0800
Subject: [PATCH] Use loff_t for size in struct proc_dir_entry

Change proc_dir_entry->size to be loff_t to represent files like
/proc/vmcore for 32bit systems with more than 4G memory.

Needed for seeing correct size for /proc/vmcore for 32-bit systems with >
4G RAM.

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/proc_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 6b12b0f661b4..cb224cf653b1 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -56,7 +56,7 @@ struct proc_dir_entry {
 	nlink_t nlink;
 	uid_t uid;
 	gid_t gid;
-	unsigned long size;
+	loff_t size;
 	struct inode_operations * proc_iops;
 	struct file_operations * proc_fops;
 	get_info_t *get_info;
-- 
cgit v1.2.3


From 6e0678f394c7bd21bfa5d252b071a09e10e7a749 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:44 -0800
Subject: [PATCH] mempool: add page allocator

This will be used by the next patch in the series to replace duplicate
mempool-backed page allocators in 2 places in the kernel.  It is also likely
that there will be more users in the future.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempool.h | 12 ++++++++++++
 mm/mempool.c            | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index f2427d7394b0..9787572e0ae7 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -38,4 +38,16 @@ extern void mempool_free(void *element, mempool_t *pool);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
+/*
+ * A mempool_alloc_t and mempool_free_t for a simple page allocator that
+ * allocates pages of the order specified by pool_data
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
+void mempool_free_pages(void *element, void *pool_data);
+static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
+{
+	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
+			      (void *)(long)order);
+}
+
 #endif /* _LINUX_MEMPOOL_H */
diff --git a/mm/mempool.c b/mm/mempool.c
index f71893ed3543..45c0112ca7b2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -289,3 +289,21 @@ void mempool_free_slab(void *element, void *pool_data)
 	kmem_cache_free(mem, element);
 }
 EXPORT_SYMBOL(mempool_free_slab);
+
+/*
+ * A simple mempool-backed page allocator that allocates pages
+ * of the order specified by pool_data.
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL(mempool_alloc_pages);
+
+void mempool_free_pages(void *element, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	__free_pages(element, order);
+}
+EXPORT_SYMBOL(mempool_free_pages);
-- 
cgit v1.2.3


From 53184082b070dfb077218828fdf839826102ed96 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:46 -0800
Subject: [PATCH] mempool: add kmalloc allocator

Add another allocator to the common mempool code: a kmalloc/kfree allocator

This will be used by the next patch in the series to replace duplicate
mempool-backed kmalloc allocators in several places in the kernel.  It is also
very likely that there will be more users in the future.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempool.h | 12 ++++++++++++
 mm/mempool.c            | 17 +++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 9787572e0ae7..d23dbf076b57 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -38,6 +38,18 @@ extern void mempool_free(void *element, mempool_t *pool);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
+/*
+ * A mempool_alloc_t and mempool_free_t to kmalloc the amount of memory
+ * specified by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
+void mempool_kfree(void *element, void *pool_data);
+static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
+{
+	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
+			      (void *) size);
+}
+
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
  * allocates pages of the order specified by pool_data
diff --git a/mm/mempool.c b/mm/mempool.c
index 45c0112ca7b2..a1397c6a4864 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -290,6 +290,23 @@ void mempool_free_slab(void *element, void *pool_data)
 }
 EXPORT_SYMBOL(mempool_free_slab);
 
+/*
+ * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
+ * specfied by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+	return kmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kmalloc);
+
+void mempool_kfree(void *element, void *pool_data)
+{
+	kfree(element);
+}
+EXPORT_SYMBOL(mempool_kfree);
+
 /*
  * A simple mempool-backed page allocator that allocates pages
  * of the order specified by pool_data.
-- 
cgit v1.2.3


From f183323d3822dee4d7b3147a59b6e8987fe201e0 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:48 -0800
Subject: [PATCH] mempool: add kzalloc allocator

Add another allocator to the common mempool code: a kzalloc/kfree allocator

This will be used by the next patch in the series to replace a mempool-backed
kzalloc allocator.  It is also very likely that there will be more users in
the future.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempool.h | 10 ++++++++--
 mm/mempool.c            |  9 ++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index d23dbf076b57..41570ce353eb 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -39,16 +39,22 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
 /*
- * A mempool_alloc_t and mempool_free_t to kmalloc the amount of memory
- * specified by pool_data
+ * 2 mempool_alloc_t's and a mempool_free_t to kmalloc/kzalloc and kfree
+ * the amount of memory specified by pool_data
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
+void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kfree(void *element, void *pool_data);
 static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 {
 	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
 			      (void *) size);
 }
+static inline mempool_t *mempool_create_kzalloc_pool(int min_nr, size_t size)
+{
+	return mempool_create(min_nr, mempool_kzalloc, mempool_kfree,
+			      (void *) size);
+}
 
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
diff --git a/mm/mempool.c b/mm/mempool.c
index a1397c6a4864..7bf064e6a345 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -296,11 +296,18 @@ EXPORT_SYMBOL(mempool_free_slab);
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 {
-	size_t size = (size_t) pool_data;
+	size_t size = (size_t)(long)pool_data;
 	return kmalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 
+void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+	return kzalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kzalloc);
+
 void mempool_kfree(void *element, void *pool_data)
 {
 	kfree(element);
-- 
cgit v1.2.3


From fec433aaaae32a02329ad7d71b0f3c91b7525077 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:49 -0800
Subject: [PATCH] mempool: add mempool_create_slab_pool()

Create a simple wrapper function for the common case of creating a slab-based
mempool.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempool.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 41570ce353eb..9be484d11283 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -6,6 +6,8 @@
 
 #include <linux/wait.h>
 
+struct kmem_cache;
+
 typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
 typedef void (mempool_free_t)(void *element, void *pool_data);
 
@@ -37,6 +39,12 @@ extern void mempool_free(void *element, mempool_t *pool);
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
+static inline mempool_t *
+mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
+{
+	return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
+			      (void *) kc);
+}
 
 /*
  * 2 mempool_alloc_t's and a mempool_free_t to kmalloc/kzalloc and kfree
-- 
cgit v1.2.3


From 93d2341c750cda0df48a6cc67b35fe25f1ec47df Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:50 -0800
Subject: [PATCH] mempool: use mempool_create_slab_pool()

Modify well over a dozen mempool users to call mempool_create_slab_pool()
rather than calling mempool_create() with extra arguments, saving about 30
lines of code and increasing readability.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/cfq-iosched.c             |  2 +-
 drivers/block/aoe/aoeblk.c      |  4 +---
 drivers/md/dm-crypt.c           |  3 +--
 drivers/md/dm-mpath.c           |  3 +--
 drivers/md/dm-snap.c            |  3 +--
 drivers/md/dm.c                 |  6 ++----
 drivers/md/kcopyd.c             |  3 +--
 drivers/message/i2o/i2o_block.c |  7 +++----
 drivers/scsi/iscsi_tcp.c        |  4 ++--
 drivers/scsi/qla2xxx/qla_os.c   |  3 +--
 drivers/scsi/scsi_lib.c         |  5 ++---
 fs/bio.c                        |  7 ++-----
 fs/cifs/cifsfs.c                | 18 ++++++------------
 fs/jfs/jfs_metapage.c           |  4 ++--
 fs/nfs/read.c                   |  6 ++----
 fs/nfs/write.c                  | 12 ++++--------
 fs/xfs/linux-2.6/xfs_super.c    |  5 ++---
 include/linux/i2o.h             |  4 +---
 net/sunrpc/sched.c              | 12 ++++--------
 19 files changed, 39 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c4a0d5d8d7f0..bde40a6ae665 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2191,7 +2191,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	if (!cfqd->cfq_hash)
 		goto out_cfqhash;
 
-	cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
+	cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
 	if (!cfqd->crq_pool)
 		goto out_crqpool;
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 32fea55fac48..393b86a3dbf8 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -211,9 +211,7 @@ aoeblk_gdalloc(void *vp)
 		return;
 	}
 
-	d->bufpool = mempool_create(MIN_BUFS,
-				    mempool_alloc_slab, mempool_free_slab,
-				    buf_pool_cache);
+	d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache);
 	if (d->bufpool == NULL) {
 		printk(KERN_ERR "aoe: aoeblk_gdalloc: cannot allocate bufpool "
 			"for %ld.%ld\n", d->aoemajor, d->aoeminor);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d88b8eda3903..259e86f26549 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -616,8 +616,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		}
 	}
 
-	cc->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-				     mempool_free_slab, _crypt_io_pool);
+	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
 	if (!cc->io_pool) {
 		ti->error = PFX "Cannot allocate crypt io mempool";
 		goto bad3;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f72a82fb9434..1816f30678ed 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -179,8 +179,7 @@ static struct multipath *alloc_multipath(void)
 		m->queue_io = 1;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
 		INIT_WORK(&m->trigger_event, trigger_event, m);
-		m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-					      mempool_free_slab, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f3759dd7828e..7401540086df 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1174,8 +1174,7 @@ static int __init dm_snapshot_init(void)
 		goto bad4;
 	}
 
-	pending_pool = mempool_create(128, mempool_alloc_slab,
-				      mempool_free_slab, pending_cache);
+	pending_pool = mempool_create_slab_pool(128, pending_cache);
 	if (!pending_pool) {
 		DMERR("Couldn't create pending pool.");
 		r = -ENOMEM;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8c82373f7ff3..a64798ef481e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -823,13 +823,11 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
 	md->queue->unplug_fn = dm_unplug_all;
 	md->queue->issue_flush_fn = dm_flush_all;
 
-	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-				     mempool_free_slab, _io_cache);
+	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
  	if (!md->io_pool)
  		goto bad2;
 
-	md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
-				      mempool_free_slab, _tio_cache);
+	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
 	if (!md->tio_pool)
 		goto bad3;
 
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 0d54e8b7d9de..9dcb2c8a3853 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -227,8 +227,7 @@ static int jobs_init(void)
 	if (!_job_cache)
 		return -ENOMEM;
 
-	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
-				   mempool_free_slab, _job_cache);
+	_job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!_job_pool) {
 		kmem_cache_destroy(_job_cache);
 		return -ENOMEM;
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index b09fb6307153..7d4c5497785b 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -1179,10 +1179,9 @@ static int __init i2o_block_init(void)
 		goto exit;
 	}
 
-	i2o_blk_req_pool.pool = mempool_create(I2O_BLOCK_REQ_MEMPOOL_SIZE,
-					       mempool_alloc_slab,
-					       mempool_free_slab,
-					       i2o_blk_req_pool.slab);
+	i2o_blk_req_pool.pool =
+		mempool_create_slab_pool(I2O_BLOCK_REQ_MEMPOOL_SIZE,
+					 i2o_blk_req_pool.slab);
 	if (!i2o_blk_req_pool.pool) {
 		osm_err("can't init request mempool\n");
 		rc = -ENOMEM;
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 7b82ff090d42..2068b66822b7 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -3200,8 +3200,8 @@ iscsi_r2tpool_alloc(struct iscsi_session *session)
 		 * Data-Out PDU's within R2T-sequence can be quite big;
 		 * using mempool
 		 */
-		ctask->datapool = mempool_create(ISCSI_DTASK_DEFAULT_MAX,
-			 mempool_alloc_slab, mempool_free_slab, taskcache);
+		ctask->datapool = mempool_create_slab_pool(ISCSI_DTASK_DEFAULT_MAX,
+							   taskcache);
 		if (ctask->datapool == NULL) {
 			kfifo_free(ctask->r2tqueue);
 			iscsi_pool_free(&ctask->r2tpool, (void**)ctask->r2ts);
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 029bbf461bb2..017729c59a49 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2154,8 +2154,7 @@ qla2x00_allocate_sp_pool(scsi_qla_host_t *ha)
 	int      rval;
 
 	rval = QLA_SUCCESS;
-	ha->srb_mempool = mempool_create(SRB_MIN_REQ, mempool_alloc_slab,
-	    mempool_free_slab, srb_cachep);
+	ha->srb_mempool = mempool_create_slab_pool(SRB_MIN_REQ, srb_cachep);
 	if (ha->srb_mempool == NULL) {
 		qla_printk(KERN_INFO, ha, "Unable to allocate SRB mempool.\n");
 		rval = QLA_FUNCTION_FAILED;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ede158d08d9d..8f010a314a3d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1787,9 +1787,8 @@ int __init scsi_init_queue(void)
 					sgp->name);
 		}
 
-		sgp->pool = mempool_create(SG_MEMPOOL_SIZE,
-				mempool_alloc_slab, mempool_free_slab,
-				sgp->slab);
+		sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
+						     sgp->slab);
 		if (!sgp->pool) {
 			printk(KERN_ERR "SCSI: can't init sg mempool %s\n",
 					sgp->name);
diff --git a/fs/bio.c b/fs/bio.c
index 377046d82945..eb8fbc53f2cd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1141,8 +1141,7 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
 		if (i >= scale)
 			pool_entries >>= 1;
 
-		*bvp = mempool_create(pool_entries, mempool_alloc_slab,
-					mempool_free_slab, bp->slab);
+		*bvp = mempool_create_slab_pool(pool_entries, bp->slab);
 		if (!*bvp)
 			return -ENOMEM;
 	}
@@ -1179,9 +1178,7 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size, int scale)
 	if (!bs)
 		return NULL;
 
-	bs->bio_pool = mempool_create(bio_pool_size, mempool_alloc_slab,
-			mempool_free_slab, bio_slab);
-
+	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
 	if (!bs->bio_pool)
 		goto bad;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 221b3334b737..6b99b51d6694 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -738,10 +738,8 @@ cifs_init_request_bufs(void)
 		cERROR(1,("cifs_min_rcv set to maximum (64)"));
 	}
 
-	cifs_req_poolp = mempool_create(cifs_min_rcv,
-					mempool_alloc_slab,
-					mempool_free_slab,
-					cifs_req_cachep);
+	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
+						  cifs_req_cachep);
 
 	if(cifs_req_poolp == NULL) {
 		kmem_cache_destroy(cifs_req_cachep);
@@ -771,10 +769,8 @@ cifs_init_request_bufs(void)
 		cFYI(1,("cifs_min_small set to maximum (256)"));
 	}
 
-	cifs_sm_req_poolp = mempool_create(cifs_min_small,
-				mempool_alloc_slab,
-				mempool_free_slab,
-				cifs_sm_req_cachep);
+	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
+						     cifs_sm_req_cachep);
 
 	if(cifs_sm_req_poolp == NULL) {
 		mempool_destroy(cifs_req_poolp);
@@ -808,10 +804,8 @@ cifs_init_mids(void)
 	if (cifs_mid_cachep == NULL)
 		return -ENOMEM;
 
-	cifs_mid_poolp = mempool_create(3 /* a reasonable min simultan opers */,
-					mempool_alloc_slab,
-					mempool_free_slab,
-					cifs_mid_cachep);
+	/* 3 is a reasonable minimum number of simultaneous operations */
+	cifs_mid_poolp = mempool_create_slab_pool(3, cifs_mid_cachep);
 	if(cifs_mid_poolp == NULL) {
 		kmem_cache_destroy(cifs_mid_cachep);
 		return -ENOMEM;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 8508043849f3..f28696f235c4 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -220,8 +220,8 @@ int __init metapage_init(void)
 	if (metapage_cache == NULL)
 		return -ENOMEM;
 
-	metapage_mempool = mempool_create(METAPOOL_MIN_PAGES, mempool_alloc_slab,
-					  mempool_free_slab, metapage_cache);
+	metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES,
+						    metapage_cache);
 
 	if (metapage_mempool == NULL) {
 		kmem_cache_destroy(metapage_cache);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 3961524fd4ab..624ca7146b6b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -663,10 +663,8 @@ int nfs_init_readpagecache(void)
 	if (nfs_rdata_cachep == NULL)
 		return -ENOMEM;
 
-	nfs_rdata_mempool = mempool_create(MIN_POOL_READ,
-					   mempool_alloc_slab,
-					   mempool_free_slab,
-					   nfs_rdata_cachep);
+	nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
+						     nfs_rdata_cachep);
 	if (nfs_rdata_mempool == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3f5225404c97..4cfada2cc09f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1521,17 +1521,13 @@ int nfs_init_writepagecache(void)
 	if (nfs_wdata_cachep == NULL)
 		return -ENOMEM;
 
-	nfs_wdata_mempool = mempool_create(MIN_POOL_WRITE,
-					   mempool_alloc_slab,
-					   mempool_free_slab,
-					   nfs_wdata_cachep);
+	nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
+						     nfs_wdata_cachep);
 	if (nfs_wdata_mempool == NULL)
 		return -ENOMEM;
 
-	nfs_commit_mempool = mempool_create(MIN_POOL_COMMIT,
-					   mempool_alloc_slab,
-					   mempool_free_slab,
-					   nfs_wdata_cachep);
+	nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
+						      nfs_wdata_cachep);
 	if (nfs_commit_mempool == NULL)
 		return -ENOMEM;
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8355faf8ffde..1884300417e3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -375,9 +375,8 @@ xfs_init_zones(void)
 	if (!xfs_ioend_zone)
 		goto out_destroy_vnode_zone;
 
-	xfs_ioend_pool = mempool_create(4 * MAX_BUF_PER_PAGE,
-					mempool_alloc_slab, mempool_free_slab,
-					xfs_ioend_zone);
+	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+						  xfs_ioend_zone);
 	if (!xfs_ioend_pool)
 		goto out_free_ioend_zone;
 	return 0;
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 5a9d8c599171..dd7d627bf66f 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -950,9 +950,7 @@ static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
 	if (!pool->slab)
 		goto free_name;
 
-	pool->mempool =
-	    mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
-			   pool->slab);
+	pool->mempool = mempool_create_slab_pool(min_nr, pool->slab);
 	if (!pool->mempool)
 		goto free_slab;
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b9969b91a9f7..5c3eee768504 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1167,16 +1167,12 @@ rpc_init_mempool(void)
 					     NULL, NULL);
 	if (!rpc_buffer_slabp)
 		goto err_nomem;
-	rpc_task_mempool = mempool_create(RPC_TASK_POOLSIZE,
-					    mempool_alloc_slab,
-					    mempool_free_slab,
-					    rpc_task_slabp);
+	rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE,
+						    rpc_task_slabp);
 	if (!rpc_task_mempool)
 		goto err_nomem;
-	rpc_buffer_mempool = mempool_create(RPC_BUFFER_POOLSIZE,
-					    mempool_alloc_slab,
-					    mempool_free_slab,
-					    rpc_buffer_slabp);
+	rpc_buffer_mempool = mempool_create_slab_pool(RPC_BUFFER_POOLSIZE,
+						      rpc_buffer_slabp);
 	if (!rpc_buffer_mempool)
 		goto err_nomem;
 	return 0;
-- 
cgit v1.2.3


From abcb6c9fd13fc2ad7757b818924dc8109a0e3775 Mon Sep 17 00:00:00 2001
From: Takashi Sato <sho@tnes.nec.co.jp>
Date: Sun, 26 Mar 2006 01:37:51 -0800
Subject: [PATCH] 2TB files: st_blocks is invalid when calling stat64

This patch series fixes the following problems on 32 bits architecture.

o stat64 returns the lower 32 bits of blocks, although userland st_blocks
  has 64 bits, because i_blocks has only 32 bits.  The ioctl with FIOQSIZE has
  the same problem.

o As Dave Kleikamp said, making >2TB file on JFS results in writing an
  invalid block number to disk inode.  The cause is the same as above too.

o In generic quota code dquot_transfer(), the file usage is calculated from
  i_blocks via inode_get_bytes().  If the file is over 2TB, the change of
  usage is less than expected.  The cause is the same as above too.

o As Trond Myklebust said, statfs64's entries related to blocks are invalid
  on statfs64 for a network filesystem which has more than 2^32-1 blocks with
  CONFIG_LBD disabled.  [PATCH 3/3]

We made patches to fix problems that occur when handling a large filesystem
and a large file.  It was discussed on the mails titled "stat64 for over 2TB
file returned invalid st_blocks".

Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Jan Kara <jack@ucw.cz>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/stat.h | 3 +--
 include/asm-m68k/stat.h | 3 +--
 include/asm-sh/stat.h   | 8 +-------
 include/linux/fs.h      | 2 +-
 include/linux/stat.h    | 2 +-
 5 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/stat.h b/include/asm-i386/stat.h
index b464f8020ec4..67eae78323ba 100644
--- a/include/asm-i386/stat.h
+++ b/include/asm-i386/stat.h
@@ -58,8 +58,7 @@ struct stat64 {
 	long long	st_size;
 	unsigned long	st_blksize;
 
-	unsigned long	st_blocks;	/* Number 512-byte blocks allocated. */
-	unsigned long	__pad4;		/* future possible st_blocks high bits */
+	unsigned long long	st_blocks;	/* Number 512-byte blocks allocated. */
 
 	unsigned long	st_atime;
 	unsigned long	st_atime_nsec;
diff --git a/include/asm-m68k/stat.h b/include/asm-m68k/stat.h
index c4c402a45e21..dd38bc2e9f98 100644
--- a/include/asm-m68k/stat.h
+++ b/include/asm-m68k/stat.h
@@ -60,8 +60,7 @@ struct stat64 {
 	long long	st_size;
 	unsigned long	st_blksize;
 
-	unsigned long	__pad4;		/* future possible st_blocks high bits */
-	unsigned long	st_blocks;	/* Number 512-byte blocks allocated. */
+	unsigned long long	st_blocks;	/* Number 512-byte blocks allocated. */
 
 	unsigned long	st_atime;
 	unsigned long	st_atime_nsec;
diff --git a/include/asm-sh/stat.h b/include/asm-sh/stat.h
index 914e3fcbbd37..6c41a60657f1 100644
--- a/include/asm-sh/stat.h
+++ b/include/asm-sh/stat.h
@@ -60,13 +60,7 @@ struct stat64 {
 	long long	st_size;
 	unsigned long	st_blksize;
 
-#if defined(__BIG_ENDIAN__)
-	unsigned long	__pad4;		/* Future possible st_blocks hi bits */
-	unsigned long	st_blocks;	/* Number 512-byte blocks allocated. */
-#else /* Must be little */
-	unsigned long	st_blocks;	/* Number 512-byte blocks allocated. */
-	unsigned long	__pad4;		/* Future possible st_blocks hi bits */
-#endif
+	unsigned long long	st_blocks;	/* Number 512-byte blocks allocated. */
 
 	unsigned long	st_atime;
 	unsigned long	st_atime_nsec;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ab67181a5a55..64b0ca4f14e3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -490,7 +490,7 @@ struct inode {
 	unsigned int		i_blkbits;
 	unsigned long		i_blksize;
 	unsigned long		i_version;
-	unsigned long		i_blocks;
+	sector_t		i_blocks;
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct mutex		i_mutex;
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 8ff2a122dfef..8669291352db 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -69,7 +69,7 @@ struct kstat {
 	struct timespec	mtime;
 	struct timespec	ctime;
 	unsigned long	blksize;
-	unsigned long	blocks;
+	unsigned long long	blocks;
 };
 
 #endif
-- 
cgit v1.2.3


From a0f62ac6362c168754cccb36f196b3dfbddc3bc3 Mon Sep 17 00:00:00 2001
From: Takashi Sato <sho@tnes.nec.co.jp>
Date: Sun, 26 Mar 2006 01:37:52 -0800
Subject: [PATCH] 2TB files: add blkcnt_t

Add blkcnt_t as the type of inode.i_blocks.  This enables you to make the size
of blkcnt_t either 4 bytes or 8 bytes on 32 bits architecture with CONFIG_LSF.

- CONFIG_LSF
  Add new configuration parameter.
- blkcnt_t
  On h8300, i386, mips, powerpc, s390 and sh that define sector_t,
  blkcnt_t is defined as u64 if CONFIG_LSF is enabled; otherwise it is
  defined as unsigned long.
  On other architectures, it is defined as unsigned long.
- inode.i_blocks
  Change the type from sector_t to blkcnt_t.

Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/Kconfig               | 9 +++++++++
 include/asm-h8300/types.h   | 3 +++
 include/asm-i386/types.h    | 5 +++++
 include/asm-mips/types.h    | 5 +++++
 include/asm-powerpc/types.h | 5 +++++
 include/asm-s390/types.h    | 5 +++++
 include/asm-sh/types.h      | 5 +++++
 include/linux/fs.h          | 2 +-
 include/linux/types.h       | 4 ++++
 9 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 96783645092d..43ca070dc0f8 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -23,4 +23,13 @@ config BLK_DEV_IO_TRACE
 
 	  git://brick.kernel.dk/data/git/blktrace.git
 
+config LSF
+	bool "Support for Large Single Files"
+	depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
+	default n
+	help
+	  When CONFIG_LBD is disabled, say Y here if you want to
+	  handle large file(bigger than 2TB), otherwise say N.
+	  When CONFIG_LBD is enabled, Y is set automatically.
+
 source block/Kconfig.iosched
diff --git a/include/asm-h8300/types.h b/include/asm-h8300/types.h
index bf91e0d4dde7..da2402b86540 100644
--- a/include/asm-h8300/types.h
+++ b/include/asm-h8300/types.h
@@ -58,6 +58,9 @@ typedef u32 dma_addr_t;
 #define HAVE_SECTOR_T
 typedef u64 sector_t;
 
+#define HAVE_BLKCNT_T
+typedef u64 blkcnt_t;
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/asm-i386/types.h b/include/asm-i386/types.h
index ced00fe8fe61..e50a08bd7ced 100644
--- a/include/asm-i386/types.h
+++ b/include/asm-i386/types.h
@@ -63,6 +63,11 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
+#ifdef CONFIG_LSF
+typedef u64 blkcnt_t;
+#define HAVE_BLKCNT_T
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-mips/types.h b/include/asm-mips/types.h
index 421b3aea14cc..cd2813d8e136 100644
--- a/include/asm-mips/types.h
+++ b/include/asm-mips/types.h
@@ -99,6 +99,11 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
+#ifdef CONFIG_LSF
+typedef u64 blkcnt_t;
+#define HAVE_BLKCNT_T
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-powerpc/types.h b/include/asm-powerpc/types.h
index ec3c2ee8bf86..baabba96e313 100644
--- a/include/asm-powerpc/types.h
+++ b/include/asm-powerpc/types.h
@@ -103,6 +103,11 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
+#ifdef CONFIG_LSF
+typedef u64 blkcnt_t;
+#define HAVE_BLKCNT_T
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-s390/types.h b/include/asm-s390/types.h
index d0be3e477013..5738ad63537c 100644
--- a/include/asm-s390/types.h
+++ b/include/asm-s390/types.h
@@ -93,6 +93,11 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
+#ifdef CONFIG_LSF
+typedef u64 blkcnt_t;
+#define HAVE_BLKCNT_T
+#endif
+
 #endif /* ! __s390x__   */
 #endif /* __ASSEMBLY__  */
 #endif /* __KERNEL__    */
diff --git a/include/asm-sh/types.h b/include/asm-sh/types.h
index cb7e183a0a6b..488552f43b2a 100644
--- a/include/asm-sh/types.h
+++ b/include/asm-sh/types.h
@@ -58,6 +58,11 @@ typedef u64 sector_t;
 #define HAVE_SECTOR_T
 #endif
 
+#ifdef CONFIG_LSF
+typedef u64 blkcnt_t;
+#define HAVE_BLKCNT_T
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 64b0ca4f14e3..155d29d5e5e4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -490,7 +490,7 @@ struct inode {
 	unsigned int		i_blkbits;
 	unsigned long		i_blksize;
 	unsigned long		i_version;
-	sector_t		i_blocks;
+	blkcnt_t		i_blocks;
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct mutex		i_mutex;
diff --git a/include/linux/types.h b/include/linux/types.h
index 54ae2d59e71b..1046c7ad86d9 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -137,6 +137,10 @@ typedef		__s64		int64_t;
 typedef unsigned long sector_t;
 #endif
 
+#ifndef HAVE_BLKCNT_T
+typedef unsigned long blkcnt_t;
+#endif
+
 /*
  * The type of an index into the pagecache.  Use a #define so asm/types.h
  * can override it.
-- 
cgit v1.2.3


From e2d53f9525790dfacbcf09f359536311d3913d98 Mon Sep 17 00:00:00 2001
From: Takashi Sato <sho@tnes.nec.co.jp>
Date: Sun, 26 Mar 2006 01:37:54 -0800
Subject: [PATCH] 2TB files: change type of kstatfs entries

This fix was proposed by Trond Myklebust.  He says: The type "sector_t" is
heavily tied in to the block layer interface as an offset/handle to a block,
and is subject to a supposedly block-specific configuration option:
CONFIG_LBD.  Despite this, it is used in struct kstatfs to save a couple of
bytes on the stack whenever we call the filesystems' ->statfs().

So kstatfs's entries related to blocks are invalid on statfs64 for a network
filesystem which has more than 2^32-1 blocks when CONFIG_LBD is disabled.

- struct kstatfs
  Change the type of following entries from sector_t to u64.
  f_blocks
  f_bfree
  f_bavail
  f_files
  f_ffree

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Takashi Sato <sho@tnes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/statfs.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/statfs.h b/include/linux/statfs.h
index ad83a2bdb821..b34cc829f98d 100644
--- a/include/linux/statfs.h
+++ b/include/linux/statfs.h
@@ -8,11 +8,11 @@
 struct kstatfs {
 	long f_type;
 	long f_bsize;
-	sector_t f_blocks;
-	sector_t f_bfree;
-	sector_t f_bavail;
-	sector_t f_files;
-	sector_t f_ffree;
+	u64 f_blocks;
+	u64 f_bfree;
+	u64 f_bavail;
+	u64 f_files;
+	u64 f_ffree;
 	__kernel_fsid_t f_fsid;
 	long f_namelen;
 	long f_frsize;
-- 
cgit v1.2.3


From 89747d369d34e333b9b60f10f333a0b727b4e4e2 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:55 -0800
Subject: [PATCH] ext3_get_blocks: Mapping multiple blocks at a once

Currently ext3_get_block() only maps or allocates one block at a time.  This
is quite inefficient for sequential IO workload.

I have posted a early implements a simply multiple block map and allocation
with current ext3.  The basic idea is allocating the 1st block in the existing
way, and attempting to allocate the next adjacent blocks on a best effort
basis.  More description about the implementation could be found here:
http://marc.theaimsgroup.com/?l=ext2-devel&m=112162230003522&w=2

The following the latest version of the patch: break the original patch into 5
patches, re-worked some logicals, and fixed some bugs.  The break ups are:

 [patch 1] Adding map multiple blocks at a time in ext3_get_blocks()
 [patch 2] Extend ext3_get_blocks() to support multiple block allocation
 [patch 3] Implement multiple block allocation in ext3-try-to-allocate
 (called via ext3_new_block()).
 [patch 4] Proper accounting updates in ext3_new_blocks()
 [patch 5] Adjust reservation window size properly (by the given number
 of blocks to allocate) before block allocation to increase the
 possibility of allocating multiple blocks in a single call.

Tests done so far includes fsx,tiobench and dbench.  The following numbers
collected from Direct IO tests (1G file creation/read) shows the system time
have been greatly reduced (more than 50% on my 8 cpu system) with the patches.

 1G file DIO write:
 	2.6.15		2.6.15+patches
 real    0m31.275s	0m31.161s
 user    0m0.000s	0m0.000s
 sys     0m3.384s	0m0.564s

 1G file DIO read:
 	2.6.15		2.6.15+patches
 real    0m30.733s	0m30.624s
 user    0m0.000s	0m0.004s
 sys     0m0.748s	0m0.380s

Some previous test we did on buffered IO with using multiple blocks allocation
and delayed allocation shows noticeable improvement on throughput and system
time.

This patch:

Add support of mapping multiple blocks in one call.

This is useful for DIO reads and re-writes (where blocks are already
allocated), also is in line with Christoph's proposal of using getblocks() in
mpage_readpage() or mpage_readpages().

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/dir.c           |   5 ++-
 fs/ext3/inode.c         | 105 +++++++++++++++++++++++++++++++++++-------------
 include/linux/ext3_fs.h |   6 +--
 3 files changed, 82 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 773459164bb2..38bd3f6ec147 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -131,8 +131,9 @@ static int ext3_readdir(struct file * filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext3_get_block_handle(NULL, inode, blk, &map_bh, 0, 0);
-		if (!err) {
+		err = ext3_get_blocks_handle(NULL, inode, blk, 1,
+						&map_bh, 0, 0);
+		if (err > 0) {
 			page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
 				&filp->f_ra,
 				filp,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 76e22c9c9c6c..fcfb10f77120 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -330,7 +330,7 @@ static int ext3_block_to_path(struct inode *inode,
 		ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
 	}
 	if (boundary)
-		*boundary = (i_block & (ptrs - 1)) == (final - 1);
+		*boundary = final - 1 - (i_block & (ptrs - 1));
 	return n;
 }
 
@@ -669,11 +669,15 @@ err_out:
  * akpm: `handle' can be NULL if create == 0.
  *
  * The BKL may not be held on entry here.  Be sure to take it early.
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
  */
 
 int
-ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
-		struct buffer_head *bh_result, int create, int extend_disksize)
+ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock,
+		unsigned long maxblocks, struct buffer_head *bh_result,
+		int create, int extend_disksize)
 {
 	int err = -EIO;
 	int offsets[4];
@@ -681,11 +685,15 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 	Indirect *partial;
 	unsigned long goal;
 	int left;
-	int boundary = 0;
-	const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
+	int blocks_to_boundary = 0;
+	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
+	int count = 0;
+	unsigned long first_block = 0;
+
 
 	J_ASSERT(handle != NULL || create == 0);
+	depth = ext3_block_to_path(inode, iblock, offsets, &blocks_to_boundary);
 
 	if (depth == 0)
 		goto out;
@@ -694,8 +702,31 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
+		first_block = chain[depth - 1].key;
 		clear_buffer_new(bh_result);
-		goto got_it;
+		count++;
+		/*map more blocks*/
+		while (count < maxblocks && count <= blocks_to_boundary) {
+			if (!verify_chain(chain, partial)) {
+				/*
+				 * Indirect block might be removed by
+				 * truncate while we were reading it.
+				 * Handling of that case: forget what we've
+				 * got now. Flag the err as EAGAIN, so it
+				 * will reread.
+				 */
+				err = -EAGAIN;
+				count = 0;
+				break;
+			}
+			if (le32_to_cpu(*(chain[depth-1].p+count) ==
+					(first_block + count)))
+				count++;
+			else
+				break;
+		}
+		if (err != -EAGAIN)
+			goto got_it;
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
@@ -723,6 +754,7 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 		}
 		partial = ext3_get_branch(inode, depth, offsets, chain, &err);
 		if (!partial) {
+			count++;
 			mutex_unlock(&ei->truncate_mutex);
 			if (err)
 				goto cleanup;
@@ -772,8 +804,9 @@ ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 	set_buffer_new(bh_result);
 got_it:
 	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
-	if (boundary)
+	if (blocks_to_boundary == 0)
 		set_buffer_boundary(bh_result);
+	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
 cleanup:
@@ -787,21 +820,6 @@ out:
 	return err;
 }
 
-static int ext3_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	handle_t *handle = NULL;
-	int ret;
-
-	if (create) {
-		handle = ext3_journal_current_handle();
-		J_ASSERT(handle != 0);
-	}
-	ret = ext3_get_block_handle(handle, inode, iblock,
-				bh_result, create, 1);
-	return ret;
-}
-
 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
 
 static int
@@ -812,9 +830,12 @@ ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
 	handle_t *handle = journal_current_handle();
 	int ret = 0;
 
-	if (!handle)
+	if (!create)
 		goto get_block;		/* A read */
 
+	if (max_blocks == 1)
+		goto get_block;		/* A single block get */
+
 	if (handle->h_transaction->t_state == T_LOCKED) {
 		/*
 		 * Huge direct-io writes can hold off commits for long
@@ -841,13 +862,31 @@ ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
 	}
 
 get_block:
-	if (ret == 0)
-		ret = ext3_get_block_handle(handle, inode, iblock,
-					bh_result, create, 0);
-	bh_result->b_size = (1 << inode->i_blkbits);
+	if (ret == 0) {
+		ret = ext3_get_blocks_handle(handle, inode, iblock,
+					max_blocks, bh_result, create, 0);
+		if (ret > 0) {
+			bh_result->b_size = (ret << inode->i_blkbits);
+			ret = 0;
+		}
+	}
 	return ret;
 }
 
+static int ext3_get_blocks(struct inode *inode, sector_t iblock,
+		unsigned long maxblocks, struct buffer_head *bh_result,
+		int create)
+{
+	return ext3_direct_io_get_blocks(inode, iblock, maxblocks,
+					bh_result, create);
+}
+
+static int ext3_get_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	return ext3_get_blocks(inode, iblock, 1, bh_result, create);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
@@ -862,8 +901,16 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
-	if (!*errp && buffer_mapped(&dummy)) {
+	err = ext3_get_blocks_handle(handle, inode, block, 1,
+					&dummy, create, 1);
+	if (err == 1) {
+		err = 0;
+	} else if (err >= 0) {
+		WARN_ON(1);
+		err = -EIO;
+	}
+	*errp = err;
+	if (!err && buffer_mapped(&dummy)) {
 		struct buffer_head *bh;
 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
 		if (!bh) {
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index e7239f2f97a1..0adadd85fa66 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -775,9 +775,9 @@ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
 int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
 struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
 struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
-int ext3_get_block_handle(handle_t *handle, struct inode *inode,
-	sector_t iblock, struct buffer_head *bh_result, int create,
-	int extend_disksize);
+int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
+	sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
+	int create, int extend_disksize);
 
 extern void ext3_read_inode (struct inode *);
 extern int  ext3_write_inode (struct inode *, int);
-- 
cgit v1.2.3


From b54e41ec17ae91dce174eb5a3515e7af4a440d42 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:57 -0800
Subject: [PATCH] ext3_get_blocks: support multiple blocks allocation in
 ext3_new_block()

Change ext3_try_to_allocate() (called via ext3_new_blocks()) to try to
allocate the requested number of blocks on a best effort basis: After
allocated the first block, it will always attempt to allocate the next few(up
to the requested size and not beyond the reservation window) adjacent blocks
at the same time.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/balloc.c        | 46 ++++++++++++++++++++++++++++++++++++----------
 include/linux/ext3_fs.h |  5 ++++-
 2 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 46623f77666b..bba4aeb4a708 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -653,9 +653,11 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
  */
 static int
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-	struct buffer_head *bitmap_bh, int goal, struct ext3_reserve_window *my_rsv)
+			struct buffer_head *bitmap_bh, int goal,
+			unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
 	int group_first_block, start, end;
+	unsigned long num = 0;
 
 	/* we do allocation within the reservation window if we have a window */
 	if (my_rsv) {
@@ -713,8 +715,18 @@ repeat:
 			goto fail_access;
 		goto repeat;
 	}
-	return goal;
+	num++;
+	goal++;
+	while (num < *count && goal < end
+		&& ext3_test_allocatable(goal, bitmap_bh)
+		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+		num++;
+		goal++;
+	}
+	*count = num;
+	return goal - num;
 fail_access:
+	*count = num;
 	return -1;
 }
 
@@ -1024,11 +1036,12 @@ static int
 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			unsigned int group, struct buffer_head *bitmap_bh,
 			int goal, struct ext3_reserve_window_node * my_rsv,
-			int *errp)
+			unsigned long *count, int *errp)
 {
 	unsigned long group_first_block;
 	int ret = 0;
 	int fatal;
+	unsigned long num = *count;
 
 	*errp = 0;
 
@@ -1051,7 +1064,8 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 * or last attempt to allocate a block with reservation turned on failed
 	 */
 	if (my_rsv == NULL ) {
-		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL);
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+						goal, count, NULL);
 		goto out;
 	}
 	/*
@@ -1093,11 +1107,13 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 		    || (my_rsv->rsv_end < group_first_block))
 			BUG();
 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
-					   &my_rsv->rsv_window);
+					   &num, &my_rsv->rsv_window);
 		if (ret >= 0) {
-			my_rsv->rsv_alloc_hit++;
+			my_rsv->rsv_alloc_hit += num;
+			*count = num;
 			break;				/* succeed */
 		}
+		num = *count;
 	}
 out:
 	if (ret >= 0) {
@@ -1154,8 +1170,8 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext3_new_block(handle_t *handle, struct inode *inode,
-			unsigned long goal, int *errp)
+int ext3_new_blocks(handle_t *handle, struct inode *inode,
+			unsigned long goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
@@ -1178,6 +1194,7 @@ int ext3_new_block(handle_t *handle, struct inode *inode,
 	static int goal_hits, goal_attempts;
 #endif
 	unsigned long ngroups;
+	unsigned long num = *count;
 
 	*errp = -ENOSPC;
 	sb = inode->i_sb;
@@ -1244,7 +1261,7 @@ retry:
 		if (!bitmap_bh)
 			goto io_error;
 		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, ret_block, my_rsv, &fatal);
+					bitmap_bh, ret_block, my_rsv, &num, &fatal);
 		if (fatal)
 			goto out;
 		if (ret_block >= 0)
@@ -1281,7 +1298,7 @@ retry:
 		if (!bitmap_bh)
 			goto io_error;
 		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, -1, my_rsv, &fatal);
+					bitmap_bh, -1, my_rsv, &num, &fatal);
 		if (fatal)
 			goto out;
 		if (ret_block >= 0) 
@@ -1388,6 +1405,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
+	*count = num;
 	return ret_block;
 
 io_error:
@@ -1406,6 +1424,14 @@ out:
 	return 0;
 }
 
+int ext3_new_block(handle_t *handle, struct inode *inode,
+			unsigned long goal, int *errp)
+{
+	unsigned long count = 1;
+
+	return ext3_new_blocks(handle, inode, goal, &count, errp);
+}
+
 unsigned long ext3_count_free_blocks(struct super_block *sb)
 {
 	unsigned long desc_count;
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 0adadd85fa66..8bb4f842cded 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -36,7 +36,8 @@ struct statfs;
  * Define EXT3_RESERVATION to reserve data blocks for expanding files
  */
 #define EXT3_DEFAULT_RESERVE_BLOCKS     8
-#define EXT3_MAX_RESERVE_BLOCKS         1024
+/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
+#define EXT3_MAX_RESERVE_BLOCKS         1027
 #define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
 /*
  * Always enable hashed directories
@@ -732,6 +733,8 @@ struct dir_private_info {
 extern int ext3_bg_has_super(struct super_block *sb, int group);
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
 extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+extern int ext3_new_blocks (handle_t *, struct inode *, unsigned long,
+			unsigned long *, int *);
 extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
 			      unsigned long);
 extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
-- 
cgit v1.2.3


From 205f87f6b342444f722e4559d33318686f7df2ca Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sun, 26 Mar 2006 01:38:00 -0800
Subject: [PATCH] change buffer_head.b_size to size_t

Increase the size of the buffer_head b_size field (only) for 64 bit platforms.
Update some old and moldy comments in and around the structure as well.

The b_size increase allows us to perform larger mappings and allocations for
large I/O requests from userspace, which tie in with other changes allowing
the get_block_t() interface to map multiple blocks at once.

Signed-off-by: Nathan Scott <nathans@sgi.com>
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 |  6 ++++--
 fs/ocfs2/journal.c          |  2 +-
 fs/reiserfs/prints.c        |  2 +-
 include/linux/buffer_head.h | 19 +++++++++++--------
 4 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index f25f58096428..e7a1461f4387 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -426,8 +426,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	if (all_mapped) {
 		printk("__find_get_block_slow() failed. "
 			"block=%llu, b_blocknr=%llu\n",
-			(unsigned long long)block, (unsigned long long)bh->b_blocknr);
-		printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
+			(unsigned long long)block,
+			(unsigned long long)bh->b_blocknr);
+		printk("b_state=0x%08lx, b_size=%zu\n",
+			bh->b_state, bh->b_size);
 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 	}
 out_unlock:
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 90641929a437..6a610ae53583 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -377,7 +377,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
 	BUG_ON(!bh);
 	BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
 
-	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
+	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
 		   (unsigned long long)bh->b_blocknr, type,
 		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
 		   "OCFS2_JOURNAL_ACCESS_CREATE" :
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 78b40621b88b..27bd3a1df2ad 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -143,7 +143,7 @@ static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
 	char b[BDEVNAME_SIZE];
 
 	sprintf(buf,
-		"dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+		"dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
 		bdevname(bh->b_bdev, b), bh->b_size,
 		(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
 		bh->b_state, bh->b_page,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index da917ed096a3..464f068f8b16 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -46,25 +46,28 @@ struct address_space;
 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
 
 /*
- * Keep related fields in common cachelines.  The most commonly accessed
- * field (b_state) goes at the start so the compiler does not generate
- * indexed addressing for it.
+ * Historically, a buffer_head was used to map a single block
+ * within a page, and of course as the unit of I/O through the
+ * filesystem and block layers.  Nowadays the basic I/O unit
+ * is the bio, and buffer_heads are used for extracting block
+ * mappings (via a get_block_t call), for tracking state within
+ * a page (via a page_mapping) and for wrapping bio submission
+ * for backward compatibility reasons (e.g. submit_bh).
  */
 struct buffer_head {
-	/* First cache line: */
 	unsigned long b_state;		/* buffer state bitmap (see above) */
 	struct buffer_head *b_this_page;/* circular list of page's buffers */
 	struct page *b_page;		/* the page this bh is mapped to */
-	atomic_t b_count;		/* users using this block */
-	u32 b_size;			/* block size */
 
-	sector_t b_blocknr;		/* block number */
-	char *b_data;			/* pointer to data block */
+	sector_t b_blocknr;		/* start block number */
+	size_t b_size;			/* size of mapping */
+	char *b_data;			/* pointer to data within the page */
 
 	struct block_device *b_bdev;
 	bh_end_io_t *b_end_io;		/* I/O completion */
  	void *b_private;		/* reserved for b_end_io */
 	struct list_head b_assoc_buffers; /* associated with another mapping */
+	atomic_t b_count;		/* users using this buffer_head */
 };
 
 /*
-- 
cgit v1.2.3


From b0cf2321c6599138f860517745503691556d8453 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sun, 26 Mar 2006 01:38:00 -0800
Subject: [PATCH] pass b_size to ->get_block()

Pass amount of disk needs to be mapped to get_block().  This way one can
modify the fs ->get_block() functions to map multiple blocks at the same time.

[akpm@osdl.org: performance tweak]
[akpm@osdl.org: remove unneeded assignments]
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 9 ++++++++-
 fs/mpage.c                  | 2 ++
 include/linux/buffer_head.h | 1 +
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index e7a1461f4387..a507b58550f1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1738,6 +1738,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	sector_t block;
 	sector_t last_block;
 	struct buffer_head *bh, *head;
+	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
 
 	BUG_ON(!PageLocked(page));
@@ -1745,7 +1746,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 
 	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, 1 << inode->i_blkbits,
+		create_empty_buffers(page, blocksize,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
 	}
 
@@ -1780,6 +1781,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
 		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
@@ -1933,6 +1935,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 		if (buffer_new(bh))
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
+			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				break;
@@ -2088,6 +2091,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 
 			fully_mapped = 0;
 			if (iblock < lblock) {
+				WARN_ON(bh->b_size != blocksize);
 				err = get_block(inode, iblock, bh, 0);
 				if (err)
 					SetPageError(page);
@@ -2409,6 +2413,7 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
 		create = 1;
 		if (block_start >= to)
 			create = 0;
+		map_bh.b_size = blocksize;
 		ret = get_block(inode, block_in_file + block_in_page,
 					&map_bh, create);
 		if (ret)
@@ -2669,6 +2674,7 @@ int block_truncate_page(struct address_space *mapping,
 
 	err = 0;
 	if (!buffer_mapped(bh)) {
+		WARN_ON(bh->b_size != blocksize);
 		err = get_block(inode, iblock, bh, 0);
 		if (err)
 			goto unlock;
@@ -2755,6 +2761,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 	struct inode *inode = mapping->host;
 	tmp.b_state = 0;
 	tmp.b_blocknr = 0;
+	tmp.b_size = 1 << inode->i_blkbits;
 	get_block(inode, block, &tmp, 0);
 	return tmp.b_blocknr;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index e431cb3878d6..7903b740cc11 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -192,6 +192,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 				page_block++, block_in_file++) {
 		bh.b_state = 0;
 		if (block_in_file < last_block) {
+			bh.b_size = blocksize;
 			if (get_block(inode, block_in_file, &bh, 0))
 				goto confused;
 		}
@@ -472,6 +473,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	for (page_block = 0; page_block < blocks_per_page; ) {
 
 		map_bh.b_state = 0;
+		map_bh.b_size = 1 << blkbits;
 		if (get_block(inode, block_in_file, &map_bh, 1))
 			goto confused;
 		if (buffer_new(&map_bh))
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 464f068f8b16..fb7e9b7ccbe3 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -280,6 +280,7 @@ map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
 	set_buffer_mapped(bh);
 	bh->b_bdev = sb->s_bdev;
 	bh->b_blocknr = block;
+	bh->b_size = sb->s_blocksize;
 }
 
 /*
-- 
cgit v1.2.3


From 1d8fa7a2b9a39d18727acc5c468e870df606c852 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sun, 26 Mar 2006 01:38:02 -0800
Subject: [PATCH] remove ->get_blocks() support

Now that get_block() can handle mapping multiple disk blocks, no need to have
->get_blocks().  This patch removes fs specific ->get_blocks() added for DIO
and makes it users use get_block() instead.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/block_dev.c              |  3 ++-
 fs/direct-io.c              | 27 ++++++++++++++-------------
 fs/ext2/inode.c             | 14 +-------------
 fs/ext3/inode.c             | 12 ++----------
 fs/fat/inode.c              |  2 +-
 fs/hfs/inode.c              | 13 +------------
 fs/hfsplus/inode.c          | 13 +------------
 fs/jfs/inode.c              |  2 +-
 fs/ocfs2/aops.c             |  2 +-
 fs/reiserfs/inode.c         |  1 -
 fs/xfs/linux-2.6/xfs_aops.c |  6 +++---
 include/linux/fs.h          | 17 +++++++----------
 12 files changed, 34 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9a451a9ffad4..5983d42df015 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -131,9 +131,10 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
 
 static int
 blkdev_get_blocks(struct inode *inode, sector_t iblock,
-		unsigned long max_blocks, struct buffer_head *bh, int create)
+		struct buffer_head *bh, int create)
 {
 	sector_t end_block = max_block(I_BDEV(inode));
+	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
 
 	if ((iblock + max_blocks) > end_block) {
 		max_blocks = end_block - iblock;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 235ed8d1f11e..9d1d2aa73e42 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -86,12 +86,12 @@ struct dio {
 	unsigned first_block_in_page;	/* doesn't change, Used only once */
 	int boundary;			/* prev block is at a boundary */
 	int reap_counter;		/* rate limit reaping */
-	get_blocks_t *get_blocks;	/* block mapping function */
+	get_block_t *get_block;		/* block mapping function */
 	dio_iodone_t *end_io;		/* IO completion function */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
 	sector_t next_block_for_io;	/* next block to be put under IO,
 					   in dio_blocks units */
-	struct buffer_head map_bh;	/* last get_blocks() result */
+	struct buffer_head map_bh;	/* last get_block() result */
 
 	/*
 	 * Deferred addition of a page to the dio.  These variables are
@@ -211,9 +211,9 @@ static struct page *dio_get_page(struct dio *dio)
 
 /*
  * Called when all DIO BIO I/O has been completed - let the filesystem
- * know, if it registered an interest earlier via get_blocks.  Pass the
+ * know, if it registered an interest earlier via get_block.  Pass the
  * private field of the map buffer_head so that filesystems can use it
- * to hold additional state between get_blocks calls and dio_complete.
+ * to hold additional state between get_block calls and dio_complete.
  */
 static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
 {
@@ -493,7 +493,7 @@ static int dio_bio_reap(struct dio *dio)
  * The fs is allowed to map lots of blocks at once.  If it wants to do that,
  * it uses the passed inode-relative block number as the file offset, as usual.
  *
- * get_blocks() is passed the number of i_blkbits-sized blocks which direct_io
+ * get_block() is passed the number of i_blkbits-sized blocks which direct_io
  * has remaining to do.  The fs should not map more than this number of blocks.
  *
  * If the fs has mapped a lot of blocks, it should populate bh->b_size to
@@ -506,7 +506,7 @@ static int dio_bio_reap(struct dio *dio)
  * In the case of filesystem holes: the fs may return an arbitrarily-large
  * hole by returning an appropriate value in b_size and by clearing
  * buffer_mapped().  However the direct-io code will only process holes one
- * block at a time - it will repeatedly call get_blocks() as it walks the hole.
+ * block at a time - it will repeatedly call get_block() as it walks the hole.
  */
 static int get_more_blocks(struct dio *dio)
 {
@@ -548,7 +548,8 @@ static int get_more_blocks(struct dio *dio)
 		 * at a higher level for inside-i_size block-instantiating
 		 * writes.
 		 */
-		ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
+		map_bh->b_size = fs_count << dio->blkbits;
+		ret = (*dio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
 	}
 	return ret;
@@ -783,11 +784,11 @@ static void dio_zero_block(struct dio *dio, int end)
  * happily perform page-sized but 512-byte aligned IOs.  It is important that
  * blockdev IO be able to have fine alignment and large sizes.
  *
- * So what we do is to permit the ->get_blocks function to populate bh.b_size
+ * So what we do is to permit the ->get_block function to populate bh.b_size
  * with the size of IO which is permitted at this offset and this i_blkbits.
  *
  * For best results, the blockdev should be set up with 512-byte i_blkbits and
- * it should set b_size to PAGE_SIZE or more inside get_blocks().  This gives
+ * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
  * fine alignment but still allows this function to work in PAGE_SIZE units.
  */
 static int do_direct_IO(struct dio *dio)
@@ -947,7 +948,7 @@ out:
 static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io,
+	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
 	struct dio *dio)
 {
 	unsigned long user_addr; 
@@ -969,7 +970,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	dio->boundary = 0;
 	dio->reap_counter = 0;
-	dio->get_blocks = get_blocks;
+	dio->get_block = get_block;
 	dio->end_io = end_io;
 	dio->map_bh.b_private = NULL;
 	dio->final_block_in_bio = -1;
@@ -1177,7 +1178,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	int dio_lock_type)
 {
 	int seg;
@@ -1273,7 +1274,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_blocks, end_io, dio);
+				nr_segs, blkbits, get_block, end_io, dio);
 
 	if (rw == READ && dio_lock_type == DIO_LOCKING)
 		release_i_mutex = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a717837f272e..04af9c45dce2 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -667,18 +667,6 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,ext2_get_block);
 }
 
-static int
-ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
-			struct buffer_head *bh_result, int create)
-{
-	int ret;
-
-	ret = ext2_get_block(inode, iblock, bh_result, create);
-	if (ret == 0)
-		bh_result->b_size = (1 << inode->i_blkbits);
-	return ret;
-}
-
 static ssize_t
 ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
@@ -687,7 +675,7 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = file->f_mapping->host;
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				offset, nr_segs, ext2_get_blocks, NULL);
+				offset, nr_segs, ext2_get_block, NULL);
 }
 
 static int
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 34e5b0dc9168..0cd126176bbb 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -940,11 +940,11 @@ out:
 
 static int
 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
-		unsigned long max_blocks,
 		struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = journal_current_handle();
 	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
 	if (!create)
 		goto get_block;		/* A read */
@@ -989,18 +989,10 @@ get_block:
 	return ret;
 }
 
-static int ext3_get_blocks(struct inode *inode, sector_t iblock,
-		unsigned long maxblocks, struct buffer_head *bh_result,
-		int create)
-{
-	return ext3_direct_io_get_blocks(inode, iblock, maxblocks,
-					bh_result, create);
-}
-
 static int ext3_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
-	return ext3_get_blocks(inode, iblock, 1, bh_result, create);
+	return ext3_direct_io_get_blocks(inode, iblock, bh_result, create);
 }
 
 /*
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 297300fe81c2..404bfc9f7385 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -101,11 +101,11 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock,
 }
 
 static int fat_get_blocks(struct inode *inode, sector_t iblock,
-			  unsigned long max_blocks,
 			  struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
 	int err;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 
 	err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
 	if (err)
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 39fd85b9b916..2c564701724f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -98,17 +98,6 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 	return res ? try_to_free_buffers(page) : 0;
 }
 
-static int hfs_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
-			  struct buffer_head *bh_result, int create)
-{
-	int ret;
-
-	ret = hfs_get_block(inode, iblock, bh_result, create);
-	if (!ret)
-		bh_result->b_size = (1 << inode->i_blkbits);
-	return ret;
-}
-
 static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
@@ -116,7 +105,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, hfs_get_blocks, NULL);
+				  offset, nr_segs, hfs_get_block, NULL);
 }
 
 static int hfs_writepages(struct address_space *mapping,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 12ed2b7d046b..9fbe4d2aeece 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -93,17 +93,6 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 	return res ? try_to_free_buffers(page) : 0;
 }
 
-static int hfsplus_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
-			      struct buffer_head *bh_result, int create)
-{
-	int ret;
-
-	ret = hfsplus_get_block(inode, iblock, bh_result, create);
-	if (!ret)
-		bh_result->b_size = (1 << inode->i_blkbits);
-	return ret;
-}
-
 static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
 {
@@ -111,7 +100,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, hfsplus_get_blocks, NULL);
+				  offset, nr_segs, hfsplus_get_block, NULL);
 }
 
 static int hfsplus_writepages(struct address_space *mapping,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 7239ef339489..04eb78f1252e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -302,7 +302,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode = file->f_mapping->host;
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				offset, nr_segs, jfs_get_blocks, NULL);
+				offset, nr_segs, jfs_get_block, NULL);
 }
 
 struct address_space_operations jfs_aops = {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index bf931ba1d364..0d858d0b25be 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -540,7 +540,6 @@ bail:
  * 					fs_count, map_bh, dio->rw == WRITE);
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-				     unsigned long max_blocks,
 				     struct buffer_head *bh_result, int create)
 {
 	int ret;
@@ -548,6 +547,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	u64 p_blkno;
 	int contig_blocks;
 	unsigned char blocksize_bits;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 
 	if (!inode || !bh_result) {
 		mlog(ML_ERROR, "inode or bh_result is null\n");
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 62e18c19b446..9857e50f85e7 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -466,7 +466,6 @@ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
    direct_IO request. */
 static int reiserfs_get_blocks_direct_io(struct inode *inode,
 					 sector_t iblock,
-					 unsigned long max_blocks,
 					 struct buffer_head *bh_result,
 					 int create)
 {
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a79b84f8b55c..c02f7c5b7462 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1319,12 +1319,12 @@ STATIC int
 xfs_get_blocks_direct(
 	struct inode		*inode,
 	sector_t		iblock,
-	unsigned long		max_blocks,
 	struct buffer_head	*bh_result,
 	int			create)
 {
-	return __xfs_get_block(inode, iblock, max_blocks, bh_result,
-					create, 1, BMAPI_WRITE|BMAPI_DIRECT);
+	return __xfs_get_block(inode, iblock,
+				bh_result->b_size >> inode->i_blkbits,
+				bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
 }
 
 STATIC void
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 155d29d5e5e4..9d9674946956 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -252,9 +252,6 @@ extern void __init files_init(unsigned long);
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
-typedef int (get_blocks_t)(struct inode *inode, sector_t iblock,
-			unsigned long max_blocks,
-			struct buffer_head *bh_result, int create);
 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 			ssize_t bytes, void *private);
 
@@ -1645,7 +1642,7 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
 
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
-	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	int lock_type);
 
 enum {
@@ -1656,29 +1653,29 @@ enum {
 
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	loff_t offset, unsigned long nr_segs, get_block_t get_block,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_blocks, end_io, DIO_LOCKING);
+				nr_segs, get_block, end_io, DIO_LOCKING);
 }
 
 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	loff_t offset, unsigned long nr_segs, get_block_t get_block,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_blocks, end_io, DIO_NO_LOCKING);
+				nr_segs, get_block, end_io, DIO_NO_LOCKING);
 }
 
 static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
 	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	loff_t offset, unsigned long nr_segs, get_block_t get_block,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_blocks, end_io, DIO_OWN_LOCKING);
+				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
 }
 
 extern struct file_operations generic_ro_fops;
-- 
cgit v1.2.3


From 92127c7a45d4d167d9b015a5f9de6b41ed66f1d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Mar 2006 01:38:05 -0800
Subject: [PATCH] hrtimers: optimize softirq runqueues

The hrtimer softirq is called from the timer softirq every tick.  Retrieve the
current time from xtime and wall_to_monotonic instead of calling
base->get_time() for each timer base.  Store the time in the base structure
and provide a hook once clock source abstractions are in place and to keep the
code open for new base clocks.

Based on a patch from: Roman Zippel <zippel@linux-m68k.org>

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 20 ++++++++++++--------
 kernel/hrtimer.c        | 28 ++++++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6401c31d6add..64e2754ca734 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -72,14 +72,16 @@ struct hrtimer {
 /**
  * struct hrtimer_base - the timer base for a specific clock
  *
- * @index:	clock type index for per_cpu support when moving a timer
- *		to a base on another cpu.
- * @lock:	lock protecting the base and associated timers
- * @active:	red black tree root node for the active timers
- * @first:	pointer to the timer node which expires first
- * @resolution:	the resolution of the clock, in nanoseconds
- * @get_time:	function to retrieve the current time of the clock
- * @curr_timer:	the timer which is executing a callback right now
+ * @index:		clock type index for per_cpu support when moving a timer
+ *			to a base on another cpu.
+ * @lock:		lock protecting the base and associated timers
+ * @active:		red black tree root node for the active timers
+ * @first:		pointer to the timer node which expires first
+ * @resolution:		the resolution of the clock, in nanoseconds
+ * @get_time:		function to retrieve the current time of the clock
+ * @get_sofirq_time:	function to retrieve the current time from the softirq
+ * @curr_timer:		the timer which is executing a callback right now
+ * @softirq_time:	the time when running the hrtimer queue in the softirq
  */
 struct hrtimer_base {
 	clockid_t		index;
@@ -88,7 +90,9 @@ struct hrtimer_base {
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
+	ktime_t			(*get_softirq_time)(void);
 	struct hrtimer		*curr_timer;
+	ktime_t			softirq_time;
 };
 
 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14bc9cfa6399..b728cc53452b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -122,6 +122,26 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
+/*
+ * Get the coarse grained time at the softirq based on xtime and
+ * wall_to_monotonic.
+ */
+static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+{
+	ktime_t xtim, tomono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		xtim = timespec_to_ktime(xtime);
+		tomono = timespec_to_ktime(wall_to_monotonic);
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	base[CLOCK_REALTIME].softirq_time = xtim;
+	base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+}
+
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -586,9 +606,11 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
  */
 static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
-	ktime_t now = base->get_time();
 	struct rb_node *node;
 
+	if (base->get_softirq_time)
+		base->softirq_time = base->get_softirq_time();
+
 	spin_lock_irq(&base->lock);
 
 	while ((node = base->first)) {
@@ -598,7 +620,7 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 		void *data;
 
 		timer = rb_entry(node, struct hrtimer, node);
-		if (now.tv64 <= timer->expires.tv64)
+		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
 		fn = timer->function;
@@ -641,6 +663,8 @@ void hrtimer_run_queues(void)
 	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
 	int i;
 
+	hrtimer_get_softirq_time(base);
+
 	for (i = 0; i < MAX_HRTIMER_BASES; i++)
 		run_hrtimer_queue(&base[i]);
 }
-- 
cgit v1.2.3


From 44f21475511bbc0135b52c66ad74dcc6a9026da3 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:06 -0800
Subject: [PATCH] hrtimers: pass current time to hrtimer_forward()

Pass current time to hrtimer_forward().  This allows to use the softirq time
in the timer base when the forward function is called from the timer callback.
 Other places pass current time with a call to timer->base->get_time().

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h |  3 ++-
 kernel/hrtimer.c        |  7 +++----
 kernel/itimer.c         |  3 ++-
 kernel/posix-timers.c   | 14 ++++++++++----
 4 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 64e2754ca734..84fc186324e6 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -130,7 +130,8 @@ static inline int hrtimer_active(const struct hrtimer *timer)
 }
 
 /* Forward a hrtimer so it expires after now: */
-extern unsigned long hrtimer_forward(struct hrtimer *timer, ktime_t interval);
+extern unsigned long
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
 /* Precise sleep: */
 extern long hrtimer_nanosleep(struct timespec *rqtp,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b728cc53452b..e989c9981a96 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -301,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  * hrtimer_forward - forward the timer expiry
  *
  * @timer:	hrtimer to forward
+ * @now:	forward past this time
  * @interval:	the interval to forward
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
  */
 unsigned long
-hrtimer_forward(struct hrtimer *timer, ktime_t interval)
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
 	unsigned long orun = 1;
-	ktime_t delta, now;
-
-	now = timer->base->get_time();
+	ktime_t delta;
 
 	delta = ktime_sub(now, timer->expires);
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 680e6b70c872..af2ec6b4392c 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -136,7 +136,8 @@ int it_real_fn(void *data)
 
 	if (tsk->signal->it_real_incr.tv64 != 0) {
 		hrtimer_forward(&tsk->signal->real_timer,
-			       tsk->signal->it_real_incr);
+				tsk->signal->real_timer.base->softirq_time,
+				tsk->signal->it_real_incr);
 
 		return HRTIMER_RESTART;
 	}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9944379360b5..255657accf02 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -251,15 +251,18 @@ __initcall(init_posix_timers);
 
 static void schedule_next_timer(struct k_itimer *timr)
 {
+	struct hrtimer *timer = &timr->it.real.timer;
+
 	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
 					    timr->it.real.interval);
+
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
 	++timr->it_requeue_pending;
-	hrtimer_restart(&timr->it.real.timer);
+	hrtimer_restart(timer);
 }
 
 /*
@@ -334,6 +337,7 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 static int posix_timer_fn(void *data)
 {
 	struct k_itimer *timr = data;
+	struct hrtimer *timer = &timr->it.real.timer;
 	unsigned long flags;
 	int si_private = 0;
 	int ret = HRTIMER_NORESTART;
@@ -351,7 +355,8 @@ static int posix_timer_fn(void *data)
 		 */
 		if (timr->it.real.interval.tv64 != 0) {
 			timr->it_overrun +=
-				hrtimer_forward(&timr->it.real.timer,
+				hrtimer_forward(timer,
+						timer->base->softirq_time,
 						timr->it.real.interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
@@ -623,7 +628,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	if (timr->it_requeue_pending & REQUEUE_PENDING ||
 	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
 		timr->it_overrun +=
-			hrtimer_forward(timer, timr->it.real.interval);
+			hrtimer_forward(timer, timer->base->get_time(),
+					timr->it.real.interval);
 		remaining = hrtimer_get_remaining(timer);
 	}
  calci:
-- 
cgit v1.2.3


From 432569bb9d9d424d7ffe5b21f8205c55bdd1aaa8 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:08 -0800
Subject: [PATCH] hrtimers: simplify nanosleep

nanosleep is the only user of the expired state, so let it manage this itself,
which makes the hrtimer code a bit simpler.  The remaining time is also only
calculated if requested.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h |   4 +-
 kernel/hrtimer.c        | 142 +++++++++++++++++++++---------------------------
 2 files changed, 63 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 84fc186324e6..0e8f4762f6f8 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -38,9 +38,7 @@ enum hrtimer_restart {
  * Timer states:
  */
 enum hrtimer_state {
-	HRTIMER_INACTIVE,	/* Timer is inactive */
-	HRTIMER_EXPIRED,		/* Timer is expired */
-	HRTIMER_RUNNING,		/* Timer is running the callback function */
+	HRTIMER_INACTIVE,		/* Timer is inactive */
 	HRTIMER_PENDING,		/* Timer is pending */
 };
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e989c9981a96..59ec50c1e905 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -625,30 +625,20 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 		fn = timer->function;
 		data = timer->data;
 		set_curr_timer(base, timer);
-		timer->state = HRTIMER_RUNNING;
+		timer->state = HRTIMER_INACTIVE;
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
-		/*
-		 * fn == NULL is special case for the simplest timer
-		 * variant - wake up process and do not restart:
-		 */
-		if (!fn) {
-			wake_up_process(data);
-			restart = HRTIMER_NORESTART;
-		} else
-			restart = fn(data);
+		restart = fn(data);
 
 		spin_lock_irq(&base->lock);
 
 		/* Another CPU has added back the timer */
-		if (timer->state != HRTIMER_RUNNING)
+		if (timer->state != HRTIMER_INACTIVE)
 			continue;
 
-		if (restart == HRTIMER_RESTART)
+		if (restart != HRTIMER_NORESTART)
 			enqueue_hrtimer(timer, base);
-		else
-			timer->state = HRTIMER_EXPIRED;
 	}
 	set_curr_timer(base, NULL);
 	spin_unlock_irq(&base->lock);
@@ -672,79 +662,70 @@ void hrtimer_run_queues(void)
  * Sleep related functions:
  */
 
-/**
- * schedule_hrtimer - sleep until timeout
- *
- * @timer:	hrtimer variable initialized with the correct clock base
- * @mode:	timeout value is abs/rel
- *
- * Make the current task sleep until @timeout is
- * elapsed.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * will be returned
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- */
-static ktime_t __sched
-schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
-{
-	/* fn stays NULL, meaning single-shot wakeup: */
-	timer->data = current;
+struct sleep_hrtimer {
+	struct hrtimer timer;
+	struct task_struct *task;
+	int expired;
+};
 
-	hrtimer_start(timer, timer->expires, mode);
+static int nanosleep_wakeup(void *data)
+{
+	struct sleep_hrtimer *t = data;
 
-	schedule();
-	hrtimer_cancel(timer);
+	t->expired = 1;
+	wake_up_process(t->task);
 
-	/* Return the remaining time: */
-	if (timer->state != HRTIMER_EXPIRED)
-		return ktime_sub(timer->expires, timer->base->get_time());
-	else
-		return (ktime_t) {.tv64 = 0 };
+	return HRTIMER_NORESTART;
 }
 
-static inline ktime_t __sched
-schedule_hrtimer_interruptible(struct hrtimer *timer,
-			       const enum hrtimer_mode mode)
+static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
 {
-	set_current_state(TASK_INTERRUPTIBLE);
+	t->timer.function = nanosleep_wakeup;
+	t->timer.data = t;
+	t->task = current;
+	t->expired = 0;
+
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		hrtimer_start(&t->timer, t->timer.expires, mode);
+
+		schedule();
 
-	return schedule_hrtimer(timer, mode);
+		if (unlikely(!t->expired)) {
+			hrtimer_cancel(&t->timer);
+			mode = HRTIMER_ABS;
+		}
+	} while (!t->expired && !signal_pending(current));
+
+	return t->expired;
 }
 
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
+	struct sleep_hrtimer t;
 	struct timespec __user *rmtp;
 	struct timespec tu;
-	void *rfn_save = restart->fn;
-	struct hrtimer timer;
-	ktime_t rem;
+	ktime_t time;
 
 	restart->fn = do_no_restart_syscall;
 
-	hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
-
-	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+	hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
+	t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
 
-	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
-
-	if (rem.tv64 <= 0)
+	if (do_nanosleep(&t, HRTIMER_ABS))
 		return 0;
 
 	rmtp = (struct timespec __user *) restart->arg2;
-	tu = ktime_to_timespec(rem);
-	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
-		return -EFAULT;
+	if (rmtp) {
+		time = ktime_sub(t.timer.expires, t.timer.base->get_time());
+		if (time.tv64 <= 0)
+			return 0;
+		tu = ktime_to_timespec(time);
+		if (copy_to_user(rmtp, &tu, sizeof(tu)))
+			return -EFAULT;
+	}
 
-	restart->fn = rfn_save;
+	restart->fn = nanosleep_restart;
 
 	/* The other values in restart are already filled in */
 	return -ERESTART_RESTARTBLOCK;
@@ -754,33 +735,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
-	struct hrtimer timer;
+	struct sleep_hrtimer t;
 	struct timespec tu;
 	ktime_t rem;
 
-	hrtimer_init(&timer, clockid, mode);
-
-	timer.expires = timespec_to_ktime(*rqtp);
-
-	rem = schedule_hrtimer_interruptible(&timer, mode);
-	if (rem.tv64 <= 0)
+	hrtimer_init(&t.timer, clockid, mode);
+	t.timer.expires = timespec_to_ktime(*rqtp);
+	if (do_nanosleep(&t, mode))
 		return 0;
 
 	/* Absolute timers do not update the rmtp value and restart: */
 	if (mode == HRTIMER_ABS)
 		return -ERESTARTNOHAND;
 
-	tu = ktime_to_timespec(rem);
-
-	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
-		return -EFAULT;
+	if (rmtp) {
+		rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
+		if (rem.tv64 <= 0)
+			return 0;
+		tu = ktime_to_timespec(rem);
+		if (copy_to_user(rmtp, &tu, sizeof(tu)))
+			return -EFAULT;
+	}
 
 	restart = &current_thread_info()->restart_block;
 	restart->fn = nanosleep_restart;
-	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
-	restart->arg1 = timer.expires.tv64 >> 32;
+	restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg1 = t.timer.expires.tv64 >> 32;
 	restart->arg2 = (unsigned long) rmtp;
-	restart->arg3 = (unsigned long) timer.base->index;
+	restart->arg3 = (unsigned long) t.timer.base->index;
 
 	return -ERESTART_RESTARTBLOCK;
 }
-- 
cgit v1.2.3


From b75f7a51ca75c977d7d77f735d7a7859194eb39e Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:09 -0800
Subject: [PATCH] hrtimers: remove state field

Remove the state field and encode this information in the rb_node similiar to
normal timer.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 11 ++---------
 kernel/hrtimer.c        | 14 +++++---------
 2 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0e8f4762f6f8..f57cc7bd7008 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -34,13 +34,7 @@ enum hrtimer_restart {
 	HRTIMER_RESTART,
 };
 
-/*
- * Timer states:
- */
-enum hrtimer_state {
-	HRTIMER_INACTIVE,		/* Timer is inactive */
-	HRTIMER_PENDING,		/* Timer is pending */
-};
+#define HRTIMER_INACTIVE	((void *)1UL)
 
 struct hrtimer_base;
 
@@ -61,7 +55,6 @@ struct hrtimer_base;
 struct hrtimer {
 	struct rb_node		node;
 	ktime_t			expires;
-	enum hrtimer_state	state;
 	int			(*function)(void *);
 	void			*data;
 	struct hrtimer_base	*base;
@@ -124,7 +117,7 @@ extern ktime_t hrtimer_get_next_event(void);
 
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-	return timer->state == HRTIMER_PENDING;
+	return timer->node.rb_parent != HRTIMER_INACTIVE;
 }
 
 /* Forward a hrtimer so it expires after now: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 59ec50c1e905..658d49feedb9 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -374,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 	rb_link_node(&timer->node, parent, link);
 	rb_insert_color(&timer->node, &base->active);
 
-	timer->state = HRTIMER_PENDING;
-
 	if (!base->first || timer->expires.tv64 <
 	    rb_entry(base->first, struct hrtimer, node)->expires.tv64)
 		base->first = &timer->node;
@@ -395,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 	if (base->first == &timer->node)
 		base->first = rb_next(&timer->node);
 	rb_erase(&timer->node, &base->active);
+	timer->node.rb_parent = HRTIMER_INACTIVE;
 }
 
 /*
@@ -405,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	if (hrtimer_active(timer)) {
 		__remove_hrtimer(timer, base);
-		timer->state = HRTIMER_INACTIVE;
 		return 1;
 	}
 	return 0;
@@ -579,6 +577,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &bases[clock_id];
+	timer->node.rb_parent = HRTIMER_INACTIVE;
 }
 
 /**
@@ -625,7 +624,6 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 		fn = timer->function;
 		data = timer->data;
 		set_curr_timer(base, timer);
-		timer->state = HRTIMER_INACTIVE;
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
@@ -633,12 +631,10 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 
 		spin_lock_irq(&base->lock);
 
-		/* Another CPU has added back the timer */
-		if (timer->state != HRTIMER_INACTIVE)
-			continue;
-
-		if (restart != HRTIMER_NORESTART)
+		if (restart != HRTIMER_NORESTART) {
+			BUG_ON(hrtimer_active(timer));
 			enqueue_hrtimer(timer, base);
+		}
 	}
 	set_curr_timer(base, NULL);
 	spin_unlock_irq(&base->lock);
-- 
cgit v1.2.3


From 272705c5979c114e63dbfcd28ea15093038a4c42 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:10 -0800
Subject: [PATCH] hrtimers: remove DEFINE_KTIME and ktime_to_clock_t()

Now that it_real_value is gone, the last user of DEFINE_KTIME and
ktime_to_clock_t are also gone, so remove it before someone starts using it
again.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ktime.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index f3dec45ef874..62bc57580707 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -64,9 +64,6 @@ typedef union {
 
 #if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
 
-/* Define a ktime_t variable and initialize it to zero: */
-#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
-
 /**
  * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
  *
@@ -113,9 +110,6 @@ static inline ktime_t timeval_to_ktime(struct timeval tv)
 /* Map the ktime_t to timeval conversion to ns_to_timeval function */
 #define ktime_to_timeval(kt)		ns_to_timeval((kt).tv64)
 
-/* Map the ktime_t to clock_t conversion to the inline in jiffies.h: */
-#define ktime_to_clock_t(kt)		nsec_to_clock_t((kt).tv64)
-
 /* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
 #define ktime_to_ns(kt)			((kt).tv64)
 
@@ -136,9 +130,6 @@ static inline ktime_t timeval_to_ktime(struct timeval tv)
  *   tv.sec < 0 and 0 >= tv.nsec < NSEC_PER_SEC
  */
 
-/* Define a ktime_t variable and initialize it to zero: */
-#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
-
 /* Set a ktime_t variable to a value in sec/nsec representation: */
 static inline ktime_t ktime_set(const long secs, const unsigned long nsecs)
 {
@@ -254,17 +245,6 @@ static inline struct timeval ktime_to_timeval(const ktime_t kt)
 		.tv_usec = (suseconds_t) (kt.tv.nsec / NSEC_PER_USEC) };
 }
 
-/**
- * ktime_to_clock_t - convert a ktime_t variable to clock_t format
- * @kt:		the ktime_t variable to convert
- *
- * Returns a clock_t variable with the converted value
- */
-static inline clock_t ktime_to_clock_t(const ktime_t kt)
-{
-	return nsec_to_clock_t( (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec);
-}
-
 /**
  * ktime_to_ns - convert a ktime_t variable to scalar nanoseconds
  * @kt:		the ktime_t variable to convert
-- 
cgit v1.2.3


From df869b630d9d9131c10cf073fb61646048874b2f Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:11 -0800
Subject: [PATCH] hrtimers: remove nsec_t typedef

nsec_t predates ktime_t and has mostly been superseded by it.  In the few
places that are left it's better to make it explicit that we're dealing with
64 bit values here.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 18 ++++++------------
 kernel/hrtimer.c     |  4 ++--
 kernel/time.c        |  4 ++--
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index bf0e785e2e03..0cd696cee998 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -73,12 +73,6 @@ extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 #define timespec_valid(ts) \
 	(((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC))
 
-/*
- * 64-bit nanosec type. Large enough to span 292+ years in nanosecond
- * resolution. Ought to be enough for a while.
- */
-typedef s64 nsec_t;
-
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
@@ -114,9 +108,9 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
  * Returns the scalar nanosecond representation of the timespec
  * parameter.
  */
-static inline nsec_t timespec_to_ns(const struct timespec *ts)
+static inline s64 timespec_to_ns(const struct timespec *ts)
 {
-	return ((nsec_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+	return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
 }
 
 /**
@@ -126,9 +120,9 @@ static inline nsec_t timespec_to_ns(const struct timespec *ts)
  * Returns the scalar nanosecond representation of the timeval
  * parameter.
  */
-static inline nsec_t timeval_to_ns(const struct timeval *tv)
+static inline s64 timeval_to_ns(const struct timeval *tv)
 {
-	return ((nsec_t) tv->tv_sec * NSEC_PER_SEC) +
+	return ((s64) tv->tv_sec * NSEC_PER_SEC) +
 		tv->tv_usec * NSEC_PER_USEC;
 }
 
@@ -138,7 +132,7 @@ static inline nsec_t timeval_to_ns(const struct timeval *tv)
  *
  * Returns the timespec representation of the nsec parameter.
  */
-extern struct timespec ns_to_timespec(const nsec_t nsec);
+extern struct timespec ns_to_timespec(const s64 nsec);
 
 /**
  * ns_to_timeval - Convert nanoseconds to timeval
@@ -146,7 +140,7 @@ extern struct timespec ns_to_timespec(const nsec_t nsec);
  *
  * Returns the timeval representation of the nsec parameter.
  */
-extern struct timeval ns_to_timeval(const nsec_t nsec);
+extern struct timeval ns_to_timeval(const s64 nsec);
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 658d49feedb9..44108de4f028 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -266,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
 /*
  * Divide a ktime value by a nanosecond value
  */
-static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+static unsigned long ktime_divns(const ktime_t kt, s64 div)
 {
 	u64 dclc, inc, dns;
 	int sft = 0;
@@ -322,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 		interval.tv64 = timer->base->resolution.tv64;
 
 	if (unlikely(delta.tv64 >= interval.tv64)) {
-		nsec_t incr = ktime_to_ns(interval);
+		s64 incr = ktime_to_ns(interval);
 
 		orun = ktime_divns(delta, incr);
 		timer->expires = ktime_add_ns(timer->expires, incr * orun);
diff --git a/kernel/time.c b/kernel/time.c
index e00a97b77241..ff8e7019c4c4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -610,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
  *
  * Returns the timespec representation of the nsec parameter.
  */
-struct timespec ns_to_timespec(const nsec_t nsec)
+struct timespec ns_to_timespec(const s64 nsec)
 {
 	struct timespec ts;
 
@@ -630,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec)
  *
  * Returns the timeval representation of the nsec parameter.
  */
-struct timeval ns_to_timeval(const nsec_t nsec)
+struct timeval ns_to_timeval(const s64 nsec)
 {
 	struct timespec ts = ns_to_timespec(nsec);
 	struct timeval tv;
-- 
cgit v1.2.3


From 05cfb614ddbf3181540ce09d44d96486f8ba8d6a Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:12 -0800
Subject: [PATCH] hrtimers: remove data field

The nanosleep cleanup allows to remove the data field of hrtimer.  The
callback function can use container_of() to get it's own data.  Since the
hrtimer structure is anyway embedded in other structures, this adds no
overhead.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c               |  2 +-
 include/linux/hrtimer.h |  5 +----
 include/linux/sched.h   |  1 +
 include/linux/timer.h   |  3 ++-
 kernel/fork.c           |  2 +-
 kernel/hrtimer.c        | 12 +++++-------
 kernel/itimer.c         | 15 +++++++--------
 kernel/posix-timers.c   |  9 ++++-----
 8 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 995cba3c62b8..c7397c46ad6d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,7 +632,7 @@ static int de_thread(struct task_struct *tsk)
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = current;
+		sig->tsk = current;
 		spin_unlock_irq(lock);
 		if (hrtimer_cancel(&sig->real_timer))
 			hrtimer_restart(&sig->real_timer);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f57cc7bd7008..93830158348e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -45,9 +45,7 @@ struct hrtimer_base;
  * @expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
  *		which the timer is based.
- * @state:	state of the timer
  * @function:	timer expiry callback function
- * @data:	argument for the callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  *
  * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
@@ -55,8 +53,7 @@ struct hrtimer_base;
 struct hrtimer {
 	struct rb_node		node;
 	ktime_t			expires;
-	int			(*function)(void *);
-	void			*data;
+	int			(*function)(struct hrtimer *);
 	struct hrtimer_base	*base;
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e0054c1b9a09..036d14d2bf90 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,6 +402,7 @@ struct signal_struct {
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
+	struct task_struct *tsk;
 	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ee5a09e806e8..b5caabca553c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -96,6 +96,7 @@ static inline void add_timer(struct timer_list *timer)
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern int it_real_fn(void *);
+struct hrtimer;
+extern int it_real_fn(struct hrtimer *);
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a02063903aaa..4bd6486aa67d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -848,7 +848,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = tsk;
+	sig->tsk = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 44108de4f028..0237a556eb1f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -613,21 +613,19 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		int (*fn)(void *);
+		int (*fn)(struct hrtimer *);
 		int restart;
-		void *data;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
 		fn = timer->function;
-		data = timer->data;
 		set_curr_timer(base, timer);
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
-		restart = fn(data);
+		restart = fn(timer);
 
 		spin_lock_irq(&base->lock);
 
@@ -664,9 +662,10 @@ struct sleep_hrtimer {
 	int expired;
 };
 
-static int nanosleep_wakeup(void *data)
+static int nanosleep_wakeup(struct hrtimer *timer)
 {
-	struct sleep_hrtimer *t = data;
+	struct sleep_hrtimer *t =
+		container_of(timer, struct sleep_hrtimer, timer);
 
 	t->expired = 1;
 	wake_up_process(t->task);
@@ -677,7 +676,6 @@ static int nanosleep_wakeup(void *data)
 static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
 {
 	t->timer.function = nanosleep_wakeup;
-	t->timer.data = t;
 	t->task = current;
 	t->expired = 0;
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index af2ec6b4392c..204ed7939e75 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,17 +128,16 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 /*
  * The timer is automagically restarted, when interval != 0
  */
-int it_real_fn(void *data)
+int it_real_fn(struct hrtimer *timer)
 {
-	struct task_struct *tsk = (struct task_struct *) data;
+	struct signal_struct *sig =
+	    container_of(timer, struct signal_struct, real_timer);
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
-
-	if (tsk->signal->it_real_incr.tv64 != 0) {
-		hrtimer_forward(&tsk->signal->real_timer,
-				tsk->signal->real_timer.base->softirq_time,
-				tsk->signal->it_real_incr);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
 
+	if (sig->it_real_incr.tv64 != 0) {
+		hrtimer_forward(timer, timer->base->softirq_time,
+				sig->it_real_incr);
 		return HRTIMER_RESTART;
 	}
 	return HRTIMER_NORESTART;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7c5f44787c8c..ac6dc8744429 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
 static int common_timer_del(struct k_itimer *timer);
 
-static int posix_timer_fn(void *data);
+static int posix_timer_fn(struct hrtimer *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -334,14 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static int posix_timer_fn(void *data)
+static int posix_timer_fn(struct hrtimer *timer)
 {
-	struct k_itimer *timr = data;
-	struct hrtimer *timer = &timr->it.real.timer;
+	struct k_itimer *timr;
 	unsigned long flags;
 	int si_private = 0;
 	int ret = HRTIMER_NORESTART;
 
+	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
 	if (timr->it.real.interval.tv64 != 0)
@@ -725,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
 
 	mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
-	timr->it.real.timer.data = timr;
 	timr->it.real.timer.function = posix_timer_fn;
 
 	timer->expires = timespec_to_ktime(new_setting->it_value);
-- 
cgit v1.2.3


From ee8a4b7f857fe7ba243e65c8925798cf8eda5ab0 Mon Sep 17 00:00:00 2001
From: Hansjoerg Lipp <hjlipp@web.de>
Date: Sun, 26 Mar 2006 01:38:32 -0800
Subject: [PATCH] isdn4linux: Siemens Gigaset drivers - tty interface

And: Tilman Schmidt <tilman@imap.cc>

This patch adds the tty interface to the gigaset module.  The tty interface
provides direct access to the AT command set of the Gigaset devices.

Signed-off-by: Hansjoerg Lipp <hjlipp@web.de>
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Cc: Karsten Keil <kkeil@suse.de>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/gigaset/interface.c | 718 +++++++++++++++++++++++++++++++++++++++
 include/linux/gigaset_dev.h      |  32 ++
 2 files changed, 750 insertions(+)
 create mode 100644 drivers/isdn/gigaset/interface.c
 create mode 100644 include/linux/gigaset_dev.h

(limited to 'include/linux')

diff --git a/drivers/isdn/gigaset/interface.c b/drivers/isdn/gigaset/interface.c
new file mode 100644
index 000000000000..3a81d9c65141
--- /dev/null
+++ b/drivers/isdn/gigaset/interface.c
@@ -0,0 +1,718 @@
+/*
+ * interface to user space for the gigaset driver
+ *
+ * Copyright (c) 2004 by Hansjoerg Lipp <hjlipp@web.de>
+ *
+ * =====================================================================
+ *    This program is free software; you can redistribute it and/or
+ *    modify it under the terms of the GNU General Public License as
+ *    published by the Free Software Foundation; either version 2 of
+ *    the License, or (at your option) any later version.
+ * =====================================================================
+ * Version: $Id: interface.c,v 1.14.4.15 2006/02/04 18:28:16 hjlipp Exp $
+ * =====================================================================
+ */
+
+#include "gigaset.h"
+#include <linux/gigaset_dev.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+/*** our ioctls ***/
+
+static int if_lock(struct cardstate *cs, int *arg)
+{
+	int cmd = *arg;
+
+	dbg(DEBUG_IF, "%u: if_lock (%d)", cs->minor_index, cmd);
+
+	if (cmd > 1)
+		return -EINVAL;
+
+	if (cmd < 0) {
+		*arg = atomic_read(&cs->mstate) == MS_LOCKED; //FIXME remove?
+		return 0;
+	}
+
+	if (!cmd && atomic_read(&cs->mstate) == MS_LOCKED
+	    && atomic_read(&cs->connected)) {
+		cs->ops->set_modem_ctrl(cs, 0, TIOCM_DTR|TIOCM_RTS);
+		cs->ops->baud_rate(cs, B115200);
+		cs->ops->set_line_ctrl(cs, CS8);
+		cs->control_state = TIOCM_DTR|TIOCM_RTS;
+	}
+
+	cs->waiting = 1;
+	if (!gigaset_add_event(cs, &cs->at_state, EV_IF_LOCK,
+		               NULL, cmd, NULL)) {
+		cs->waiting = 0;
+		return -ENOMEM;
+	}
+
+	dbg(DEBUG_CMD, "scheduling IF_LOCK");
+	gigaset_schedule_event(cs);
+
+	wait_event(cs->waitqueue, !cs->waiting);
+
+	if (cs->cmd_result >= 0) {
+		*arg = cs->cmd_result;
+		return 0;
+	}
+
+	return cs->cmd_result;
+}
+
+static int if_version(struct cardstate *cs, unsigned arg[4])
+{
+	static const unsigned version[4] = GIG_VERSION;
+	static const unsigned compat[4] = GIG_COMPAT;
+	unsigned cmd = arg[0];
+
+	dbg(DEBUG_IF, "%u: if_version (%d)", cs->minor_index, cmd);
+
+	switch (cmd) {
+	case GIGVER_DRIVER:
+		memcpy(arg, version, sizeof version);
+		return 0;
+	case GIGVER_COMPAT:
+		memcpy(arg, compat, sizeof compat);
+		return 0;
+	case GIGVER_FWBASE:
+		cs->waiting = 1;
+		if (!gigaset_add_event(cs, &cs->at_state, EV_IF_VER,
+			               NULL, 0, arg)) {
+			cs->waiting = 0;
+			return -ENOMEM;
+		}
+
+		dbg(DEBUG_CMD, "scheduling IF_VER");
+		gigaset_schedule_event(cs);
+
+		wait_event(cs->waitqueue, !cs->waiting);
+
+		if (cs->cmd_result >= 0)
+			return 0;
+
+		return cs->cmd_result;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int if_config(struct cardstate *cs, int *arg)
+{
+	dbg(DEBUG_IF, "%u: if_config (%d)", cs->minor_index, *arg);
+
+	if (*arg != 1)
+		return -EINVAL;
+
+	if (atomic_read(&cs->mstate) != MS_LOCKED)
+		return -EBUSY;
+
+	*arg = 0;
+	return gigaset_enterconfigmode(cs);
+}
+
+/*** the terminal driver ***/
+/* stolen from usbserial and some other tty drivers */
+
+static int  if_open(struct tty_struct *tty, struct file *filp);
+static void if_close(struct tty_struct *tty, struct file *filp);
+static int  if_ioctl(struct tty_struct *tty, struct file *file,
+                     unsigned int cmd, unsigned long arg);
+static int  if_write_room(struct tty_struct *tty);
+static int  if_chars_in_buffer(struct tty_struct *tty);
+static void if_throttle(struct tty_struct *tty);
+static void if_unthrottle(struct tty_struct *tty);
+static void if_set_termios(struct tty_struct *tty, struct termios *old);
+static int  if_tiocmget(struct tty_struct *tty, struct file *file);
+static int  if_tiocmset(struct tty_struct *tty, struct file *file,
+                        unsigned int set, unsigned int clear);
+static int  if_write(struct tty_struct *tty,
+                     const unsigned char *buf, int count);
+
+static struct tty_operations if_ops = {
+	.open =			if_open,
+	.close =		if_close,
+	.ioctl =		if_ioctl,
+	.write =		if_write,
+	.write_room =		if_write_room,
+	.chars_in_buffer =	if_chars_in_buffer,
+	.set_termios =		if_set_termios,
+	.throttle =		if_throttle,
+	.unthrottle =		if_unthrottle,
+#if 0
+	.break_ctl =		serial_break,
+#endif
+	.tiocmget =		if_tiocmget,
+	.tiocmset =		if_tiocmset,
+};
+
+static int if_open(struct tty_struct *tty, struct file *filp)
+{
+	struct cardstate *cs;
+	unsigned long flags;
+
+	dbg(DEBUG_IF, "%d+%d: %s()", tty->driver->minor_start, tty->index,
+	    __FUNCTION__);
+
+	tty->driver_data = NULL;
+
+	cs = gigaset_get_cs_by_tty(tty);
+	if (!cs)
+		return -ENODEV;
+
+	if (down_interruptible(&cs->sem))
+		return -ERESTARTSYS; // FIXME -EINTR?
+	tty->driver_data = cs;
+
+	++cs->open_count;
+
+	if (cs->open_count == 1) {
+		spin_lock_irqsave(&cs->lock, flags);
+		cs->tty = tty;
+		spin_unlock_irqrestore(&cs->lock, flags);
+		tty->low_latency = 1; //FIXME test
+		//FIXME
+	}
+
+	up(&cs->sem);
+	return 0;
+}
+
+static void if_close(struct tty_struct *tty, struct file *filp)
+{
+	struct cardstate *cs;
+	unsigned long flags;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	down(&cs->sem);
+
+	if (!cs->open_count)
+		warn("%s: device not opened", __FUNCTION__);
+	else {
+		if (!--cs->open_count) {
+			spin_lock_irqsave(&cs->lock, flags);
+			cs->tty = NULL;
+			spin_unlock_irqrestore(&cs->lock, flags);
+			//FIXME
+		}
+	}
+
+	up(&cs->sem);
+}
+
+static int if_ioctl(struct tty_struct *tty, struct file *file,
+                    unsigned int cmd, unsigned long arg)
+{
+	struct cardstate *cs;
+	int retval = -ENODEV;
+	int int_arg;
+	unsigned char buf[6];
+	unsigned version[4];
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return -ENODEV;
+	}
+
+	dbg(DEBUG_IF, "%u: %s(0x%x)", cs->minor_index, __FUNCTION__, cmd);
+
+	if (down_interruptible(&cs->sem))
+		return -ERESTARTSYS; // FIXME -EINTR?
+
+	if (!cs->open_count)
+		warn("%s: device not opened", __FUNCTION__);
+	else {
+		retval = 0;
+		switch (cmd) {
+		case GIGASET_REDIR:
+			retval = get_user(int_arg, (int __user *) arg);
+			if (retval >= 0)
+				retval = if_lock(cs, &int_arg);
+			if (retval >= 0)
+				retval = put_user(int_arg, (int __user *) arg);
+			break;
+		case GIGASET_CONFIG:
+			retval = get_user(int_arg, (int __user *) arg);
+			if (retval >= 0)
+				retval = if_config(cs, &int_arg);
+			if (retval >= 0)
+				retval = put_user(int_arg, (int __user *) arg);
+			break;
+		case GIGASET_BRKCHARS:
+			//FIXME test if MS_LOCKED
+			gigaset_dbg_buffer(DEBUG_IF, "GIGASET_BRKCHARS",
+			                   6, (const unsigned char *) arg, 1);
+			if (!atomic_read(&cs->connected)) {
+				dbg(DEBUG_ANY, "can't communicate with unplugged device");
+				retval = -ENODEV;
+				break;
+			}
+			retval = copy_from_user(&buf,
+			                        (const unsigned char __user *) arg, 6)
+			         ? -EFAULT : 0;
+			if (retval >= 0)
+				retval = cs->ops->brkchars(cs, buf);
+			break;
+		case GIGASET_VERSION:
+			retval = copy_from_user(version, (unsigned __user *) arg,
+			                        sizeof version) ? -EFAULT : 0;
+			if (retval >= 0)
+				retval = if_version(cs, version);
+			if (retval >= 0)
+				retval = copy_to_user((unsigned __user *) arg, version,
+				                      sizeof version)
+				         ? -EFAULT : 0;
+			break;
+	        default:
+			dbg(DEBUG_ANY, "%s: arg not supported - 0x%04x",
+			    __FUNCTION__, cmd);
+			retval = -ENOIOCTLCMD;
+		}
+	}
+
+	up(&cs->sem);
+
+	return retval;
+}
+
+static int if_tiocmget(struct tty_struct *tty, struct file *file)
+{
+	struct cardstate *cs;
+	int retval;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return -ENODEV;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	if (down_interruptible(&cs->sem))
+		return -ERESTARTSYS; // FIXME -EINTR?
+
+	// FIXME read from device?
+	retval = cs->control_state & (TIOCM_RTS|TIOCM_DTR);
+
+	up(&cs->sem);
+
+	return retval;
+}
+
+static int if_tiocmset(struct tty_struct *tty, struct file *file,
+                       unsigned int set, unsigned int clear)
+{
+	struct cardstate *cs;
+	int retval;
+	unsigned mc;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return -ENODEV;
+	}
+
+	dbg(DEBUG_IF,
+	    "%u: %s(0x%x, 0x%x)", cs->minor_index, __FUNCTION__, set, clear);
+
+	if (down_interruptible(&cs->sem))
+		return -ERESTARTSYS; // FIXME -EINTR?
+
+	if (!atomic_read(&cs->connected)) {
+		dbg(DEBUG_ANY, "can't communicate with unplugged device");
+		retval = -ENODEV;
+	} else {
+		mc = (cs->control_state | set) & ~clear & (TIOCM_RTS|TIOCM_DTR);
+		retval = cs->ops->set_modem_ctrl(cs, cs->control_state, mc);
+		cs->control_state = mc;
+	}
+
+	up(&cs->sem);
+
+	return retval;
+}
+
+static int if_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+	struct cardstate *cs;
+	int retval = -ENODEV;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return -ENODEV;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	if (down_interruptible(&cs->sem))
+		return -ERESTARTSYS; // FIXME -EINTR?
+
+	if (!cs->open_count)
+		warn("%s: device not opened", __FUNCTION__);
+	else if (atomic_read(&cs->mstate) != MS_LOCKED) {
+		warn("can't write to unlocked device");
+		retval = -EBUSY;
+	} else if (!atomic_read(&cs->connected)) {
+		dbg(DEBUG_ANY, "can't write to unplugged device");
+		retval = -EBUSY; //FIXME
+	} else {
+		retval = cs->ops->write_cmd(cs, buf, count,
+		                            &cs->if_wake_tasklet);
+	}
+
+	up(&cs->sem);
+
+	return retval;
+}
+
+static int if_write_room(struct tty_struct *tty)
+{
+	struct cardstate *cs;
+	int retval = -ENODEV;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return -ENODEV;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	if (down_interruptible(&cs->sem))
+		return -ERESTARTSYS; // FIXME -EINTR?
+
+	if (!cs->open_count)
+		warn("%s: device not opened", __FUNCTION__);
+	else if (atomic_read(&cs->mstate) != MS_LOCKED) {
+		warn("can't write to unlocked device");
+		retval = -EBUSY; //FIXME
+	} else if (!atomic_read(&cs->connected)) {
+		dbg(DEBUG_ANY, "can't write to unplugged device");
+		retval = -EBUSY; //FIXME
+	} else
+		retval = cs->ops->write_room(cs);
+
+	up(&cs->sem);
+
+	return retval;
+}
+
+static int if_chars_in_buffer(struct tty_struct *tty)
+{
+	struct cardstate *cs;
+	int retval = -ENODEV;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return -ENODEV;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	if (down_interruptible(&cs->sem))
+		return -ERESTARTSYS; // FIXME -EINTR?
+
+	if (!cs->open_count)
+		warn("%s: device not opened", __FUNCTION__);
+	else if (atomic_read(&cs->mstate) != MS_LOCKED) {
+		warn("can't write to unlocked device");
+		retval = -EBUSY;
+	} else if (!atomic_read(&cs->connected)) {
+		dbg(DEBUG_ANY, "can't write to unplugged device");
+		retval = -EBUSY; //FIXME
+	} else
+		retval = cs->ops->chars_in_buffer(cs);
+
+	up(&cs->sem);
+
+	return retval;
+}
+
+static void if_throttle(struct tty_struct *tty)
+{
+	struct cardstate *cs;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	down(&cs->sem);
+
+	if (!cs->open_count)
+		warn("%s: device not opened", __FUNCTION__);
+	else {
+		//FIXME
+	}
+
+	up(&cs->sem);
+}
+
+static void if_unthrottle(struct tty_struct *tty)
+{
+	struct cardstate *cs;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	down(&cs->sem);
+
+	if (!cs->open_count)
+		warn("%s: device not opened", __FUNCTION__);
+	else {
+		//FIXME
+	}
+
+	up(&cs->sem);
+}
+
+static void if_set_termios(struct tty_struct *tty, struct termios *old)
+{
+	struct cardstate *cs;
+	unsigned int iflag;
+	unsigned int cflag;
+	unsigned int old_cflag;
+	unsigned int control_state, new_state;
+
+	cs = (struct cardstate *) tty->driver_data;
+	if (!cs) {
+		err("cs==NULL in %s", __FUNCTION__);
+		return;
+	}
+
+	dbg(DEBUG_IF, "%u: %s()", cs->minor_index, __FUNCTION__);
+
+	down(&cs->sem);
+
+	if (!cs->open_count) {
+		warn("%s: device not opened", __FUNCTION__);
+		goto out;
+	}
+
+	if (!atomic_read(&cs->connected)) {
+		dbg(DEBUG_ANY, "can't communicate with unplugged device");
+		goto out;
+	}
+
+	// stolen from mct_u232.c
+	iflag = tty->termios->c_iflag;
+	cflag = tty->termios->c_cflag;
+	old_cflag = old ? old->c_cflag : cflag; //FIXME?
+	dbg(DEBUG_IF, "%u: iflag %x cflag %x old %x", cs->minor_index,
+	    iflag, cflag, old_cflag);
+
+	/* get a local copy of the current port settings */
+	control_state = cs->control_state;
+
+	/*
+	 * Update baud rate.
+	 * Do not attempt to cache old rates and skip settings,
+	 * disconnects screw such tricks up completely.
+	 * Premature optimization is the root of all evil.
+	 */
+
+        /* reassert DTR and (maybe) RTS on transition from B0 */
+	if ((old_cflag & CBAUD) == B0) {
+		new_state = control_state | TIOCM_DTR;
+		/* don't set RTS if using hardware flow control */
+		if (!(old_cflag & CRTSCTS))
+			new_state |= TIOCM_RTS;
+		dbg(DEBUG_IF, "%u: from B0 - set DTR%s", cs->minor_index,
+		    (new_state & TIOCM_RTS) ? " only" : "/RTS");
+		cs->ops->set_modem_ctrl(cs, control_state, new_state);
+		control_state = new_state;
+	}
+
+	cs->ops->baud_rate(cs, cflag & CBAUD);
+
+	if ((cflag & CBAUD) == B0) {
+		/* Drop RTS and DTR */
+		dbg(DEBUG_IF, "%u: to B0 - drop DTR/RTS", cs->minor_index);
+		new_state = control_state & ~(TIOCM_DTR | TIOCM_RTS);
+		cs->ops->set_modem_ctrl(cs, control_state, new_state);
+		control_state = new_state;
+	}
+
+	/*
+	 * Update line control register (LCR)
+	 */
+
+	cs->ops->set_line_ctrl(cs, cflag);
+
+#if 0
+	//FIXME this hangs M101 [ts 2005-03-09]
+	//FIXME do we need this?
+	/*
+	 * Set flow control: well, I do not really now how to handle DTR/RTS.
+	 * Just do what we have seen with SniffUSB on Win98.
+	 */
+	/* Drop DTR/RTS if no flow control otherwise assert */
+	dbg(DEBUG_IF, "%u: control_state %x", cs->minor_index, control_state);
+	new_state = control_state;
+	if ((iflag & IXOFF) || (iflag & IXON) || (cflag & CRTSCTS))
+		new_state |= TIOCM_DTR | TIOCM_RTS;
+	else
+		new_state &= ~(TIOCM_DTR | TIOCM_RTS);
+	if (new_state != control_state) {
+		dbg(DEBUG_IF, "%u: new_state %x", cs->minor_index, new_state);
+		gigaset_set_modem_ctrl(cs, control_state, new_state); // FIXME: mct_u232.c sets the old state here. is this a bug?
+		control_state = new_state;
+	}
+#endif
+
+	/* save off the modified port settings */
+	cs->control_state = control_state;
+
+out:
+	up(&cs->sem);
+}
+
+
+/* wakeup tasklet for the write operation */
+static void if_wake(unsigned long data)
+{
+	struct cardstate *cs = (struct cardstate *) data;
+	struct tty_struct *tty;
+
+	tty = cs->tty;
+	if (!tty)
+		return;
+
+	if ((tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
+	    tty->ldisc.write_wakeup) {
+		dbg(DEBUG_IF, "write wakeup call");
+		tty->ldisc.write_wakeup(tty);
+	}
+
+	wake_up_interruptible(&tty->write_wait);
+}
+
+/*** interface to common ***/
+
+void gigaset_if_init(struct cardstate *cs)
+{
+	struct gigaset_driver *drv;
+
+	drv = cs->driver;
+	if (!drv->have_tty)
+		return;
+
+	tasklet_init(&cs->if_wake_tasklet, &if_wake, (unsigned long) cs);
+	tty_register_device(drv->tty, cs->minor_index, NULL);
+}
+
+void gigaset_if_free(struct cardstate *cs)
+{
+	struct gigaset_driver *drv;
+
+	drv = cs->driver;
+	if (!drv->have_tty)
+		return;
+
+	tasklet_disable(&cs->if_wake_tasklet);
+	tasklet_kill(&cs->if_wake_tasklet);
+	tty_unregister_device(drv->tty, cs->minor_index);
+}
+
+void gigaset_if_receive(struct cardstate *cs,
+                        unsigned char *buffer, size_t len)
+{
+	unsigned long flags;
+	struct tty_struct *tty;
+
+	spin_lock_irqsave(&cs->lock, flags);
+	if ((tty = cs->tty) == NULL)
+		dbg(DEBUG_ANY, "receive on closed device");
+	else {
+		tty_buffer_request_room(tty, len);
+		tty_insert_flip_string(tty, buffer, len);
+		tty_flip_buffer_push(tty);
+	}
+	spin_unlock_irqrestore(&cs->lock, flags);
+}
+EXPORT_SYMBOL_GPL(gigaset_if_receive);
+
+/* gigaset_if_initdriver
+ * Initialize tty interface.
+ * parameters:
+ *      drv             Driver
+ *      procname        Name of the driver (e.g. for /proc/tty/drivers)
+ *      devname         Name of the device files (prefix without minor number)
+ *      devfsname       Devfs name of the device files without %d
+ */
+void gigaset_if_initdriver(struct gigaset_driver *drv, const char *procname,
+                           const char *devname, const char *devfsname)
+{
+	unsigned minors = drv->minors;
+	int ret;
+	struct tty_driver *tty;
+
+	drv->have_tty = 0;
+
+	if ((drv->tty = alloc_tty_driver(minors)) == NULL)
+		goto enomem;
+	tty = drv->tty;
+
+	tty->magic =		TTY_DRIVER_MAGIC,
+	tty->major =		GIG_MAJOR,
+	tty->type =		TTY_DRIVER_TYPE_SERIAL,
+	tty->subtype =		SERIAL_TYPE_NORMAL,
+	tty->flags =		TTY_DRIVER_REAL_RAW | TTY_DRIVER_NO_DEVFS,
+
+	tty->driver_name =	procname;
+	tty->name =		devname;
+	tty->minor_start =	drv->minor;
+	tty->num =		drv->minors;
+
+	tty->owner =		THIS_MODULE;
+	tty->devfs_name =	devfsname;
+
+	tty->init_termios          = tty_std_termios; //FIXME
+	tty->init_termios.c_cflag  = B9600 | CS8 | CREAD | HUPCL | CLOCAL; //FIXME
+	tty_set_operations(tty, &if_ops);
+
+	ret = tty_register_driver(tty);
+	if (ret < 0) {
+		warn("failed to register tty driver (error %d)", ret);
+		goto error;
+	}
+	dbg(DEBUG_IF, "tty driver initialized");
+	drv->have_tty = 1;
+	return;
+
+enomem:
+	warn("could not allocate tty structures");
+error:
+	if (drv->tty)
+		put_tty_driver(drv->tty);
+}
+
+void gigaset_if_freedriver(struct gigaset_driver *drv)
+{
+	if (!drv->have_tty)
+		return;
+
+	drv->have_tty = 0;
+	tty_unregister_driver(drv->tty);
+	put_tty_driver(drv->tty);
+}
diff --git a/include/linux/gigaset_dev.h b/include/linux/gigaset_dev.h
new file mode 100644
index 000000000000..70ad09c8ad1e
--- /dev/null
+++ b/include/linux/gigaset_dev.h
@@ -0,0 +1,32 @@
+/*
+ * interface to user space for the gigaset driver
+ *
+ * Copyright (c) 2004 by Hansjoerg Lipp <hjlipp@web.de>
+ *
+ * =====================================================================
+ *    This program is free software; you can redistribute it and/or
+ *    modify it under the terms of the GNU General Public License as
+ *    published by the Free Software Foundation; either version 2 of
+ *    the License, or (at your option) any later version.
+ * =====================================================================
+ * Version: $Id: gigaset_dev.h,v 1.4.4.4 2005/11/21 22:28:09 hjlipp Exp $
+ * =====================================================================
+ */
+
+#ifndef GIGASET_INTERFACE_H
+#define GIGASET_INTERFACE_H
+
+#include <linux/ioctl.h>
+
+#define GIGASET_IOCTL 0x47
+
+#define GIGVER_DRIVER 0
+#define GIGVER_COMPAT 1
+#define GIGVER_FWBASE 2
+
+#define GIGASET_REDIR    _IOWR (GIGASET_IOCTL, 0, int)
+#define GIGASET_CONFIG   _IOWR (GIGASET_IOCTL, 1, int)
+#define GIGASET_BRKCHARS _IOW  (GIGASET_IOCTL, 2, unsigned char[6]) //FIXME [6] okay?
+#define GIGASET_VERSION  _IOWR (GIGASET_IOCTL, 3, unsigned[4])
+
+#endif
-- 
cgit v1.2.3


From 0b28002fdf2d5b6ce3135a544c04940a16c5b0ba Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Sun, 26 Mar 2006 01:38:58 -0800
Subject: [PATCH] more s/fucn/func/ typo fixes

s/fucntion/function/ typo fixes

Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl | 2 +-
 arch/m68k/bvme6000/config.c                                  | 2 +-
 arch/s390/crypto/crypt_s390_query.c                          | 2 +-
 drivers/acpi/processor_core.c                                | 2 +-
 drivers/net/sis900.c                                         | 4 ++--
 include/linux/gameport.h                                     | 4 ++--
 include/linux/serio.h                                        | 6 +++---
 7 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index 6dc9d9f622ca..6feef9e82b63 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -2836,7 +2836,7 @@ struct _snd_pcm_runtime {
 
         <para>
 	Note that this callback became non-atomic since the recent version.
-	You can use schedule-related fucntions safely in this callback now.
+	You can use schedule-related functions safely in this callback now.
         </para>
 
         <para>
diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c
index 3ffc84f9c291..c90cb5fcc8ef 100644
--- a/arch/m68k/bvme6000/config.c
+++ b/arch/m68k/bvme6000/config.c
@@ -142,7 +142,7 @@ void __init config_bvme6000(void)
     /* Now do the PIT configuration */
 
     pit->pgcr	= 0x00;	/* Unidirectional 8 bit, no handshake for now */
-    pit->psrr	= 0x18;	/* PIACK and PIRQ fucntions enabled */
+    pit->psrr	= 0x18;	/* PIACK and PIRQ functions enabled */
     pit->pacr	= 0x00;	/* Sub Mode 00, H2 i/p, no DMA */
     pit->padr	= 0x00;	/* Just to be tidy! */
     pit->paddr	= 0x00;	/* All inputs for now (safest) */
diff --git a/arch/s390/crypto/crypt_s390_query.c b/arch/s390/crypto/crypt_s390_query.c
index def02bdc44a4..54fb11d7fadd 100644
--- a/arch/s390/crypto/crypt_s390_query.c
+++ b/arch/s390/crypto/crypt_s390_query.c
@@ -55,7 +55,7 @@ static void query_available_functions(void)
 	printk(KERN_INFO "KMC_AES_256: %d\n",
 		crypt_s390_func_available(KMC_AES_256_ENCRYPT));
 
-	/* query available KIMD fucntions */
+	/* query available KIMD functions */
 	printk(KERN_INFO "KIMD_QUERY: %d\n",
 		crypt_s390_func_available(KIMD_QUERY));
 	printk(KERN_INFO "KIMD_SHA_1: %d\n",
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 99a3a28594da..713b763884a9 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -246,7 +246,7 @@ static int acpi_processor_errata(struct acpi_processor *pr)
 }
 
 /* --------------------------------------------------------------------------
-                              Common ACPI processor fucntions
+                              Common ACPI processor functions
    -------------------------------------------------------------------------- */
 
 /*
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 253440a98022..8429ceb01389 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -1693,7 +1693,7 @@ static irqreturn_t sis900_interrupt(int irq, void *dev_instance, struct pt_regs
  *
  *	Process receive interrupt events, 
  *	put buffer to higher layer and refill buffer pool
- *	Note: This fucntion is called by interrupt handler, 
+ *	Note: This function is called by interrupt handler,
  *	don't do "too much" work here
  */
 
@@ -1840,7 +1840,7 @@ static int sis900_rx(struct net_device *net_dev)
  *
  *	Check for error condition and free socket buffer etc 
  *	schedule for more transmission as needed
- *	Note: This fucntion is called by interrupt handler, 
+ *	Note: This function is called by interrupt handler,
  *	don't do "too much" work here
  */
 
diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 2401dea2b867..9c8e6da2393b 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -119,7 +119,7 @@ static inline void gameport_set_name(struct gameport *gameport, const char *name
 }
 
 /*
- * Use the following fucntions to manipulate gameport's per-port
+ * Use the following functions to manipulate gameport's per-port
  * driver-specific data.
  */
 static inline void *gameport_get_drvdata(struct gameport *gameport)
@@ -133,7 +133,7 @@ static inline void gameport_set_drvdata(struct gameport *gameport, void *data)
 }
 
 /*
- * Use the following fucntions to pin gameport's driver in process context
+ * Use the following functions to pin gameport's driver in process context
  */
 static inline int gameport_pin_driver(struct gameport *gameport)
 {
diff --git a/include/linux/serio.h b/include/linux/serio.h
index aa4d6493a034..690aabca8ed0 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -119,7 +119,7 @@ static inline void serio_cleanup(struct serio *serio)
 }
 
 /*
- * Use the following fucntions to manipulate serio's per-port
+ * Use the following functions to manipulate serio's per-port
  * driver-specific data.
  */
 static inline void *serio_get_drvdata(struct serio *serio)
@@ -133,7 +133,7 @@ static inline void serio_set_drvdata(struct serio *serio, void *data)
 }
 
 /*
- * Use the following fucntions to protect critical sections in
+ * Use the following functions to protect critical sections in
  * driver code from port's interrupt handler
  */
 static inline void serio_pause_rx(struct serio *serio)
@@ -147,7 +147,7 @@ static inline void serio_continue_rx(struct serio *serio)
 }
 
 /*
- * Use the following fucntions to pin serio's driver in process context
+ * Use the following functions to pin serio's driver in process context
  */
 static inline int serio_pin_driver(struct serio *serio)
 {
-- 
cgit v1.2.3


From e9bebd6f3acee68fa07d44726895b40733cb1dc0 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Sun, 26 Mar 2006 01:39:55 -0800
Subject: [PATCH] bitops: remove unused generic bitops in
 include/linux/bitops.h

generic_{ffs,fls,fls64,hweight{64,32,16,8}}() were moved into
include/asm-generic/bitops.h.  So all architectures don't use them.

Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bitops.h | 124 +------------------------------------------------
 1 file changed, 1 insertion(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index f17525a963d1..5d1eabcde5d5 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -2,89 +2,12 @@
 #define _LINUX_BITOPS_H
 #include <asm/types.h>
 
-/*
- * ffs: find first bit set. This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
- */
-
-static inline int generic_ffs(int x)
-{
-	int r = 1;
-
-	if (!x)
-		return 0;
-	if (!(x & 0xffff)) {
-		x >>= 16;
-		r += 16;
-	}
-	if (!(x & 0xff)) {
-		x >>= 8;
-		r += 8;
-	}
-	if (!(x & 0xf)) {
-		x >>= 4;
-		r += 4;
-	}
-	if (!(x & 3)) {
-		x >>= 2;
-		r += 2;
-	}
-	if (!(x & 1)) {
-		x >>= 1;
-		r += 1;
-	}
-	return r;
-}
-
-/*
- * fls: find last bit set.
- */
-
-static __inline__ int generic_fls(int x)
-{
-	int r = 32;
-
-	if (!x)
-		return 0;
-	if (!(x & 0xffff0000u)) {
-		x <<= 16;
-		r -= 16;
-	}
-	if (!(x & 0xff000000u)) {
-		x <<= 8;
-		r -= 8;
-	}
-	if (!(x & 0xf0000000u)) {
-		x <<= 4;
-		r -= 4;
-	}
-	if (!(x & 0xc0000000u)) {
-		x <<= 2;
-		r -= 2;
-	}
-	if (!(x & 0x80000000u)) {
-		x <<= 1;
-		r -= 1;
-	}
-	return r;
-}
-
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
  */
 #include <asm/bitops.h>
 
-
-static inline int generic_fls64(__u64 x)
-{
-	__u32 h = x >> 32;
-	if (h)
-		return fls(h) + 32;
-	return fls(x);
-}
-
 static __inline__ int get_bitmask_order(unsigned int count)
 {
 	int order;
@@ -103,54 +26,9 @@ static __inline__ int get_count_order(unsigned int count)
 	return order;
 }
 
-/*
- * hweightN: returns the hamming weight (i.e. the number
- * of bits set) of a N-bit word
- */
-
-static inline unsigned int generic_hweight32(unsigned int w)
-{
-        unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
-        res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
-        res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
-        res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
-        return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
-}
-
-static inline unsigned int generic_hweight16(unsigned int w)
-{
-        unsigned int res = (w & 0x5555) + ((w >> 1) & 0x5555);
-        res = (res & 0x3333) + ((res >> 2) & 0x3333);
-        res = (res & 0x0F0F) + ((res >> 4) & 0x0F0F);
-        return (res & 0x00FF) + ((res >> 8) & 0x00FF);
-}
-
-static inline unsigned int generic_hweight8(unsigned int w)
-{
-        unsigned int res = (w & 0x55) + ((w >> 1) & 0x55);
-        res = (res & 0x33) + ((res >> 2) & 0x33);
-        return (res & 0x0F) + ((res >> 4) & 0x0F);
-}
-
-static inline unsigned long generic_hweight64(__u64 w)
-{
-#if BITS_PER_LONG < 64
-	return generic_hweight32((unsigned int)(w >> 32)) +
-				generic_hweight32((unsigned int)w);
-#else
-	u64 res;
-	res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul);
-	res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
-	res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful);
-	res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul);
-	res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul);
-	return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul);
-#endif
-}
-
 static inline unsigned long hweight_long(unsigned long w)
 {
-	return sizeof(w) == 4 ? generic_hweight32(w) : generic_hweight64(w);
+	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
 }
 
 /*
-- 
cgit v1.2.3


From fbb18a277a6f192404aa20ece49529acb1e1e76d Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 26 Mar 2006 23:13:39 +0100
Subject: [SERIAL] amba-pl010: allow platforms to specify modem control method

The amba-pl010 hardware does not provide RTS and DTR control lines; it
is expected that these will be implemented using GPIO.  Allow platforms
to supply a function to implement manipulation of modem control lines.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mach-integrator/core.c |  46 ++++++++++++
 drivers/serial/amba-pl010.c     | 160 +++++++++++++++++-----------------------
 include/linux/amba/serial.h     |   6 ++
 3 files changed, 119 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c
index 20071a2767cc..576a5e979c00 100644
--- a/arch/arm/mach-integrator/core.c
+++ b/arch/arm/mach-integrator/core.c
@@ -15,7 +15,9 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/termios.h>
 #include <linux/amba/bus.h>
+#include <linux/amba/serial.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
@@ -28,6 +30,8 @@
 
 #include "common.h"
 
+static struct amba_pl010_data integrator_uart_data;
+
 static struct amba_device rtc_device = {
 	.dev		= {
 		.bus_id	= "mb:15",
@@ -44,6 +48,7 @@ static struct amba_device rtc_device = {
 static struct amba_device uart0_device = {
 	.dev		= {
 		.bus_id	= "mb:16",
+		.platform_data = &integrator_uart_data,
 	},
 	.res		= {
 		.start	= INTEGRATOR_UART0_BASE,
@@ -57,6 +62,7 @@ static struct amba_device uart0_device = {
 static struct amba_device uart1_device = {
 	.dev		= {
 		.bus_id	= "mb:17",
+		.platform_data = &integrator_uart_data,
 	},
 	.res		= {
 		.start	= INTEGRATOR_UART1_BASE,
@@ -115,6 +121,46 @@ static int __init integrator_init(void)
 
 arch_initcall(integrator_init);
 
+/*
+ * On the Integrator platform, the port RTS and DTR are provided by
+ * bits in the following SC_CTRLS register bits:
+ *        RTS  DTR
+ *  UART0  7    6
+ *  UART1  5    4
+ */
+#define SC_CTRLC	(IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLC_OFFSET)
+#define SC_CTRLS	(IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLS_OFFSET)
+
+static void integrator_uart_set_mctrl(struct amba_device *dev, void __iomem *base, unsigned int mctrl)
+{
+	unsigned int ctrls = 0, ctrlc = 0, rts_mask, dtr_mask;
+
+	if (dev == &uart0_device) {
+		rts_mask = 1 << 4;
+		dtr_mask = 1 << 5;
+	} else {
+		rts_mask = 1 << 6;
+		dtr_mask = 1 << 7;
+	}
+
+	if (mctrl & TIOCM_RTS)
+		ctrlc |= rts_mask;
+	else
+		ctrls |= rts_mask;
+
+	if (mctrl & TIOCM_DTR)
+		ctrlc |= dtr_mask;
+	else
+		ctrls |= dtr_mask;
+
+	__raw_writel(ctrls, SC_CTRLS);
+	__raw_writel(ctrlc, SC_CTRLC);
+}
+
+static struct amba_pl010_data integrator_uart_data = {
+	.set_mctrl = integrator_uart_set_mctrl,
+};
+
 #define CM_CTRL	IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET
 
 static DEFINE_SPINLOCK(cm_lock);
diff --git a/drivers/serial/amba-pl010.c b/drivers/serial/amba-pl010.c
index 127d6cd5de7f..1631414000a2 100644
--- a/drivers/serial/amba-pl010.c
+++ b/drivers/serial/amba-pl010.c
@@ -51,8 +51,6 @@
 #include <linux/amba/serial.h>
 
 #include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/hardware.h>
 
 #define UART_NR		2
 
@@ -65,26 +63,16 @@
 #define UART_RX_DATA(s)		(((s) & UART01x_FR_RXFE) == 0)
 #define UART_TX_READY(s)	(((s) & UART01x_FR_TXFF) == 0)
 
-#define UART_DUMMY_RSR_RX	/*256*/0
+#define UART_DUMMY_RSR_RX	256
 #define UART_PORT_SIZE		64
 
-/*
- * On the Integrator platform, the port RTS and DTR are provided by
- * bits in the following SC_CTRLS register bits:
- *        RTS  DTR
- *  UART0  7    6
- *  UART1  5    4
- */
-#define SC_CTRLC	(IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLC_OFFSET)
-#define SC_CTRLS	(IO_ADDRESS(INTEGRATOR_SC_BASE) + INTEGRATOR_SC_CTRLS_OFFSET)
-
 /*
  * We wrap our port structure around the generic uart_port.
  */
 struct uart_amba_port {
 	struct uart_port	port;
-	unsigned int		dtr_mask;
-	unsigned int		rts_mask;
+	struct amba_device	*dev;
+	struct amba_pl010_data	*data;
 	unsigned int		old_status;
 };
 
@@ -300,20 +288,9 @@ static unsigned int pl010_get_mctrl(struct uart_port *port)
 static void pl010_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
 	struct uart_amba_port *uap = (struct uart_amba_port *)port;
-	unsigned int ctrls = 0, ctrlc = 0;
-
-	if (mctrl & TIOCM_RTS)
-		ctrlc |= uap->rts_mask;
-	else
-		ctrls |= uap->rts_mask;
 
-	if (mctrl & TIOCM_DTR)
-		ctrlc |= uap->dtr_mask;
-	else
-		ctrls |= uap->dtr_mask;
-
-	__raw_writel(ctrls, SC_CTRLS);
-	__raw_writel(ctrlc, SC_CTRLC);
+	if (uap->data)
+		uap->data->set_mctrl(uap->dev, uap->port.membase, mctrl);
 }
 
 static void pl010_break_ctl(struct uart_port *port, int break_state)
@@ -539,38 +516,7 @@ static struct uart_ops amba_pl010_pops = {
 	.verify_port	= pl010_verify_port,
 };
 
-static struct uart_amba_port amba_ports[UART_NR] = {
-	{
-		.port	= {
-			.membase	= (void *)IO_ADDRESS(INTEGRATOR_UART0_BASE),
-			.mapbase	= INTEGRATOR_UART0_BASE,
-			.iotype		= UPIO_MEM,
-			.irq		= IRQ_UARTINT0,
-			.uartclk	= 14745600,
-			.fifosize	= 16,
-			.ops		= &amba_pl010_pops,
-			.flags		= UPF_BOOT_AUTOCONF,
-			.line		= 0,
-		},
-		.dtr_mask	= 1 << 5,
-		.rts_mask	= 1 << 4,
-	},
-	{
-		.port	= {
-			.membase	= (void *)IO_ADDRESS(INTEGRATOR_UART1_BASE),
-			.mapbase	= INTEGRATOR_UART1_BASE,
-			.iotype		= UPIO_MEM,
-			.irq		= IRQ_UARTINT1,
-			.uartclk	= 14745600,
-			.fifosize	= 16,
-			.ops		= &amba_pl010_pops,
-			.flags		= UPF_BOOT_AUTOCONF,
-			.line		= 1,
-		},
-		.dtr_mask	= 1 << 7,
-		.rts_mask	= 1 << 6,
-	}
-};
+static struct uart_amba_port *amba_ports[UART_NR];
 
 #ifdef CONFIG_SERIAL_AMBA_PL010_CONSOLE
 
@@ -588,7 +534,7 @@ static void pl010_console_putchar(struct uart_port *port, int ch)
 static void
 pl010_console_write(struct console *co, const char *s, unsigned int count)
 {
-	struct uart_port *port = &amba_ports[co->index].port;
+	struct uart_port *port = &amba_ports[co->index]->port;
 	unsigned int status, old_cr;
 
 	/*
@@ -651,7 +597,7 @@ static int __init pl010_console_setup(struct console *co, char *options)
 	 */
 	if (co->index >= UART_NR)
 		co->index = 0;
-	port = &amba_ports[co->index].port;
+	port = &amba_ports[co->index]->port;
 
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
@@ -672,24 +618,6 @@ static struct console amba_console = {
 	.data		= &amba_reg,
 };
 
-static int __init amba_console_init(void)
-{
-	/*
-	 * All port initializations are done statically
-	 */
-	register_console(&amba_console);
-	return 0;
-}
-console_initcall(amba_console_init);
-
-static int __init amba_late_console_init(void)
-{
-	if (!(amba_console.flags & CON_ENABLED))
-		register_console(&amba_console);
-	return 0;
-}
-late_initcall(amba_late_console_init);
-
 #define AMBA_CONSOLE	&amba_console
 #else
 #define AMBA_CONSOLE	NULL
@@ -707,30 +635,76 @@ static struct uart_driver amba_reg = {
 
 static int pl010_probe(struct amba_device *dev, void *id)
 {
-	int i;
+	struct uart_amba_port *port;
+	void __iomem *base;
+	int i, ret;
 
-	for (i = 0; i < UART_NR; i++) {
-		if (amba_ports[i].port.mapbase != dev->res.start)
-			continue;
+	for (i = 0; i < ARRAY_SIZE(amba_ports); i++)
+		if (amba_ports[i] == NULL)
+			break;
 
-		amba_ports[i].port.dev = &dev->dev;
-		uart_add_one_port(&amba_reg, &amba_ports[i].port);
-		amba_set_drvdata(dev, &amba_ports[i]);
-		break;
+	if (i == ARRAY_SIZE(amba_ports)) {
+		ret = -EBUSY;
+		goto out;
 	}
 
-	return 0;
+	port = kzalloc(sizeof(struct uart_amba_port), GFP_KERNEL);
+	if (!port) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	base = ioremap(dev->res.start, PAGE_SIZE);
+	if (!base) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	port->port.dev = &dev->dev;
+	port->port.mapbase = dev->res.start;
+	port->port.membase = base;
+	port->port.iotype = UPIO_MEM;
+	port->port.irq = dev->irq[0];
+	port->port.uartclk = 14745600;
+	port->port.fifosize = 16;
+	port->port.ops = &amba_pl010_pops;
+	port->port.flags = UPF_BOOT_AUTOCONF;
+	port->port.line = i;
+	port->dev = dev;
+	port->data = dev->dev.platform_data;
+
+	amba_ports[i] = port;
+
+	amba_set_drvdata(dev, port);
+	ret = uart_add_one_port(&amba_reg, &port->port);
+	if (ret) {
+		amba_set_drvdata(dev, NULL);
+		amba_ports[i] = NULL;
+		iounmap(base);
+ free:
+		kfree(port);
+	}
+
+ out:
+	return ret;
 }
 
 static int pl010_remove(struct amba_device *dev)
 {
-	struct uart_amba_port *uap = amba_get_drvdata(dev);
-
-	if (uap)
-		uart_remove_one_port(&amba_reg, &uap->port);
+	struct uart_amba_port *port = amba_get_drvdata(dev);
+	int i;
 
 	amba_set_drvdata(dev, NULL);
 
+	uart_remove_one_port(&amba_reg, &port->port);
+
+	for (i = 0; i < ARRAY_SIZE(amba_ports); i++)
+		if (amba_ports[i] == port)
+			amba_ports[i] = NULL;
+
+	iounmap(port->port.membase);
+	kfree(port);
+
 	return 0;
 }
 
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index dc726ffccebd..48ee32a18ac5 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -158,4 +158,10 @@
 #define UART01x_RSR_ANY		(UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE)
 #define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
 
+#ifndef __ASSEMBLY__
+struct amba_pl010_data {
+	void (*set_mctrl)(struct amba_device *dev, void __iomem *base, unsigned int mctrl);
+};
+#endif
+
 #endif
-- 
cgit v1.2.3


From 5931c4350059ce9bd5fe398b628c478753a11e44 Mon Sep 17 00:00:00 2001
From: Sylvain Munaut <tnt@246tNt.com>
Date: Sun, 26 Mar 2006 13:37:07 +0200
Subject: [PATCH] ppc32: Adds support for the PCI hostbridge in MPC5200B

ppc32: Adds support for the PCI hostbridge in MPC5200B

Signed-off-by: John Rigby <jrigby@freescale.com>
Signed-off-by: Sylvain Munaut <tnt@246tNt.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/ppc/syslib/mpc52xx_pci.c | 3 ++-
 include/linux/pci_ids.h       | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/ppc/syslib/mpc52xx_pci.c b/arch/ppc/syslib/mpc52xx_pci.c
index 9ec525f9fe98..5a5a7a9cd248 100644
--- a/arch/ppc/syslib/mpc52xx_pci.c
+++ b/arch/ppc/syslib/mpc52xx_pci.c
@@ -225,7 +225,8 @@ mpc52xx_pci_fixup_resources(struct pci_dev *dev)
 	/* The PCI Host bridge of MPC52xx has a prefetch memory resource
 	   fixed to 1Gb. Doesn't fit in the resource system so we remove it */
 	if ( (dev->vendor == PCI_VENDOR_ID_MOTOROLA) &&
-	     (dev->device == PCI_DEVICE_ID_MOTOROLA_MPC5200) ) {
+	     (   dev->device == PCI_DEVICE_ID_MOTOROLA_MPC5200
+	      || dev->device == PCI_DEVICE_ID_MOTOROLA_MPC5200B) ) {
 		struct resource *res = &dev->resource[1];
 		res->start = res->end = res->flags = 0;
 	}
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 6f080ae59286..72d1b678e0e9 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -772,6 +772,7 @@
 #define PCI_DEVICE_ID_MOTOROLA_HAWK	0x4803
 #define PCI_DEVICE_ID_MOTOROLA_HARRIER	0x480b
 #define PCI_DEVICE_ID_MOTOROLA_MPC5200	0x5803
+#define PCI_DEVICE_ID_MOTOROLA_MPC5200B	0x5809
 
 #define PCI_VENDOR_ID_PROMISE		0x105a
 #define PCI_DEVICE_ID_PROMISE_20265	0x0d30
-- 
cgit v1.2.3


From 837c7878771c15ed8d85ecf814ece7fcb4551b46 Mon Sep 17 00:00:00 2001
From: Ben Woodard <woodard@redhat.com>
Date: Wed, 22 Mar 2006 08:09:31 +0100
Subject: [BLOCK] increase size of disk stat counters

The kernel's representation of the disk statistics uses the type unsigned
which is 32b on both 32b and 64b platforms.  Unfortunately, most system
tools that work with these numbers that are exported in /proc/diskstats
including iostat read these numbers into unsigned longs.  This works fine
on 32b platforms and when the number of IO transactions are small on 64b
platforms.  However, when the numbers wrap on 64b platforms & you read the
numbers into unsigned longs, and compare the numbers to previous readings,
then you get an unsigned representation of a negative number.  This looks
like a very large 64b number & gives you bizarre readouts in iostat:

ilc4: Device:    rrqm/s wrqm/s r/s    w/s  rsec/s  wsec/s    rkB/s wkB/s avgrq-sz avgqu-sz   await  svctm  %util
ilc4: sda        5.50   0.00   143.96 0.00 307496983987862656.00 0.00 153748491993931328.00     0.00 2136028725038430.00     7.94   55.12    5.59  80.42

Though fixing iostat in user space is possible, and a quick survey
indicates that several other similar tools also use unsigned longs when
processing /proc/diskstats.  Therefore, it seems like a better approach
would be to extend the length of the disk_stats structure on 64b
architectures to 64b.  The following patch does that.  It should not affect
the operation on 32b platforms.

Signed-off-by: Ben Woodard <woodard@redhat.com>
Cc: Rick Lindsley <ricklind@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/genhd.c         |  6 +++---
 include/linux/genhd.h | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 64510fd88621..db4c60c802d6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -454,8 +454,8 @@ static ssize_t disk_stats_read(struct gendisk * disk, char *page)
 	disk_round_stats(disk);
 	preempt_enable();
 	return sprintf(page,
-		"%8u %8u %8llu %8u "
-		"%8u %8u %8llu %8u "
+		"%8lu %8lu %8llu %8u "
+		"%8lu %8lu %8llu %8u "
 		"%8u %8u %8u"
 		"\n",
 		disk_stat_read(disk, ios[READ]),
@@ -649,7 +649,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 	preempt_disable();
 	disk_round_stats(gp);
 	preempt_enable();
-	seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n",
+	seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
 		gp->major, n + gp->first_minor, disk_name(gp, n, buf),
 		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
 		(unsigned long long)disk_stat_read(gp, sectors[0]),
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index fd647fde5ec1..179fea53fc81 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -89,12 +89,12 @@ struct hd_struct {
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
 
 struct disk_stats {
-	unsigned sectors[2];		/* READs and WRITEs */
-	unsigned ios[2];
-	unsigned merges[2];
-	unsigned ticks[2];
-	unsigned io_ticks;
-	unsigned time_in_queue;
+	unsigned long sectors[2];	/* READs and WRITEs */
+	unsigned long ios[2];
+	unsigned long merges[2];
+	unsigned long ticks[2];
+	unsigned long io_ticks;
+	unsigned long time_in_queue;
 };
 	
 struct gendisk {
-- 
cgit v1.2.3


From f75ba3ade8a4599d67040a9493d75a864e7b329c Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 27 Mar 2006 01:14:52 -0800
Subject: [PATCH] autofs4: increase module version

Update autofs4 version.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/auto_fs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index 9343c89d843c..d998ddcf7288 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	4
 
-#define AUTOFS_PROTO_SUBVERSION		7
+#define AUTOFS_PROTO_SUBVERSION		10
 
 /* Mask for expire behaviour */
 #define AUTOFS_EXP_IMMEDIATE		1
-- 
cgit v1.2.3


From 5c0a32fc2cd0be912511199449a37a4a6f0f582d Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 27 Mar 2006 01:14:55 -0800
Subject: [PATCH] autofs4: add new packet type for v5 communications

This patch define a new autofs packet for autofs v5 and updates the waitq.c
functions to handle the additional packet type.

Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs4/autofs_i.h    | 23 +++++++++----
 fs/autofs4/waitq.c       | 86 +++++++++++++++++++++++++++++++++++++++++-------
 include/linux/auto_fs4.h | 51 +++++++++++++++++++++++++---
 3 files changed, 136 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index ed388a1d8fc4..37c8d909d1e9 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -77,6 +77,12 @@ struct autofs_wait_queue {
 	int hash;
 	int len;
 	char *name;
+	u32 dev;
+	u64 ino;
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	pid_t tgid;
 	/* This is for status reporting upon return */
 	int status;
 	atomic_t notified;
@@ -180,13 +186,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info
 
 /* Queue management functions */
 
-enum autofs_notify
-{
-	NFY_NONE,
-	NFY_MOUNT,
-	NFY_EXPIRE
-};
-
 int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
@@ -204,6 +203,16 @@ static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **de
 	return res;
 }
 
+static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
+{
+	return new_encode_dev(sbi->sb->s_dev);
+}
+
+static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
+{
+	return sbi->sb->s_root->d_inode->i_ino;
+}
+
 static inline int simple_positive(struct dentry *dentry)
 {
 	return dentry->d_inode && !d_unhashed(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index b0bb9d43bcd9..12da2c977b0a 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -3,7 +3,7 @@
  * linux/fs/autofs/waitq.c
  *
  *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 2001-2003 Ian Kent <raven@themaw.net>
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
@@ -97,7 +97,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
-	if (type == autofs_ptype_missing) {
+	switch (type) {
+	/* Kernel protocol v4 missing and expire packets */
+	case autofs_ptype_missing:
+	{
 		struct autofs_packet_missing *mp = &pkt.missing;
 
 		pktsz = sizeof(*mp);
@@ -106,7 +109,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		mp->len = wq->len;
 		memcpy(mp->name, wq->name, wq->len);
 		mp->name[wq->len] = '\0';
-	} else if (type == autofs_ptype_expire_multi) {
+		break;
+	}
+	case autofs_ptype_expire_multi:
+	{
 		struct autofs_packet_expire_multi *ep = &pkt.expire_multi;
 
 		pktsz = sizeof(*ep);
@@ -115,7 +121,34 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		ep->len = wq->len;
 		memcpy(ep->name, wq->name, wq->len);
 		ep->name[wq->len] = '\0';
-	} else {
+		break;
+	}
+	/*
+	 * Kernel protocol v5 packet for handling indirect and direct
+	 * mount missing and expire requests
+	 */
+	case autofs_ptype_missing_indirect:
+	case autofs_ptype_expire_indirect:
+	case autofs_ptype_missing_direct:
+	case autofs_ptype_expire_direct:
+	{
+		struct autofs_v5_packet *packet = &pkt.v5_packet;
+
+		pktsz = sizeof(*packet);
+
+		packet->wait_queue_token = wq->wait_queue_token;
+		packet->len = wq->len;
+		memcpy(packet->name, wq->name, wq->len);
+		packet->name[wq->len] = '\0';
+		packet->dev = wq->dev;
+		packet->ino = wq->ino;
+		packet->uid = wq->uid;
+		packet->gid = wq->gid;
+		packet->pid = wq->pid;
+		packet->tgid = wq->tgid;
+		break;
+	}
+	default:
 		printk("autofs4_notify_daemon: bad type %d!\n", type);
 		return;
 	}
@@ -161,7 +194,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 {
 	struct autofs_wait_queue *wq;
 	char *name;
-	int len, status;
+	unsigned int len = 0;
+	unsigned int hash = 0;
+	int status;
 
 	/* In catatonic mode, we don't wait for nobody */
 	if (sbi->catatonic)
@@ -171,11 +206,17 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	if (!name)
 		return -ENOMEM;
 
-	len = autofs4_getpath(sbi, dentry, &name);
-	if (!len) {
-		kfree(name);
-		return -ENOENT;
+	/* If this is a direct mount request create a dummy name */
+	if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYP_DIRECT))
+		len = sprintf(name, "%p", dentry);
+	else {
+		len = autofs4_getpath(sbi, dentry, &name);
+		if (!len) {
+			kfree(name);
+			return -ENOENT;
+		}
 	}
+	hash = full_name_hash(name, len);
 
 	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
 		kfree(name);
@@ -211,9 +252,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->next = sbi->queues;
 		sbi->queues = wq;
 		init_waitqueue_head(&wq->queue);
-		wq->hash = dentry->d_name.hash;
+		wq->hash = hash;
 		wq->name = name;
 		wq->len = len;
+		wq->dev = autofs4_get_dev(sbi);
+		wq->ino = autofs4_get_ino(sbi);
+		wq->uid = current->uid;
+		wq->gid = current->gid;
+		wq->pid = current->pid;
+		wq->tgid = current->tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
 		atomic_set(&wq->wait_ctr, 2);
 		atomic_set(&wq->notified, 1);
@@ -227,8 +274,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	}
 
 	if (notify != NFY_NONE && atomic_dec_and_test(&wq->notified)) {
-		int type = (notify == NFY_MOUNT ?
-			autofs_ptype_missing : autofs_ptype_expire_multi);
+		int type;
+
+		if (sbi->version < 5) {
+			if (notify == NFY_MOUNT)
+				type = autofs_ptype_missing;
+			else
+				type = autofs_ptype_expire_multi;
+		} else {
+			if (notify == NFY_MOUNT)
+				type = (sbi->type & AUTOFS_TYP_DIRECT) ?
+					autofs_ptype_missing_direct :
+					 autofs_ptype_missing_indirect;
+			else
+				type = (sbi->type & AUTOFS_TYP_DIRECT) ?
+					autofs_ptype_expire_direct :
+					autofs_ptype_expire_indirect;
+		}
 
 		DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
 			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index d998ddcf7288..0a6bc52ffe88 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -19,18 +19,37 @@
 #undef AUTOFS_MIN_PROTO_VERSION
 #undef AUTOFS_MAX_PROTO_VERSION
 
-#define AUTOFS_PROTO_VERSION		4
+#define AUTOFS_PROTO_VERSION		5
 #define AUTOFS_MIN_PROTO_VERSION	3
-#define AUTOFS_MAX_PROTO_VERSION	4
+#define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		10
+#define AUTOFS_PROTO_SUBVERSION		0
 
 /* Mask for expire behaviour */
 #define AUTOFS_EXP_IMMEDIATE		1
 #define AUTOFS_EXP_LEAVES		2
 
-/* New message type */
-#define autofs_ptype_expire_multi	2	/* Expire entry (umount request) */
+/* Daemon notification packet types */
+enum autofs_notify {
+	NFY_NONE,
+	NFY_MOUNT,
+	NFY_EXPIRE
+};
+
+/* Kernel protocol version 4 packet types */
+
+/* Expire entry (umount request) */
+#define autofs_ptype_expire_multi	2
+
+/* Kernel protocol version 5 packet types */
+
+/* Indirect mount missing and expire requests. */
+#define autofs_ptype_missing_indirect	3
+#define autofs_ptype_expire_indirect	4
+
+/* Direct mount missing and expire requests */
+#define autofs_ptype_missing_direct	5
+#define autofs_ptype_expire_direct	6
 
 /* v4 multi expire (via pipe) */
 struct autofs_packet_expire_multi {
@@ -40,14 +59,36 @@ struct autofs_packet_expire_multi {
 	char name[NAME_MAX+1];
 };
 
+/* autofs v5 common packet struct */
+struct autofs_v5_packet {
+	struct autofs_packet_hdr hdr;
+	autofs_wqt_t wait_queue_token;
+	__u32 dev;
+	__u64 ino;
+	__u32 uid;
+	__u32 gid;
+	__u32 pid;
+	__u32 tgid;
+	__u32 len;
+	char name[NAME_MAX+1];
+};
+
+typedef struct autofs_v5_packet autofs_packet_missing_indirect_t;
+typedef struct autofs_v5_packet autofs_packet_expire_indirect_t;
+typedef struct autofs_v5_packet autofs_packet_missing_direct_t;
+typedef struct autofs_v5_packet autofs_packet_expire_direct_t;
+
 union autofs_packet_union {
 	struct autofs_packet_hdr hdr;
 	struct autofs_packet_missing missing;
 	struct autofs_packet_expire expire;
 	struct autofs_packet_expire_multi expire_multi;
+	struct autofs_v5_packet v5_packet;
 };
 
 #define AUTOFS_IOC_EXPIRE_MULTI		_IOW(0x93,0x66,int)
+#define AUTOFS_IOC_EXPIRE_INDIRECT	AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_EXPIRE_DIRECT	AUTOFS_IOC_EXPIRE_MULTI
 #define AUTOFS_IOC_PROTOSUBVER		_IOR(0x93,0x67,int)
 #define AUTOFS_IOC_ASKREGHOST           _IOR(0x93,0x68,int)
 #define AUTOFS_IOC_TOGGLEREGHOST        _IOR(0x93,0x69,int)
-- 
cgit v1.2.3


From efc36aa5608f5717338747e152c23f2cfdb14697 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:14:59 -0800
Subject: [PATCH] knfsd: Change the store of auth_domains to not be a 'cache'

The 'auth_domain's are simply handles on internal data structures.  They do
not cache information from user-space, and forcing them into the mold of a
'cache' misrepresents their true nature and causes confusion.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c                  |   5 +-
 include/linux/sunrpc/svcauth.h    |  12 ++--
 net/sunrpc/auth_gss/svcauth_gss.c |  14 ++---
 net/sunrpc/sunrpc_syms.c          |   4 +-
 net/sunrpc/svcauth.c              | 122 +++++++++++---------------------------
 net/sunrpc/svcauth_unix.c         |  69 ++++++++++-----------
 6 files changed, 81 insertions(+), 145 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 417ec02df44f..ac0997731fce 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -242,7 +242,7 @@ static inline int svc_expkey_match (struct svc_expkey *a, struct svc_expkey *b)
 
 static inline void svc_expkey_init(struct svc_expkey *new, struct svc_expkey *item)
 {
-	cache_get(&item->ek_client->h);
+	kref_get(&item->ek_client->ref);
 	new->ek_client = item->ek_client;
 	new->ek_fsidtype = item->ek_fsidtype;
 	new->ek_fsid[0] = item->ek_fsid[0];
@@ -474,7 +474,7 @@ static inline int svc_export_match(struct svc_export *a, struct svc_export *b)
 }
 static inline void svc_export_init(struct svc_export *new, struct svc_export *item)
 {
-	cache_get(&item->ex_client->h);
+	kref_get(&item->ex_client->ref);
 	new->ex_client = item->ex_client;
 	new->ex_dentry = dget(item->ex_dentry);
 	new->ex_mnt = mntget(item->ex_mnt);
@@ -1129,7 +1129,6 @@ exp_delclient(struct nfsctl_client *ncp)
 	 */
 	if (dom) {
 		err = auth_unix_forget_old(dom);
-		dom->h.expiry_time = get_seconds();
 		auth_domain_put(dom);
 	}
 
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index c119ce7cbd22..2fe2087edd66 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -45,9 +45,10 @@ struct svc_rqst;		/* forward decl */
  * of ip addresses to the given client.
  */
 struct auth_domain {
-	struct	cache_head	h;
+	struct kref		ref;
+	struct hlist_node	hash;
 	char			*name;
-	int			flavour;
+	struct auth_ops		*flavour;
 };
 
 /*
@@ -86,6 +87,9 @@ struct auth_domain {
  *
  * domain_release()
  *   This call releases a domain.
+ * set_client()
+ *   Givens a pending request (struct svc_rqst), finds and assigns
+ *   an appropriate 'auth_domain' as the client.
  */
 struct auth_ops {
 	char *	name;
@@ -117,7 +121,7 @@ extern void	svc_auth_unregister(rpc_authflavor_t flavor);
 extern struct auth_domain *unix_domain_find(char *name);
 extern void auth_domain_put(struct auth_domain *item);
 extern int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom);
-extern struct auth_domain *auth_domain_lookup(struct auth_domain *item, int set);
+extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new);
 extern struct auth_domain *auth_domain_find(char *name);
 extern struct auth_domain *auth_unix_lookup(struct in_addr addr);
 extern int auth_unix_forget_old(struct auth_domain *dom);
@@ -160,8 +164,6 @@ static inline unsigned long hash_mem(char *buf, int length, int bits)
 	return hash >> (BITS_PER_LONG - bits);
 }
 
-extern struct cache_detail auth_domain_cache, ip_map_cache;
-
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SUNRPC_SVCAUTH_H_ */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 23632d84d8d7..6b073c2e6930 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -645,6 +645,8 @@ find_gss_auth_domain(struct gss_ctx *ctx, u32 svc)
 	return auth_domain_find(name);
 }
 
+static struct auth_ops svcauthops_gss;
+
 int
 svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
 {
@@ -655,20 +657,18 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		goto out;
-	cache_init(&new->h.h);
+	kref_init(&new->h.ref);
 	new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
 	if (!new->h.name)
 		goto out_free_dom;
 	strcpy(new->h.name, name);
-	new->h.flavour = RPC_AUTH_GSS;
+	new->h.flavour = &svcauthops_gss;
 	new->pseudoflavor = pseudoflavor;
-	new->h.h.expiry_time = NEVER;
 
-	test = auth_domain_lookup(&new->h, 1);
-	if (test == &new->h) {
-		BUG_ON(atomic_dec_and_test(&new->h.h.refcnt));
-	} else { /* XXX Duplicate registration? */
+	test = auth_domain_lookup(name, &new->h);
+	if (test != &new->h) { /* XXX Duplicate registration? */
 		auth_domain_put(&new->h);
+		/* dangling ref-count... */
 		goto out;
 	}
 	return 0;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 9f7373203592..40401196e7de 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -142,6 +142,7 @@ EXPORT_SYMBOL(nlm_debug);
 
 extern int register_rpc_pipefs(void);
 extern void unregister_rpc_pipefs(void);
+extern struct cache_detail ip_map_cache;
 
 static int __init
 init_sunrpc(void)
@@ -158,7 +159,6 @@ init_sunrpc(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_init();
 #endif
-	cache_register(&auth_domain_cache);
 	cache_register(&ip_map_cache);
 out:
 	return err;
@@ -169,8 +169,6 @@ cleanup_sunrpc(void)
 {
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
-	if (cache_unregister(&auth_domain_cache))
-		printk(KERN_ERR "sunrpc: failed to unregister auth_domain cache\n");
 	if (cache_unregister(&ip_map_cache))
 		printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index dda4f0c63511..5b28c6176806 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -106,112 +106,56 @@ svc_auth_unregister(rpc_authflavor_t flavor)
 EXPORT_SYMBOL(svc_auth_unregister);
 
 /**************************************************
- * cache for domain name to auth_domain
- * Entries are only added by flavours which will normally
- * have a structure that 'inherits' from auth_domain.
- * e.g. when an IP -> domainname is given to  auth_unix,
- * and the domain name doesn't exist, it will create a
- * auth_unix_domain and add it to this hash table.
- * If it finds the name does exist, but isn't AUTH_UNIX,
- * it will complain.
+ * 'auth_domains' are stored in a hash table indexed by name.
+ * When the last reference to an 'auth_domain' is dropped,
+ * the object is unhashed and freed.
+ * If auth_domain_lookup fails to find an entry, it will return
+ * it's second argument 'new'.  If this is non-null, it will
+ * have been atomically linked into the table.
  */
 
-/*
- * Auth auth_domain cache is somewhat different to other caches,
- * largely because the entries are possibly of different types:
- * each auth flavour has it's own type.
- * One consequence of this that DefineCacheLookup cannot
- * allocate a new structure as it cannot know the size.
- * Notice that the "INIT" code fragment is quite different
- * from other caches.  When auth_domain_lookup might be
- * creating a new domain, the new domain is passed in
- * complete and it is used as-is rather than being copied into
- * another structure.
- */
 #define	DN_HASHBITS	6
 #define	DN_HASHMAX	(1<<DN_HASHBITS)
 #define	DN_HASHMASK	(DN_HASHMAX-1)
 
-static struct cache_head	*auth_domain_table[DN_HASHMAX];
-
-static void auth_domain_drop(struct cache_head *item, struct cache_detail *cd)
-{
-	struct auth_domain *dom = container_of(item, struct auth_domain, h);
-	if (cache_put(item,cd))
-		authtab[dom->flavour]->domain_release(dom);
-}
-
-
-struct cache_detail auth_domain_cache = {
-	.owner		= THIS_MODULE,
-	.hash_size	= DN_HASHMAX,
-	.hash_table	= auth_domain_table,
-	.name		= "auth.domain",
-	.cache_put	= auth_domain_drop,
-};
+static struct hlist_head	auth_domain_table[DN_HASHMAX];
+static spinlock_t	auth_domain_lock = SPIN_LOCK_UNLOCKED;
 
 void auth_domain_put(struct auth_domain *dom)
 {
-	auth_domain_drop(&dom->h, &auth_domain_cache);
-}
-
-static inline int auth_domain_hash(struct auth_domain *item)
-{
-	return hash_str(item->name, DN_HASHBITS);
-}
-static inline int auth_domain_match(struct auth_domain *tmp, struct auth_domain *item)
-{
-	return strcmp(tmp->name, item->name) == 0;
+	if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
+		hlist_del(&dom->hash);
+		dom->flavour->domain_release(dom);
+	}
 }
 
 struct auth_domain *
-auth_domain_lookup(struct auth_domain *item, int set)
+auth_domain_lookup(char *name, struct auth_domain *new)
 {
-	struct auth_domain *tmp = NULL;
-	struct cache_head **hp, **head;
-	head = &auth_domain_cache.hash_table[auth_domain_hash(item)];
-
-	if (set)
-		write_lock(&auth_domain_cache.hash_lock);
-	else
-		read_lock(&auth_domain_cache.hash_lock);
-	for (hp=head; *hp != NULL; hp = &tmp->h.next) {
-		tmp = container_of(*hp, struct auth_domain, h);
-		if (!auth_domain_match(tmp, item))
-			continue;
-		if (!set) {
-			cache_get(&tmp->h);
-			goto out_noset;
+	struct auth_domain *hp;
+	struct hlist_head *head;
+	struct hlist_node *np;
+
+	head = &auth_domain_table[hash_str(name, DN_HASHBITS)];
+
+	spin_lock(&auth_domain_lock);
+
+	hlist_for_each_entry(hp, np, head, hash) {
+		if (strcmp(hp->name, name)==0) {
+			kref_get(&hp->ref);
+			spin_unlock(&auth_domain_lock);
+			return hp;
 		}
-		*hp = tmp->h.next;
-		tmp->h.next = NULL;
-		auth_domain_drop(&tmp->h, &auth_domain_cache);
-		goto out_set;
 	}
-	/* Didn't find anything */
-	if (!set)
-		goto out_nada;
-	auth_domain_cache.entries++;
-out_set:
-	item->h.next = *head;
-	*head = &item->h;
-	cache_get(&item->h);
-	write_unlock(&auth_domain_cache.hash_lock);
-	cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time);
-	cache_get(&item->h);
-	return item;
-out_nada:
-	tmp = NULL;
-out_noset:
-	read_unlock(&auth_domain_cache.hash_lock);
-	return tmp;
+	if (new) {
+		hlist_add_head(&new->hash, head);
+		kref_get(&new->ref);
+	}
+	spin_unlock(&auth_domain_lock);
+	return new;
 }
 
 struct auth_domain *auth_domain_find(char *name)
 {
-	struct auth_domain *rv, ad;
-
-	ad.name = name;
-	rv = auth_domain_lookup(&ad, 0);
-	return rv;
+	return auth_domain_lookup(name, NULL);
 }
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 3e6c694bbad1..17e8b2a3130c 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -27,41 +27,35 @@ struct unix_domain {
 	/* other stuff later */
 };
 
+extern struct auth_ops svcauth_unix;
+
 struct auth_domain *unix_domain_find(char *name)
 {
-	struct auth_domain *rv, ud;
-	struct unix_domain *new;
-
-	ud.name = name;
-	
-	rv = auth_domain_lookup(&ud, 0);
-
- foundit:
-	if (rv && rv->flavour != RPC_AUTH_UNIX) {
-		auth_domain_put(rv);
-		return NULL;
-	}
-	if (rv)
-		return rv;
-
-	new = kmalloc(sizeof(*new), GFP_KERNEL);
-	if (new == NULL)
-		return NULL;
-	cache_init(&new->h.h);
-	new->h.name = kstrdup(name, GFP_KERNEL);
-	new->h.flavour = RPC_AUTH_UNIX;
-	new->addr_changes = 0;
-	new->h.h.expiry_time = NEVER;
-
-	rv = auth_domain_lookup(&new->h, 2);
-	if (rv == &new->h) {
-		if (atomic_dec_and_test(&new->h.h.refcnt)) BUG();
-	} else {
-		auth_domain_put(&new->h);
-		goto foundit;
+	struct auth_domain *rv;
+	struct unix_domain *new = NULL;
+
+	rv = auth_domain_lookup(name, NULL);
+	while(1) {
+		if (rv != &new->h) {
+			if (new) auth_domain_put(&new->h);
+			return rv;
+		}
+		if (rv && rv->flavour != &svcauth_unix) {
+			auth_domain_put(rv);
+			return NULL;
+		}
+		if (rv)
+			return rv;
+
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return NULL;
+		kref_init(&new->h.ref);
+		new->h.name = kstrdup(name, GFP_KERNEL);
+		new->h.flavour = &svcauth_unix;
+		new->addr_changes = 0;
+		rv = auth_domain_lookup(name, &new->h);
 	}
-
-	return rv;
 }
 
 static void svcauth_unix_domain_release(struct auth_domain *dom)
@@ -130,7 +124,7 @@ static inline void ip_map_init(struct ip_map *new, struct ip_map *item)
 }
 static inline void ip_map_update(struct ip_map *new, struct ip_map *item)
 {
-	cache_get(&item->m_client->h.h);
+	kref_get(&item->m_client->h.ref);
 	new->m_client = item->m_client;
 	new->m_add_change = item->m_add_change;
 }
@@ -272,7 +266,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
 	struct unix_domain *udom;
 	struct ip_map ip, *ipmp;
 
-	if (dom->flavour != RPC_AUTH_UNIX)
+	if (dom->flavour != &svcauth_unix)
 		return -EINVAL;
 	udom = container_of(dom, struct unix_domain, h);
 	strcpy(ip.m_class, "nfsd");
@@ -295,7 +289,7 @@ int auth_unix_forget_old(struct auth_domain *dom)
 {
 	struct unix_domain *udom;
 	
-	if (dom->flavour != RPC_AUTH_UNIX)
+	if (dom->flavour != &svcauth_unix)
 		return -EINVAL;
 	udom = container_of(dom, struct unix_domain, h);
 	udom->addr_changes++;
@@ -323,7 +317,7 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
 		rv = NULL;
 	} else {
 		rv = &ipm->m_client->h;
-		cache_get(&rv->h);
+		kref_get(&rv->ref);
 	}
 	ip_map_put(&ipm->h, &ip_map_cache);
 	return rv;
@@ -332,7 +326,6 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
 void svcauth_unix_purge(void)
 {
 	cache_purge(&ip_map_cache);
-	cache_purge(&auth_domain_cache);
 }
 
 static int
@@ -361,7 +354,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 			return SVC_DENIED;
 		case 0:
 			rqstp->rq_client = &ipm->m_client->h;
-			cache_get(&rqstp->rq_client->h);
+			kref_get(&rqstp->rq_client->ref);
 			ip_map_put(&ipm->h, &ip_map_cache);
 			break;
 	}
-- 
cgit v1.2.3


From eab7e2e647c348b418e8715ecaca0177e1b473c7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:00 -0800
Subject: [PATCH] knfsd: Break the hard linkage from svc_expkey to svc_export

Current svc_expkey holds a pointer to the svc_export structure, so updates to
that structure have to be in-place, which is a wart on the whole cache
infrastruct.  So we break that linkage and just do a second lookup.

If this became a performance issue, it would be possible to put a direct link
back in which was only used conditionally.  i.e.  when an object is replaced
in the cache, we set a flag in the old object.  When dereferencing the link
from svc_expkey, if the flag is set, we drop the reference and do a fresh
lookup.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c            | 60 ++++++++++++++++++++++++++++++---------------
 include/linux/nfsd/export.h | 20 +++------------
 2 files changed, 44 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ac0997731fce..587829ed651c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -73,8 +73,10 @@ void expkey_put(struct cache_head *item, struct cache_detail *cd)
 	if (cache_put(item, cd)) {
 		struct svc_expkey *key = container_of(item, struct svc_expkey, h);
 		if (test_bit(CACHE_VALID, &item->flags) &&
-		    !test_bit(CACHE_NEGATIVE, &item->flags))
-			exp_put(key->ek_export);
+		    !test_bit(CACHE_NEGATIVE, &item->flags)) {
+			dput(key->ek_dentry);
+			mntput(key->ek_mnt);
+		}
 		auth_domain_put(key->ek_client);
 		kfree(key);
 	}
@@ -164,26 +166,18 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 	} else {
 		struct nameidata nd;
 		struct svc_expkey *ek;
-		struct svc_export *exp;
 		err = path_lookup(buf, 0, &nd);
 		if (err)
 			goto out;
 
 		dprintk("Found the path %s\n", buf);
-		exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL);
-
-		err = -ENOENT;
-		if (!exp)
-			goto out_nd;
-		key.ek_export = exp;
-		dprintk("And found export\n");
+		key.ek_mnt = nd.mnt;
+		key.ek_dentry = nd.dentry;
 		
 		ek = svc_expkey_lookup(&key, 1);
 		if (ek)
 			expkey_put(&ek->h, &svc_expkey_cache);
-		exp_put(exp);
 		err = 0;
-	out_nd:
 		path_release(&nd);
 	}
 	cache_flush();
@@ -214,7 +208,7 @@ static int expkey_show(struct seq_file *m,
 	if (test_bit(CACHE_VALID, &h->flags) && 
 	    !test_bit(CACHE_NEGATIVE, &h->flags)) {
 		seq_printf(m, " ");
-		seq_path(m, ek->ek_export->ex_mnt, ek->ek_export->ex_dentry, "\\ \t\n");
+		seq_path(m, ek->ek_mnt, ek->ek_dentry, "\\ \t\n");
 	}
 	seq_printf(m, "\n");
 	return 0;
@@ -252,8 +246,8 @@ static inline void svc_expkey_init(struct svc_expkey *new, struct svc_expkey *it
 
 static inline void svc_expkey_update(struct svc_expkey *new, struct svc_expkey *item)
 {
-	cache_get(&item->ek_export->h);
-	new->ek_export = item->ek_export;
+	new->ek_mnt = mntget(item->ek_mnt);
+	new->ek_dentry = dget(item->ek_dentry);
 }
 
 static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
@@ -519,7 +513,8 @@ static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
 	key.ek_client = clp;
 	key.ek_fsidtype = fsid_type;
 	memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
-	key.ek_export = exp;
+	key.ek_mnt = exp->ex_mnt;
+	key.ek_dentry = exp->ex_dentry;
 	key.h.expiry_time = NEVER;
 	key.h.flags = 0;
 
@@ -741,8 +736,8 @@ exp_export(struct nfsctl_export *nxp)
 	if ((nxp->ex_flags & NFSEXP_FSID) &&
 	    (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) &&
 	    !IS_ERR(fsid_key) &&
-	    fsid_key->ek_export &&
-	    fsid_key->ek_export != exp)
+	    fsid_key->ek_mnt &&
+	    (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) )
 		goto finish;
 
 	if (exp) {
@@ -912,6 +907,24 @@ out:
 	return err;
 }
 
+struct svc_export *
+exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
+	 struct cache_req *reqp)
+{
+	struct svc_export *exp;
+	struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
+	if (!ek || IS_ERR(ek))
+		return ERR_PTR(PTR_ERR(ek));
+
+	exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp);
+	expkey_put(&ek->h, &svc_expkey_cache);
+
+	if (!exp || IS_ERR(exp))
+		return ERR_PTR(PTR_ERR(exp));
+	return exp;
+}
+
+
 /*
  * Called when we need the filehandle for the root of the pseudofs,
  * for a given NFSv4 client.   The root is defined to be the
@@ -922,6 +935,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 	       struct cache_req *creq)
 {
 	struct svc_expkey *fsid_key;
+	struct svc_export *exp;
 	int rv;
 	u32 fsidv[2];
 
@@ -933,8 +947,14 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 	if (!fsid_key || IS_ERR(fsid_key))
 		return nfserr_perm;
 
-	rv = fh_compose(fhp, fsid_key->ek_export, 
-			  fsid_key->ek_export->ex_dentry, NULL);
+	exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq);
+	if (exp == NULL)
+		rv = nfserr_perm;
+	else if (IS_ERR(exp))
+		rv = nfserrno(PTR_ERR(exp));
+	else
+		rv = fh_compose(fhp, exp,
+				fsid_key->ek_dentry, NULL);
 	expkey_put(&fsid_key->h, &svc_expkey_cache);
 	return rv;
 }
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 6bad4766d3d9..d52e0b7ad37b 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -67,7 +67,8 @@ struct svc_expkey {
 	int			ek_fsidtype;
 	u32			ek_fsid[3];
 
-	struct svc_export *	ek_export;
+	struct vfsmount *	ek_mnt;
+	struct dentry *		ek_dentry;
 };
 
 #define EX_SECURE(exp)		(!((exp)->ex_flags & NFSEXP_INSECURE_PORT))
@@ -114,22 +115,9 @@ static inline void exp_get(struct svc_export *exp)
 {
 	cache_get(&exp->h);
 }
-static inline struct svc_export *
+extern struct svc_export *
 exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
-	 struct cache_req *reqp)
-{
-	struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
-	if (ek && !IS_ERR(ek)) {
-		struct svc_export *exp = ek->ek_export;
-		int err;
-		exp_get(exp);
-		expkey_put(&ek->h, &svc_expkey_cache);
-		if ((err = cache_check(&svc_export_cache, &exp->h, reqp)))
-			exp = ERR_PTR(err);
-		return exp;
-	} else
-		return ERR_PTR(PTR_ERR(ek));
-}
+	 struct cache_req *reqp);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From 7d317f2c9f1e9dcf4f632fa98f91d1d4a36c4cae Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:01 -0800
Subject: [PATCH] knfsd: Get rid of 'inplace' sunrpc caches

These were an unnecessary wart.  Also only have one 'DefineSimpleCache..'
instead of two.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c                  |  4 ++--
 fs/nfsd/nfs4idmap.c               | 10 ++--------
 include/linux/sunrpc/cache.h      | 28 +++++++++++-----------------
 net/sunrpc/auth_gss/svcauth_gss.c |  4 ++--
 net/sunrpc/svcauth_unix.c         |  2 +-
 5 files changed, 18 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 587829ed651c..c591761a1ad6 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -250,7 +250,7 @@ static inline void svc_expkey_update(struct svc_expkey *new, struct svc_expkey *
 	new->ek_dentry = dget(item->ek_dentry);
 }
 
-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
+static DefineSimpleCacheLookup(svc_expkey, svc_expkey)
 
 #define	EXPORT_HASHBITS		8
 #define	EXPORT_HASHMAX		(1<< EXPORT_HASHBITS)
@@ -482,7 +482,7 @@ static inline void svc_export_update(struct svc_export *new, struct svc_export *
 	new->ex_fsid = item->ex_fsid;
 }
 
-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */
+static DefineSimpleCacheLookup(svc_export, svc_export)
 
 
 struct svc_expkey *
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 13369650cdf9..dea690aa8bb5 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -76,12 +76,6 @@ struct ent {
 	char              authname[IDMAP_NAMESZ];
 };
 
-#define DefineSimpleCacheLookupMap(STRUCT, FUNC)			\
-        DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,		\
-        (struct STRUCT *item, int set), /*no setup */,			\
-	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
-	STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0)
-
 /* Common entry handling */
 
 #define ENT_HASHBITS          8
@@ -264,7 +258,7 @@ out:
 	return error;
 }
 
-static DefineSimpleCacheLookupMap(ent, idtoname);
+static DefineSimpleCacheLookup(ent, idtoname);
 
 /*
  * Name -> ID cache
@@ -390,7 +384,7 @@ out:
 	return (error);
 }
 
-static DefineSimpleCacheLookupMap(ent, nametoid);
+static DefineSimpleCacheLookup(ent, nametoid);
 
 /*
  * Exported API
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index c4e3ea7cf154..405ac14e509a 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -133,14 +133,11 @@ struct cache_deferred_req {
  * If "set" == 0 :
  *    If an entry is found, it is returned
  *    If no entry is found, a new non-VALID entry is created.
- * If "set" == 1 and INPLACE == 0 :
+ * If "set" == 1 :
  *    If no entry is found a new one is inserted with data from "template"
  *    If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
  *    If a CACHE_VALID entry is found, a new entry is swapped in with data
  *       from "template"
- * If set == 1, and INPLACE == 1 :
- *    As above, except that if a CACHE_VALID entry is found, we UPDATE in place
- *       instead of swapping in a new entry.
  *
  * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
  * run but insteead CACHE_NEGATIVE is set in any new item.
@@ -159,13 +156,8 @@ struct cache_deferred_req {
  * TEST  tests if "tmp" matches "item"
  * INIT copies key information from "item" to "new"
  * UPDATE copies content information from "item" to "tmp"
- * INPLACE is true if updates can happen inplace rather than allocating a new structure
- *
- * WARNING: any substantial changes to this must be reflected in
- *   net/sunrpc/svcauth.c(auth_domain_lookup)
- *  which is a similar routine that is open-coded.
  */
-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE)	\
+#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE)	\
 RTN *FNAME ARGS										\
 {											\
 	RTN *tmp, *new=NULL;								\
@@ -179,13 +171,13 @@ RTN *FNAME ARGS										\
 		tmp = container_of(*hp, RTN, MEMBER);					\
 		if (TEST) { /* found a match */						\
 											\
-			if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
+			if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new)	\
 				break;							\
 											\
 			if (new)							\
 				{INIT;}							\
 			if (set) {							\
-				if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
+				if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
 				{ /* need to swap in new */				\
 					RTN *t2;					\
 											\
@@ -206,7 +198,7 @@ RTN *FNAME ARGS										\
 			else read_unlock(&(DETAIL)->hash_lock);				\
 			if (set)							\
 				cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
-			if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
+			if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
 			if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL);		\
 			return tmp;							\
 		}									\
@@ -239,10 +231,12 @@ RTN *FNAME ARGS										\
 	return NULL;									\
 }
 
-#define DefineSimpleCacheLookup(STRUCT,INPLACE)	\
-	DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */,	\
-			  & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\
-			  STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE)
+#define DefineSimpleCacheLookup(STRUCT, FUNC)				\
+        DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,		\
+        (struct STRUCT *item, int set), /*no setup */,			\
+	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
+	STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+
 
 #define cache_for_each(pos, detail, index, member) 						\
 	for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ;		\
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 6b073c2e6930..aadb4e8d6aa7 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -259,7 +259,7 @@ static struct cache_detail rsi_cache = {
 	.cache_parse    = rsi_parse,
 };
 
-static DefineSimpleCacheLookup(rsi, 0)
+static DefineSimpleCacheLookup(rsi, rsi)
 
 /*
  * The rpcsec_context cache is used to store a context that is
@@ -446,7 +446,7 @@ static struct cache_detail rsc_cache = {
 	.cache_parse	= rsc_parse,
 };
 
-static DefineSimpleCacheLookup(rsc, 0);
+static DefineSimpleCacheLookup(rsc, rsc);
 
 static struct rsc *
 gss_svc_searchbyctx(struct xdr_netobj *handle)
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 17e8b2a3130c..7ddf068b5b25 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -258,7 +258,7 @@ struct cache_detail ip_map_cache = {
 	.cache_show	= ip_map_show,
 };
 
-static DefineSimpleCacheLookup(ip_map, 0)
+static DefineSimpleCacheLookup(ip_map, ip_map)
 
 
 int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
-- 
cgit v1.2.3


From 15a5f6bd23eddd5b3be80366f364be04fb1c1c99 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:02 -0800
Subject: [PATCH] knfsd: Create cache_lookup function instead of using a macro
 to declare one

The C++-like 'template' approach proves to be too ugly and hard to work with.

The old 'template' won't go away until all users are updated.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/cache.h | 12 ++++++
 net/sunrpc/cache.c           | 98 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 405ac14e509a..3e17a5ff1dea 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -81,6 +81,11 @@ struct cache_detail {
 					      struct cache_detail *cd,
 					      struct cache_head *h);
 
+	struct cache_head *	(*alloc)(void);
+	int			(*match)(struct cache_head *orig, struct cache_head *new);
+	void			(*init)(struct cache_head *orig, struct cache_head *new);
+	void			(*update)(struct cache_head *orig, struct cache_head *new);
+
 	/* fields below this comment are for internal use
 	 * and should not be touched by cache owners
 	 */
@@ -237,6 +242,13 @@ RTN *FNAME ARGS										\
 	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
 	STRUCT##_init(new, item), STRUCT##_update(tmp, item))
 
+extern struct cache_head *
+sunrpc_cache_lookup(struct cache_detail *detail,
+		    struct cache_head *key, int hash);
+extern struct cache_head *
+sunrpc_cache_update(struct cache_detail *detail,
+		    struct cache_head *new, struct cache_head *old, int hash);
+
 
 #define cache_for_each(pos, detail, index, member) 						\
 	for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ;		\
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 0acccfeeb284..4449dc52edf5 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -47,6 +47,104 @@ void cache_init(struct cache_head *h)
 	h->last_refresh = now;
 }
 
+struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
+				       struct cache_head *key, int hash)
+{
+	struct cache_head **head,  **hp;
+	struct cache_head *new = NULL;
+
+	head = &detail->hash_table[hash];
+
+	read_lock(&detail->hash_lock);
+
+	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+		struct cache_head *tmp = *hp;
+		if (detail->match(tmp, key)) {
+			cache_get(tmp);
+			read_unlock(&detail->hash_lock);
+			return tmp;
+		}
+	}
+	read_unlock(&detail->hash_lock);
+	/* Didn't find anything, insert an empty entry */
+
+	new = detail->alloc();
+	if (!new)
+		return NULL;
+	cache_init(new);
+
+	write_lock(&detail->hash_lock);
+
+	/* check if entry appeared while we slept */
+	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+		struct cache_head *tmp = *hp;
+		if (detail->match(tmp, key)) {
+			cache_get(tmp);
+			write_unlock(&detail->hash_lock);
+			detail->cache_put(new, detail);
+			return tmp;
+		}
+	}
+	detail->init(new, key);
+	new->next = *head;
+	*head = new;
+	detail->entries++;
+	cache_get(new);
+	write_unlock(&detail->hash_lock);
+
+	return new;
+}
+EXPORT_SYMBOL(sunrpc_cache_lookup);
+
+struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
+				       struct cache_head *new, struct cache_head *old, int hash)
+{
+	/* The 'old' entry is to be replaced by 'new'.
+	 * If 'old' is not VALID, we update it directly,
+	 * otherwise we need to replace it
+	 */
+	struct cache_head **head;
+	struct cache_head *tmp;
+
+	if (!test_bit(CACHE_VALID, &old->flags)) {
+		write_lock(&detail->hash_lock);
+		if (!test_bit(CACHE_VALID, &old->flags)) {
+			if (test_bit(CACHE_NEGATIVE, &new->flags))
+				set_bit(CACHE_NEGATIVE, &old->flags);
+			else
+				detail->update(old, new);
+			/* FIXME cache_fresh should come first */
+			write_unlock(&detail->hash_lock);
+			cache_fresh(detail, old, new->expiry_time);
+			return old;
+		}
+		write_unlock(&detail->hash_lock);
+	}
+	/* We need to insert a new entry */
+	tmp = detail->alloc();
+	if (!tmp) {
+		detail->cache_put(old, detail);
+		return NULL;
+	}
+	cache_init(tmp);
+	detail->init(tmp, old);
+	head = &detail->hash_table[hash];
+
+	write_lock(&detail->hash_lock);
+	if (test_bit(CACHE_NEGATIVE, &new->flags))
+		set_bit(CACHE_NEGATIVE, &tmp->flags);
+	else
+		detail->update(tmp, new);
+	tmp->next = *head;
+	*head = tmp;
+	cache_get(tmp);
+	write_unlock(&detail->hash_lock);
+	cache_fresh(detail, tmp, new->expiry_time);
+	cache_fresh(detail, old, 0);
+	detail->cache_put(old, detail);
+	return tmp;
+}
+EXPORT_SYMBOL(sunrpc_cache_update);
 
 static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h);
 /*
-- 
cgit v1.2.3


From 4d90452cb23b08a9a9dd001010f0ee6b1ee83a45 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:07 -0800
Subject: [PATCH] knfsd: Remove DefineCacheLookup

This has been replaced by more traditional code.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/cache.h | 113 -------------------------------------------
 1 file changed, 113 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 3e17a5ff1dea..afc481dd02dd 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -128,119 +128,6 @@ struct cache_deferred_req {
 					   int too_many);
 };
 
-/*
- * just like a template in C++, this macro does cache lookup
- * for us.
- * The function is passed some sort of HANDLE from which a cache_detail
- * structure can be determined (via SETUP, DETAIL), a template
- * cache entry (type RTN*), and a "set" flag.  Using the HASHFN and the 
- * TEST, the function will try to find a matching cache entry in the cache.
- * If "set" == 0 :
- *    If an entry is found, it is returned
- *    If no entry is found, a new non-VALID entry is created.
- * If "set" == 1 :
- *    If no entry is found a new one is inserted with data from "template"
- *    If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
- *    If a CACHE_VALID entry is found, a new entry is swapped in with data
- *       from "template"
- *
- * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
- * run but insteead CACHE_NEGATIVE is set in any new item.
-
- *  In any case, the new entry is returned with a reference count.
- *
- *    
- * RTN is a struct type for a cache entry
- * MEMBER is the member of the cache which is cache_head, which must be first
- * FNAME is the name for the function	
- * ARGS are arguments to function and must contain RTN *item, int set.  May
- *   also contain something to be usedby SETUP or DETAIL to find cache_detail.
- * SETUP  locates the cache detail and makes it available as...
- * DETAIL identifies the cache detail, possibly set up by SETUP
- * HASHFN returns a hash value of the cache entry "item"
- * TEST  tests if "tmp" matches "item"
- * INIT copies key information from "item" to "new"
- * UPDATE copies content information from "item" to "tmp"
- */
-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE)	\
-RTN *FNAME ARGS										\
-{											\
-	RTN *tmp, *new=NULL;								\
-	struct cache_head **hp, **head;							\
-	SETUP;										\
-	head = &(DETAIL)->hash_table[HASHFN];						\
- retry:											\
-	if (set||new) write_lock(&(DETAIL)->hash_lock);					\
-	else read_lock(&(DETAIL)->hash_lock);						\
-	for(hp=head; *hp != NULL; hp = &tmp->MEMBER.next) {				\
-		tmp = container_of(*hp, RTN, MEMBER);					\
-		if (TEST) { /* found a match */						\
-											\
-			if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new)	\
-				break;							\
-											\
-			if (new)							\
-				{INIT;}							\
-			if (set) {							\
-				if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
-				{ /* need to swap in new */				\
-					RTN *t2;					\
-											\
-					new->MEMBER.next = tmp->MEMBER.next;		\
-					*hp = &new->MEMBER;				\
-					tmp->MEMBER.next = NULL;			\
-					t2 = tmp; tmp = new; new = t2;			\
-				}							\
-				if (test_bit(CACHE_NEGATIVE,  &item->MEMBER.flags))	\
-					set_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags);	\
-				else {							\
-					UPDATE;						\
-					clear_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags);	\
-				}							\
-			}								\
-			cache_get(&tmp->MEMBER);					\
-			if (set||new) write_unlock(&(DETAIL)->hash_lock);		\
-			else read_unlock(&(DETAIL)->hash_lock);				\
-			if (set)							\
-				cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
-			if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
-			if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL);		\
-			return tmp;							\
-		}									\
-	}										\
-	/* Didn't find anything */							\
-	if (new) {									\
-		INIT;									\
-		new->MEMBER.next = *head;						\
-		*head = &new->MEMBER;							\
-		(DETAIL)->entries ++;							\
-		cache_get(&new->MEMBER);						\
-		if (set) {								\
-			tmp = new;							\
-			if (test_bit(CACHE_NEGATIVE, &item->MEMBER.flags))		\
-				set_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags);		\
-			else {UPDATE;}							\
-		}									\
-	}										\
-	if (set||new) write_unlock(&(DETAIL)->hash_lock);				\
-	else read_unlock(&(DETAIL)->hash_lock);						\
-	if (new && set)									\
-		cache_fresh(DETAIL, &new->MEMBER, item->MEMBER.expiry_time);		\
-	if (new)				       					\
-		return new;								\
-	new = kmalloc(sizeof(*new), GFP_KERNEL);					\
-	if (new) {									\
-		cache_init(&new->MEMBER);						\
-		goto retry;								\
-	}										\
-	return NULL;									\
-}
-
-#define DefineSimpleCacheLookup(STRUCT, FUNC)				\
-        DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,		\
-        (struct STRUCT *item, int set), /*no setup */,			\
-	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
-	STRUCT##_init(new, item), STRUCT##_update(tmp, item))
 
 extern struct cache_head *
 sunrpc_cache_lookup(struct cache_detail *detail,
-- 
cgit v1.2.3


From ebd0cb1af3be2729cc1f574681dfba01fcf458d9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:08 -0800
Subject: [PATCH] knfsd: Unexport cache_fresh and fix a small race

Cache_fresh is now only used in cache.c, so unexport it.

Part of cache_fresh (setting CACHE_VALID) should really be done under the
lock, while part (calling cache_revisit_request etc) must be done outside the
lock.  So we split it up appropriately.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/cache.h |  2 --
 net/sunrpc/cache.c           | 51 ++++++++++++++++++++++++++------------------
 net/sunrpc/sunrpc_syms.c     |  1 -
 3 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index afc481dd02dd..a37fead1873b 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -165,8 +165,6 @@ static inline int cache_put(struct cache_head *h, struct cache_detail *cd)
 }
 
 extern void cache_init(struct cache_head *h);
-extern void cache_fresh(struct cache_detail *detail,
-			struct cache_head *head, time_t expiry);
 extern int cache_check(struct cache_detail *detail,
 		       struct cache_head *h, struct cache_req *rqstp);
 extern void cache_flush(void);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index b242f491cea9..edcda4fd88e8 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -96,6 +96,27 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 }
 EXPORT_SYMBOL(sunrpc_cache_lookup);
 
+
+static void queue_loose(struct cache_detail *detail, struct cache_head *ch);
+
+static int cache_fresh_locked(struct cache_head *head, time_t expiry)
+{
+	head->expiry_time = expiry;
+	head->last_refresh = get_seconds();
+	return !test_and_set_bit(CACHE_VALID, &head->flags);
+}
+
+static void cache_fresh_unlocked(struct cache_head *head,
+			struct cache_detail *detail, int new)
+{
+	if (new)
+		cache_revisit_request(head);
+	if (test_and_clear_bit(CACHE_PENDING, &head->flags)) {
+		cache_revisit_request(head);
+		queue_loose(detail, head);
+	}
+}
+
 struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 				       struct cache_head *new, struct cache_head *old, int hash)
 {
@@ -105,6 +126,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	 */
 	struct cache_head **head;
 	struct cache_head *tmp;
+	int is_new;
 
 	if (!test_bit(CACHE_VALID, &old->flags)) {
 		write_lock(&detail->hash_lock);
@@ -113,9 +135,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 				set_bit(CACHE_NEGATIVE, &old->flags);
 			else
 				detail->update(old, new);
-			/* FIXME cache_fresh should come first */
+			is_new = cache_fresh_locked(old, new->expiry_time);
 			write_unlock(&detail->hash_lock);
-			cache_fresh(detail, old, new->expiry_time);
+			cache_fresh_unlocked(old, detail, is_new);
 			return old;
 		}
 		write_unlock(&detail->hash_lock);
@@ -138,9 +160,11 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	tmp->next = *head;
 	*head = tmp;
 	cache_get(tmp);
+	is_new = cache_fresh_locked(tmp, new->expiry_time);
+	cache_fresh_locked(old, 0);
 	write_unlock(&detail->hash_lock);
-	cache_fresh(detail, tmp, new->expiry_time);
-	cache_fresh(detail, old, 0);
+	cache_fresh_unlocked(tmp, detail, is_new);
+	cache_fresh_unlocked(old, detail, 0);
 	detail->cache_put(old, detail);
 	return tmp;
 }
@@ -192,7 +216,8 @@ int cache_check(struct cache_detail *detail,
 				clear_bit(CACHE_PENDING, &h->flags);
 				if (rv == -EAGAIN) {
 					set_bit(CACHE_NEGATIVE, &h->flags);
-					cache_fresh(detail, h, get_seconds()+CACHE_NEW_EXPIRY);
+					cache_fresh_unlocked(h, detail,
+					     cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY));
 					rv = -ENOENT;
 				}
 				break;
@@ -213,22 +238,6 @@ int cache_check(struct cache_detail *detail,
 	return rv;
 }
 
-static void queue_loose(struct cache_detail *detail, struct cache_head *ch);
-
-void cache_fresh(struct cache_detail *detail,
-		 struct cache_head *head, time_t expiry)
-{
-
-	head->expiry_time = expiry;
-	head->last_refresh = get_seconds();
-	if (!test_and_set_bit(CACHE_VALID, &head->flags))
-		cache_revisit_request(head);
-	if (test_and_clear_bit(CACHE_PENDING, &head->flags)) {
-		cache_revisit_request(head);
-		queue_loose(detail, head);
-	}
-}
-
 /*
  * caches need to be periodically cleaned.
  * For this we maintain a list of cache_detail and
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 40401196e7de..69b8238f3d10 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -105,7 +105,6 @@ EXPORT_SYMBOL(auth_unix_lookup);
 EXPORT_SYMBOL(cache_check);
 EXPORT_SYMBOL(cache_flush);
 EXPORT_SYMBOL(cache_purge);
-EXPORT_SYMBOL(cache_fresh);
 EXPORT_SYMBOL(cache_init);
 EXPORT_SYMBOL(cache_register);
 EXPORT_SYMBOL(cache_unregister);
-- 
cgit v1.2.3


From baab935ff3bdac20c558809da0d8e8f761840219 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:09 -0800
Subject: [PATCH] knfsd: Convert sunrpc_cache to use krefs

.. it makes some of the code nicer.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c                  | 51 ++++++++++++++++++---------------------
 fs/nfsd/nfs4idmap.c               | 18 ++++++--------
 fs/nfsd/nfsfh.c                   |  2 +-
 include/linux/nfsd/export.h       |  4 +--
 include/linux/sunrpc/cache.h      | 13 +++++-----
 net/sunrpc/auth_gss/svcauth_gss.c | 28 +++++++++------------
 net/sunrpc/cache.c                | 20 +++++++--------
 net/sunrpc/svcauth_unix.c         | 20 +++++++--------
 8 files changed, 72 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index abd68965822f..cc811a1094cb 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -57,18 +57,17 @@ static int		exp_verify_string(char *cp, int max);
 #define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
 static struct cache_head *expkey_table[EXPKEY_HASHMAX];
 
-void expkey_put(struct cache_head *item, struct cache_detail *cd)
+void expkey_put(struct kref *ref)
 {
-	if (cache_put(item, cd)) {
-		struct svc_expkey *key = container_of(item, struct svc_expkey, h);
-		if (test_bit(CACHE_VALID, &item->flags) &&
-		    !test_bit(CACHE_NEGATIVE, &item->flags)) {
-			dput(key->ek_dentry);
-			mntput(key->ek_mnt);
-		}
-		auth_domain_put(key->ek_client);
-		kfree(key);
+	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
+
+	if (test_bit(CACHE_VALID, &key->h.flags) &&
+	    !test_bit(CACHE_NEGATIVE, &key->h.flags)) {
+		dput(key->ek_dentry);
+		mntput(key->ek_mnt);
 	}
+	auth_domain_put(key->ek_client);
+	kfree(key);
 }
 
 static void expkey_request(struct cache_detail *cd,
@@ -158,7 +157,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 		set_bit(CACHE_NEGATIVE, &key.h.flags);
 		ek = svc_expkey_update(&key, ek);
 		if (ek)
-			expkey_put(&ek->h, &svc_expkey_cache);
+			cache_put(&ek->h, &svc_expkey_cache);
 		else err = -ENOMEM;
 	} else {
 		struct nameidata nd;
@@ -172,7 +171,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 		
 		ek = svc_expkey_update(&key, ek);
 		if (ek)
-			expkey_put(&ek->h, &svc_expkey_cache);
+			cache_put(&ek->h, &svc_expkey_cache);
 		else
 			err = -ENOMEM;
 		path_release(&nd);
@@ -318,15 +317,13 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 
 static struct cache_head *export_table[EXPORT_HASHMAX];
 
-void svc_export_put(struct cache_head *item, struct cache_detail *cd)
+static void svc_export_put(struct kref *ref)
 {
-	if (cache_put(item, cd)) {
-		struct svc_export *exp = container_of(item, struct svc_export, h);
-		dput(exp->ex_dentry);
-		mntput(exp->ex_mnt);
-		auth_domain_put(exp->ex_client);
-		kfree(exp);
-	}
+	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
+	dput(exp->ex_dentry);
+	mntput(exp->ex_mnt);
+	auth_domain_put(exp->ex_client);
+	kfree(exp);
 }
 
 static void svc_export_request(struct cache_detail *cd,
@@ -633,7 +630,7 @@ static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
 	if (ek)
 		ek = svc_expkey_update(&key,ek);
 	if (ek) {
-		expkey_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
 		return 0;
 	}
 	return -ENOMEM;
@@ -762,7 +759,7 @@ static void exp_fsid_unhash(struct svc_export *exp)
 	ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
 	if (ek && !IS_ERR(ek)) {
 		ek->h.expiry_time = get_seconds()-1;
-		expkey_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
 	}
 	svc_expkey_cache.nextcheck = get_seconds();
 }
@@ -800,7 +797,7 @@ static void exp_unhash(struct svc_export *exp)
 	ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
 	if (ek && !IS_ERR(ek)) {
 		ek->h.expiry_time = get_seconds()-1;
-		expkey_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
 	}
 	svc_expkey_cache.nextcheck = get_seconds();
 }
@@ -902,7 +899,7 @@ finish:
 	if (exp)
 		exp_put(exp);
 	if (fsid_key && !IS_ERR(fsid_key))
-		expkey_put(&fsid_key->h, &svc_expkey_cache);
+		cache_put(&fsid_key->h, &svc_expkey_cache);
 	if (clp)
 		auth_domain_put(clp);
 	path_release(&nd);
@@ -1030,7 +1027,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 		return ERR_PTR(PTR_ERR(ek));
 
 	exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp);
-	expkey_put(&ek->h, &svc_expkey_cache);
+	cache_put(&ek->h, &svc_expkey_cache);
 
 	if (!exp || IS_ERR(exp))
 		return ERR_PTR(PTR_ERR(exp));
@@ -1068,7 +1065,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 	else
 		rv = fh_compose(fhp, exp,
 				fsid_key->ek_dentry, NULL);
-	expkey_put(&fsid_key->h, &svc_expkey_cache);
+	cache_put(&fsid_key->h, &svc_expkey_cache);
 	return rv;
 }
 
@@ -1187,7 +1184,7 @@ static int e_show(struct seq_file *m, void *p)
 	cache_get(&exp->h);
 	if (cache_check(&svc_export_cache, &exp->h, NULL))
 		return 0;
-	if (cache_put(&exp->h, &svc_export_cache)) BUG();
+	cache_put(&exp->h, &svc_export_cache);
 	return svc_export_show(m, &svc_export_cache, cp);
 }
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 75cfbb68b205..4b6aa60dfceb 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -96,12 +96,10 @@ ent_init(struct cache_head *cnew, struct cache_head *citm)
 }
 
 static void
-ent_put(struct cache_head *ch, struct cache_detail *cd)
+ent_put(struct kref *ref)
 {
-	if (cache_put(ch, cd)) {
-		struct ent *map = container_of(ch, struct ent, h);
-		kfree(map);
-	}
+	struct ent *map = container_of(ref, struct ent, h.ref);
+	kfree(map);
 }
 
 static struct cache_head *
@@ -270,7 +268,7 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 	if (res == NULL)
 		goto out;
 
-	ent_put(&res->h, &idtoname_cache);
+	cache_put(&res->h, &idtoname_cache);
 
 	error = 0;
 out:
@@ -433,7 +431,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 	if (res == NULL)
 		goto out;
 
-	ent_put(&res->h, &nametoid_cache);
+	cache_put(&res->h, &nametoid_cache);
 	error = 0;
 out:
 	kfree(buf1);
@@ -562,7 +560,7 @@ do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
 		goto out_put;
 	return 0;
 out_put:
-	ent_put(&(*item)->h, detail);
+	cache_put(&(*item)->h, detail);
 out_err:
 	*item = NULL;
 	return ret;
@@ -613,7 +611,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 	if (ret)
 		return ret;
 	*id = item->id;
-	ent_put(&item->h, &nametoid_cache);
+	cache_put(&item->h, &nametoid_cache);
 	return 0;
 }
 
@@ -635,7 +633,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 	ret = strlen(item->name);
 	BUG_ON(ret > IDMAP_NAMESZ);
 	memcpy(name, item->name, ret);
-	ent_put(&item->h, &idtoname_cache);
+	cache_put(&item->h, &idtoname_cache);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 7a3e397b4ed3..3f2ec2e6d06c 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -506,7 +506,7 @@ fh_put(struct svc_fh *fhp)
 		nfsd_nr_put++;
 	}
 	if (exp) {
-		svc_export_put(&exp->h, &svc_export_cache);
+		cache_put(&exp->h, &svc_export_cache);
 		fhp->fh_export = NULL;
 	}
 	return;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index d52e0b7ad37b..a6c08a47b25c 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -102,13 +102,11 @@ int			exp_rootfh(struct auth_domain *,
 int			exp_pseudoroot(struct auth_domain *, struct svc_fh *fhp, struct cache_req *creq);
 int			nfserrno(int errno);
 
-extern void expkey_put(struct cache_head *item, struct cache_detail *cd);
-extern void svc_export_put(struct cache_head *item, struct cache_detail *cd);
 extern struct cache_detail svc_export_cache, svc_expkey_cache;
 
 static inline void exp_put(struct svc_export *exp)
 {
-	svc_export_put(&exp->h, &svc_export_cache);
+	cache_put(&exp->h, &svc_export_cache);
 }
 
 static inline void exp_get(struct svc_export *exp)
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index a37fead1873b..ad3f5cbdb770 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -50,7 +50,7 @@ struct cache_head {
 	time_t		last_refresh;   /* If CACHE_PENDING, this is when upcall 
 					 * was sent, else this is when update was received
 					 */
-	atomic_t 	refcnt;
+	struct kref	ref;
 	unsigned long	flags;
 };
 #define	CACHE_VALID	0	/* Entry contains valid data */
@@ -68,8 +68,7 @@ struct cache_detail {
 	atomic_t		inuse; /* active user-space update or lookup */
 
 	char			*name;
-	void			(*cache_put)(struct cache_head *,
-					     struct cache_detail*);
+	void			(*cache_put)(struct kref *);
 
 	void			(*cache_request)(struct cache_detail *cd,
 						 struct cache_head *h,
@@ -151,17 +150,17 @@ extern void cache_clean_deferred(void *owner);
 
 static inline struct cache_head  *cache_get(struct cache_head *h)
 {
-	atomic_inc(&h->refcnt);
+	kref_get(&h->ref);
 	return h;
 }
 
 
-static inline int cache_put(struct cache_head *h, struct cache_detail *cd)
+static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 {
-	if (atomic_read(&h->refcnt) <= 2 &&
+	if (atomic_read(&h->ref.refcount) <= 2 &&
 	    h->expiry_time < cd->nextcheck)
 		cd->nextcheck = h->expiry_time;
-	return atomic_dec_and_test(&h->refcnt);
+	kref_put(&h->ref, cd->cache_put);
 }
 
 extern void cache_init(struct cache_head *h);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 380152603d1e..4d7eb9e704da 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -89,13 +89,11 @@ static void rsi_free(struct rsi *rsii)
 	kfree(rsii->out_token.data);
 }
 
-static void rsi_put(struct cache_head *item, struct cache_detail *cd)
+static void rsi_put(struct kref *ref)
 {
-	struct rsi *rsii = container_of(item, struct rsi, h);
-	if (cache_put(item, cd)) {
-		rsi_free(rsii);
-		kfree(rsii);
-	}
+	struct rsi *rsii = container_of(ref, struct rsi, h.ref);
+	rsi_free(rsii);
+	kfree(rsii);
 }
 
 static inline int rsi_hash(struct rsi *item)
@@ -267,7 +265,7 @@ static int rsi_parse(struct cache_detail *cd,
 out:
 	rsi_free(&rsii);
 	if (rsip)
-		rsi_put(&rsip->h, &rsi_cache);
+		cache_put(&rsip->h, &rsi_cache);
 	else
 		status = -ENOMEM;
 	return status;
@@ -357,14 +355,12 @@ static void rsc_free(struct rsc *rsci)
 		put_group_info(rsci->cred.cr_group_info);
 }
 
-static void rsc_put(struct cache_head *item, struct cache_detail *cd)
+static void rsc_put(struct kref *ref)
 {
-	struct rsc *rsci = container_of(item, struct rsc, h);
+	struct rsc *rsci = container_of(ref, struct rsc, h.ref);
 
-	if (cache_put(item, cd)) {
-		rsc_free(rsci);
-		kfree(rsci);
-	}
+	rsc_free(rsci);
+	kfree(rsci);
 }
 
 static inline int
@@ -509,7 +505,7 @@ static int rsc_parse(struct cache_detail *cd,
 out:
 	rsc_free(&rsci);
 	if (rscp)
-		rsc_put(&rscp->h, &rsc_cache);
+		cache_put(&rscp->h, &rsc_cache);
 	else
 		status = -ENOMEM;
 	return status;
@@ -1076,7 +1072,7 @@ drop:
 	ret = SVC_DROP;
 out:
 	if (rsci)
-		rsc_put(&rsci->h, &rsc_cache);
+		cache_put(&rsci->h, &rsc_cache);
 	return ret;
 }
 
@@ -1168,7 +1164,7 @@ out_err:
 		put_group_info(rqstp->rq_cred.cr_group_info);
 	rqstp->rq_cred.cr_group_info = NULL;
 	if (gsd->rsci)
-		rsc_put(&gsd->rsci->h, &rsc_cache);
+		cache_put(&gsd->rsci->h, &rsc_cache);
 	gsd->rsci = NULL;
 
 	return stat;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index edcda4fd88e8..dd81e5928172 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -42,7 +42,7 @@ void cache_init(struct cache_head *h)
 	time_t now = get_seconds();
 	h->next = NULL;
 	h->flags = 0;
-	atomic_set(&h->refcnt, 1);
+	kref_init(&h->ref);
 	h->expiry_time = now + CACHE_NEW_EXPIRY;
 	h->last_refresh = now;
 }
@@ -81,7 +81,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 		if (detail->match(tmp, key)) {
 			cache_get(tmp);
 			write_unlock(&detail->hash_lock);
-			detail->cache_put(new, detail);
+			cache_put(new, detail);
 			return tmp;
 		}
 	}
@@ -145,7 +145,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	/* We need to insert a new entry */
 	tmp = detail->alloc();
 	if (!tmp) {
-		detail->cache_put(old, detail);
+		cache_put(old, detail);
 		return NULL;
 	}
 	cache_init(tmp);
@@ -165,7 +165,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	write_unlock(&detail->hash_lock);
 	cache_fresh_unlocked(tmp, detail, is_new);
 	cache_fresh_unlocked(old, detail, 0);
-	detail->cache_put(old, detail);
+	cache_put(old, detail);
 	return tmp;
 }
 EXPORT_SYMBOL(sunrpc_cache_update);
@@ -234,7 +234,7 @@ int cache_check(struct cache_detail *detail,
 		cache_defer_req(rqstp, h);
 
 	if (rv)
-		detail->cache_put(h, detail);
+		cache_put(h, detail);
 	return rv;
 }
 
@@ -431,7 +431,7 @@ static int cache_clean(void)
 			if (test_and_clear_bit(CACHE_PENDING, &ch->flags))
 				queue_loose(current_detail, ch);
 
-			if (atomic_read(&ch->refcnt) == 1)
+			if (atomic_read(&ch->ref.refcount) == 1)
 				break;
 		}
 		if (ch) {
@@ -446,7 +446,7 @@ static int cache_clean(void)
 			current_index ++;
 		spin_unlock(&cache_list_lock);
 		if (ch)
-			d->cache_put(ch, d);
+			cache_put(ch, d);
 	} else
 		spin_unlock(&cache_list_lock);
 
@@ -723,7 +723,7 @@ cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 		    !test_bit(CACHE_PENDING, &rq->item->flags)) {
 			list_del(&rq->q.list);
 			spin_unlock(&queue_lock);
-			cd->cache_put(rq->item, cd);
+			cache_put(rq->item, cd);
 			kfree(rq->buf);
 			kfree(rq);
 		} else
@@ -906,7 +906,7 @@ static void queue_loose(struct cache_detail *detail, struct cache_head *ch)
 				continue;
 			list_del(&cr->q.list);
 			spin_unlock(&queue_lock);
-			detail->cache_put(cr->item, detail);
+			cache_put(cr->item, detail);
 			kfree(cr->buf);
 			kfree(cr);
 			return;
@@ -1192,7 +1192,7 @@ static int c_show(struct seq_file *m, void *p)
 
 	ifdebug(CACHE)
 		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
-			   cp->expiry_time, atomic_read(&cp->refcnt), cp->flags);
+			   cp->expiry_time, atomic_read(&cp->ref.refcount), cp->flags);
 	cache_get(cp);
 	if (cache_check(cd, cp, NULL))
 		/* cache_check does a cache_put on failure */
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 7e38621a20b7..11020c0b7db5 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -84,15 +84,15 @@ struct ip_map {
 };
 static struct cache_head	*ip_table[IP_HASHMAX];
 
-static void ip_map_put(struct cache_head *item, struct cache_detail *cd)
+static void ip_map_put(struct kref *kref)
 {
+	struct cache_head *item = container_of(kref, struct cache_head, ref);
 	struct ip_map *im = container_of(item, struct ip_map,h);
-	if (cache_put(item, cd)) {
-		if (test_bit(CACHE_VALID, &item->flags) &&
-		    !test_bit(CACHE_NEGATIVE, &item->flags))
-			auth_domain_put(&im->m_client->h);
-		kfree(im);
-	}
+
+	if (test_bit(CACHE_VALID, &item->flags) &&
+	    !test_bit(CACHE_NEGATIVE, &item->flags))
+		auth_domain_put(&im->m_client->h);
+	kfree(im);
 }
 
 #if IP_HASHBITS == 8
@@ -315,7 +315,7 @@ static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t ex
 				 hash_ip((unsigned long)ipm->m_addr.s_addr));
 	if (!ch)
 		return -ENOMEM;
-	ip_map_put(ch, &ip_map_cache);
+	cache_put(ch, &ip_map_cache);
 	return 0;
 }
 
@@ -369,7 +369,7 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
 		rv = &ipm->m_client->h;
 		kref_get(&rv->ref);
 	}
-	ip_map_put(&ipm->h, &ip_map_cache);
+	cache_put(&ipm->h, &ip_map_cache);
 	return rv;
 }
 
@@ -403,7 +403,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 		case 0:
 			rqstp->rq_client = &ipm->m_client->h;
 			kref_get(&rqstp->rq_client->ref);
-			ip_map_put(&ipm->h, &ip_map_cache);
+			cache_put(&ipm->h, &ip_map_cache);
 			break;
 	}
 	return SVC_OK;
-- 
cgit v1.2.3


From 74cae61ab45f19a3e8c4d9f53c0e94df129c7915 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 27 Mar 2006 01:15:10 -0800
Subject: [PATCH] fs/nfsd/export.c,net/sunrpc/cache.c: make needlessly global
 code static

We can now make some code static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c             | 13 ++++++++-----
 include/linux/nfsd/export.h  |  5 +----
 include/linux/sunrpc/cache.h |  1 -
 net/sunrpc/cache.c           |  2 +-
 net/sunrpc/sunrpc_syms.c     |  1 -
 5 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cc811a1094cb..c340be0a3f59 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -57,7 +57,7 @@ static int		exp_verify_string(char *cp, int max);
 #define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
 static struct cache_head *expkey_table[EXPKEY_HASHMAX];
 
-void expkey_put(struct kref *ref)
+static void expkey_put(struct kref *ref)
 {
 	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
 
@@ -87,6 +87,8 @@ static void expkey_request(struct cache_detail *cd,
 
 static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
 static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
+static struct cache_detail svc_expkey_cache;
+
 static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
 	/* client fsidtype fsid [path] */
@@ -255,7 +257,7 @@ static struct cache_head *expkey_alloc(void)
 		return NULL;
 }
 
-struct cache_detail svc_expkey_cache = {
+static struct cache_detail svc_expkey_cache = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
 	.hash_table	= expkey_table,
@@ -345,7 +347,8 @@ static void svc_export_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
-struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old);
+static struct svc_export *svc_export_update(struct svc_export *new,
+					    struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
 
 static int check_export(struct inode *inode, int flags)
@@ -574,7 +577,7 @@ svc_export_lookup(struct svc_export *exp)
 		return NULL;
 }
 
-struct svc_export *
+static struct svc_export *
 svc_export_update(struct svc_export *new, struct svc_export *old)
 {
 	struct cache_head *ch;
@@ -593,7 +596,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
 }
 
 
-struct svc_expkey *
+static struct svc_expkey *
 exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
 {
 	struct svc_expkey key, *ek;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index a6c08a47b25c..d2a8abb5011a 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -86,9 +86,6 @@ void			nfsd_export_shutdown(void);
 void			nfsd_export_flush(void);
 void			exp_readlock(void);
 void			exp_readunlock(void);
-struct svc_expkey *	exp_find_key(struct auth_domain *clp, 
-				     int fsid_type, u32 *fsidv,
-				     struct cache_req *reqp);
 struct svc_export *	exp_get_by_name(struct auth_domain *clp,
 					struct vfsmount *mnt,
 					struct dentry *dentry,
@@ -102,7 +99,7 @@ int			exp_rootfh(struct auth_domain *,
 int			exp_pseudoroot(struct auth_domain *, struct svc_fh *fhp, struct cache_req *creq);
 int			nfserrno(int errno);
 
-extern struct cache_detail svc_export_cache, svc_expkey_cache;
+extern struct cache_detail svc_export_cache;
 
 static inline void exp_put(struct svc_export *exp)
 {
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index ad3f5cbdb770..b5612c958cce 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -163,7 +163,6 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 	kref_put(&h->ref, cd->cache_put);
 }
 
-extern void cache_init(struct cache_head *h);
 extern int cache_check(struct cache_detail *detail,
 		       struct cache_head *h, struct cache_req *rqstp);
 extern void cache_flush(void);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index dd81e5928172..3ac4193a78ed 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -37,7 +37,7 @@
 static void cache_defer_req(struct cache_req *req, struct cache_head *item);
 static void cache_revisit_request(struct cache_head *item);
 
-void cache_init(struct cache_head *h)
+static void cache_init(struct cache_head *h)
 {
 	time_t now = get_seconds();
 	h->next = NULL;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 69b8238f3d10..769114f0f886 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -105,7 +105,6 @@ EXPORT_SYMBOL(auth_unix_lookup);
 EXPORT_SYMBOL(cache_check);
 EXPORT_SYMBOL(cache_flush);
 EXPORT_SYMBOL(cache_purge);
-EXPORT_SYMBOL(cache_init);
 EXPORT_SYMBOL(cache_register);
 EXPORT_SYMBOL(cache_unregister);
 EXPORT_SYMBOL(qword_add);
-- 
cgit v1.2.3


From 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Mon, 27 Mar 2006 01:15:22 -0800
Subject: [PATCH] sched: new sched domain for representing multi-core

Add a new sched domain for representing multi-core with shared caches
between cores.  Consider a dual package system, each package containing two
cores and with last level cache shared between cores with in a package.  If
there are two runnable processes, with this appended patch those two
processes will be scheduled on different packages.

On such systems, with this patch we have observed 8% perf improvement with
specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2
users).

This new domain will come into play only on multi-core systems with shared
caches.  On other systems, this sched domain will be removed by domain
degeneration code.  This new domain can be also used for implementing power
savings policy (see OLS 2005 CMP kernel scheduler paper for more details..
I will post another patch for power savings policy soon)

Most of the arch/* file changes are for cpu_coregroup_map() implementation.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig                      |  9 +++++
 arch/i386/kernel/cpu/common.c          | 10 +++--
 arch/i386/kernel/cpu/intel_cacheinfo.c | 22 +++++++++-
 arch/i386/kernel/smpboot.c             | 24 +++++++++++
 arch/x86_64/Kconfig                    |  9 +++++
 arch/x86_64/kernel/setup.c             |  3 +-
 arch/x86_64/kernel/smpboot.c           | 24 +++++++++++
 include/asm-i386/processor.h           |  5 +++
 include/asm-i386/topology.h            |  2 +
 include/asm-x86_64/processor.h         |  4 ++
 include/asm-x86_64/smp.h               |  1 +
 include/asm-x86_64/topology.h          |  2 +
 include/linux/topology.h               |  9 +++++
 kernel/sched.c                         | 73 +++++++++++++++++++++++++++++++---
 14 files changed, 186 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f7db71d0b913..f17bd1d2707e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 7e3d6b6a4e96..a06a49075f10 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
 void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
-	int junk;
+	int ebx;
 
 	if (have_cpuid_p()) {
 		/* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 		/* Intel-defined flags: level 0x00000001 */
 		if ( c->cpuid_level >= 0x00000001 ) {
 			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 			c->x86_capability[0] = capability;
 			c->x86_capability[4] = excap;
 			c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 			if (c->x86 >= 0x6)
 				c->x86_model += ((tfms >> 16) & 0xF) << 4;
 			c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+			c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+			c->apicid = (ebx >> 24) & 0xFF;
+#endif
 		} else {
 			/* Have CPUID level 0 only - unheard of */
 			c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index ce61921369e5..7e7fd4e67dd0 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+	unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
 
 	if (c->cpuid_level > 3) {
 		static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 					break;
 				    case 2:
 					new_l2 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l2_id = c->apicid >> index_msb;
 					break;
 				    case 3:
 					new_l3 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l3_id = c->apicid >> index_msb;
 					break;
 				    default:
 					break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 		if (new_l1i)
 			l1i = new_l1i;
 
-		if (new_l2)
+		if (new_l2) {
 			l2 = new_l2;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l2_id;
+#endif
+		}
 
-		if (new_l3)
+		if (new_l3) {
 			l3 = new_l3;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l3_id;
+#endif
+		}
 
 		if ( trace )
 			printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 82371d83bfa9..a6969903f2d6 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* Core ID of each logical CPU */
 int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
 
 static int cpucount;
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0ca88f8..1cb4aa241c8c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -250,6 +250,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config NUMA
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8311a7..d1f3e9272c05 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id(0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}
 
+	c->apicid = phys_pkg_id(0);
+
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e98659d077..ea48fa638070 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 /* core ID of each logical CPU */
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
 
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index feca5d961e2b..af4bfd012475 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/percpu.h>
+#include <linux/cpumask.h>
 
 /* flag for disabling the tsc */
 extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
 	char	pad0;
 	int	x86_power;
 	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
 	unsigned char x86_max_cores;	/* cpuid returned max cores value */
 	unsigned char booted_cores;	/* number of cores as seen by OS */
 	unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
 
 extern	int phys_proc_id[NR_CPUS];
 extern	int cpu_core_id[NR_CPUS];
+extern	int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 
 extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa958c6ee83e..b94e5eeef917 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
 
 #endif /* CONFIG_NUMA */
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8c8d88c036ed..1aa2cee43344 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
 #include <asm/mmsegment.h>
 #include <asm/percpu.h>
 #include <linux/personality.h>
+#include <linux/cpumask.h>
 
 #define TF_MASK		0x00000100
 #define IF_MASK		0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
         __u32   x86_power; 	
 	__u32   extended_cpuid_level;	/* Max extended CPUID function supported */
 	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
 	__u8	apicid;
 	__u8	booted_cores;	/* number of cores as seen by OS */
 } ____cacheline_aligned;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 9ccbb2cfd5c0..a4fdaeb5c397 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern u8 phys_proc_id[NR_CPUS];
 extern u8 cpu_core_id[NR_CPUS];
+extern u8 cpu_llc_id[NR_CPUS];
 
 #define SMP_TRAMPOLINE_BASE 0x6000
 
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c642f5d9882d..9db54e9d17bb 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
 
 #include <asm-generic/topology.h>
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e8eb0040ce3a..a305ae2e44b6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,15 @@
 	.nr_balance_failed	= 0,			\
 }
 
+#ifdef CONFIG_SCHED_MC
+#ifndef SD_MC_INIT
+/* for now its same as SD_CPU_INIT.
+ * TBD: Tune Domain parameters!
+ */
+#define SD_MC_INIT   SD_CPU_INIT
+#endif
+#endif
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index a96a05d23262..8a8b71b5751b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
 
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+	return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+	cpumask_t mask = cpu_coregroup_map(cpu);
+	return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
 	return first_cpu(cpu_sibling_map[cpu]);
 #else
 	return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
 
+#ifdef CONFIG_SCHED_MC
+		p = sd;
+		sd = &per_cpu(core_domains, i);
+		group = cpu_to_core_group(i);
+		*sd = SD_MC_INIT;
+		sd->span = cpu_coregroup_map(i);
+		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->parent = p;
+		sd->groups = &sched_group_core[group];
+#endif
+
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
 	}
 #endif
 
+#ifdef CONFIG_SCHED_MC
+	/* Set up multi-core groups */
+	for_each_cpu_mask(i, *cpu_map) {
+		cpumask_t this_core_map = cpu_coregroup_map(i);
+		cpus_and(this_core_map, this_core_map, *cpu_map);
+		if (i != first_cpu(this_core_map))
+			continue;
+		init_sched_build_groups(sched_group_core, this_core_map,
+					&cpu_to_core_group);
+	}
+#endif
+
+
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		power = SCHED_LOAD_SCALE;
 		sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+		sd = &per_cpu(core_domains, i);
+		power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+					    * SCHED_LOAD_SCALE / 10;
+		sd->groups->cpu_power = power;
+
+		sd = &per_cpu(phys_domains, i);
 
+ 		/*
+ 		 * This has to be < 2 * SCHED_LOAD_SCALE
+ 		 * Lets keep it SCHED_LOAD_SCALE, so that
+ 		 * while calculating NUMA group's cpu_power
+ 		 * we can simply do
+ 		 *  numa_group->cpu_power += phys_group->cpu_power;
+ 		 *
+ 		 * See "only add power once for each physical pkg"
+ 		 * comment below
+ 		 */
+ 		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
 		sd = &per_cpu(phys_domains, i);
 		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 		sd->groups->cpu_power = power;
+#endif
 
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
 next_sg:
 		for_each_cpu_mask(j, sg->cpumask) {
 			struct sched_domain *sd;
-			int power;
 
 			sd = &per_cpu(phys_domains, j);
 			if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
 				 */
 				continue;
 			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
 
-			sg->cpu_power += power;
+			sg->cpu_power += sd->groups->cpu_power;
 		}
 		sg = sg->next;
 		if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+		sd = &per_cpu(core_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
-- 
cgit v1.2.3


From a117e66ed45ac0569c039ea60bd7a9a61e031858 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:25 -0800
Subject: [PATCH] unify pfn_to_page: generic functions

There are 3 memory models, FLATMEM, DISCONTIGMEM, SPARSEMEM.
Each arch has its own page_to_pfn(), pfn_to_page() for each models.
But most of them can use the same arithmetic.

This patch adds asm-generic/memory_model.h, which includes generic
page_to_pfn(), pfn_to_page() definitions for each memory model.

When CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y, out-of-line functions are
used instead of macro. This is enabled by some archs and  reduces
text size.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/memory_model.h | 77 ++++++++++++++++++++++++++++++++++++++
 include/asm-sparc64/page.h         |  2 +
 include/linux/mmzone.h             | 11 ------
 mm/page_alloc.c                    | 42 +++++++++++++++++++++
 4 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 include/asm-generic/memory_model.h

(limited to 'include/linux')

diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
new file mode 100644
index 000000000000..a7bb4978e808
--- /dev/null
+++ b/include/asm-generic/memory_model.h
@@ -0,0 +1,77 @@
+#ifndef __ASM_MEMORY_MODEL_H
+#define __ASM_MEMORY_MODEL_H
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#if defined(CONFIG_FLATMEM)
+
+#ifndef ARCH_PFN_OFFSET
+#define ARCH_PFN_OFFSET		(0UL)
+#endif
+
+#elif defined(CONFIG_DISCONTIGMEM)
+
+#ifndef arch_pfn_to_nid
+#define arch_pfn_to_nid(pfn)	pfn_to_nid(pfn)
+#endif
+
+#ifndef arch_local_page_offset
+#define arch_local_page_offset(pfn, nid)	\
+	((pfn) - NODE_DATA(nid)->node_start_pfn)
+#endif
+
+#endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+struct page;
+/* this is useful when inlined pfn_to_page is too big */
+extern struct page *pfn_to_page(unsigned long pfn);
+extern unsigned long page_to_pfn(struct page *page);
+#else
+/*
+ * supports 3 memory models.
+ */
+#if defined(CONFIG_FLATMEM)
+
+#define pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))
+#define page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
+				 ARCH_PFN_OFFSET)
+#elif defined(CONFIG_DISCONTIGMEM)
+
+#define pfn_to_page(pfn)			\
+({	unsigned long __pfn = (pfn);		\
+	unsigned long __nid = arch_pfn_to_nid(pfn);  \
+	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
+})
+
+#define page_to_pfn(pg)			\
+({	struct page *__pg = (pg);		\
+	struct zone *__zone = page_zone(__pg);	\
+	(unsigned long)(__pg - __zone->zone_mem_map) +	\
+	 __zone->zone_start_pfn;			\
+})
+
+#elif defined(CONFIG_SPARSEMEM)
+/*
+ * Note: section's mem_map is encorded to reflect its start_pfn.
+ * section[i].section_mem_map == mem_map's address - start_pfn;
+ */
+#define page_to_pfn(pg)					\
+({	struct page *__pg = (pg);				\
+	int __sec = page_to_section(__pg);			\
+	__pg - __section_mem_map_addr(__nr_to_section(__sec));	\
+})
+
+#define pfn_to_page(pfn)				\
+({	unsigned long __pfn = (pfn);			\
+	struct mem_section *__sec = __pfn_to_section(__pfn);	\
+	__section_mem_map_addr(__sec) + __pfn;		\
+})
+#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index 66fe4ac59fd6..aabb21906724 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -111,6 +111,8 @@ typedef unsigned long pgprot_t;
 				 (_AC(0x0000000070000000,UL)) : \
 				 (_AC(0xfffff80000000000,UL) + (1UL << 32UL)))
 
+#include <asm-generic/memory_model.h>
+
 #endif /* !(__ASSEMBLY__) */
 
 /* to align the pointer to the (next) page boundary */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ebfc238cc243..0c1c0c0cce65 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -602,17 +602,6 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-#define pfn_to_page(pfn) 						\
-({ 									\
-	unsigned long __pfn = (pfn);					\
-	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
-})
-#define page_to_pfn(page)						\
-({									\
-	page - __section_mem_map_addr(__nr_to_section(			\
-		page_to_section(page)));				\
-})
-
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 338a02bb004d..349b328763b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2745,3 +2745,45 @@ void *__init alloc_large_system_hash(const char *tablename,
 
 	return table;
 }
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+/*
+ * pfn <-> page translation. out-of-line version.
+ * (see asm-generic/memory_model.h)
+ */
+#if defined(CONFIG_FLATMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return mem_map + (pfn - ARCH_PFN_OFFSET);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	return (page - mem_map) + ARCH_PFN_OFFSET;
+}
+#elif defined(CONFIG_DISCONTIGMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	int nid = arch_pfn_to_nid(pfn);
+	return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+	return (page - zone->zone_mem_map) + zone->zone_start_pfn;
+
+}
+#elif defined(CONFIG_SPARSEMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+}
+
+unsigned long page_to_pfn(struct page *page)
+{
+	long section_id = page_to_section(page);
+	return page - __section_mem_map_addr(__nr_to_section(section_id));
+}
+#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
-- 
cgit v1.2.3


From a0140c1d85637ee5f4ea7c78f066e3611a6a79dc Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:55 -0800
Subject: [PATCH] remove zone_mem_map

This patch removes zone_mem_map.

pfn_to_page uses pgdat, page_to_pfn uses zone.  page_to_pfn can use pgdat
instead of zone, which is only one user of zone_mem_map.  By modifing it,
we can remove zone_mem_map.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/mmzone.h         |  3 +--
 include/asm-generic/memory_model.h | 10 +++++-----
 include/linux/mmzone.h             |  1 -
 mm/page_alloc.c                    |  6 ++----
 4 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index c9004398f273..192d80c875b0 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -83,8 +83,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 	pte_t pte;                                                           \
 	unsigned long pfn;                                                   \
 									     \
-	pfn = ((unsigned long)((page)-page_zone(page)->zone_mem_map)) << 32; \
-	pfn += page_zone(page)->zone_start_pfn << 32;			     \
+	pfn = page_to_pfn(page) << 32; \
 	pte_val(pte) = pfn | pgprot_val(pgprot);			     \
 									     \
 	pte;								     \
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index a7bb4978e808..0cfb086dd373 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -45,11 +45,11 @@ extern unsigned long page_to_pfn(struct page *page);
 	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
 })
 
-#define page_to_pfn(pg)			\
-({	struct page *__pg = (pg);		\
-	struct zone *__zone = page_zone(__pg);	\
-	(unsigned long)(__pg - __zone->zone_mem_map) +	\
-	 __zone->zone_start_pfn;			\
+#define page_to_pfn(pg)							\
+({	struct page *__pg = (pg);					\
+	struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));	\
+	(unsigned long)(__pg - __pgdat->node_mem_map) +			\
+	 __pgdat->node_start_pfn;					\
 })
 
 #elif defined(CONFIG_SPARSEMEM)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c1c0c0cce65..ace31c515a8c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -225,7 +225,6 @@ struct zone {
 	 * Discontig memory support fields.
 	 */
 	struct pglist_data	*zone_pgdat;
-	struct page		*zone_mem_map;
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 349b328763b7..8dc8f2735d22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2042,7 +2042,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 	zone_wait_table_init(zone, size);
 	pgdat->nr_zones = zone_idx(zone) + 1;
 
-	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
 	zone->zone_start_pfn = zone_start_pfn;
 
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
@@ -2768,9 +2767,8 @@ struct page *pfn_to_page(unsigned long pfn)
 }
 unsigned long page_to_pfn(struct page *page)
 {
-	struct zone *zone = page_zone(page);
-	return (page - zone->zone_mem_map) + zone->zone_start_pfn;
-
+	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+	return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
 }
 #elif defined(CONFIG_SPARSEMEM)
 struct page *pfn_to_page(unsigned long pfn)
-- 
cgit v1.2.3


From 8357f8695d58b50fbf2bd507b4b0fc2cd1e43bd6 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:57 -0800
Subject: [PATCH] define for_each_online_pgdat

This patch defines for_each_online_pgdat() as a replacement of
for_each_pgdat()

Now, online nodes are managed by node_online_map.  But for_each_pgdat()
uses pgdat_link to iterate over all nodes(pgdat).  This means management
structure for online pgdat is duplicated.

I think using node_online_map for for_each_pgdat() is simple and sane
rather ather than pgdat_link.  New macro is named as
for_each_online_pgdat().  Following patch will fix callers of
for_each_pgdat().

The bootmem allocater uses for_each_pgdat() before pgdat initialization.  I
don't think it's sane.  Following patch will fix it.

Signed-off-by: Yasunori Goto     <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h   | 108 +++++++++++++++++++++++++----------------------
 include/linux/nodemask.h |   4 ++
 2 files changed, 61 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ace31c515a8c..96eb08025092 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -13,6 +13,7 @@
 #include <linux/numa.h>
 #include <linux/init.h>
 #include <linux/seqlock.h>
+#include <linux/nodemask.h>
 #include <asm/atomic.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -349,57 +350,6 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
-/**
- * for_each_pgdat - helper macro to iterate over all nodes
- * @pgdat - pointer to a pg_data_t variable
- *
- * Meant to help with common loops of the form
- * pgdat = pgdat_list;
- * while(pgdat) {
- * 	...
- * 	pgdat = pgdat->pgdat_next;
- * }
- */
-#define for_each_pgdat(pgdat) \
-	for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next)
-
-/*
- * next_zone - helper magic for for_each_zone()
- * Thanks to William Lee Irwin III for this piece of ingenuity.
- */
-static inline struct zone *next_zone(struct zone *zone)
-{
-	pg_data_t *pgdat = zone->zone_pgdat;
-
-	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
-		zone++;
-	else if (pgdat->pgdat_next) {
-		pgdat = pgdat->pgdat_next;
-		zone = pgdat->node_zones;
-	} else
-		zone = NULL;
-
-	return zone;
-}
-
-/**
- * for_each_zone - helper macro to iterate over all memory zones
- * @zone - pointer to struct zone variable
- *
- * The user only needs to declare the zone variable, for_each_zone
- * fills it in. This basically means for_each_zone() is an
- * easier to read version of this piece of code:
- *
- * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
- * 	for (i = 0; i < MAX_NR_ZONES; ++i) {
- * 		struct zone * z = pgdat->node_zones + i;
- * 		...
- * 	}
- * }
- */
-#define for_each_zone(zone) \
-	for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
-
 static inline int populated_zone(struct zone *zone)
 {
 	return (!!zone->present_pages);
@@ -471,6 +421,62 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
+static inline struct pglist_data *first_online_pgdat(void)
+{
+	return NODE_DATA(first_online_node);
+}
+
+static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+	int nid = next_online_node(pgdat->node_id);
+
+	if (nid == MAX_NUMNODES)
+		return NULL;
+	return NODE_DATA(nid);
+}
+
+
+/**
+ * for_each_pgdat - helper macro to iterate over all nodes
+ * @pgdat - pointer to a pg_data_t variable
+ */
+#define for_each_online_pgdat(pgdat)			\
+	for (pgdat = first_online_pgdat();		\
+	     pgdat;					\
+	     pgdat = next_online_pgdat(pgdat))
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ * Thanks to William Lee Irwin III for this piece of ingenuity.
+ */
+static inline struct zone *next_zone(struct zone *zone)
+{
+	pg_data_t *pgdat = zone->zone_pgdat;
+
+	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+		zone++;
+	else {
+		pgdat = next_online_pgdat(pgdat);
+		if (pgdat)
+			zone = pgdat->node_zones;
+		else
+			zone = NULL;
+	}
+	return zone;
+}
+
+/**
+ * for_each_zone - helper macro to iterate over all memory zones
+ * @zone - pointer to struct zone variable
+ *
+ * The user only needs to declare the zone variable, for_each_zone
+ * fills it in.
+ */
+#define for_each_zone(zone)			        \
+	for (zone = (first_online_pgdat())->node_zones; \
+	     zone;					\
+	     zone = next_zone(zone))
+
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
 #endif
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index b959a4525cbd..1a9ef3e627d1 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -350,11 +350,15 @@ extern nodemask_t node_possible_map;
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define first_online_node	first_node(node_online_map)
+#define next_online_node(nid)	next_node((nid), node_online_map)
 #else
 #define num_online_nodes()	1
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define first_online_node	0
+#define next_online_node(nid)	(MAX_NUMNODES)
 #endif
 
 #define any_online_node(mask)			\
-- 
cgit v1.2.3


From 679bc9fbb508a0aac9539b2de747eb5849feb428 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:58 -0800
Subject: [PATCH] for_each_online_pgdat: for_each_bootmem

Add a list_head to bootmem_data_t and make bootmems use it.  bootmem list is
sorted by node_boot_start.

Only nodes against which init_bootmem() is called are linked to the list.
(i386 allocates bootmem only from one node(0) not from all online nodes.)

A summary:
 1. for_each_online_pgdat() traverses all *online* nodes.
 2. alloc_bootmem() allocates memory only from initialized-for-bootmem nodes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h |  1 +
 mm/bootmem.c            | 39 +++++++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 7155452fb4a8..de3eb8d8ae26 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -38,6 +38,7 @@ typedef struct bootmem_data {
 	unsigned long last_pos;
 	unsigned long last_success;	/* Previous allocation point.  To speed
 					 * up searching */
+	struct list_head list;
 } bootmem_data_t;
 
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index b55bd39fc5dd..d3e3bd2ffcea 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn);		/* This is exported so
 				 * dma_get_required_mask(), which uses
 				 * it, can be an inline function */
 
+static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
  * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 
 	return mapsize;
 }
+/*
+ * link bdata in order
+ */
+static void link_bootmem(bootmem_data_t *bdata)
+{
+	bootmem_data_t *ent;
+	if (list_empty(&bdata_list)) {
+		list_add(&bdata->list, &bdata_list);
+		return;
+	}
+	/* insert in order */
+	list_for_each_entry(ent, &bdata_list, list) {
+		if (bdata->node_boot_start < ent->node_boot_start) {
+			list_add_tail(&bdata->list, &ent->list);
+			return;
+		}
+	}
+	list_add_tail(&bdata->list, &bdata_list);
+	return;
+}
+
 
 /*
  * Called once to set up the allocator itself.
@@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long mapsize = ((end - start)+7)/8;
 
-	pgdat->pgdat_next = pgdat_list;
-	pgdat_list = pgdat;
-
 	mapsize = ALIGN(mapsize, sizeof(long));
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
 	bdata->node_boot_start = (start << PAGE_SHIFT);
 	bdata->node_low_pfn = end;
+	link_bootmem(bdata);
 
 	/*
 	 * Initially all pages are reserved - setup_arch() has to
@@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void)
 
 void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
-	pg_data_t *pgdat = pgdat_list;
+	bootmem_data_t *bdata;
 	void *ptr;
 
-	for_each_pgdat(pgdat)
-		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-						 align, goal, 0)))
+	list_for_each_entry(bdata, &bdata_list, list)
+		if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
 			return(ptr);
 
 	/*
@@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne
 
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
 {
-	pg_data_t *pgdat = pgdat_list;
+	bootmem_data_t *bdata;
 	void *ptr;
 
-	for_each_pgdat(pgdat)
-		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+	list_for_each_entry(bdata, &bdata_list, list)
+		if ((ptr = __alloc_bootmem_core(bdata, size,
 						 align, goal, LOW32LIMIT)))
 			return(ptr);
 
-- 
cgit v1.2.3


From ae0f15fb91274e67d78836d38c99ec363df33073 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:16:01 -0800
Subject: [PATCH] for_each_online_pgdat: remove pgdat_list

By using for_each_online_pgdat(), pgdat_list is not necessary now.  This patch
removes it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 3 ---
 mm/page_alloc.c        | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 96eb08025092..0d12c3cf1f86 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -307,7 +307,6 @@ typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
-	struct pglist_data *pgdat_next;
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
@@ -324,8 +323,6 @@ typedef struct pglist_data {
 
 #include <linux/memory_hotplug.h>
 
-extern struct pglist_data *pgdat_list;
-
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat);
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ccc3713dd407..dc523a1f270d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,7 +49,6 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
@@ -2169,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos)
 {
 	pg_data_t *pgdat;
 	loff_t node = *pos;
-
-	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+	for (pgdat = first_online_pgdat();
+	     pgdat && node;
+	     pgdat = next_online_pgdat(pgdat))
 		--node;
 
 	return pgdat;
@@ -2181,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
 	pg_data_t *pgdat = (pg_data_t *)arg;
 
 	(*pos)++;
-	return pgdat->pgdat_next;
+	return next_online_pgdat(pgdat);
 }
 
 static void frag_stop(struct seq_file *m, void *arg)
-- 
cgit v1.2.3


From 95144c788dc01b6a0ff2c9c2222e37ffdab358b8 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:16:02 -0800
Subject: [PATCH] uninline zone helpers

Helper functions for for_each_online_pgdat/for_each_zone look too big to be
inlined.  Speed of these helper macro itself is not very important.  (inner
loops are tend to do more work than this)

This patch make helper function to be out-of-lined.

	inline		out-of-line
.text   005c0680        005bf6a0

005c0680 - 005bf6a0 = FE0 = 4Kbytes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 38 +++-----------------------------------
 mm/Makefile            |  2 +-
 mm/mmzone.c            | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 36 deletions(-)
 create mode 100644 mm/mmzone.c

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d12c3cf1f86..b5c21122c299 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -418,20 +418,9 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-static inline struct pglist_data *first_online_pgdat(void)
-{
-	return NODE_DATA(first_online_node);
-}
-
-static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
-{
-	int nid = next_online_node(pgdat->node_id);
-
-	if (nid == MAX_NUMNODES)
-		return NULL;
-	return NODE_DATA(nid);
-}
-
+extern struct pglist_data *first_online_pgdat(void);
+extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
+extern struct zone *next_zone(struct zone *zone);
 
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
@@ -441,27 +430,6 @@ static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 	for (pgdat = first_online_pgdat();		\
 	     pgdat;					\
 	     pgdat = next_online_pgdat(pgdat))
-
-/*
- * next_zone - helper magic for for_each_zone()
- * Thanks to William Lee Irwin III for this piece of ingenuity.
- */
-static inline struct zone *next_zone(struct zone *zone)
-{
-	pg_data_t *pgdat = zone->zone_pgdat;
-
-	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
-		zone++;
-	else {
-		pgdat = next_online_pgdat(pgdat);
-		if (pgdat)
-			zone = pgdat->node_zones;
-		else
-			zone = NULL;
-	}
-	return zone;
-}
-
 /**
  * for_each_zone - helper macro to iterate over all memory zones
  * @zone - pointer to struct zone variable
diff --git a/mm/Makefile b/mm/Makefile
index f10c753dce6d..0b8f73f2ed16 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
-			   prio_tree.o util.o $(mmu-y)
+			   prio_tree.o util.o mmzone.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 000000000000..b022370e612e
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,50 @@
+/*
+ * linux/mm/mmzone.c
+ *
+ * management codes for pgdats and zones.
+ */
+
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+
+struct pglist_data *first_online_pgdat(void)
+{
+	return NODE_DATA(first_online_node);
+}
+
+EXPORT_SYMBOL(first_online_pgdat);
+
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+	int nid = next_online_node(pgdat->node_id);
+
+	if (nid == MAX_NUMNODES)
+		return NULL;
+	return NODE_DATA(nid);
+}
+EXPORT_SYMBOL(next_online_pgdat);
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ */
+struct zone *next_zone(struct zone *zone)
+{
+	pg_data_t *pgdat = zone->zone_pgdat;
+
+	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+		zone++;
+	else {
+		pgdat = next_online_pgdat(pgdat);
+		if (pgdat)
+			zone = pgdat->node_zones;
+		else
+			zone = NULL;
+	}
+	return zone;
+}
+EXPORT_SYMBOL(next_zone);
+
-- 
cgit v1.2.3


From 22a9835c350782a5c3257343713932af3ac92ee0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Mon, 27 Mar 2006 01:16:04 -0800
Subject: [PATCH] unify PFN_* macros

Just about every architecture defines some macros to do operations on pfns.
 They're all virtually identical.  This patch consolidates all of them.

One minor glitch is that at least i386 uses them in a very skeletal header
file.  To keep away from #include dependency hell, I stuck the new
definitions in a new, isolated header.

Of all of the implementations, sh64 is the only one that varied by a bit.
It used some masks to ensure that any sign-extension got ripped away before
the arithmetic is done.  This has been posted to that sh64 maintainers and
the development list.

Compiles on x86, x86_64, ia64 and ppc64.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/setup.c               | 9 +--------
 arch/alpha/mm/numa.c                    | 4 +---
 arch/arm26/mm/init.c                    | 7 +------
 arch/cris/kernel/setup.c                | 5 +----
 arch/i386/kernel/setup.c                | 1 +
 arch/i386/mm/discontig.c                | 1 +
 arch/m32r/kernel/setup.c                | 1 +
 arch/m32r/mm/discontig.c                | 1 +
 arch/m32r/mm/init.c                     | 1 +
 arch/mips/ite-boards/ivr/init.c         | 3 ---
 arch/mips/ite-boards/qed-4n-s01b/init.c | 3 ---
 arch/mips/kernel/setup.c                | 9 +--------
 arch/mips/mips-boards/generic/memory.c  | 7 ++-----
 arch/mips/mips-boards/sim/sim_mem.c     | 7 ++-----
 arch/mips/mm/init.c                     | 4 +---
 arch/mips/sgi-ip27/ip27-memory.c        | 3 +--
 arch/sh/kernel/setup.c                  | 5 +----
 arch/sh64/kernel/setup.c                | 1 +
 arch/um/kernel/physmem.c                | 3 +--
 include/asm-i386/setup.h                | 4 +---
 include/asm-m32r/setup.h                | 4 ----
 include/asm-sh64/platform.h             | 5 -----
 include/linux/pfn.h                     | 9 +++++++++
 23 files changed, 29 insertions(+), 68 deletions(-)
 create mode 100644 include/linux/pfn.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index b4e5f8ff2b25..9402624453c2 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -34,6 +34,7 @@
 #include <linux/root_dev.h>
 #include <linux/initrd.h>
 #include <linux/eisa.h>
+#include <linux/pfn.h>
 #ifdef CONFIG_MAGIC_SYSRQ
 #include <linux/sysrq.h>
 #include <linux/reboot.h>
@@ -241,9 +242,6 @@ reserve_std_resources(void)
 		request_resource(io, standard_io_resources+i);
 }
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
 #define PFN_MAX		PFN_DOWN(0x80000000)
 #define for_each_mem_cluster(memdesc, cluster, i)		\
 	for ((cluster) = (memdesc)->cluster, (i) = 0;		\
@@ -472,11 +470,6 @@ page_is_ram(unsigned long pfn)
 	return 0;
 }
 
-#undef PFN_UP
-#undef PFN_DOWN
-#undef PFN_PHYS
-#undef PFN_MAX
-
 void __init
 setup_arch(char **cmdline_p)
 {
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 6d5251254f68..bf6b65c81bef 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -13,6 +13,7 @@
 #include <linux/bootmem.h>
 #include <linux/swap.h>
 #include <linux/initrd.h>
+#include <linux/pfn.h>
 
 #include <asm/hwrpb.h>
 #include <asm/pgalloc.h>
@@ -27,9 +28,6 @@ bootmem_data_t node_bdata[MAX_NUMNODES];
 #define DBGDCONT(args...)
 #endif
 
-#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)     ((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
 #define for_each_mem_cluster(memdesc, cluster, i)		\
 	for ((cluster) = (memdesc)->cluster, (i) = 0;		\
 	     (i) < (memdesc)->numclusters; (i)++, (cluster)++)
diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c
index e3ecaa453747..7da8a5205678 100644
--- a/arch/arm26/mm/init.c
+++ b/arch/arm26/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
 #include <linux/blkdev.h>
+#include <linux/pfn.h>
 
 #include <asm/segment.h>
 #include <asm/mach-types.h>
@@ -101,12 +102,6 @@ struct node_info {
 	int bootmap_pages;
 };
 
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_UP(x)	(PAGE_ALIGN(x) >> PAGE_SHIFT)
-#define PFN_SIZE(x)	((x) >> PAGE_SHIFT)
-#define PFN_RANGE(s,e)	PFN_SIZE(PAGE_ALIGN((unsigned long)(e)) - \
-				(((unsigned long)(s)) & PAGE_MASK))
-
 /*
  * FIXME: We really want to avoid allocating the bootmap bitmap
  * over the top of the initrd.  Hopefully, this is located towards
diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c
index 1ba57efff60d..619a6eefd893 100644
--- a/arch/cris/kernel/setup.c
+++ b/arch/cris/kernel/setup.c
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 #include <linux/tty.h>
 #include <linux/utsname.h>
+#include <linux/pfn.h>
 
 #include <asm/setup.h>
 
@@ -88,10 +89,6 @@ setup_arch(char **cmdline_p)
 	init_mm.end_data =   (unsigned long) &_edata;
 	init_mm.brk =        (unsigned long) &_end;
 
-#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)     ((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
-
 	/* min_low_pfn points to the start of DRAM, start_pfn points
 	 * to the first DRAM pages after the kernel, and max_low_pfn
 	 * to the end of DRAM.
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 6917daa159ab..8c08660b4e5d 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -46,6 +46,7 @@
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
 #include <linux/dmi.h>
+#include <linux/pfn.h>
 
 #include <video/edid.h>
 
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index c3f3ae95e22d..fe6eb901326e 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -31,6 +31,7 @@
 #include <linux/nodemask.h>
 #include <linux/module.h>
 #include <linux/kexec.h>
+#include <linux/pfn.h>
 
 #include <asm/e820.h>
 #include <asm/setup.h>
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index d742037a7ccb..0d78942b4c76 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/cpu.h>
 #include <linux/nodemask.h>
+#include <linux/pfn.h>
 
 #include <asm/processor.h>
 #include <asm/pgtable.h>
diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c
index 70c8528a0ad5..cf610a7c5ff0 100644
--- a/arch/m32r/mm/discontig.c
+++ b/arch/m32r/mm/discontig.c
@@ -13,6 +13,7 @@
 #include <linux/initrd.h>
 #include <linux/nodemask.h>
 #include <linux/module.h>
+#include <linux/pfn.h>
 
 #include <asm/setup.h>
 
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 2e0fe199ce38..b71348fec1f4 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/bitops.h>
 #include <linux/nodemask.h>
+#include <linux/pfn.h>
 #include <asm/types.h>
 #include <asm/processor.h>
 #include <asm/page.h>
diff --git a/arch/mips/ite-boards/ivr/init.c b/arch/mips/ite-boards/ivr/init.c
index ea4e1935fec5..b774db035b31 100644
--- a/arch/mips/ite-boards/ivr/init.c
+++ b/arch/mips/ite-boards/ivr/init.c
@@ -45,9 +45,6 @@ extern void  __init prom_init_cmdline(void);
 extern unsigned long __init prom_get_memsize(void);
 extern void __init it8172_init_ram_resource(unsigned long memsize);
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
 const char *get_system_type(void)
 {
 	return "Globespan IVR";
diff --git a/arch/mips/ite-boards/qed-4n-s01b/init.c b/arch/mips/ite-boards/qed-4n-s01b/init.c
index 56dca7e0c21d..e8ec8be66a80 100644
--- a/arch/mips/ite-boards/qed-4n-s01b/init.c
+++ b/arch/mips/ite-boards/qed-4n-s01b/init.c
@@ -45,9 +45,6 @@ extern void  __init prom_init_cmdline(void);
 extern unsigned long __init prom_get_memsize(void);
 extern void __init it8172_init_ram_resource(unsigned long memsize);
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
 const char *get_system_type(void)
 {
 	return "ITE QED-4N-S01B";
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 0cb3b6097e0e..dcbfd27071f0 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -34,6 +34,7 @@
 #include <linux/highmem.h>
 #include <linux/console.h>
 #include <linux/mmzone.h>
+#include <linux/pfn.h>
 
 #include <asm/addrspace.h>
 #include <asm/bootinfo.h>
@@ -257,10 +258,6 @@ static inline int parse_rd_cmdline(unsigned long* rd_start, unsigned long* rd_en
 	return 0;
 }
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
-
 #define MAXMEM		HIGHMEM_START
 #define MAXMEM_PFN	PFN_DOWN(MAXMEM)
 
@@ -493,10 +490,6 @@ static inline void resource_init(void)
 	}
 }
 
-#undef PFN_UP
-#undef PFN_DOWN
-#undef PFN_PHYS
-
 #undef MAXMEM
 #undef MAXMEM_PFN
 
diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c
index ee5e70c95cf3..32c9210373ac 100644
--- a/arch/mips/mips-boards/generic/memory.c
+++ b/arch/mips/mips-boards/generic/memory.c
@@ -49,9 +49,6 @@ static char *mtypes[3] = {
 /* References to section boundaries */
 extern char _end;
 
-#define PFN_ALIGN(x)    (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
-
 struct prom_pmemblock * __init prom_getmdesc(void)
 {
 	char *memsize_str;
@@ -109,10 +106,10 @@ struct prom_pmemblock * __init prom_getmdesc(void)
 
 	mdesc[3].type = yamon_dontuse;
 	mdesc[3].base = 0x00100000;
-	mdesc[3].size = CPHYSADDR(PFN_ALIGN(&_end)) - mdesc[3].base;
+	mdesc[3].size = CPHYSADDR(PAGE_ALIGN(&_end)) - mdesc[3].base;
 
 	mdesc[4].type = yamon_free;
-	mdesc[4].base = CPHYSADDR(PFN_ALIGN(&_end));
+	mdesc[4].base = CPHYSADDR(PAGE_ALIGN(&_end));
 	mdesc[4].size = memsize - mdesc[4].base;
 
 	return &mdesc[0];
diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c
index 1ec4e75656bd..e57f737bab10 100644
--- a/arch/mips/mips-boards/sim/sim_mem.c
+++ b/arch/mips/mips-boards/sim/sim_mem.c
@@ -42,9 +42,6 @@ static char *mtypes[3] = {
 /* References to section boundaries */
 extern char _end;
 
-#define PFN_ALIGN(x)    (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
-
 struct prom_pmemblock * __init prom_getmdesc(void)
 {
 	unsigned int memsize;
@@ -64,10 +61,10 @@ struct prom_pmemblock * __init prom_getmdesc(void)
 
 	mdesc[2].type = simmem_reserved;
 	mdesc[2].base = 0x00100000;
-	mdesc[2].size = CPHYSADDR(PFN_ALIGN(&_end)) - mdesc[2].base;
+	mdesc[2].size = CPHYSADDR(PAGE_ALIGN(&_end)) - mdesc[2].base;
 
 	mdesc[3].type = simmem_free;
-	mdesc[3].base = CPHYSADDR(PFN_ALIGN(&_end));
+	mdesc[3].base = CPHYSADDR(PAGE_ALIGN(&_end));
 	mdesc[3].size = memsize - mdesc[3].base;
 
 	return &mdesc[0];
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 52f7d59fe612..ad89c442f299 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -25,6 +25,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/proc_fs.h>
+#include <linux/pfn.h>
 
 #include <asm/bootinfo.h>
 #include <asm/cachectl.h>
@@ -177,9 +178,6 @@ void __init paging_init(void)
 	free_area_init(zones_size);
 }
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-
 static inline int page_is_ram(unsigned long pagenr)
 {
 	int i;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index e0d095daa5ed..6c00dce9f73f 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -19,6 +19,7 @@
 #include <linux/nodemask.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/pfn.h>
 #include <asm/page.h>
 #include <asm/sections.h>
 
@@ -28,8 +29,6 @@
 #include <asm/sn/sn_private.h>
 
 
-#define PFN_UP(x)		(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-
 #define SLOT_PFNSHIFT           (SLOT_SHIFT - PAGE_SHIFT)
 #define PFN_NASIDSHFT           (NASID_SHFT - PAGE_SHIFT)
 
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index c0e79843f580..7ee4ca203616 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -20,6 +20,7 @@
 #include <linux/root_dev.h>
 #include <linux/utsname.h>
 #include <linux/cpu.h>
+#include <linux/pfn.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/sections.h>
@@ -275,10 +276,6 @@ void __init setup_arch(char **cmdline_p)
 
 	sh_mv_setup(cmdline_p);
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
-
 	/*
 	 * Find the highest page frame number we have available
 	 */
diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c
index c7a7b816a30f..d2711c9c9d13 100644
--- a/arch/sh64/kernel/setup.c
+++ b/arch/sh64/kernel/setup.c
@@ -48,6 +48,7 @@
 #include <linux/root_dev.h>
 #include <linux/cpu.h>
 #include <linux/initrd.h>
+#include <linux/pfn.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index 0e65340eee33..0500800df1c1 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -9,6 +9,7 @@
 #include "linux/vmalloc.h"
 #include "linux/bootmem.h"
 #include "linux/module.h"
+#include "linux/pfn.h"
 #include "asm/types.h"
 #include "asm/pgtable.h"
 #include "kern_util.h"
@@ -316,8 +317,6 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len,
 	}
 }
 
-#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-
 extern int __syscall_stub_start, __binary_start;
 
 void setup_physmem(unsigned long start, unsigned long reserve_end,
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 826a8ca50ac8..ee941457b55d 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -6,9 +6,7 @@
 #ifndef _i386_SETUP_H
 #define _i386_SETUP_H
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
+#include <linux/pfn.h>
 
 /*
  * Reserved space for vmalloc and iomap - defined in asm/page.h
diff --git a/include/asm-m32r/setup.h b/include/asm-m32r/setup.h
index 5f028dc26a9b..52f4fa29abfc 100644
--- a/include/asm-m32r/setup.h
+++ b/include/asm-m32r/setup.h
@@ -24,10 +24,6 @@
 #define RAMDISK_PROMPT_FLAG		(0x8000)
 #define RAMDISK_LOAD_FLAG		(0x4000)
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
-
 extern unsigned long memory_start;
 extern unsigned long memory_end;
 
diff --git a/include/asm-sh64/platform.h b/include/asm-sh64/platform.h
index 7046a9014027..bd0d9c405a80 100644
--- a/include/asm-sh64/platform.h
+++ b/include/asm-sh64/platform.h
@@ -61,9 +61,4 @@ extern int platform_int_priority[NR_INTC_IRQS];
 #define code_resource (platform_parms.kram_res_p[STANDARD_KRAM_RESOURCES - 2])
 #define data_resource (platform_parms.kram_res_p[STANDARD_KRAM_RESOURCES - 1])
 
-/* Be prepared to 64-bit sign extensions */
-#define PFN_UP(x)       ((((x) + PAGE_SIZE-1) >> PAGE_SHIFT) & 0x000fffff)
-#define PFN_DOWN(x)     (((x) >> PAGE_SHIFT) & 0x000fffff)
-#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
-
 #endif	/* __ASM_SH64_PLATFORM_H */
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
new file mode 100644
index 000000000000..bb01f8b92b56
--- /dev/null
+++ b/include/linux/pfn.h
@@ -0,0 +1,9 @@
+#ifndef _LINUX_PFN_H_
+#define _LINUX_PFN_H_
+
+#define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
+#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
+#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
+#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
+
+#endif
-- 
cgit v1.2.3


From 0771dfefc9e538f077d0b43b6dec19a5a67d0e70 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:22 -0800
Subject: [PATCH] lightweight robust futexes: core

Add the core infrastructure for robust futexes: structure definitions, the new
syscalls and the do_exit() based cleanup mechanism.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h   |  95 ++++++++++++++++++++++++++
 include/linux/sched.h   |   3 +
 include/linux/threads.h |   3 +-
 kernel/exit.c           |   3 +
 kernel/futex.c          | 172 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys_ni.c         |   4 ++
 6 files changed, 279 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 10f96c31971e..20face6b798d 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
+#include <linux/sched.h>
+
 /* Second argument to futex syscall */
 
 
@@ -11,10 +13,103 @@
 #define FUTEX_CMP_REQUEUE	4
 #define FUTEX_WAKE_OP		5
 
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ */
+
+/*
+ * Per-lock list entry - embedded in user-space locks, somewhere close
+ * to the futex field. (Note: user-space uses a double-linked list to
+ * achieve O(1) list add and remove, but the kernel only needs to know
+ * about the forward link)
+ *
+ * NOTE: this structure is part of the syscall ABI, and must not be
+ * changed.
+ */
+struct robust_list {
+	struct robust_list __user *next;
+};
+
+/*
+ * Per-thread list head:
+ *
+ * NOTE: this structure is part of the syscall ABI, and must only be
+ * changed if the change is first communicated with the glibc folks.
+ * (When an incompatible change is done, we'll increase the structure
+ *  size, which glibc will detect)
+ */
+struct robust_list_head {
+	/*
+	 * The head of the list. Points back to itself if empty:
+	 */
+	struct robust_list list;
+
+	/*
+	 * This relative offset is set by user-space, it gives the kernel
+	 * the relative position of the futex field to examine. This way
+	 * we keep userspace flexible, to freely shape its data-structure,
+	 * without hardcoding any particular offset into the kernel:
+	 */
+	long futex_offset;
+
+	/*
+	 * The death of the thread may race with userspace setting
+	 * up a lock's links. So to handle this race, userspace first
+	 * sets this field to the address of the to-be-taken lock,
+	 * then does the lock acquire, and then adds itself to the
+	 * list, and then clears this field. Hence the kernel will
+	 * always have full knowledge of all locks that the thread
+	 * _might_ have taken. We check the owner TID in any case,
+	 * so only truly owned locks will be handled.
+	 */
+	struct robust_list __user *list_op_pending;
+};
+
+/*
+ * Are there any waiters for this robust futex:
+ */
+#define FUTEX_WAITERS		0x80000000
+
+/*
+ * The kernel signals via this bit that a thread holding a futex
+ * has exited without unlocking the futex. The kernel also does
+ * a FUTEX_WAKE on such futexes, after setting the bit, to wake
+ * up any possible waiters:
+ */
+#define FUTEX_OWNER_DIED	0x40000000
+
+/*
+ * Reserved bit:
+ */
+#define FUTEX_OWNER_PENDING	0x20000000
+
+/*
+ * The rest of the robust-futex field is for the TID:
+ */
+#define FUTEX_TID_MASK		0x1fffffff
+
+/*
+ * A limit of one million locks held per thread (!) ought to be enough
+ * for some time. This also protects against a deliberately circular
+ * list. Not worth introducing an rlimit for this:
+ */
+#define ROBUST_LIST_LIMIT	1048576
+
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
+extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
+
+#ifdef CONFIG_FUTEX
+extern void exit_robust_list(struct task_struct *curr);
+#else
+static inline void exit_robust_list(struct task_struct *curr)
+{
+}
+#endif
+
 #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
 #define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
 #define FUTEX_OP_OR		2	/* *(int *)UADDR2 |= OPARG; */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 036d14d2bf90..fd4848f2d750 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,7 @@
 #include <linux/topology.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/futex.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -872,6 +873,8 @@ struct task_struct {
 	int cpuset_mems_generation;
 	int cpuset_mem_spread_rotor;
 #endif
+	struct robust_list_head __user *robust_list;
+
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
 };
diff --git a/include/linux/threads.h b/include/linux/threads.h
index b59738ac6197..e646bcdf2614 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -28,7 +28,8 @@
 #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
 
 /*
- * A maximum of 4 million PIDs should be enough for a while:
+ * A maximum of 4 million PIDs should be enough for a while.
+ * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
  */
 #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
 	(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
diff --git a/kernel/exit.c b/kernel/exit.c
index 8037405e136e..aecb48ca7370 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
+#include <linux/futex.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -852,6 +853,8 @@ fastcall NORET_TYPE void do_exit(long code)
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
+	if (unlikely(tsk->robust_list))
+		exit_robust_list(tsk);
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f978032..feb724b2554e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
  *  Removed page pinning, fix privately mapped COW pages and other cleanups
  *  (C) Copyright 2003, 2004 Jamie Lokier
  *
+ *  Robust futex support started by Ingo Molnar
+ *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,174 @@ error:
 	goto out;
 }
 
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+asmlinkage long
+sys_set_robust_list(struct robust_list_head __user *head,
+		    size_t len)
+{
+	/*
+	 * The kernel knows only one size for now:
+	 */
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->robust_list = head;
+
+	return 0;
+}
+
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+asmlinkage long
+sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+		    size_t __user *len_ptr)
+{
+	struct robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(head, head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
+{
+	unsigned int futex_val;
+
+repeat:
+	if (get_user(futex_val, uaddr))
+		return -1;
+
+	if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
+		/*
+		 * Ok, this dying thread is truly holding a futex
+		 * of interest. Set the OWNER_DIED bit atomically
+		 * via cmpxchg, and if the value had FUTEX_WAITERS
+		 * set, wake up a waiter (if any). (We have to do a
+		 * futex_wake() even if OWNER_DIED is already set -
+		 * to handle the rare but possible case of recursive
+		 * thread-death.) The rest of the cleanup is done in
+		 * userspace.
+		 */
+		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
+					 futex_val | FUTEX_OWNER_DIED) !=
+								   futex_val)
+			goto repeat;
+
+		if (futex_val & FUTEX_WAITERS)
+			futex_wake((unsigned long)uaddr, 1);
+	}
+	return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+	struct robust_list_head __user *head = curr->robust_list;
+	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT;
+	unsigned long futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (get_user(entry, &head->list.next))
+		return;
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (get_user(pending, &head->list_op_pending))
+		return;
+	if (pending)
+		handle_futex_death((void *)pending + futex_offset, curr);
+
+	while (entry != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (get_user(entry, &entry->next))
+			return;
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 		unsigned long uaddr2, int val2, int val3)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6b1..d82864c4a617 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
-- 
cgit v1.2.3


From 34f192c6527f20c47ccec239e7d51a27691b93fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:24 -0800
Subject: [PATCH] lightweight robust futexes: compat

32-bit syscall compatibility support.  (This patch also moves all futex
related compat functionality into kernel/futex_compat.c.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compat.h |  18 +++++++
 include/linux/sched.h  |   3 ++
 kernel/Makefile        |   3 ++
 kernel/compat.c        |  23 --------
 kernel/exit.c          |   5 ++
 kernel/futex_compat.c  | 142 +++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 171 insertions(+), 23 deletions(-)
 create mode 100644 kernel/futex_compat.c

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 24d659cdbafe..6d3a654be1ae 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -147,6 +147,24 @@ typedef struct compat_sigevent {
 	} _sigev_un;
 } compat_sigevent_t;
 
+struct compat_robust_list {
+	compat_uptr_t			next;
+};
+
+struct compat_robust_list_head {
+	struct compat_robust_list	list;
+	compat_long_t			futex_offset;
+	compat_uptr_t			list_op_pending;
+};
+
+extern void compat_exit_robust_list(struct task_struct *curr);
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len);
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr);
 
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fd4848f2d750..20b4f0372e44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -874,6 +874,9 @@ struct task_struct {
 	int cpuset_mem_spread_rotor;
 #endif
 	struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+	struct compat_robust_list_head __user *compat_robust_list;
+#endif
 
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
diff --git a/kernel/Makefile b/kernel/Makefile
index ff1c11dc12cf..58908f9d156a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/compat.c b/kernel/compat.c
index b9bdd1271f44..c1601a84f8d8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,7 +17,6 @@
 #include <linux/time.h>
 #include <linux/signal.h>
 #include <linux/sched.h>	/* for MAX_SCHEDULE_TIMEOUT */
-#include <linux/futex.h>	/* for FUTEX_WAIT */
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
@@ -239,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
 	return ret;
 }
 
-#ifdef CONFIG_FUTEX
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
-		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
-{
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-	int val2 = 0;
-
-	if ((op == FUTEX_WAIT) && utime) {
-		if (get_compat_timespec(&t, utime))
-			return -EFAULT;
-		timeout = timespec_to_jiffies(&t) + 1;
-	}
-	if (op >= FUTEX_REQUEUE)
-		val2 = (int) (unsigned long) utime;
-
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
-}
-#endif
-
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
 		struct compat_rlimit __user *rlim)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index aecb48ca7370..a8c7efc7a681 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -855,6 +856,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 	if (unlikely(tsk->robust_list))
 		exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+	if (unlikely(tsk->compat_robust_list))
+		compat_exit_robust_list(tsk);
+#endif
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 000000000000..c153559ef289
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,142 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+	struct robust_list __user *entry, *pending;
+	compat_uptr_t uentry, upending;
+	unsigned int limit = ROBUST_LIST_LIMIT;
+	compat_long_t futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (get_user(uentry, &head->list.next))
+		return;
+	entry = compat_ptr(uentry);
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (get_user(upending, &head->list_op_pending))
+		return;
+	pending = compat_ptr(upending);
+	if (upending)
+		handle_futex_death((void *)pending + futex_offset, curr);
+
+	while (compat_ptr(uentry) != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (get_user(uentry, (compat_uptr_t *)&entry->next))
+			return;
+		entry = compat_ptr(uentry);
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len)
+{
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->compat_robust_list = head;
+
+	return 0;
+}
+
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr)
+{
+	struct compat_robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->compat_robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->compat_robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+		struct compat_timespec __user *utime, u32 __user *uaddr2,
+		int val3)
+{
+	struct timespec t;
+	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	int val2 = 0;
+
+	if ((op == FUTEX_WAIT) && utime) {
+		if (get_compat_timespec(&t, utime))
+			return -EFAULT;
+		timeout = timespec_to_jiffies(&t) + 1;
+	}
+	if (op >= FUTEX_REQUEUE)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex((unsigned long)uaddr, op, val, timeout,
+			(unsigned long)uaddr2, val2, val3);
+}
-- 
cgit v1.2.3


From 8f17d3a5049d32392b79925c73a0cf99ce6d5af0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:27 -0800
Subject: [PATCH] lightweight robust futexes updates

- fix: initialize the robust list(s) to NULL in copy_process.

- doc update

- cleanup: rename _inuser to _inatomic

- __user cleanups and other small cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/robust-futex-ABI.txt |  2 --
 Documentation/robust-futexes.txt   |  2 +-
 include/asm-frv/futex.h            |  2 +-
 include/asm-generic/futex.h        |  2 +-
 include/asm-i386/futex.h           |  2 +-
 include/asm-mips/futex.h           |  2 +-
 include/asm-powerpc/futex.h        |  2 +-
 include/asm-x86_64/futex.h         |  2 +-
 include/linux/futex.h              |  2 +-
 kernel/fork.c                      |  5 ++++-
 kernel/futex.c                     | 20 +++++++++-----------
 kernel/futex_compat.c              |  7 +++----
 12 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
index def5d8735286..8529a17ffaa1 100644
--- a/Documentation/robust-futex-ABI.txt
+++ b/Documentation/robust-futex-ABI.txt
@@ -142,8 +142,6 @@ On insertion:
     of the 'lock word', to the linked list starting at 'head', and
  4) clear the 'list_op_pending' word.
 
-	XXX I am particularly unsure of the following -pj XXX
-
 On removal:
  1) set the 'list_op_pending' word to the address of the 'lock word'
     to be removed,
diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt
index 7aecc67b1361..df82d75245a0 100644
--- a/Documentation/robust-futexes.txt
+++ b/Documentation/robust-futexes.txt
@@ -213,6 +213,6 @@ robust-mutex testcases.
 All other architectures should build just fine too - but they wont have
 the new syscalls yet.
 
-Architectures need to implement the new futex_atomic_cmpxchg_inuser()
+Architectures need to implement the new futex_atomic_cmpxchg_inatomic()
 inline function before writing up the syscalls (that function returns
 -ENOSYS right now).
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
index 9a0e9026ba5e..08b3d1da3583 100644
--- a/include/asm-frv/futex.h
+++ b/include/asm-frv/futex.h
@@ -10,7 +10,7 @@
 extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 514bd401cd7e..df893c160318 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -50,7 +50,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
index 41184a31885c..7b8ceefd010f 100644
--- a/include/asm-i386/futex.h
+++ b/include/asm-i386/futex.h
@@ -105,7 +105,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
index c5fb2d6d918a..a554089991f2 100644
--- a/include/asm-mips/futex.h
+++ b/include/asm-mips/futex.h
@@ -100,7 +100,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-powerpc/futex.h b/include/asm-powerpc/futex.h
index 80ed9854e42b..f1b3c00bc1ce 100644
--- a/include/asm-powerpc/futex.h
+++ b/include/asm-powerpc/futex.h
@@ -82,7 +82,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-x86_64/futex.h b/include/asm-x86_64/futex.h
index 7d9eb1a84546..9804bf07b092 100644
--- a/include/asm-x86_64/futex.h
+++ b/include/asm-x86_64/futex.h
@@ -95,7 +95,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 20face6b798d..55fff96ae859 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -100,7 +100,7 @@ long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
-extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
+extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
diff --git a/kernel/fork.c b/kernel/fork.c
index e0a2b449dea6..c49bd193b058 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,7 +1061,10 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * Clear TID on mm_release()?
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
-
+	p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+	p->compat_robust_list = NULL;
+#endif
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/futex.c b/kernel/futex.c
index feb724b2554e..9c9b2b6b22dd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -913,15 +913,15 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
 {
-	unsigned int futex_val;
+	u32 uval;
 
-repeat:
-	if (get_user(futex_val, uaddr))
+retry:
+	if (get_user(uval, uaddr))
 		return -1;
 
-	if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
+	if ((uval & FUTEX_TID_MASK) == curr->pid) {
 		/*
 		 * Ok, this dying thread is truly holding a futex
 		 * of interest. Set the OWNER_DIED bit atomically
@@ -932,12 +932,11 @@ repeat:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
-					 futex_val | FUTEX_OWNER_DIED) !=
-								   futex_val)
-			goto repeat;
+		if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
+					 uval | FUTEX_OWNER_DIED) != uval)
+			goto retry;
 
-		if (futex_val & FUTEX_WAITERS)
+		if (uval & FUTEX_WAITERS)
 			futex_wake((unsigned long)uaddr, 1);
 	}
 	return 0;
@@ -985,7 +984,6 @@ void exit_robust_list(struct task_struct *curr)
 			if (handle_futex_death((void *)entry + futex_offset,
 						curr))
 				return;
-
 		/*
 		 * Fetch the next entry in the list:
 		 */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index c153559ef289..9c077cf9aa84 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -121,9 +121,9 @@ err_unlock:
 	return ret;
 }
 
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
+		u32 val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -137,6 +137,5 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
 	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
-- 
cgit v1.2.3


From 76b81e2b0e2241accebcc68e126bc5ab958661b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:28 -0800
Subject: [PATCH] lightweight robust futexes updates 2

futex.h updates:

- get rid of FUTEX_OWNER_PENDING - it's not used
- reduce ROBUST_LIST_LIMIT to a saner value

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 55fff96ae859..966a5b3da439 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -79,22 +79,16 @@ struct robust_list_head {
  */
 #define FUTEX_OWNER_DIED	0x40000000
 
-/*
- * Reserved bit:
- */
-#define FUTEX_OWNER_PENDING	0x20000000
-
 /*
  * The rest of the robust-futex field is for the TID:
  */
-#define FUTEX_TID_MASK		0x1fffffff
+#define FUTEX_TID_MASK		0x3fffffff
 
 /*
- * A limit of one million locks held per thread (!) ought to be enough
- * for some time. This also protects against a deliberately circular
- * list. Not worth introducing an rlimit for this:
+ * This limit protects against a deliberately circular list.
+ * (Not worth introducing an rlimit for it)
  */
-#define ROBUST_LIST_LIMIT	1048576
+#define ROBUST_LIST_LIMIT	2048
 
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
-- 
cgit v1.2.3


From e041c683412d5bf44dc2b109053e3b837b71742d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 27 Mar 2006 01:16:30 -0800
Subject: [PATCH] Notifier chain update: API changes

The kernel's implementation of notifier chains is unsafe.  There is no
protection against entries being added to or removed from a chain while the
chain is in use.  The issues were discussed in this thread:

    http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2

We noticed that notifier chains in the kernel fall into two basic usage
classes:

	"Blocking" chains are always called from a process context
	and the callout routines are allowed to sleep;

	"Atomic" chains can be called from an atomic context and
	the callout routines are not allowed to sleep.

We decided to codify this distinction and make it part of the API.  Therefore
this set of patches introduces three new, parallel APIs: one for blocking
notifiers, one for atomic notifiers, and one for "raw" notifiers (which is
really just the old API under a new name).  New kinds of data structures are
used for the heads of the chains, and new routines are defined for
registration, unregistration, and calling a chain.  The three APIs are
explained in include/linux/notifier.h and their implementation is in
kernel/sys.c.

With atomic and blocking chains, the implementation guarantees that the chain
links will not be corrupted and that chain callers will not get messed up by
entries being added or removed.  For raw chains the implementation provides no
guarantees at all; users of this API must provide their own protections.  (The
idea was that situations may come up where the assumptions of the atomic and
blocking APIs are not appropriate, so it should be possible for users to
handle these things in their own way.)

There are some limitations, which should not be too hard to live with.  For
atomic/blocking chains, registration and unregistration must always be done in
a process context since the chain is protected by a mutex/rwsem.  Also, a
callout routine for a non-raw chain must not try to register or unregister
entries on its own chain.  (This did happen in a couple of places and the code
had to be changed to avoid it.)

Since atomic chains may be called from within an NMI handler, they cannot use
spinlocks for synchronization.  Instead we use RCU.  The overhead falls almost
entirely in the unregister routine, which is okay since unregistration is much
less frequent that calling a chain.

Here is the list of chains that we adjusted and their classifications.  None
of them use the raw API, so for the moment it is only a placeholder.

  ATOMIC CHAINS
  -------------
arch/i386/kernel/traps.c:		i386die_chain
arch/ia64/kernel/traps.c:		ia64die_chain
arch/powerpc/kernel/traps.c:		powerpc_die_chain
arch/sparc64/kernel/traps.c:		sparc64die_chain
arch/x86_64/kernel/traps.c:		die_chain
drivers/char/ipmi/ipmi_si_intf.c:	xaction_notifier_list
kernel/panic.c:				panic_notifier_list
kernel/profile.c:			task_free_notifier
net/bluetooth/hci_core.c:		hci_notifier
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_chain
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_expect_chain
net/ipv6/addrconf.c:			inet6addr_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_expect_chain
net/netlink/af_netlink.c:		netlink_chain

  BLOCKING CHAINS
  ---------------
arch/powerpc/platforms/pseries/reconfig.c:	pSeries_reconfig_chain
arch/s390/kernel/process.c:		idle_chain
arch/x86_64/kernel/process.c		idle_notifier
drivers/base/memory.c:			memory_chain
drivers/cpufreq/cpufreq.c		cpufreq_policy_notifier_list
drivers/cpufreq/cpufreq.c		cpufreq_transition_notifier_list
drivers/macintosh/adb.c:		adb_client_list
drivers/macintosh/via-pmu.c		sleep_notifier_list
drivers/macintosh/via-pmu68k.c		sleep_notifier_list
drivers/macintosh/windfarm_core.c	wf_client_list
drivers/usb/core/notify.c		usb_notifier_list
drivers/video/fbmem.c			fb_notifier_list
kernel/cpu.c				cpu_chain
kernel/module.c				module_notify_list
kernel/profile.c			munmap_notifier
kernel/profile.c			task_exit_notifier
kernel/sys.c				reboot_notifier_list
net/core/dev.c				netdev_chain
net/decnet/dn_dev.c:			dnaddr_chain
net/ipv4/devinet.c:			inetaddr_chain

It's possible that some of these classifications are wrong.  If they are,
please let us know or submit a patch to fix them.  Note that any chain that
gets called very frequently should be atomic, because the rwsem read-locking
used for blocking chains is very likely to incur cache misses on SMP systems.
(However, if the chain's callout routines may sleep then the chain cannot be
atomic.)

The patch set was written by Alan Stern and Chandra Seetharaman, incorporating
material written by Keith Owens and suggestions from Paul McKenney and Andrew
Morton.

[jes@sgi.com: restructure the notifier chain initialization macros]
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/setup.c                   |   5 +-
 arch/arm/mach-omap1/board-netstar.c         |   2 +-
 arch/arm/mach-omap1/board-voiceblue.c       |   2 +-
 arch/i386/kernel/traps.c                    |  17 +-
 arch/ia64/kernel/traps.c                    |   6 +-
 arch/mips/lasat/setup.c                     |   3 +-
 arch/mips/sgi-ip22/ip22-reset.c             |   2 +-
 arch/mips/sgi-ip32/ip32-reset.c             |   2 +-
 arch/parisc/kernel/pdc_chassis.c            |   3 +-
 arch/powerpc/kernel/setup_64.c              |   3 +-
 arch/powerpc/kernel/traps.c                 |  16 +-
 arch/powerpc/platforms/pseries/reconfig.c   |  10 +-
 arch/ppc/platforms/prep_setup.c             |   2 +-
 arch/s390/kernel/process.c                  |  11 +-
 arch/sparc64/kernel/traps.c                 |  17 +-
 arch/um/drivers/mconsole_kern.c             |   3 +-
 arch/um/kernel/um_arch.c                    |   3 +-
 arch/x86_64/kernel/process.c                |  17 +-
 arch/x86_64/kernel/traps.c                  |  18 +-
 arch/xtensa/platform-iss/setup.c            |   2 +-
 drivers/base/memory.c                       |   8 +-
 drivers/char/ipmi/ipmi_msghandler.c         |   4 +-
 drivers/char/ipmi/ipmi_si_intf.c            |   7 +-
 drivers/char/ipmi/ipmi_watchdog.c           |   6 +-
 drivers/cpufreq/cpufreq.c                   |  61 +++---
 drivers/firmware/dcdbas.c                   |  19 +-
 drivers/macintosh/adb.c                     |  11 +-
 drivers/macintosh/adbhid.c                  |   3 +-
 drivers/macintosh/via-pmu.c                 |   2 +-
 drivers/macintosh/via-pmu68k.c              |   7 +-
 drivers/macintosh/windfarm_core.c           |   8 +-
 drivers/misc/ibmasm/heartbeat.c             |   5 +-
 drivers/net/bonding/bond_main.c             |   2 +-
 drivers/parisc/led.c                        |  14 +-
 drivers/parisc/power.c                      |   6 +-
 drivers/scsi/gdth.c                         |   9 +-
 drivers/usb/core/notify.c                   |  65 +-----
 drivers/video/fbmem.c                       |  31 +--
 include/asm-i386/kdebug.h                   |  10 +-
 include/asm-ia64/kdebug.h                   |   4 +-
 include/asm-powerpc/kdebug.h                |  12 +-
 include/asm-sparc64/kdebug.h                |  11 +-
 include/asm-x86_64/kdebug.h                 |  23 +-
 include/linux/adb.h                         |   2 +-
 include/linux/kernel.h                      |   2 +-
 include/linux/memory.h                      |   1 -
 include/linux/netfilter_ipv4/ip_conntrack.h |  17 +-
 include/linux/notifier.h                    |  96 +++++++-
 include/net/netfilter/nf_conntrack.h        |  17 +-
 kernel/cpu.c                                |  29 +--
 kernel/module.c                             |  20 +-
 kernel/panic.c                              |   4 +-
 kernel/profile.c                            |  53 ++---
 kernel/softlockup.c                         |   2 +-
 kernel/sys.c                                | 327 ++++++++++++++++++++++------
 net/bluetooth/hci_core.c                    |   8 +-
 net/core/dev.c                              |  42 ++--
 net/decnet/dn_dev.c                         |  10 +-
 net/ipv4/devinet.c                          |  16 +-
 net/ipv4/netfilter/ip_conntrack_core.c      |   6 +-
 net/ipv6/addrconf.c                         |  10 +-
 net/netfilter/nf_conntrack_core.c           |   6 +-
 net/netlink/af_netlink.c                    |   9 +-
 63 files changed, 677 insertions(+), 472 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 9402624453c2..dd8769670596 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -43,7 +43,7 @@
 #include <asm/setup.h>
 #include <asm/io.h>
 
-extern struct notifier_block *panic_notifier_list;
+extern struct atomic_notifier_head panic_notifier_list;
 static int alpha_panic_event(struct notifier_block *, unsigned long, void *);
 static struct notifier_block alpha_panic_block = {
 	alpha_panic_event,
@@ -500,7 +500,8 @@ setup_arch(char **cmdline_p)
 	}
 
 	/* Register a call for panic conditions. */
-	notifier_chain_register(&panic_notifier_list, &alpha_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&alpha_panic_block);
 
 #ifdef CONFIG_ALPHA_GENERIC
 	/* Assume that we've booted from SRM if we haven't booted from MILO.
diff --git a/arch/arm/mach-omap1/board-netstar.c b/arch/arm/mach-omap1/board-netstar.c
index 60d5f8a3339c..7520e602d7a2 100644
--- a/arch/arm/mach-omap1/board-netstar.c
+++ b/arch/arm/mach-omap1/board-netstar.c
@@ -141,7 +141,7 @@ static int __init netstar_late_init(void)
 	/* TODO: Setup front panel switch here */
 
 	/* Setup panic notifier */
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/arm/mach-omap1/board-voiceblue.c b/arch/arm/mach-omap1/board-voiceblue.c
index bfd5fdd1a875..52e4a9d69642 100644
--- a/arch/arm/mach-omap1/board-voiceblue.c
+++ b/arch/arm/mach-omap1/board-voiceblue.c
@@ -235,7 +235,7 @@ static struct notifier_block panic_block = {
 static int __init voiceblue_setup(void)
 {
 	/* Setup panic notifier */
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4624f8ca2459..6b63a5aa1e46 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -92,22 +92,21 @@ asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
-struct notifier_block *i386die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(i386die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&i386die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&i386die_chain, nb);
 }
 EXPORT_SYMBOL(register_die_notifier);
 
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&i386die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
+
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 {
 	return	p > (void *)tinfo &&
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index dabd6c32641e..7c1ddc8ac443 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -30,19 +30,19 @@ extern spinlock_t timerlist_lock;
 fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
 
-struct notifier_block *ia64die_chain;
+ATOMIC_NOTIFIER_HEAD(ia64die_chain);
 
 int
 register_die_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ia64die_chain, nb);
+	return atomic_notifier_chain_register(&ia64die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(register_die_notifier);
 
 int
 unregister_die_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ia64die_chain, nb);
+	return atomic_notifier_chain_unregister(&ia64die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(unregister_die_notifier);
 
diff --git a/arch/mips/lasat/setup.c b/arch/mips/lasat/setup.c
index 83eb08b7a072..e9e9a89c6741 100644
--- a/arch/mips/lasat/setup.c
+++ b/arch/mips/lasat/setup.c
@@ -165,7 +165,8 @@ void __init plat_setup(void)
 
 	/* Set up panic notifier */
 	for (i = 0; i < sizeof(lasat_panic_block) / sizeof(struct notifier_block); i++)
-		notifier_chain_register(&panic_notifier_list, &lasat_panic_block[i]);
+		atomic_notifier_chain_register(&panic_notifier_list,
+				&lasat_panic_block[i]);
 
 	lasat_reboot_setup();
 
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index 92a3b3c15ed3..a9c58e067b53 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -238,7 +238,7 @@ static int __init reboot_setup(void)
 	request_irq(SGI_PANEL_IRQ, panel_int, 0, "Front Panel", NULL);
 	init_timer(&blink_timer);
 	blink_timer.function = blink_timeout;
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index 0c948008b023..ab9d9cef089e 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -193,7 +193,7 @@ static __init int ip32_reboot_setup(void)
 
 	init_timer(&blink_timer);
 	blink_timer.function = blink_timeout;
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	request_irq(MACEISA_RTC_IRQ, ip32_rtc_int, 0, "rtc", NULL);
 
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 2a01fe1bdc98..0cea6958f427 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -150,7 +150,8 @@ void __init parisc_pdc_chassis_init(void)
 
 		if (handle) {
 			/* initialize panic notifier chain */
-			notifier_chain_register(&panic_notifier_list, &pdc_chassis_panic_block);
+			atomic_notifier_chain_register(&panic_notifier_list,
+					&pdc_chassis_panic_block);
 
 			/* initialize reboot notifier chain */
 			register_reboot_notifier(&pdc_chassis_reboot_block);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 2f3fdad35594..e20c1fae3423 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -579,7 +579,8 @@ void __init setup_arch(char **cmdline_p)
 	panic_timeout = 180;
 
 	if (ppc_md.panic)
-		notifier_chain_register(&panic_notifier_list, &ppc64_panic_block);
+		atomic_notifier_chain_register(&panic_notifier_list,
+				&ppc64_panic_block);
 
 	init_mm.start_code = PAGE_OFFSET;
 	init_mm.end_code = (unsigned long) _etext;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 98660aedeeb7..9763faab6739 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -74,19 +74,19 @@ EXPORT_SYMBOL(__debugger_dabr_match);
 EXPORT_SYMBOL(__debugger_fault_handler);
 #endif
 
-struct notifier_block *powerpc_die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(powerpc_die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
+	return atomic_notifier_chain_register(&powerpc_die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
 
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&powerpc_die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&powerpc_die_chain, nb);
 }
+EXPORT_SYMBOL(unregister_die_notifier);
 
 /*
  * Trap & Exception support
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index 86cfa6ecdcf3..5ad90676567a 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -94,16 +94,16 @@ static struct device_node *derive_parent(const char *path)
 	return parent;
 }
 
-static struct notifier_block *pSeries_reconfig_chain;
+static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
 
 int pSeries_reconfig_notifier_register(struct notifier_block *nb)
 {
-	return notifier_chain_register(&pSeries_reconfig_chain, nb);
+	return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
 }
 
 void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
 {
-	notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+	blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
 }
 
 static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
@@ -131,7 +131,7 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist
 		goto out_err;
 	}
 
-	err = notifier_call_chain(&pSeries_reconfig_chain,
+	err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
 				  PSERIES_RECONFIG_ADD, np);
 	if (err == NOTIFY_BAD) {
 		printk(KERN_ERR "Failed to add device node %s\n", path);
@@ -171,7 +171,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 
 	remove_node_proc_entries(np);
 
-	notifier_call_chain(&pSeries_reconfig_chain,
+	blocking_notifier_call_chain(&pSeries_reconfig_chain,
 			    PSERIES_RECONFIG_REMOVE, np);
 	of_detach_node(np);
 
diff --git a/arch/ppc/platforms/prep_setup.c b/arch/ppc/platforms/prep_setup.c
index a0fc628ffb1e..d95c05d9824d 100644
--- a/arch/ppc/platforms/prep_setup.c
+++ b/arch/ppc/platforms/prep_setup.c
@@ -736,7 +736,7 @@ ibm_statusled_progress(char *s, unsigned short hex)
 		hex = 0xfff;
 		if (!notifier_installed) {
 			++notifier_installed;
-			notifier_chain_register(&panic_notifier_list,
+			atomic_notifier_chain_register(&panic_notifier_list,
 						&ibm_statusled_block);
 		}
 	}
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 99182a415fe7..4a0f5a1551ea 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -76,17 +76,17 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Need to know about CPUs going idle?
  */
-static struct notifier_block *idle_chain;
+static ATOMIC_NOTIFIER_HEAD(idle_chain);
 
 int register_idle_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&idle_chain, nb);
+	return atomic_notifier_chain_register(&idle_chain, nb);
 }
 EXPORT_SYMBOL(register_idle_notifier);
 
 int unregister_idle_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&idle_chain, nb);
+	return atomic_notifier_chain_unregister(&idle_chain, nb);
 }
 EXPORT_SYMBOL(unregister_idle_notifier);
 
@@ -95,7 +95,7 @@ void do_monitor_call(struct pt_regs *regs, long interruption_code)
 	/* disable monitor call class 0 */
 	__ctl_clear_bit(8, 15);
 
-	notifier_call_chain(&idle_chain, CPU_NOT_IDLE,
+	atomic_notifier_call_chain(&idle_chain, CPU_NOT_IDLE,
 			    (void *)(long) smp_processor_id());
 }
 
@@ -116,7 +116,8 @@ static void default_idle(void)
 		return;
 	}
 
-	rc = notifier_call_chain(&idle_chain, CPU_IDLE, (void *)(long) cpu);
+	rc = atomic_notifier_call_chain(&idle_chain,
+			CPU_IDLE, (void *)(long) cpu);
 	if (rc != NOTIFY_OK && rc != NOTIFY_DONE)
 		BUG();
 	if (rc != NOTIFY_OK) {
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index df612e4f75f9..ff090bb9734b 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -43,18 +43,19 @@
 #include <linux/kmod.h>
 #endif
 
-struct notifier_block *sparc64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(sparc64die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&sparc64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&sparc64die_chain, nb);
 }
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&sparc64die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
 
 /* When an irrecoverable trap occurs at tl > 0, the trap entry
  * code logs the trap state registers at every level in the trap
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 54388d10bcf9..1488816588ea 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -762,7 +762,8 @@ static struct notifier_block panic_exit_notifier = {
 
 static int add_notifier(void)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&panic_exit_notifier);
 	return(0);
 }
 
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index bb1c87211ac1..7d51dd7201c3 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -477,7 +477,8 @@ static struct notifier_block panic_exit_notifier = {
 
 void __init setup_arch(char **cmdline_p)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&panic_exit_notifier);
 	paging_init();
         strlcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
  	*cmdline_p = command_line;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 0370720515f1..70dd8e5c6889 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -66,24 +66,17 @@ EXPORT_SYMBOL(boot_option_idle_override);
 void (*pm_idle)(void);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
-static struct notifier_block *idle_notifier;
-static DEFINE_SPINLOCK(idle_notifier_lock);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 
 void idle_notifier_register(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_register(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_register(&idle_notifier, n);
 }
 EXPORT_SYMBOL_GPL(idle_notifier_register);
 
 void idle_notifier_unregister(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_unregister(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_unregister(&idle_notifier, n);
 }
 EXPORT_SYMBOL(idle_notifier_unregister);
 
@@ -93,13 +86,13 @@ static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
 void enter_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 
 static void __exit_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_NOT_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
 
 /* Called from interrupts to signify idle end */
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 7b148309c529..edaa9fe654dc 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -69,20 +69,20 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 
-struct notifier_block *die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&die_chain, nb);
 }
+EXPORT_SYMBOL(unregister_die_notifier);
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
diff --git a/arch/xtensa/platform-iss/setup.c b/arch/xtensa/platform-iss/setup.c
index 2e6dcbf0cc04..23790a5610e2 100644
--- a/arch/xtensa/platform-iss/setup.c
+++ b/arch/xtensa/platform-iss/setup.c
@@ -108,5 +108,5 @@ static struct notifier_block iss_panic_block = {
 
 void __init platform_setup(char **p_cmdline)
 {
-	notifier_chain_register(&panic_notifier_list, &iss_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &iss_panic_block);
 }
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 105a0d61eb1f..dd547af4681a 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -47,16 +47,16 @@ static struct kset_uevent_ops memory_uevent_ops = {
 	.uevent		= memory_uevent,
 };
 
-static struct notifier_block *memory_chain;
+static BLOCKING_NOTIFIER_HEAD(memory_chain);
 
 int register_memory_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_register(&memory_chain, nb);
+        return blocking_notifier_chain_register(&memory_chain, nb);
 }
 
 void unregister_memory_notifier(struct notifier_block *nb)
 {
-        notifier_chain_unregister(&memory_chain, nb);
+        blocking_notifier_chain_unregister(&memory_chain, nb);
 }
 
 /*
@@ -140,7 +140,7 @@ static ssize_t show_mem_state(struct sys_device *dev, char *buf)
 
 static inline int memory_notify(unsigned long val, void *v)
 {
-	return notifier_call_chain(&memory_chain, val, v);
+	return blocking_notifier_call_chain(&memory_chain, val, v);
 }
 
 /*
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index b8fb87c6c29f..40eb005b9d77 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -3744,7 +3744,7 @@ static int ipmi_init_msghandler(void)
 	ipmi_timer.expires = jiffies + IPMI_TIMEOUT_JIFFIES;
 	add_timer(&ipmi_timer);
 
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	initialized = 1;
 
@@ -3764,7 +3764,7 @@ static __exit void cleanup_ipmi(void)
 	if (!initialized)
 		return;
 
-	notifier_chain_unregister(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_unregister(&panic_notifier_list, &panic_block);
 
 	/* This can't be called if any interfaces exist, so no worry about
 	   shutting down the interfaces. */
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 12f858dc9994..35fbd4d8ed4b 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -237,10 +237,10 @@ struct smi_info
 
 static int try_smi_init(struct smi_info *smi);
 
-static struct notifier_block *xaction_notifier_list;
+static ATOMIC_NOTIFIER_HEAD(xaction_notifier_list);
 static int register_xaction_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_register(&xaction_notifier_list, nb);
+	return atomic_notifier_chain_register(&xaction_notifier_list, nb);
 }
 
 static void si_restart_short_timer(struct smi_info *smi_info);
@@ -302,7 +302,8 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info)
 		do_gettimeofday(&t);
 		printk("**Start2: %d.%9.9d\n", t.tv_sec, t.tv_usec);
 #endif
-		err = notifier_call_chain(&xaction_notifier_list, 0, smi_info);
+		err = atomic_notifier_call_chain(&xaction_notifier_list,
+				0, smi_info);
 		if (err & NOTIFY_STOP_MASK) {
 			rv = SI_SM_CALL_WITHOUT_DELAY;
 			goto out;
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 616539310d9a..7ece9f3c8f70 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -1158,7 +1158,8 @@ static int __init ipmi_wdog_init(void)
 	}
 
 	register_reboot_notifier(&wdog_reboot_notifier);
-	notifier_chain_register(&panic_notifier_list, &wdog_panic_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&wdog_panic_notifier);
 
 	printk(KERN_INFO PFX "driver initialized\n");
 
@@ -1176,7 +1177,8 @@ static __exit void ipmi_unregister_watchdog(void)
 		release_nmi(&ipmi_nmi_handler);
 #endif
 
-	notifier_chain_unregister(&panic_notifier_list, &wdog_panic_notifier);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&wdog_panic_notifier);
 	unregister_reboot_notifier(&wdog_reboot_notifier);
 
 	if (! watchdog_user)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index aed80e6aec6d..9b6ae7dc8b8a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -52,9 +52,8 @@ static void handle_update(void *data);
  * changes to devices when the CPU clock speed changes.
  * The mutex locks both lists.
  */
-static struct notifier_block *cpufreq_policy_notifier_list;
-static struct notifier_block *cpufreq_transition_notifier_list;
-static DECLARE_RWSEM (cpufreq_notifier_rwsem);
+static BLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);
+static BLOCKING_NOTIFIER_HEAD(cpufreq_transition_notifier_list);
 
 
 static LIST_HEAD(cpufreq_governor_list);
@@ -247,8 +246,6 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 	dprintk("notification %u of frequency transition to %u kHz\n",
 		state, freqs->new);
 
-	down_read(&cpufreq_notifier_rwsem);
-
 	policy = cpufreq_cpu_data[freqs->cpu];
 	switch (state) {
 
@@ -266,20 +263,19 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 				freqs->old = policy->cur;
 			}
 		}
-		notifier_call_chain(&cpufreq_transition_notifier_list,
-					CPUFREQ_PRECHANGE, freqs);
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
+				CPUFREQ_PRECHANGE, freqs);
 		adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
 		break;
 
 	case CPUFREQ_POSTCHANGE:
 		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
-		notifier_call_chain(&cpufreq_transition_notifier_list,
-					CPUFREQ_POSTCHANGE, freqs);
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
+				CPUFREQ_POSTCHANGE, freqs);
 		if (likely(policy) && likely(policy->cpu == freqs->cpu))
 			policy->cur = freqs->new;
 		break;
 	}
-	up_read(&cpufreq_notifier_rwsem);
 }
 EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
 
@@ -1007,7 +1003,7 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
 		freqs.old = cpu_policy->cur;
 		freqs.new = cur_freq;
 
-		notifier_call_chain(&cpufreq_transition_notifier_list,
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
 				    CPUFREQ_SUSPENDCHANGE, &freqs);
 		adjust_jiffies(CPUFREQ_SUSPENDCHANGE, &freqs);
 
@@ -1088,7 +1084,8 @@ static int cpufreq_resume(struct sys_device * sysdev)
 			freqs.old = cpu_policy->cur;
 			freqs.new = cur_freq;
 
-			notifier_call_chain(&cpufreq_transition_notifier_list,
+			blocking_notifier_call_chain(
+					&cpufreq_transition_notifier_list,
 					CPUFREQ_RESUMECHANGE, &freqs);
 			adjust_jiffies(CPUFREQ_RESUMECHANGE, &freqs);
 
@@ -1125,24 +1122,24 @@ static struct sysdev_driver cpufreq_sysdev_driver = {
  *      changes in cpufreq policy.
  *
  *	This function may sleep, and has the same return conditions as
- *	notifier_chain_register.
+ *	blocking_notifier_chain_register.
  */
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
 {
 	int ret;
 
-	down_write(&cpufreq_notifier_rwsem);
 	switch (list) {
 	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_transition_notifier_list, nb);
+		ret = blocking_notifier_chain_register(
+				&cpufreq_transition_notifier_list, nb);
 		break;
 	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_policy_notifier_list, nb);
+		ret = blocking_notifier_chain_register(
+				&cpufreq_policy_notifier_list, nb);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-	up_write(&cpufreq_notifier_rwsem);
 
 	return ret;
 }
@@ -1157,24 +1154,24 @@ EXPORT_SYMBOL(cpufreq_register_notifier);
  *	Remove a driver from the CPU frequency notifier list.
  *
  *	This function may sleep, and has the same return conditions as
- *	notifier_chain_unregister.
+ *	blocking_notifier_chain_unregister.
  */
 int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)
 {
 	int ret;
 
-	down_write(&cpufreq_notifier_rwsem);
 	switch (list) {
 	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_transition_notifier_list, nb);
+		ret = blocking_notifier_chain_unregister(
+				&cpufreq_transition_notifier_list, nb);
 		break;
 	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_policy_notifier_list, nb);
+		ret = blocking_notifier_chain_unregister(
+				&cpufreq_policy_notifier_list, nb);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-	up_write(&cpufreq_notifier_rwsem);
 
 	return ret;
 }
@@ -1346,29 +1343,23 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_poli
 	if (ret)
 		goto error_out;
 
-	down_read(&cpufreq_notifier_rwsem);
-
 	/* adjust if necessary - all reasons */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_ADJUST,
-			    policy);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_ADJUST, policy);
 
 	/* adjust if necessary - hardware incompatibility*/
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_INCOMPATIBLE,
-			    policy);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_INCOMPATIBLE, policy);
 
 	/* verify the cpu speed can be set within this limit,
 	   which might be different to the first one */
 	ret = cpufreq_driver->verify(policy);
-	if (ret) {
-		up_read(&cpufreq_notifier_rwsem);
+	if (ret)
 		goto error_out;
-	}
 
 	/* notification of the new policy */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY,
-			    policy);
-
-	up_read(&cpufreq_notifier_rwsem);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_NOTIFY, policy);
 
 	data->min = policy->min;
 	data->max = policy->max;
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index d6543fc4a923..339f405ff708 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -484,26 +484,15 @@ static void dcdbas_host_control(void)
 static int dcdbas_reboot_notify(struct notifier_block *nb, unsigned long code,
 				void *unused)
 {
-	static unsigned int notify_cnt = 0;
-
 	switch (code) {
 	case SYS_DOWN:
 	case SYS_HALT:
 	case SYS_POWER_OFF:
 		if (host_control_on_shutdown) {
 			/* firmware is going to perform host control action */
-			if (++notify_cnt == 2) {
-				printk(KERN_WARNING
-				       "Please wait for shutdown "
-				       "action to complete...\n");
-				dcdbas_host_control();
-			}
-			/*
-			 * register again and initiate the host control
-			 * action on the second notification to allow
-			 * everyone that registered to be notified
-			 */
-			register_reboot_notifier(nb);
+			printk(KERN_WARNING "Please wait for shutdown "
+			       "action to complete...\n");
+			dcdbas_host_control();
 		}
 		break;
 	}
@@ -514,7 +503,7 @@ static int dcdbas_reboot_notify(struct notifier_block *nb, unsigned long code,
 static struct notifier_block dcdbas_reboot_nb = {
 	.notifier_call = dcdbas_reboot_notify,
 	.next = NULL,
-	.priority = 0
+	.priority = INT_MIN
 };
 
 static DCDBAS_BIN_ATTR_RW(smi_data);
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index d2ead1776c16..34fcabac5fdb 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -80,7 +80,7 @@ static struct adb_driver *adb_driver_list[] = {
 static struct class *adb_dev_class;
 
 struct adb_driver *adb_controller;
-struct notifier_block *adb_client_list = NULL;
+BLOCKING_NOTIFIER_HEAD(adb_client_list);
 static int adb_got_sleep;
 static int adb_inited;
 static pid_t adb_probe_task_pid;
@@ -354,7 +354,8 @@ adb_notify_sleep(struct pmu_sleep_notifier *self, int when)
 		/* Stop autopoll */
 		if (adb_controller->autopoll)
 			adb_controller->autopoll(0);
-		ret = notifier_call_chain(&adb_client_list, ADB_MSG_POWERDOWN, NULL);
+		ret = blocking_notifier_call_chain(&adb_client_list,
+				ADB_MSG_POWERDOWN, NULL);
 		if (ret & NOTIFY_STOP_MASK) {
 			up(&adb_probe_mutex);
 			return PBOOK_SLEEP_REFUSE;
@@ -391,7 +392,8 @@ do_adb_reset_bus(void)
 	if (adb_controller->autopoll)
 		adb_controller->autopoll(0);
 
-	nret = notifier_call_chain(&adb_client_list, ADB_MSG_PRE_RESET, NULL);
+	nret = blocking_notifier_call_chain(&adb_client_list,
+			ADB_MSG_PRE_RESET, NULL);
 	if (nret & NOTIFY_STOP_MASK) {
 		if (adb_controller->autopoll)
 			adb_controller->autopoll(autopoll_devs);
@@ -426,7 +428,8 @@ do_adb_reset_bus(void)
 	}
 	up(&adb_handler_sem);
 
-	nret = notifier_call_chain(&adb_client_list, ADB_MSG_POST_RESET, NULL);
+	nret = blocking_notifier_call_chain(&adb_client_list,
+			ADB_MSG_POST_RESET, NULL);
 	if (nret & NOTIFY_STOP_MASK)
 		return -EBUSY;
 	
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index c0b46bceb5df..f5779a73184d 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -1214,7 +1214,8 @@ static int __init adbhid_init(void)
 
 	adbhid_probe();
 
-	notifier_chain_register(&adb_client_list, &adbhid_adb_notifier);
+	blocking_notifier_chain_register(&adb_client_list,
+			&adbhid_adb_notifier);
 
 	return 0;
 }
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 4f5f3abc9cb3..0b5ff553e39a 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -187,7 +187,7 @@ extern int disable_kernel_backlight;
 
 int __fake_sleep;
 int asleep;
-struct notifier_block *sleep_notifier_list;
+BLOCKING_NOTIFIER_HEAD(sleep_notifier_list);
 
 #ifdef CONFIG_ADB
 static int adb_dev_map = 0;
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
index f08e52f2107b..35b70323e7e3 100644
--- a/drivers/macintosh/via-pmu68k.c
+++ b/drivers/macintosh/via-pmu68k.c
@@ -102,7 +102,7 @@ static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited = 0;
 
 int asleep;
-struct notifier_block *sleep_notifier_list;
+BLOCKING_NOTIFIER_HEAD(sleep_notifier_list);
 
 static int pmu_probe(void);
 static int pmu_init(void);
@@ -913,7 +913,8 @@ int powerbook_sleep(void)
 	struct adb_request sleep_req;
 
 	/* Notify device drivers */
-	ret = notifier_call_chain(&sleep_notifier_list, PBOOK_SLEEP, NULL);
+	ret = blocking_notifier_call_chain(&sleep_notifier_list,
+			PBOOK_SLEEP, NULL);
 	if (ret & NOTIFY_STOP_MASK)
 		return -EBUSY;
 
@@ -984,7 +985,7 @@ int powerbook_sleep(void)
 			enable_irq(i);
 
 	/* Notify drivers */
-	notifier_call_chain(&sleep_notifier_list, PBOOK_WAKE, NULL);
+	blocking_notifier_call_chain(&sleep_notifier_list, PBOOK_WAKE, NULL);
 
 	/* reenable ADB autopoll */
 	pmu_adb_autopoll(adb_dev_map);
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index 6c0ba04bc57a..ab3faa702d58 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -52,7 +52,7 @@
 static LIST_HEAD(wf_controls);
 static LIST_HEAD(wf_sensors);
 static DEFINE_MUTEX(wf_lock);
-static struct notifier_block *wf_client_list;
+static BLOCKING_NOTIFIER_HEAD(wf_client_list);
 static int wf_client_count;
 static unsigned int wf_overtemp;
 static unsigned int wf_overtemp_counter;
@@ -68,7 +68,7 @@ static struct platform_device wf_platform_device = {
 
 static inline void wf_notify(int event, void *param)
 {
-	notifier_call_chain(&wf_client_list, event, param);
+	blocking_notifier_call_chain(&wf_client_list, event, param);
 }
 
 int wf_critical_overtemp(void)
@@ -398,7 +398,7 @@ int wf_register_client(struct notifier_block *nb)
 	struct wf_sensor *sr;
 
 	mutex_lock(&wf_lock);
-	rc = notifier_chain_register(&wf_client_list, nb);
+	rc = blocking_notifier_chain_register(&wf_client_list, nb);
 	if (rc != 0)
 		goto bail;
 	wf_client_count++;
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(wf_register_client);
 int wf_unregister_client(struct notifier_block *nb)
 {
 	mutex_lock(&wf_lock);
-	notifier_chain_unregister(&wf_client_list, nb);
+	blocking_notifier_chain_unregister(&wf_client_list, nb);
 	wf_client_count++;
 	if (wf_client_count == 0)
 		wf_stop_thread();
diff --git a/drivers/misc/ibmasm/heartbeat.c b/drivers/misc/ibmasm/heartbeat.c
index f295401fac21..7fd7a43e38de 100644
--- a/drivers/misc/ibmasm/heartbeat.c
+++ b/drivers/misc/ibmasm/heartbeat.c
@@ -52,12 +52,13 @@ static struct notifier_block panic_notifier = { panic_happened, NULL, 1 };
 
 void ibmasm_register_panic_notifier(void)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_notifier);
 }
 
 void ibmasm_unregister_panic_notifier(void)
 {
-	notifier_chain_unregister(&panic_notifier_list, &panic_notifier);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&panic_notifier);
 }
 
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2d0ac169a86c..f13a539dc169 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3159,7 +3159,7 @@ static int bond_slave_netdev_event(unsigned long event, struct net_device *slave
  * bond_netdev_event: handle netdev notifier chain events.
  *
  * This function receives events for the netdev chain.  The caller (an
- * ioctl handler calling notifier_call_chain) holds the necessary
+ * ioctl handler calling blocking_notifier_call_chain) holds the necessary
  * locks for us to safely manipulate the slave devices (RTNL lock,
  * dev_probe_lock).
  */
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 3627a2d7f79f..298f2ddb2c17 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -499,11 +499,16 @@ static int led_halt(struct notifier_block *, unsigned long, void *);
 static struct notifier_block led_notifier = {
 	.notifier_call = led_halt,
 };
+static int notifier_disabled = 0;
 
 static int led_halt(struct notifier_block *nb, unsigned long event, void *buf) 
 {
 	char *txt;
-	
+
+	if (notifier_disabled)
+		return NOTIFY_OK;
+
+	notifier_disabled = 1;
 	switch (event) {
 	case SYS_RESTART:	txt = "SYSTEM RESTART";
 				break;
@@ -527,7 +532,6 @@ static int led_halt(struct notifier_block *nb, unsigned long event, void *buf)
 		if (led_func_ptr)
 			led_func_ptr(0xff); /* turn all LEDs ON */
 	
-	unregister_reboot_notifier(&led_notifier);
 	return NOTIFY_OK;
 }
 
@@ -758,6 +762,12 @@ not_found:
 	return 1;
 }
 
+static void __exit led_exit(void)
+{
+	unregister_reboot_notifier(&led_notifier);
+	return;
+}
+
 #ifdef CONFIG_PROC_FS
 module_init(led_create_procfs)
 #endif
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index 54b2b7f20b96..0bcab83b4080 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -251,7 +251,8 @@ static int __init power_init(void)
 	}
 
 	/* Register a call for panic conditions. */
-	notifier_chain_register(&panic_notifier_list, &parisc_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&parisc_panic_block);
 
 	tasklet_enable(&power_tasklet);
 
@@ -264,7 +265,8 @@ static void __exit power_exit(void)
 		return;
 
 	tasklet_disable(&power_tasklet);
-	notifier_chain_unregister(&panic_notifier_list, &parisc_panic_block);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&parisc_panic_block);
 	power_tasklet.func = NULL;
 	pdc_soft_power_button(0);
 }
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 62e3cda859af..7f7013e80a88 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -671,7 +671,7 @@ static struct file_operations gdth_fops = {
 static struct notifier_block gdth_notifier = {
     gdth_halt, NULL, 0
 };
-
+static int notifier_disabled = 0;
 
 static void gdth_delay(int milliseconds)
 {
@@ -4595,13 +4595,13 @@ static int __init gdth_detect(struct scsi_host_template *shtp)
         add_timer(&gdth_timer);
 #endif
         major = register_chrdev(0,"gdth",&gdth_fops);
+        notifier_disabled = 0;
         register_reboot_notifier(&gdth_notifier);
     }
     gdth_polling = FALSE;
     return gdth_ctr_vcount;
 }
 
-
 static int gdth_release(struct Scsi_Host *shp)
 {
     int hanum;
@@ -5632,10 +5632,14 @@ static int gdth_halt(struct notifier_block *nb, ulong event, void *buf)
     char            cmnd[MAX_COMMAND_SIZE];   
 #endif
 
+    if (notifier_disabled)
+    	return NOTIFY_OK;
+
     TRACE2(("gdth_halt() event %d\n",(int)event));
     if (event != SYS_RESTART && event != SYS_HALT && event != SYS_POWER_OFF)
         return NOTIFY_DONE;
 
+    notifier_disabled = 1;
     printk("GDT-HA: Flushing all host drives .. ");
     for (hanum = 0; hanum < gdth_ctr_count; ++hanum) {
         gdth_flush(hanum);
@@ -5679,7 +5683,6 @@ static int gdth_halt(struct notifier_block *nb, ulong event, void *buf)
 #ifdef GDTH_STATISTICS
     del_timer(&gdth_timer);
 #endif
-    unregister_reboot_notifier(&gdth_notifier);
     return NOTIFY_OK;
 }
 
diff --git a/drivers/usb/core/notify.c b/drivers/usb/core/notify.c
index 4b55285de9a0..fe0ed54fa0ae 100644
--- a/drivers/usb/core/notify.c
+++ b/drivers/usb/core/notify.c
@@ -16,57 +16,7 @@
 #include <linux/mutex.h>
 #include "usb.h"
 
-
-static struct notifier_block *usb_notifier_list;
-static DEFINE_MUTEX(usb_notifier_lock);
-
-static void usb_notifier_chain_register(struct notifier_block **list,
-					struct notifier_block *n)
-{
-	mutex_lock(&usb_notifier_lock);
-	while (*list) {
-		if (n->priority > (*list)->priority)
-			break;
-		list = &((*list)->next);
-	}
-	n->next = *list;
-	*list = n;
-	mutex_unlock(&usb_notifier_lock);
-}
-
-static void usb_notifier_chain_unregister(struct notifier_block **nl,
-				   struct notifier_block *n)
-{
-	mutex_lock(&usb_notifier_lock);
-	while ((*nl)!=NULL) {
-		if ((*nl)==n) {
-			*nl = n->next;
-			goto exit;
-		}
-		nl=&((*nl)->next);
-	}
-exit:
-	mutex_unlock(&usb_notifier_lock);
-}
-
-static int usb_notifier_call_chain(struct notifier_block **n,
-				   unsigned long val, void *v)
-{
-	int ret=NOTIFY_DONE;
-	struct notifier_block *nb = *n;
-
-	mutex_lock(&usb_notifier_lock);
-	while (nb) {
-		ret = nb->notifier_call(nb,val,v);
-		if (ret&NOTIFY_STOP_MASK) {
-			goto exit;
-		}
-		nb = nb->next;
-	}
-exit:
-	mutex_unlock(&usb_notifier_lock);
-	return ret;
-}
+static BLOCKING_NOTIFIER_HEAD(usb_notifier_list);
 
 /**
  * usb_register_notify - register a notifier callback whenever a usb change happens
@@ -76,7 +26,7 @@ exit:
  */
 void usb_register_notify(struct notifier_block *nb)
 {
-	usb_notifier_chain_register(&usb_notifier_list, nb);
+	blocking_notifier_chain_register(&usb_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(usb_register_notify);
 
@@ -89,27 +39,28 @@ EXPORT_SYMBOL_GPL(usb_register_notify);
  */
 void usb_unregister_notify(struct notifier_block *nb)
 {
-	usb_notifier_chain_unregister(&usb_notifier_list, nb);
+	blocking_notifier_chain_unregister(&usb_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(usb_unregister_notify);
 
 
 void usb_notify_add_device(struct usb_device *udev)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev);
 }
 
 void usb_notify_remove_device(struct usb_device *udev)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev);
+	blocking_notifier_call_chain(&usb_notifier_list,
+			USB_DEVICE_REMOVE, udev);
 }
 
 void usb_notify_add_bus(struct usb_bus *ubus)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus);
 }
 
 void usb_notify_remove_bus(struct usb_bus *ubus)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus);
 }
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 07d882b14396..b1a8dca76430 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -55,7 +55,7 @@
 
 #define FBPIXMAPSIZE	(1024 * 8)
 
-static struct notifier_block *fb_notifier_list;
+static BLOCKING_NOTIFIER_HEAD(fb_notifier_list);
 struct fb_info *registered_fb[FB_MAX];
 int num_registered_fb;
 
@@ -784,7 +784,7 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 
 		    event.info = info;
 		    event.data = &mode1;
-		    ret = notifier_call_chain(&fb_notifier_list,
+		    ret = blocking_notifier_call_chain(&fb_notifier_list,
 					      FB_EVENT_MODE_DELETE, &event);
 		}
 
@@ -830,8 +830,8 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 
 				info->flags &= ~FBINFO_MISC_USEREVENT;
 				event.info = info;
-				notifier_call_chain(&fb_notifier_list, evnt,
-						    &event);
+				blocking_notifier_call_chain(&fb_notifier_list,
+						evnt, &event);
 			}
 		}
 	}
@@ -854,7 +854,8 @@ fb_blank(struct fb_info *info, int blank)
 
 		event.info = info;
 		event.data = &blank;
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_BLANK, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_BLANK, &event);
 	}
 
  	return ret;
@@ -925,7 +926,7 @@ fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		con2fb.framebuffer = -1;
 		event.info = info;
 		event.data = &con2fb;
-		notifier_call_chain(&fb_notifier_list,
+		blocking_notifier_call_chain(&fb_notifier_list,
 				    FB_EVENT_GET_CONSOLE_MAP, &event);
 		return copy_to_user(argp, &con2fb,
 				    sizeof(con2fb)) ? -EFAULT : 0;
@@ -944,7 +945,7 @@ fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		    return -EINVAL;
 		event.info = info;
 		event.data = &con2fb;
-		return notifier_call_chain(&fb_notifier_list,
+		return blocking_notifier_call_chain(&fb_notifier_list,
 					   FB_EVENT_SET_CONSOLE_MAP,
 					   &event);
 	case FBIOBLANK:
@@ -1324,7 +1325,7 @@ register_framebuffer(struct fb_info *fb_info)
 	devfs_mk_cdev(MKDEV(FB_MAJOR, i),
 			S_IFCHR | S_IRUGO | S_IWUGO, "fb/%d", i);
 	event.info = fb_info;
-	notifier_call_chain(&fb_notifier_list,
+	blocking_notifier_call_chain(&fb_notifier_list,
 			    FB_EVENT_FB_REGISTERED, &event);
 	return 0;
 }
@@ -1366,7 +1367,7 @@ unregister_framebuffer(struct fb_info *fb_info)
  */
 int fb_register_client(struct notifier_block *nb)
 {
-	return notifier_chain_register(&fb_notifier_list, nb);
+	return blocking_notifier_chain_register(&fb_notifier_list, nb);
 }
 
 /**
@@ -1375,7 +1376,7 @@ int fb_register_client(struct notifier_block *nb)
  */
 int fb_unregister_client(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&fb_notifier_list, nb);
+	return blocking_notifier_chain_unregister(&fb_notifier_list, nb);
 }
 
 /**
@@ -1393,11 +1394,13 @@ void fb_set_suspend(struct fb_info *info, int state)
 
 	event.info = info;
 	if (state) {
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_SUSPEND, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_SUSPEND, &event);
 		info->state = FBINFO_STATE_SUSPENDED;
 	} else {
 		info->state = FBINFO_STATE_RUNNING;
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_RESUME, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_RESUME, &event);
 	}
 }
 
@@ -1469,7 +1472,7 @@ int fb_new_modelist(struct fb_info *info)
 
 	if (!list_empty(&info->modelist)) {
 		event.info = info;
-		err = notifier_call_chain(&fb_notifier_list,
+		err = blocking_notifier_call_chain(&fb_notifier_list,
 					   FB_EVENT_NEW_MODELIST,
 					   &event);
 	}
@@ -1495,7 +1498,7 @@ int fb_con_duit(struct fb_info *info, int event, void *data)
 	evnt.info = info;
 	evnt.data = data;
 
-	return notifier_call_chain(&fb_notifier_list, event, &evnt);
+	return blocking_notifier_call_chain(&fb_notifier_list, event, &evnt);
 }
 EXPORT_SYMBOL(fb_con_duit);
 
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index 316138e89910..96d0828ce096 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -17,11 +17,9 @@ struct die_args {
 	int signr;
 };
 
-/* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched - then free.
-  */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *i386die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head i386die_chain;
 
 
 /* Grossly misnamed. */
@@ -51,7 +49,7 @@ static inline int notify_die(enum die_val val, const char *str,
 		.trapnr = trap,
 		.signr = sig
 	};
-	return notifier_call_chain(&i386die_chain, val, &args);
+	return atomic_notifier_call_chain(&i386die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h
index 8b01a083dde6..218c458ab60c 100644
--- a/include/asm-ia64/kdebug.h
+++ b/include/asm-ia64/kdebug.h
@@ -40,7 +40,7 @@ struct die_args {
 
 extern int register_die_notifier(struct notifier_block *);
 extern int unregister_die_notifier(struct notifier_block *);
-extern struct notifier_block *ia64die_chain;
+extern struct atomic_notifier_head ia64die_chain;
 
 enum die_val {
 	DIE_BREAK = 1,
@@ -81,7 +81,7 @@ static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,
 		.signr  = sig
 	};
 
-	return notifier_call_chain(&ia64die_chain, val, &args);
+	return atomic_notifier_call_chain(&ia64die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-powerpc/kdebug.h b/include/asm-powerpc/kdebug.h
index 7c16265568e0..c01786ab5fa6 100644
--- a/include/asm-powerpc/kdebug.h
+++ b/include/asm-powerpc/kdebug.h
@@ -16,13 +16,9 @@ struct die_args {
 	int signr;
 };
 
-/*
-   Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched -
-   then free.
- */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *powerpc_die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head powerpc_die_chain;
 
 /* Grossly misnamed. */
 enum die_val {
@@ -37,7 +33,7 @@ enum die_val {
 static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
 {
 	struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
-	return notifier_call_chain(&powerpc_die_chain, val, &args);
+	return atomic_notifier_call_chain(&powerpc_die_chain, val, &args);
 }
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-sparc64/kdebug.h b/include/asm-sparc64/kdebug.h
index 6321f5a0198d..4040d127ac3e 100644
--- a/include/asm-sparc64/kdebug.h
+++ b/include/asm-sparc64/kdebug.h
@@ -15,12 +15,9 @@ struct die_args {
 	int signr;
 };
 
-/* Note - you should never unregister because that can race with NMIs.
- * If you really want to do it first unregister - then synchronize_sched
- * - then free.
- */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *sparc64die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head sparc64die_chain;
 
 extern void bad_trap(struct pt_regs *, long);
 
@@ -46,7 +43,7 @@ static inline int notify_die(enum die_val val,char *str, struct pt_regs *regs,
 				 .trapnr	= trap,
 				 .signr		= sig };
 
-	return notifier_call_chain(&sparc64die_chain, val, &args);
+	return atomic_notifier_call_chain(&sparc64die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h
index b9ed4c0c8783..cf795631d9b4 100644
--- a/include/asm-x86_64/kdebug.h
+++ b/include/asm-x86_64/kdebug.h
@@ -5,21 +5,20 @@
 
 struct pt_regs;
 
-struct die_args { 
+struct die_args {
 	struct pt_regs *regs;
 	const char *str;
-	long err; 
+	long err;
 	int trapnr;
 	int signr;
-}; 
+};
+
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head die_chain;
 
-/* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched - then free.
-  */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *die_chain;
 /* Grossly misnamed. */
-enum die_val { 
+enum die_val {
 	DIE_OOPS = 1,
 	DIE_INT3,
 	DIE_DEBUG,
@@ -33,8 +32,8 @@ enum die_val {
 	DIE_CALL,
 	DIE_NMI_IPI,
 	DIE_PAGE_FAULT,
-}; 
-	
+};
+
 static inline int notify_die(enum die_val val, const char *str,
 			struct pt_regs *regs, long err, int trap, int sig)
 {
@@ -45,7 +44,7 @@ static inline int notify_die(enum die_val val, const char *str,
 		.trapnr = trap,
 		.signr = sig
 	};
-	return notifier_call_chain(&die_chain, val, &args); 
+	return atomic_notifier_call_chain(&die_chain, val, &args);
 } 
 
 extern int printk_address(unsigned long address);
diff --git a/include/linux/adb.h b/include/linux/adb.h
index e9fdc63483c7..b7305b178279 100644
--- a/include/linux/adb.h
+++ b/include/linux/adb.h
@@ -85,7 +85,7 @@ enum adb_message {
     ADB_MSG_POST_RESET	/* Called after resetting the bus (re-do init & register) */
 };
 extern struct adb_driver *adb_controller;
-extern struct notifier_block *adb_client_list;
+extern struct blocking_notifier_head adb_client_list;
 
 int adb_request(struct adb_request *req, void (*done)(struct adb_request *),
 		int flags, int nbytes, ...);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 03d6cfaa5b8a..a3720f973ea5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -87,7 +87,7 @@ extern int cond_resched(void);
 		(__x < 0) ? -__x : __x;		\
 	})
 
-extern struct notifier_block *panic_notifier_list;
+extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e251dc43d0f5..8f04143ca363 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -77,7 +77,6 @@ extern int remove_memory_block(unsigned long, struct mem_section *, int);
 
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
-struct notifier_block;
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index f32d75c4f4cf..d54d7b278e96 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -308,29 +308,30 @@ DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
 #define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
  
-extern struct notifier_block *ip_conntrack_chain;
-extern struct notifier_block *ip_conntrack_expect_chain;
+extern struct atomic_notifier_head ip_conntrack_chain;
+extern struct atomic_notifier_head ip_conntrack_expect_chain;
 
 static inline int ip_conntrack_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ip_conntrack_chain, nb);
+	return atomic_notifier_chain_register(&ip_conntrack_chain, nb);
 }
 
 static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ip_conntrack_chain, nb);
+	return atomic_notifier_chain_unregister(&ip_conntrack_chain, nb);
 }
 
 static inline int 
 ip_conntrack_expect_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ip_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_register(&ip_conntrack_expect_chain, nb);
 }
 
 static inline int
 ip_conntrack_expect_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ip_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_unregister(&ip_conntrack_expect_chain,
+			nb);
 }
 
 extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct);
@@ -355,14 +356,14 @@ static inline void ip_conntrack_event(enum ip_conntrack_events event,
 				      struct ip_conntrack *ct)
 {
 	if (is_confirmed(ct) && !is_dying(ct))
-		notifier_call_chain(&ip_conntrack_chain, event, ct);
+		atomic_notifier_call_chain(&ip_conntrack_chain, event, ct);
 }
 
 static inline void 
 ip_conntrack_expect_event(enum ip_conntrack_expect_events event,
 			  struct ip_conntrack_expect *exp)
 {
-	notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
+	atomic_notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
 }
 #else /* CONFIG_IP_NF_CONNTRACK_EVENTS */
 static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, 
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 5937dd6053c3..51dbab9710c7 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -10,25 +10,107 @@
 #ifndef _LINUX_NOTIFIER_H
 #define _LINUX_NOTIFIER_H
 #include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
 
-struct notifier_block
-{
-	int (*notifier_call)(struct notifier_block *self, unsigned long, void *);
+/*
+ * Notifier chains are of three types:
+ *
+ *	Atomic notifier chains: Chain callbacks run in interrupt/atomic
+ *		context. Callouts are not allowed to block.
+ *	Blocking notifier chains: Chain callbacks run in process context.
+ *		Callouts are allowed to block.
+ *	Raw notifier chains: There are no restrictions on callbacks,
+ *		registration, or unregistration.  All locking and protection
+ *		must be provided by the caller.
+ *
+ * atomic_notifier_chain_register() may be called from an atomic context,
+ * but blocking_notifier_chain_register() must be called from a process
+ * context.  Ditto for the corresponding _unregister() routines.
+ *
+ * atomic_notifier_chain_unregister() and blocking_notifier_chain_unregister()
+ * _must not_ be called from within the call chain.
+ */
+
+struct notifier_block {
+	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
 	struct notifier_block *next;
 	int priority;
 };
 
+struct atomic_notifier_head {
+	spinlock_t lock;
+	struct notifier_block *head;
+};
+
+struct blocking_notifier_head {
+	struct rw_semaphore rwsem;
+	struct notifier_block *head;
+};
+
+struct raw_notifier_head {
+	struct notifier_block *head;
+};
+
+#define ATOMIC_INIT_NOTIFIER_HEAD(name) do {	\
+		spin_lock_init(&(name)->lock);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {	\
+		init_rwsem(&(name)->rwsem);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define RAW_INIT_NOTIFIER_HEAD(name) do {	\
+		(name)->head = NULL;		\
+	} while (0)
+
+#define ATOMIC_NOTIFIER_INIT(name) {				\
+		.lock = SPIN_LOCK_UNLOCKED,			\
+		.head = NULL }
+#define BLOCKING_NOTIFIER_INIT(name) {				\
+		.rwsem = __RWSEM_INITIALIZER((name).rwsem),	\
+		.head = NULL }
+#define RAW_NOTIFIER_INIT(name)	{				\
+		.head = NULL }
+
+#define ATOMIC_NOTIFIER_HEAD(name)				\
+	struct atomic_notifier_head name =			\
+		ATOMIC_NOTIFIER_INIT(name)
+#define BLOCKING_NOTIFIER_HEAD(name)				\
+	struct blocking_notifier_head name =			\
+		BLOCKING_NOTIFIER_INIT(name)
+#define RAW_NOTIFIER_HEAD(name)					\
+	struct raw_notifier_head name =				\
+		RAW_NOTIFIER_INIT(name)
 
 #ifdef __KERNEL__
 
-extern int notifier_chain_register(struct notifier_block **list, struct notifier_block *n);
-extern int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n);
-extern int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v);
+extern int atomic_notifier_chain_register(struct atomic_notifier_head *,
+		struct notifier_block *);
+extern int blocking_notifier_chain_register(struct blocking_notifier_head *,
+		struct notifier_block *);
+extern int raw_notifier_chain_register(struct raw_notifier_head *,
+		struct notifier_block *);
+
+extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *,
+		struct notifier_block *);
+extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *,
+		struct notifier_block *);
+extern int raw_notifier_chain_unregister(struct raw_notifier_head *,
+		struct notifier_block *);
+
+extern int atomic_notifier_call_chain(struct atomic_notifier_head *,
+		unsigned long val, void *v);
+extern int blocking_notifier_call_chain(struct blocking_notifier_head *,
+		unsigned long val, void *v);
+extern int raw_notifier_call_chain(struct raw_notifier_head *,
+		unsigned long val, void *v);
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
 #define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
-#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)	/* Bad/Veto action	*/
+#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
+						/* Bad/Veto action */
 /*
  * Clean way to return from the notifier and stop further calls.
  */
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index b6f0905a4ee2..916013ca4a5c 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -300,29 +300,30 @@ DECLARE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
 
 #define CONNTRACK_ECACHE(x)	(__get_cpu_var(nf_conntrack_ecache).x)
 
-extern struct notifier_block *nf_conntrack_chain;
-extern struct notifier_block *nf_conntrack_expect_chain;
+extern struct atomic_notifier_head nf_conntrack_chain;
+extern struct atomic_notifier_head nf_conntrack_expect_chain;
 
 static inline int nf_conntrack_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&nf_conntrack_chain, nb);
+	return atomic_notifier_chain_register(&nf_conntrack_chain, nb);
 }
 
 static inline int nf_conntrack_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&nf_conntrack_chain, nb);
+	return atomic_notifier_chain_unregister(&nf_conntrack_chain, nb);
 }
 
 static inline int
 nf_conntrack_expect_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&nf_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_register(&nf_conntrack_expect_chain, nb);
 }
 
 static inline int
 nf_conntrack_expect_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&nf_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_unregister(&nf_conntrack_expect_chain,
+			nb);
 }
 
 extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
@@ -347,14 +348,14 @@ static inline void nf_conntrack_event(enum ip_conntrack_events event,
 				      struct nf_conn *ct)
 {
 	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct))
-		notifier_call_chain(&nf_conntrack_chain, event, ct);
+		atomic_notifier_call_chain(&nf_conntrack_chain, event, ct);
 }
 
 static inline void
 nf_conntrack_expect_event(enum ip_conntrack_expect_events event,
 			  struct nf_conntrack_expect *exp)
 {
-	notifier_call_chain(&nf_conntrack_expect_chain, event, exp);
+	atomic_notifier_call_chain(&nf_conntrack_expect_chain, event, exp);
 }
 #else /* CONFIG_NF_CONNTRACK_EVENTS */
 static inline void nf_conntrack_event_cache(enum ip_conntrack_events event,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8be22bd80933..fe2b8d0bfe4c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DECLARE_MUTEX(cpucontrol);
 
-static struct notifier_block *cpu_chain;
+static BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
-	int ret;
-
-	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-		return ret;
-	ret = notifier_chain_register(&cpu_chain, nb);
-	unlock_cpu_hotplug();
-	return ret;
+	return blocking_notifier_chain_register(&cpu_chain, nb);
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	lock_cpu_hotplug();
-	notifier_chain_unregister(&cpu_chain, nb);
-	unlock_cpu_hotplug();
+	blocking_notifier_chain_unregister(&cpu_chain, nb);
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+	err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	if (IS_ERR(p)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+		if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
 				(void *)(long)cpu) == NOTIFY_BAD)
 			BUG();
 
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
 	put_cpu();
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
-	    == NOTIFY_BAD)
+	if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
+			(void *)(long)cpu) == NOTIFY_BAD)
 		BUG();
 
 	check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
 		goto out;
 	}
 
-	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
 				__FUNCTION__, cpu);
@@ -226,11 +218,12 @@ int __devinit cpu_up(unsigned int cpu)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+	blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 
 out_notify:
 	if (ret != 0)
-		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+		blocking_notifier_call_chain(&cpu_chain,
+				CPU_UP_CANCELED, hcpu);
 out:
 	unlock_cpu_hotplug();
 	return ret;
diff --git a/kernel/module.c b/kernel/module.c
index ddfe45ac2fd1..4fafd58038a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -64,26 +64,17 @@ static DEFINE_SPINLOCK(modlist_lock);
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
-static DEFINE_MUTEX(notify_mutex);
-static struct notifier_block * module_notify_list;
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
 int register_module_notifier(struct notifier_block * nb)
 {
-	int err;
-	mutex_lock(&notify_mutex);
-	err = notifier_chain_register(&module_notify_list, nb);
-	mutex_unlock(&notify_mutex);
-	return err;
+	return blocking_notifier_chain_register(&module_notify_list, nb);
 }
 EXPORT_SYMBOL(register_module_notifier);
 
 int unregister_module_notifier(struct notifier_block * nb)
 {
-	int err;
-	mutex_lock(&notify_mutex);
-	err = notifier_chain_unregister(&module_notify_list, nb);
-	mutex_unlock(&notify_mutex);
-	return err;
+	return blocking_notifier_chain_unregister(&module_notify_list, nb);
 }
 EXPORT_SYMBOL(unregister_module_notifier);
 
@@ -1816,9 +1807,8 @@ sys_init_module(void __user *umod,
 	/* Drop lock so they can recurse */
 	mutex_unlock(&module_mutex);
 
-	mutex_lock(&notify_mutex);
-	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
-	mutex_unlock(&notify_mutex);
+	blocking_notifier_call_chain(&module_notify_list,
+			MODULE_STATE_COMING, mod);
 
 	/* Start the module */
 	if (mod->init != NULL)
diff --git a/kernel/panic.c b/kernel/panic.c
index acd95adddb93..f895c7c01d5b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 
-struct notifier_block *panic_notifier_list;
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
 EXPORT_SYMBOL(panic_notifier_list);
 
@@ -97,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	smp_send_stop();
 #endif
 
-	notifier_call_chain(&panic_notifier_list, 0, buf);
+	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	if (!panic_blink)
 		panic_blink = no_blink;
diff --git a/kernel/profile.c b/kernel/profile.c
index ad81f799a9b4..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -87,72 +87,52 @@ void __init profile_init(void)
  
 #ifdef CONFIG_PROFILING
  
-static DECLARE_RWSEM(profile_rwsem);
-static DEFINE_RWLOCK(handoff_lock);
-static struct notifier_block * task_exit_notifier;
-static struct notifier_block * task_free_notifier;
-static struct notifier_block * munmap_notifier;
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
  
 void profile_task_exit(struct task_struct * task)
 {
-	down_read(&profile_rwsem);
-	notifier_call_chain(&task_exit_notifier, 0, task);
-	up_read(&profile_rwsem);
+	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 }
  
 int profile_handoff_task(struct task_struct * task)
 {
 	int ret;
-	read_lock(&handoff_lock);
-	ret = notifier_call_chain(&task_free_notifier, 0, task);
-	read_unlock(&handoff_lock);
+	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
 	return (ret == NOTIFY_OK) ? 1 : 0;
 }
 
 void profile_munmap(unsigned long addr)
 {
-	down_read(&profile_rwsem);
-	notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-	up_read(&profile_rwsem);
+	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 }
 
 int task_handoff_register(struct notifier_block * n)
 {
-	int err = -EINVAL;
-
-	write_lock(&handoff_lock);
-	err = notifier_chain_register(&task_free_notifier, n);
-	write_unlock(&handoff_lock);
-	return err;
+	return atomic_notifier_chain_register(&task_free_notifier, n);
 }
 
 int task_handoff_unregister(struct notifier_block * n)
 {
-	int err = -EINVAL;
-
-	write_lock(&handoff_lock);
-	err = notifier_chain_unregister(&task_free_notifier, n);
-	write_unlock(&handoff_lock);
-	return err;
+	return atomic_notifier_chain_unregister(&task_free_notifier, n);
 }
 
 int profile_event_register(enum profile_type type, struct notifier_block * n)
 {
 	int err = -EINVAL;
  
-	down_write(&profile_rwsem);
- 
 	switch (type) {
 		case PROFILE_TASK_EXIT:
-			err = notifier_chain_register(&task_exit_notifier, n);
+			err = blocking_notifier_chain_register(
+					&task_exit_notifier, n);
 			break;
 		case PROFILE_MUNMAP:
-			err = notifier_chain_register(&munmap_notifier, n);
+			err = blocking_notifier_chain_register(
+					&munmap_notifier, n);
 			break;
 	}
  
-	up_write(&profile_rwsem);
- 
 	return err;
 }
 
@@ -161,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
 {
 	int err = -EINVAL;
  
-	down_write(&profile_rwsem);
- 
 	switch (type) {
 		case PROFILE_TASK_EXIT:
-			err = notifier_chain_unregister(&task_exit_notifier, n);
+			err = blocking_notifier_chain_unregister(
+					&task_exit_notifier, n);
 			break;
 		case PROFILE_MUNMAP:
-			err = notifier_chain_unregister(&munmap_notifier, n);
+			err = blocking_notifier_chain_unregister(
+					&munmap_notifier, n);
 			break;
 	}
 
-	up_write(&profile_rwsem);
 	return err;
 }
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9b3d5847ed8..ced91e1ff564 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -152,5 +152,5 @@ __init void spawn_softlockup_task(void)
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 38bc73ede2ba..c93d37f71aef 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
  *	and the like. 
  */
 
-static struct notifier_block *reboot_notifier_list;
-static DEFINE_RWLOCK(notifier_lock);
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
+/*
+ *	Notifier chain core routines.  The exported routines below
+ *	are layered on top of these, with appropriate locking added.
+ */
+
+static int notifier_chain_register(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if (n->priority > (*nl)->priority)
+			break;
+		nl = &((*nl)->next);
+	}
+	n->next = *nl;
+	rcu_assign_pointer(*nl, n);
+	return 0;
+}
+
+static int notifier_chain_unregister(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if ((*nl) == n) {
+			rcu_assign_pointer(*nl, n->next);
+			return 0;
+		}
+		nl = &((*nl)->next);
+	}
+	return -ENOENT;
+}
+
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+		unsigned long val, void *v)
+{
+	int ret = NOTIFY_DONE;
+	struct notifier_block *nb;
+
+	nb = rcu_dereference(*nl);
+	while (nb) {
+		ret = nb->notifier_call(nb, val, v);
+		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+			break;
+		nb = rcu_dereference(nb->next);
+	}
+	return ret;
+}
+
+/*
+ *	Atomic notifier chain routines.  Registration and unregistration
+ *	use a mutex, and call_chain is synchronized by RCU (no locks).
+ */
 
 /**
- *	notifier_chain_register	- Add notifier to a notifier chain
- *	@list: Pointer to root list pointer
+ *	atomic_notifier_chain_register - Add notifier to an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
  *	@n: New entry in notifier chain
  *
- *	Adds a notifier to a notifier chain.
+ *	Adds a notifier to an atomic notifier chain.
  *
  *	Currently always returns zero.
  */
+
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_register(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+
+/**
+ *	atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from an atomic notifier chain.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_unregister(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	synchronize_rcu();
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+
+/**
+ *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an atomic context, so they must not block.
+ *	This routine uses RCU to synchronize with changes to the chain.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
  
-int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+		unsigned long val, void *v)
 {
-	write_lock(&notifier_lock);
-	while(*list)
-	{
-		if(n->priority > (*list)->priority)
-			break;
-		list= &((*list)->next);
-	}
-	n->next = *list;
-	*list=n;
-	write_unlock(&notifier_lock);
-	return 0;
+	int ret;
+
+	rcu_read_lock();
+	ret = notifier_call_chain(&nh->head, val, v);
+	rcu_read_unlock();
+	return ret;
 }
 
-EXPORT_SYMBOL(notifier_chain_register);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+
+/*
+ *	Blocking notifier chain routines.  All access to the chain is
+ *	synchronized by an rwsem.
+ */
 
 /**
- *	notifier_chain_unregister - Remove notifier from a notifier chain
- *	@nl: Pointer to root list pointer
+ *	blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
  *	@n: New entry in notifier chain
  *
- *	Removes a notifier from a notifier chain.
+ *	Adds a notifier to a blocking notifier chain.
+ *	Must be called in process context.
  *
- *	Returns zero on success, or %-ENOENT on failure.
+ *	Currently always returns zero.
  */
  
-int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
 {
-	write_lock(&notifier_lock);
-	while((*nl)!=NULL)
-	{
-		if((*nl)==n)
-		{
-			*nl=n->next;
-			write_unlock(&notifier_lock);
-			return 0;
-		}
-		nl=&((*nl)->next);
-	}
-	write_unlock(&notifier_lock);
-	return -ENOENT;
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_register(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_register(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
 }
 
-EXPORT_SYMBOL(notifier_chain_unregister);
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 
 /**
- *	notifier_call_chain - Call functions in a notifier chain
- *	@n: Pointer to root pointer of notifier chain
+ *	blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a blocking notifier chain.
+ *	Must be called from process context.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_unregister(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_unregister(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+
+/**
+ *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
  *
- *	Calls each function in a notifier chain in turn.
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in a process context, so they are allowed to block.
  *
- *	If the return value of the notifier can be and'd
- *	with %NOTIFY_STOP_MASK, then notifier_call_chain
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
  *	will return immediately, with the return value of
  *	the notifier function which halted execution.
- *	Otherwise, the return value is the return value
+ *	Otherwise the return value is the return value
  *	of the last notifier function called.
  */
  
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v)
 {
-	int ret=NOTIFY_DONE;
-	struct notifier_block *nb = *n;
+	int ret;
 
-	while(nb)
-	{
-		ret=nb->notifier_call(nb,val,v);
-		if(ret&NOTIFY_STOP_MASK)
-		{
-			return ret;
-		}
-		nb=nb->next;
-	}
+	down_read(&nh->rwsem);
+	ret = notifier_call_chain(&nh->head, val, v);
+	up_read(&nh->rwsem);
 	return ret;
 }
 
-EXPORT_SYMBOL(notifier_call_chain);
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+
+/*
+ *	Raw notifier chain routines.  There is no protection;
+ *	the caller must provide it.  Use at your own risk!
+ */
+
+/**
+ *	raw_notifier_chain_register - Add notifier to a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Currently always returns zero.
+ */
+
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_register(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+
+/**
+ *	raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_unregister(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+
+/**
+ *	raw_notifier_call_chain - Call functions in a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an undefined context.
+ *	All locking must be provided by the caller.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then raw_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
+
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return notifier_call_chain(&nh->head, val, v);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 
 /**
  *	register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
  *	Registers a function with the list of functions
  *	to be called at reboot time.
  *
- *	Currently always returns zero, as notifier_chain_register
+ *	Currently always returns zero, as blocking_notifier_chain_register
  *	always returns zero.
  */
  
 int register_reboot_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_register(&reboot_notifier_list, nb);
+	return blocking_notifier_chain_register(&reboot_notifier_list, nb);
 }
 
 EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,7 +424,7 @@ EXPORT_SYMBOL(register_reboot_notifier);
  
 int unregister_reboot_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_unregister(&reboot_notifier_list, nb);
+	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
 }
 
 EXPORT_SYMBOL(unregister_reboot_notifier);
@@ -380,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
 
 void kernel_restart_prepare(char *cmd)
 {
-	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
 }
@@ -430,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
 
 void kernel_shutdown_prepare(enum system_states state)
 {
-	notifier_call_chain(&reboot_notifier_list,
+	blocking_notifier_call_chain(&reboot_notifier_list,
 		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
 	system_state = state;
 	device_shutdown();
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 9106354c781e..a49a6975092d 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -73,23 +73,23 @@ DEFINE_RWLOCK(hci_cb_list_lock);
 struct hci_proto *hci_proto[HCI_MAX_PROTO];
 
 /* HCI notifiers list */
-static struct notifier_block *hci_notifier;
+static ATOMIC_NOTIFIER_HEAD(hci_notifier);
 
 /* ---- HCI notifications ---- */
 
 int hci_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&hci_notifier, nb);
+	return atomic_notifier_chain_register(&hci_notifier, nb);
 }
 
 int hci_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&hci_notifier, nb);
+	return atomic_notifier_chain_unregister(&hci_notifier, nb);
 }
 
 static void hci_notify(struct hci_dev *hdev, int event)
 {
-	notifier_call_chain(&hci_notifier, event, hdev);
+	atomic_notifier_call_chain(&hci_notifier, event, hdev);
 }
 
 /* ---- HCI requests ---- */
diff --git a/net/core/dev.c b/net/core/dev.c
index 8e1dc3051222..a3ab11f34153 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -193,7 +193,7 @@ static inline struct hlist_head *dev_index_hash(int ifindex)
  *	Our notifier list
  */
 
-static struct notifier_block *netdev_chain;
+static BLOCKING_NOTIFIER_HEAD(netdev_chain);
 
 /*
  *	Device drivers call our routines to queue packets here. We empty the
@@ -736,7 +736,8 @@ int dev_change_name(struct net_device *dev, char *newname)
 	if (!err) {
 		hlist_del(&dev->name_hlist);
 		hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGENAME, dev);
 	}
 
 	return err;
@@ -750,7 +751,7 @@ int dev_change_name(struct net_device *dev, char *newname)
  */
 void netdev_features_change(struct net_device *dev)
 {
-	notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 }
 EXPORT_SYMBOL(netdev_features_change);
 
@@ -765,7 +766,8 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGE, dev);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 	}
 }
@@ -862,7 +864,7 @@ int dev_open(struct net_device *dev)
 		/*
 		 *	... and announce new interface.
 		 */
-		notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
+		blocking_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 	}
 	return ret;
 }
@@ -885,7 +887,7 @@ int dev_close(struct net_device *dev)
 	 *	Tell people we are going down, so that they can
 	 *	prepare to death, when device is still operating.
 	 */
-	notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 
 	dev_deactivate(dev);
 
@@ -922,7 +924,7 @@ int dev_close(struct net_device *dev)
 	/*
 	 * Tell people we are down
 	 */
-	notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 
 	return 0;
 }
@@ -953,7 +955,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 	int err;
 
 	rtnl_lock();
-	err = notifier_chain_register(&netdev_chain, nb);
+	err = blocking_notifier_chain_register(&netdev_chain, nb);
 	if (!err) {
 		for (dev = dev_base; dev; dev = dev->next) {
 			nb->notifier_call(nb, NETDEV_REGISTER, dev);
@@ -981,7 +983,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	int err;
 
 	rtnl_lock();
-	err = notifier_chain_unregister(&netdev_chain, nb);
+	err = blocking_notifier_chain_unregister(&netdev_chain, nb);
 	rtnl_unlock();
 	return err;
 }
@@ -992,12 +994,12 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
  *      @v:   pointer passed unmodified to notifier function
  *
  *	Call all network notifier blocks.  Parameters and return value
- *	are as for notifier_call_chain().
+ *	are as for blocking_notifier_call_chain().
  */
 
 int call_netdevice_notifiers(unsigned long val, void *v)
 {
-	return notifier_call_chain(&netdev_chain, val, v);
+	return blocking_notifier_call_chain(&netdev_chain, val, v);
 }
 
 /* When > 0 there are consumers of rx skb time stamps */
@@ -2242,7 +2244,8 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	if (dev->flags & IFF_UP &&
 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
 					  IFF_VOLATILE)))
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGE, dev);
 
 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
@@ -2286,8 +2289,8 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	else
 		dev->mtu = new_mtu;
 	if (!err && dev->flags & IFF_UP)
-		notifier_call_chain(&netdev_chain,
-				    NETDEV_CHANGEMTU, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGEMTU, dev);
 	return err;
 }
 
@@ -2303,7 +2306,8 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 		return -ENODEV;
 	err = dev->set_mac_address(dev, sa);
 	if (!err)
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGEADDR, dev);
 	return err;
 }
 
@@ -2359,7 +2363,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
 				return -EINVAL;
 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
-			notifier_call_chain(&netdev_chain,
+			blocking_notifier_call_chain(&netdev_chain,
 					    NETDEV_CHANGEADDR, dev);
 			return 0;
 
@@ -2813,7 +2817,7 @@ int register_netdevice(struct net_device *dev)
 	write_unlock_bh(&dev_base_lock);
 
 	/* Notify protocols, that a new device appeared. */
-	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
 
 	/* Finish registration after unlock */
 	net_set_todo(dev);
@@ -2892,7 +2896,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 			rtnl_lock();
 
 			/* Rebroadcast unregister notification */
-			notifier_call_chain(&netdev_chain,
+			blocking_notifier_call_chain(&netdev_chain,
 					    NETDEV_UNREGISTER, dev);
 
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
@@ -3148,7 +3152,7 @@ int unregister_netdevice(struct net_device *dev)
 	/* Notify protocols, that we are about to destroy
 	   this device. They should clean all the things.
 	*/
-	notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
 	
 	/*
 	 *	Flush the multicast chain
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index cc7b9d9255ef..d2ae9893ca17 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -68,7 +68,7 @@ __le16 decnet_address = 0;
 
 static DEFINE_RWLOCK(dndev_lock);
 static struct net_device *decnet_default_device;
-static struct notifier_block *dnaddr_chain;
+static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
 
 static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
 static void dn_dev_delete(struct net_device *dev);
@@ -446,7 +446,7 @@ static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr **ifap, int de
 	}
 
 	rtmsg_ifa(RTM_DELADDR, ifa1);
-	notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
 	if (destroy) {
 		dn_dev_free_ifa(ifa1);
 
@@ -481,7 +481,7 @@ static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
 	dn_db->ifa_list = ifa;
 
 	rtmsg_ifa(RTM_NEWADDR, ifa);
-	notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
 
 	return 0;
 }
@@ -1285,12 +1285,12 @@ void dn_dev_devices_on(void)
 
 int register_dnaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&dnaddr_chain, nb);
+	return blocking_notifier_chain_register(&dnaddr_chain, nb);
 }
 
 int unregister_dnaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&dnaddr_chain, nb);
+	return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 44fdf1413e2c..81c2f7885292 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -81,7 +81,7 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
 
 static void rtmsg_ifa(int event, struct in_ifaddr *);
 
-static struct notifier_block *inetaddr_chain;
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			 int destroy);
 #ifdef CONFIG_SYSCTL
@@ -267,7 +267,8 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 				*ifap1 = ifa->ifa_next;
 
 				rtmsg_ifa(RTM_DELADDR, ifa);
-				notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+				blocking_notifier_call_chain(&inetaddr_chain,
+						NETDEV_DOWN, ifa);
 				inet_free_ifa(ifa);
 			} else {
 				promote = ifa;
@@ -291,7 +292,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	   So that, this order is correct.
 	 */
 	rtmsg_ifa(RTM_DELADDR, ifa1);
-	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 
 	if (promote) {
 
@@ -303,7 +304,8 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 
 		promote->ifa_flags &= ~IFA_F_SECONDARY;
 		rtmsg_ifa(RTM_NEWADDR, promote);
-		notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote);
+		blocking_notifier_call_chain(&inetaddr_chain,
+				NETDEV_UP, promote);
 		for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
 			if (ifa1->ifa_mask != ifa->ifa_mask ||
 			    !inet_ifa_match(ifa1->ifa_address, ifa))
@@ -366,7 +368,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa)
 	   Notifier will trigger FIB update, so that
 	   listeners of netlink will know about new ifaddr */
 	rtmsg_ifa(RTM_NEWADDR, ifa);
-	notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
 
 	return 0;
 }
@@ -938,12 +940,12 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
 
 int register_inetaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&inetaddr_chain, nb);
+	return blocking_notifier_chain_register(&inetaddr_chain, nb);
 }
 
 int unregister_inetaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&inetaddr_chain, nb);
+	return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
 }
 
 /* Rename ifa_labels for a device name change. Make some effort to preserve existing
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 9e34034729a6..ceaabc18202b 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -80,8 +80,8 @@ static int ip_conntrack_vmalloc;
 static unsigned int ip_conntrack_next_id;
 static unsigned int ip_conntrack_expect_next_id;
 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-struct notifier_block *ip_conntrack_chain;
-struct notifier_block *ip_conntrack_expect_chain;
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
 
 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
@@ -92,7 +92,7 @@ __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
 {
 	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
 	if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
-		notifier_call_chain(&ip_conntrack_chain, ecache->events,
+		atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
 				    ecache->ct);
 	ecache->events = 0;
 	ip_conntrack_put(ecache->ct);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 01c62a0d3742..445006ee4522 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -143,7 +143,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 				struct prefix_info *pinfo);
 static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev);
 
-static struct notifier_block *inet6addr_chain;
+static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
 
 struct ipv6_devconf ipv6_devconf = {
 	.forwarding		= 0,
@@ -593,7 +593,7 @@ out2:
 	read_unlock_bh(&addrconf_lock);
 
 	if (likely(err == 0))
-		notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
+		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
 	else {
 		kfree(ifa);
 		ifa = ERR_PTR(err);
@@ -688,7 +688,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
 	ipv6_ifa_notify(RTM_DELADDR, ifp);
 
-	notifier_call_chain(&inet6addr_chain,NETDEV_DOWN,ifp);
+	atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp);
 
 	addrconf_del_timer(ifp);
 
@@ -3767,12 +3767,12 @@ static void addrconf_sysctl_unregister(struct ipv6_devconf *p)
 
 int register_inet6addr_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_register(&inet6addr_chain, nb);
+        return atomic_notifier_chain_register(&inet6addr_chain, nb);
 }
 
 int unregister_inet6addr_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_unregister(&inet6addr_chain,nb);
+        return atomic_notifier_chain_unregister(&inet6addr_chain,nb);
 }
 
 /*
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0ae281d9bfc3..56389c83557c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -90,8 +90,8 @@ static int nf_conntrack_vmalloc;
 static unsigned int nf_conntrack_next_id;
 static unsigned int nf_conntrack_expect_next_id;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-struct notifier_block *nf_conntrack_chain;
-struct notifier_block *nf_conntrack_expect_chain;
+ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
 
 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
 
@@ -103,7 +103,7 @@ __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
 	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
 	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
 	    && ecache->events)
-		notifier_call_chain(&nf_conntrack_chain, ecache->events,
+		atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
 				    ecache->ct);
 
 	ecache->events = 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d00a9034cb5f..2a233ffcf618 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -123,7 +123,7 @@ static void netlink_destroy_callback(struct netlink_callback *cb);
 static DEFINE_RWLOCK(nl_table_lock);
 static atomic_t nl_table_users = ATOMIC_INIT(0);
 
-static struct notifier_block *netlink_chain;
+static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static u32 netlink_group_mask(u32 group)
 {
@@ -469,7 +469,8 @@ static int netlink_release(struct socket *sock)
 						.protocol = sk->sk_protocol,
 						.pid = nlk->pid,
 					  };
-		notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
+		atomic_notifier_call_chain(&netlink_chain,
+				NETLINK_URELEASE, &n);
 	}	
 
 	if (nlk->module)
@@ -1695,12 +1696,12 @@ static struct file_operations netlink_seq_fops = {
 
 int netlink_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&netlink_chain, nb);
+	return atomic_notifier_chain_register(&netlink_chain, nb);
 }
 
 int netlink_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&netlink_chain, nb);
+	return atomic_notifier_chain_unregister(&netlink_chain, nb);
 }
                 
 static const struct proto_ops netlink_ops = {
-- 
cgit v1.2.3


From c58411e95d7f5062dedd1a3064af4d359da1e633 Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:34 -0800
Subject: [PATCH] RTC Subsystem: library functions

RTC and date/time related functions.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/Kconfig       |   2 +
 drivers/Makefile      |   1 +
 drivers/rtc/Kconfig   |   6 +++
 drivers/rtc/Makefile  |   5 +++
 drivers/rtc/rtc-lib.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc.h   |   5 +++
 6 files changed, 120 insertions(+)
 create mode 100644 drivers/rtc/Kconfig
 create mode 100644 drivers/rtc/Makefile
 create mode 100644 drivers/rtc/rtc-lib.c

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index bddf431bbb72..9f5c0da57c90 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -70,4 +70,6 @@ source "drivers/sn/Kconfig"
 
 source "drivers/edac/Kconfig"
 
+source "drivers/rtc/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 5c69b86db624..424955274e60 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_USB_GADGET)	+= usb/gadget/
 obj-$(CONFIG_GAMEPORT)		+= input/gameport/
 obj-$(CONFIG_INPUT)		+= input/
 obj-$(CONFIG_I2O)		+= message/
+obj-$(CONFIG_RTC_LIB)		+= rtc/
 obj-$(CONFIG_I2C)		+= i2c/
 obj-$(CONFIG_W1)		+= w1/
 obj-$(CONFIG_HWMON)		+= hwmon/
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
new file mode 100644
index 000000000000..15df7c130fa6
--- /dev/null
+++ b/drivers/rtc/Kconfig
@@ -0,0 +1,6 @@
+#
+# RTC class/drivers configuration
+#
+
+config RTC_LIB
+	tristate
\ No newline at end of file
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
new file mode 100644
index 000000000000..eb9ad77c3e95
--- /dev/null
+++ b/drivers/rtc/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for RTC class/drivers.
+#
+
+obj-$(CONFIG_RTC_LIB)	+= rtc-lib.o
diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
new file mode 100644
index 000000000000..cfedc1d28ee1
--- /dev/null
+++ b/drivers/rtc/rtc-lib.c
@@ -0,0 +1,101 @@
+/*
+ * rtc and date/time utility functions
+ *
+ * Copyright (C) 2005-06 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * based on arch/arm/common/rtctime.c and other bits
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/module.h>
+#include <linux/rtc.h>
+
+static const unsigned char rtc_days_in_month[] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+#define LEAPS_THRU_END_OF(y) ((y)/4 - (y)/100 + (y)/400)
+#define LEAP_YEAR(year) ((!(year % 4) && (year % 100)) || !(year % 400))
+
+int rtc_month_days(unsigned int month, unsigned int year)
+{
+	return rtc_days_in_month[month] + (LEAP_YEAR(year) && month == 1);
+}
+EXPORT_SYMBOL(rtc_month_days);
+
+/*
+ * Convert seconds since 01-01-1970 00:00:00 to Gregorian date.
+ */
+void rtc_time_to_tm(unsigned long time, struct rtc_time *tm)
+{
+	register int days, month, year;
+
+	days = time / 86400;
+	time -= days * 86400;
+
+	/* day of the week, 1970-01-01 was a Thursday */
+	tm->tm_wday = (days + 4) % 7;
+
+	year = 1970 + days / 365;
+	days -= (year - 1970) * 365
+		+ LEAPS_THRU_END_OF(year - 1)
+		- LEAPS_THRU_END_OF(1970 - 1);
+	if (days < 0) {
+		year -= 1;
+		days += 365 + LEAP_YEAR(year);
+	}
+	tm->tm_year = year - 1900;
+	tm->tm_yday = days + 1;
+
+	for (month = 0; month < 11; month++) {
+		int newdays;
+
+		newdays = days - rtc_month_days(month, year);
+		if (newdays < 0)
+			break;
+		days = newdays;
+	}
+	tm->tm_mon = month;
+	tm->tm_mday = days + 1;
+
+	tm->tm_hour = time / 3600;
+	time -= tm->tm_hour * 3600;
+	tm->tm_min = time / 60;
+	tm->tm_sec = time - tm->tm_min * 60;
+}
+EXPORT_SYMBOL(rtc_time_to_tm);
+
+/*
+ * Does the rtc_time represent a valid date/time?
+ */
+int rtc_valid_tm(struct rtc_time *tm)
+{
+	if (tm->tm_year < 70
+		|| tm->tm_mon >= 12
+		|| tm->tm_mday < 1
+		|| tm->tm_mday > rtc_month_days(tm->tm_mon, tm->tm_year + 1900)
+		|| tm->tm_hour >= 24
+		|| tm->tm_min >= 60
+		|| tm->tm_sec >= 60)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(rtc_valid_tm);
+
+/*
+ * Convert Gregorian date to seconds since 01-01-1970 00:00:00.
+ */
+int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time)
+{
+	*time = mktime(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
+			tm->tm_hour, tm->tm_min, tm->tm_sec);
+	return 0;
+}
+EXPORT_SYMBOL(rtc_tm_to_time);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index b739ac1f7ca0..8454337c7058 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -95,6 +95,11 @@ struct rtc_pll_info {
 
 #include <linux/interrupt.h>
 
+extern int rtc_month_days(unsigned int month, unsigned int year);
+extern int rtc_valid_tm(struct rtc_time *tm);
+extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
+extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
+
 typedef struct rtc_task {
 	void (*func)(void *private_data);
 	void *private_data;
-- 
cgit v1.2.3


From 0c86edc0d4970649f39748c4ce4f2895f728468f Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:37 -0800
Subject: [PATCH] RTC subsystem: class

Add the basic RTC subsystem infrastructure to the kernel.

rtc/class.c - registration facilities for RTC drivers
rtc/interface.c - kernel/rtc interface functions
rtc/hctosys.c - snippet of code that copies hw clock to sw clock
		at bootup, if configured to do so.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 CREDITS                 |   5 +-
 MAINTAINERS             |   6 ++
 drivers/rtc/Kconfig     |  44 +++++++-
 drivers/rtc/Makefile    |   5 +-
 drivers/rtc/class.c     | 145 +++++++++++++++++++++++++
 drivers/rtc/hctosys.c   |  69 ++++++++++++
 drivers/rtc/interface.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc.h     |  87 +++++++++++++++
 8 files changed, 633 insertions(+), 5 deletions(-)
 create mode 100644 drivers/rtc/class.c
 create mode 100644 drivers/rtc/hctosys.c
 create mode 100644 drivers/rtc/interface.c

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index c6d69bf10e15..35850d882c34 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3741,10 +3741,11 @@ D: Mylex DAC960 PCI RAID driver
 D: Miscellaneous kernel fixes
 
 N: Alessandro Zummo
-E: azummo@ita.flashnet.it
-W: http://freepage.logicom.it/azummo/
+E: a.zummo@towertech.it
 D: CMI8330 support is sb_card.c
 D: ISAPnP fixes in sb_card.c
+D: ZyXEL omni.net lcd plus driver
+D: RTC subsystem
 S: Italy
 
 N: Marc Zyngier
diff --git a/MAINTAINERS b/MAINTAINERS
index f27846734b06..e5b051f0e27e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2233,6 +2233,12 @@ M:	p_gortmaker@yahoo.com
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
+REAL TIME CLOCK (RTC) SUBSYSTEM
+P:	Alessandro Zummo
+M:	a.zummo@towertech.it
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 REISERFS FILE SYSTEM
 P:	Hans Reiser
 M:	reiserfs-dev@namesys.com
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 15df7c130fa6..a256f67b78e4 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1,6 +1,46 @@
-#
+\#
 # RTC class/drivers configuration
 #
 
+menu "Real Time Clock"
+
 config RTC_LIB
-	tristate
\ No newline at end of file
+	tristate
+
+config RTC_CLASS
+	tristate "RTC class"
+	depends on EXPERIMENTAL
+	default n
+	select RTC_LIB
+	help
+	  Generic RTC class support. If you say yes here, you will
+ 	  be allowed to plug one or more RTCs to your system. You will
+	  probably want to enable one of more of the interfaces below.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-class.
+
+config RTC_HCTOSYS
+	bool "Set system time from RTC on startup"
+	depends on RTC_CLASS = y
+	default y
+	help
+	  If you say yes here, the system time will be set using
+	  the value read from the specified RTC device. This is useful
+	  in order to avoid unnecessary fschk runs.
+
+config RTC_HCTOSYS_DEVICE
+	string "The RTC to read the time from"
+	depends on RTC_HCTOSYS = y
+	default "rtc0"
+	help
+	  The RTC device that will be used as the source for
+	  the system time, usually rtc0.
+
+comment "RTC interfaces"
+	depends on RTC_CLASS
+
+comment "RTC drivers"
+	depends on RTC_CLASS
+
+endmenu
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index eb9ad77c3e95..7b87f3710dff 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -2,4 +2,7 @@
 # Makefile for RTC class/drivers.
 #
 
-obj-$(CONFIG_RTC_LIB)	+= rtc-lib.o
+obj-$(CONFIG_RTC_LIB)		+= rtc-lib.o
+obj-$(CONFIG_RTC_HCTOSYS)	+= hctosys.o
+obj-$(CONFIG_RTC_CLASS)		+= rtc-core.o
+rtc-core-y			:= class.o interface.o
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
new file mode 100644
index 000000000000..8533936d50d8
--- /dev/null
+++ b/drivers/rtc/class.c
@@ -0,0 +1,145 @@
+/*
+ * RTC subsystem, base class
+ *
+ * Copyright (C) 2005 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * class skeleton from drivers/hwmon/hwmon.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/module.h>
+#include <linux/rtc.h>
+#include <linux/kdev_t.h>
+#include <linux/idr.h>
+
+static DEFINE_IDR(rtc_idr);
+static DEFINE_MUTEX(idr_lock);
+struct class *rtc_class;
+
+static void rtc_device_release(struct class_device *class_dev)
+{
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+	mutex_lock(&idr_lock);
+	idr_remove(&rtc_idr, rtc->id);
+	mutex_unlock(&idr_lock);
+	kfree(rtc);
+}
+
+/**
+ * rtc_device_register - register w/ RTC class
+ * @dev: the device to register
+ *
+ * rtc_device_unregister() must be called when the class device is no
+ * longer needed.
+ *
+ * Returns the pointer to the new struct class device.
+ */
+struct rtc_device *rtc_device_register(const char *name, struct device *dev,
+					struct rtc_class_ops *ops,
+					struct module *owner)
+{
+	struct rtc_device *rtc;
+	int id, err;
+
+	if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+
+	mutex_lock(&idr_lock);
+	err = idr_get_new(&rtc_idr, NULL, &id);
+	mutex_unlock(&idr_lock);
+
+	if (err < 0)
+		goto exit;
+
+	id = id & MAX_ID_MASK;
+
+	rtc = kzalloc(sizeof(struct rtc_device), GFP_KERNEL);
+	if (rtc == NULL) {
+		err = -ENOMEM;
+		goto exit_idr;
+	}
+
+	rtc->id = id;
+	rtc->ops = ops;
+	rtc->owner = owner;
+	rtc->class_dev.dev = dev;
+	rtc->class_dev.class = rtc_class;
+	rtc->class_dev.release = rtc_device_release;
+
+	mutex_init(&rtc->ops_lock);
+	spin_lock_init(&rtc->irq_lock);
+	spin_lock_init(&rtc->irq_task_lock);
+
+	strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
+	snprintf(rtc->class_dev.class_id, BUS_ID_SIZE, "rtc%d", id);
+
+	err = class_device_register(&rtc->class_dev);
+	if (err)
+		goto exit_kfree;
+
+	dev_info(dev, "rtc core: registered %s as %s\n",
+			rtc->name, rtc->class_dev.class_id);
+
+	return rtc;
+
+exit_kfree:
+	kfree(rtc);
+
+exit_idr:
+	idr_remove(&rtc_idr, id);
+
+exit:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(rtc_device_register);
+
+
+/**
+ * rtc_device_unregister - removes the previously registered RTC class device
+ *
+ * @rtc: the RTC class device to destroy
+ */
+void rtc_device_unregister(struct rtc_device *rtc)
+{
+	mutex_lock(&rtc->ops_lock);
+	rtc->ops = NULL;
+	mutex_unlock(&rtc->ops_lock);
+	class_device_unregister(&rtc->class_dev);
+}
+EXPORT_SYMBOL_GPL(rtc_device_unregister);
+
+int rtc_interface_register(struct class_interface *intf)
+{
+	intf->class = rtc_class;
+	return class_interface_register(intf);
+}
+EXPORT_SYMBOL_GPL(rtc_interface_register);
+
+static int __init rtc_init(void)
+{
+	rtc_class = class_create(THIS_MODULE, "rtc");
+	if (IS_ERR(rtc_class)) {
+		printk(KERN_ERR "%s: couldn't create class\n", __FILE__);
+		return PTR_ERR(rtc_class);
+	}
+	return 0;
+}
+
+static void __exit rtc_exit(void)
+{
+	class_destroy(rtc_class);
+}
+
+module_init(rtc_init);
+module_exit(rtc_exit);
+
+MODULE_AUTHOR("Alessandro Zummo <a.zummo@towerteh.it>");
+MODULE_DESCRIPTION("RTC class support");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rtc/hctosys.c b/drivers/rtc/hctosys.c
new file mode 100644
index 000000000000..d02fe9a0001f
--- /dev/null
+++ b/drivers/rtc/hctosys.c
@@ -0,0 +1,69 @@
+/*
+ * RTC subsystem, initialize system time on startup
+ *
+ * Copyright (C) 2005 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/rtc.h>
+
+/* IMPORTANT: the RTC only stores whole seconds. It is arbitrary
+ * whether it stores the most close value or the value with partial
+ * seconds truncated. However, it is important that we use it to store
+ * the truncated value. This is because otherwise it is necessary,
+ * in an rtc sync function, to read both xtime.tv_sec and
+ * xtime.tv_nsec. On some processors (i.e. ARM), an atomic read
+ * of >32bits is not possible. So storing the most close value would
+ * slow down the sync API. So here we have the truncated value and
+ * the best guess is to add 0.5s.
+ */
+
+static int __init rtc_hctosys(void)
+{
+	int err;
+	struct rtc_time tm;
+	struct class_device *class_dev = rtc_class_open(CONFIG_RTC_HCTOSYS_DEVICE);
+
+	if (class_dev == NULL) {
+		printk("%s: unable to open rtc device (%s)\n",
+			__FILE__, CONFIG_RTC_HCTOSYS_DEVICE);
+		return -ENODEV;
+	}
+
+	err = rtc_read_time(class_dev, &tm);
+	if (err == 0) {
+		err = rtc_valid_tm(&tm);
+		if (err == 0) {
+			struct timespec tv;
+
+			tv.tv_nsec = NSEC_PER_SEC >> 1;
+
+			rtc_tm_to_time(&tm, &tv.tv_sec);
+
+			do_settimeofday(&tv);
+
+			dev_info(class_dev->dev,
+				"setting the system clock to "
+				"%d-%02d-%02d %02d:%02d:%02d (%u)\n",
+				tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+				tm.tm_hour, tm.tm_min, tm.tm_sec,
+				(unsigned int) tv.tv_sec);
+		}
+		else
+			dev_err(class_dev->dev,
+				"hctosys: invalid date/time\n");
+	}
+	else
+		dev_err(class_dev->dev,
+			"hctosys: unable to read the hardware clock\n");
+
+	rtc_class_close(class_dev);
+
+	return 0;
+}
+
+late_initcall(rtc_hctosys);
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
new file mode 100644
index 000000000000..56e490709b87
--- /dev/null
+++ b/drivers/rtc/interface.c
@@ -0,0 +1,277 @@
+/*
+ * RTC subsystem, interface functions
+ *
+ * Copyright (C) 2005 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * based on arch/arm/common/rtctime.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/rtc.h>
+
+int rtc_read_time(struct class_device *class_dev, struct rtc_time *tm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->read_time)
+		err = -EINVAL;
+	else {
+		memset(tm, 0, sizeof(struct rtc_time));
+		err = rtc->ops->read_time(class_dev->dev, tm);
+	}
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_read_time);
+
+int rtc_set_time(struct class_device *class_dev, struct rtc_time *tm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = rtc_valid_tm(tm);
+	if (err != 0)
+		return err;
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->set_time)
+		err = -EINVAL;
+	else
+		err = rtc->ops->set_time(class_dev->dev, tm);
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_set_time);
+
+int rtc_set_mmss(struct class_device *class_dev, unsigned long secs)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (rtc->ops->set_mmss)
+		err = rtc->ops->set_mmss(class_dev->dev, secs);
+	else if (rtc->ops->read_time && rtc->ops->set_time) {
+		struct rtc_time new, old;
+
+		err = rtc->ops->read_time(class_dev->dev, &old);
+		if (err == 0) {
+			rtc_time_to_tm(secs, &new);
+
+			/*
+			 * avoid writing when we're going to change the day of
+			 * the month. We will retry in the next minute. This
+			 * basically means that if the RTC must not drift
+			 * by more than 1 minute in 11 minutes.
+			 */
+			if (!((old.tm_hour == 23 && old.tm_min == 59) ||
+				(new.tm_hour == 23 && new.tm_min == 59)))
+				err = rtc->ops->set_time(class_dev->dev, &new);
+		}
+	}
+	else
+		err = -EINVAL;
+
+	mutex_unlock(&rtc->ops_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_set_mmss);
+
+int rtc_read_alarm(struct class_device *class_dev, struct rtc_wkalrm *alarm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (rtc->ops == NULL)
+		err = -ENODEV;
+	else if (!rtc->ops->read_alarm)
+		err = -EINVAL;
+	else {
+		memset(alarm, 0, sizeof(struct rtc_wkalrm));
+		err = rtc->ops->read_alarm(class_dev->dev, alarm);
+	}
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_read_alarm);
+
+int rtc_set_alarm(struct class_device *class_dev, struct rtc_wkalrm *alarm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->set_alarm)
+		err = -EINVAL;
+	else
+		err = rtc->ops->set_alarm(class_dev->dev, alarm);
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_set_alarm);
+
+void rtc_update_irq(struct class_device *class_dev,
+		unsigned long num, unsigned long events)
+{
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	spin_lock(&rtc->irq_lock);
+	rtc->irq_data = (rtc->irq_data + (num << 8)) | events;
+	spin_unlock(&rtc->irq_lock);
+
+	spin_lock(&rtc->irq_task_lock);
+	if (rtc->irq_task)
+		rtc->irq_task->func(rtc->irq_task->private_data);
+	spin_unlock(&rtc->irq_task_lock);
+
+	wake_up_interruptible(&rtc->irq_queue);
+	kill_fasync(&rtc->async_queue, SIGIO, POLL_IN);
+}
+EXPORT_SYMBOL_GPL(rtc_update_irq);
+
+struct class_device *rtc_class_open(char *name)
+{
+	struct class_device *class_dev = NULL,
+				*class_dev_tmp;
+
+	down(&rtc_class->sem);
+	list_for_each_entry(class_dev_tmp, &rtc_class->children, node) {
+		if (strncmp(class_dev_tmp->class_id, name, BUS_ID_SIZE) == 0) {
+			class_dev = class_dev_tmp;
+			break;
+		}
+	}
+
+	if (class_dev) {
+		if (!try_module_get(to_rtc_device(class_dev)->owner))
+			class_dev = NULL;
+	}
+	up(&rtc_class->sem);
+
+	return class_dev;
+}
+EXPORT_SYMBOL_GPL(rtc_class_open);
+
+void rtc_class_close(struct class_device *class_dev)
+{
+	module_put(to_rtc_device(class_dev)->owner);
+}
+EXPORT_SYMBOL_GPL(rtc_class_close);
+
+int rtc_irq_register(struct class_device *class_dev, struct rtc_task *task)
+{
+	int retval = -EBUSY;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	if (task == NULL || task->func == NULL)
+		return -EINVAL;
+
+	spin_lock(&rtc->irq_task_lock);
+	if (rtc->irq_task == NULL) {
+		rtc->irq_task = task;
+		retval = 0;
+	}
+	spin_unlock(&rtc->irq_task_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(rtc_irq_register);
+
+void rtc_irq_unregister(struct class_device *class_dev, struct rtc_task *task)
+{
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	spin_lock(&rtc->irq_task_lock);
+	if (rtc->irq_task == task)
+		rtc->irq_task = NULL;
+	spin_unlock(&rtc->irq_task_lock);
+}
+EXPORT_SYMBOL_GPL(rtc_irq_unregister);
+
+int rtc_irq_set_state(struct class_device *class_dev, struct rtc_task *task, int enabled)
+{
+	int err = 0;
+	unsigned long flags;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	spin_lock_irqsave(&rtc->irq_task_lock, flags);
+	if (rtc->irq_task != task)
+		err = -ENXIO;
+	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
+
+	if (err == 0)
+		err = rtc->ops->irq_set_state(class_dev->dev, enabled);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_irq_set_state);
+
+int rtc_irq_set_freq(struct class_device *class_dev, struct rtc_task *task, int freq)
+{
+	int err = 0, tmp = 0;
+	unsigned long flags;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	/* allowed range is 2-8192 */
+	if (freq < 2 || freq > 8192)
+		return -EINVAL;
+/*
+	FIXME: this does not belong here, will move where appropriate
+	at a later stage. It cannot hurt right now, trust me :)
+	if ((freq > rtc_max_user_freq) && (!capable(CAP_SYS_RESOURCE)))
+		return -EACCES;
+*/
+	/* check if freq is a power of 2 */
+	while (freq > (1 << tmp))
+		tmp++;
+
+	if (freq != (1 << tmp))
+		return -EINVAL;
+
+	spin_lock_irqsave(&rtc->irq_task_lock, flags);
+	if (rtc->irq_task != task)
+		err = -ENXIO;
+	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
+
+	if (err == 0) {
+		err = rtc->ops->irq_set_freq(class_dev->dev, freq);
+		if (err == 0)
+			rtc->irq_freq = freq;
+	}
+	return err;
+}
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 8454337c7058..ab61cd1199f2 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -91,6 +91,12 @@ struct rtc_pll_info {
 #define RTC_PLL_GET	_IOR('p', 0x11, struct rtc_pll_info)  /* Get PLL correction */
 #define RTC_PLL_SET	_IOW('p', 0x12, struct rtc_pll_info)  /* Set PLL correction */
 
+/* interrupt flags */
+#define RTC_IRQF 0x80 /* any of the following is active */
+#define RTC_PF 0x40
+#define RTC_AF 0x20
+#define RTC_UF 0x10
+
 #ifdef __KERNEL__
 
 #include <linux/interrupt.h>
@@ -100,6 +106,87 @@ extern int rtc_valid_tm(struct rtc_time *tm);
 extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
 extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
 
+#include <linux/device.h>
+#include <linux/seq_file.h>
+#include <linux/cdev.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+
+extern struct class *rtc_class;
+
+struct rtc_class_ops {
+	int (*open)(struct device *);
+	void (*release)(struct device *);
+	int (*ioctl)(struct device *, unsigned int, unsigned long);
+	int (*read_time)(struct device *, struct rtc_time *);
+	int (*set_time)(struct device *, struct rtc_time *);
+	int (*read_alarm)(struct device *, struct rtc_wkalrm *);
+	int (*set_alarm)(struct device *, struct rtc_wkalrm *);
+	int (*proc)(struct device *, struct seq_file *);
+	int (*set_mmss)(struct device *, unsigned long secs);
+	int (*irq_set_state)(struct device *, int enabled);
+	int (*irq_set_freq)(struct device *, int freq);
+	int (*read_callback)(struct device *, int data);
+};
+
+#define RTC_DEVICE_NAME_SIZE 20
+struct rtc_task;
+
+struct rtc_device
+{
+	struct class_device class_dev;
+	struct module *owner;
+
+	int id;
+	char name[RTC_DEVICE_NAME_SIZE];
+
+	struct rtc_class_ops *ops;
+	struct mutex ops_lock;
+
+	struct class_device *rtc_dev;
+	struct cdev char_dev;
+	struct mutex char_lock;
+
+	unsigned long irq_data;
+	spinlock_t irq_lock;
+	wait_queue_head_t irq_queue;
+	struct fasync_struct *async_queue;
+
+	struct rtc_task *irq_task;
+	spinlock_t irq_task_lock;
+	int irq_freq;
+};
+#define to_rtc_device(d) container_of(d, struct rtc_device, class_dev)
+
+extern struct rtc_device *rtc_device_register(const char *name,
+					struct device *dev,
+					struct rtc_class_ops *ops,
+					struct module *owner);
+extern void rtc_device_unregister(struct rtc_device *rdev);
+extern int rtc_interface_register(struct class_interface *intf);
+
+extern int rtc_read_time(struct class_device *class_dev, struct rtc_time *tm);
+extern int rtc_set_time(struct class_device *class_dev, struct rtc_time *tm);
+extern int rtc_set_mmss(struct class_device *class_dev, unsigned long secs);
+extern int rtc_read_alarm(struct class_device *class_dev,
+			struct rtc_wkalrm *alrm);
+extern int rtc_set_alarm(struct class_device *class_dev,
+				struct rtc_wkalrm *alrm);
+extern void rtc_update_irq(struct class_device *class_dev,
+			unsigned long num, unsigned long events);
+
+extern struct class_device *rtc_class_open(char *name);
+extern void rtc_class_close(struct class_device *class_dev);
+
+extern int rtc_irq_register(struct class_device *class_dev,
+				struct rtc_task *task);
+extern void rtc_irq_unregister(struct class_device *class_dev,
+				struct rtc_task *task);
+extern int rtc_irq_set_state(struct class_device *class_dev,
+				struct rtc_task *task, int enabled);
+extern int rtc_irq_set_freq(struct class_device *class_dev,
+				struct rtc_task *task, int freq);
+
 typedef struct rtc_task {
 	void (*func)(void *private_data);
 	void *private_data;
-- 
cgit v1.2.3


From 6fc7f10cee28c7fa190920fefda8c696d5bf3074 Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:37 -0800
Subject: [PATCH] RTC subsystem: I2C cleanup

This patch, completely optional, removes from drivers/i2c/chips all the
drivers that are implemented in the new RTC subsystem.

It should be noted that none of the current driver is actually integrated,
i.e.  usable without further patches.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/i2c/chips/Kconfig   |  18 --
 drivers/i2c/chips/Makefile  |   2 -
 drivers/i2c/chips/rtc8564.c | 385 ------------------------
 drivers/i2c/chips/rtc8564.h |  78 -----
 drivers/i2c/chips/x1205.c   | 698 --------------------------------------------
 include/linux/x1205.h       |  31 --
 6 files changed, 1212 deletions(-)
 delete mode 100644 drivers/i2c/chips/rtc8564.c
 delete mode 100644 drivers/i2c/chips/rtc8564.h
 delete mode 100644 drivers/i2c/chips/x1205.c
 delete mode 100644 include/linux/x1205.h

(limited to 'include/linux')

diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index f9fae28f5612..7aa5c38f0855 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -65,15 +65,6 @@ config SENSORS_PCF8591
 	  This driver can also be built as a module.  If so, the module
 	  will be called pcf8591.
 
-config SENSORS_RTC8564
-	tristate "Epson 8564 RTC chip"
-	depends on I2C && EXPERIMENTAL
-	help
-	  If you say yes here you get support for the Epson 8564 RTC chip.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called i2c-rtc8564.
-
 config ISP1301_OMAP
 	tristate "Philips ISP1301 with OMAP OTG"
 	depends on I2C && ARCH_OMAP_OTG
@@ -126,13 +117,4 @@ config SENSORS_MAX6875
 	  This driver can also be built as a module.  If so, the module
 	  will be called max6875.
 
-config RTC_X1205_I2C
-	tristate "Xicor X1205 RTC chip"
-	depends on I2C && EXPERIMENTAL
-	help
-	  If you say yes here you get support for the Xicor X1205 RTC chip.
-
-	  This driver can also be built as a module. If so, the module
-	  will be called x1205.
-
 endmenu
diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile
index 46178b57b1f1..779868ef2e26 100644
--- a/drivers/i2c/chips/Makefile
+++ b/drivers/i2c/chips/Makefile
@@ -10,10 +10,8 @@ obj-$(CONFIG_SENSORS_M41T00)	+= m41t00.o
 obj-$(CONFIG_SENSORS_PCA9539)	+= pca9539.o
 obj-$(CONFIG_SENSORS_PCF8574)	+= pcf8574.o
 obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
-obj-$(CONFIG_SENSORS_RTC8564)	+= rtc8564.o
 obj-$(CONFIG_ISP1301_OMAP)	+= isp1301_omap.o
 obj-$(CONFIG_TPS65010)		+= tps65010.o
-obj-$(CONFIG_RTC_X1205_I2C)	+= x1205.o
 
 ifeq ($(CONFIG_I2C_DEBUG_CHIP),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/i2c/chips/rtc8564.c b/drivers/i2c/chips/rtc8564.c
deleted file mode 100644
index 0d8699b3f488..000000000000
--- a/drivers/i2c/chips/rtc8564.c
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- *  linux/drivers/i2c/chips/rtc8564.c
- *
- *  Copyright (C) 2002-2004 Stefan Eletzhofer
- *
- *	based on linux/drivers/acron/char/pcf8583.c
- *  Copyright (C) 2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Driver for system3's EPSON RTC 8564 chip
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/bcd.h>
-#include <linux/i2c.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/rtc.h>		/* get the user-level API */
-#include <linux/init.h>
-
-#include "rtc8564.h"
-
-#ifdef DEBUG
-# define _DBG(x, fmt, args...) do{ if (debug>=x) printk(KERN_DEBUG"%s: " fmt "\n", __FUNCTION__, ##args); } while(0);
-#else
-# define _DBG(x, fmt, args...) do { } while(0);
-#endif
-
-#define _DBGRTCTM(x, rtctm) if (debug>=x) printk("%s: secs=%d, mins=%d, hours=%d, mday=%d, " \
-			"mon=%d, year=%d, wday=%d VL=%d\n", __FUNCTION__, \
-			(rtctm).secs, (rtctm).mins, (rtctm).hours, (rtctm).mday, \
-			(rtctm).mon, (rtctm).year, (rtctm).wday, (rtctm).vl);
-
-struct rtc8564_data {
-	struct i2c_client client;
-	u16 ctrl;
-};
-
-static inline u8 _rtc8564_ctrl1(struct i2c_client *client)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-	return data->ctrl & 0xff;
-}
-static inline u8 _rtc8564_ctrl2(struct i2c_client *client)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-	return (data->ctrl & 0xff00) >> 8;
-}
-
-#define CTRL1(c) _rtc8564_ctrl1(c)
-#define CTRL2(c) _rtc8564_ctrl2(c)
-
-static int debug;
-module_param(debug, int, S_IRUGO | S_IWUSR);
-
-static struct i2c_driver rtc8564_driver;
-
-static unsigned short ignore[] = { I2C_CLIENT_END };
-static unsigned short normal_addr[] = { 0x51, I2C_CLIENT_END };
-
-static struct i2c_client_address_data addr_data = {
-	.normal_i2c		= normal_addr,
-	.probe			= ignore,
-	.ignore			= ignore,
-};
-
-static int rtc8564_read_mem(struct i2c_client *client, struct mem *mem);
-static int rtc8564_write_mem(struct i2c_client *client, struct mem *mem);
-
-static int rtc8564_read(struct i2c_client *client, unsigned char adr,
-			unsigned char *buf, unsigned char len)
-{
-	int ret = -EIO;
-	unsigned char addr[1] = { adr };
-	struct i2c_msg msgs[2] = {
-		{client->addr, 0, 1, addr},
-		{client->addr, I2C_M_RD, len, buf}
-	};
-
-	_DBG(1, "client=%p, adr=%d, buf=%p, len=%d", client, adr, buf, len);
-
-	if (!buf) {
-		ret = -EINVAL;
-		goto done;
-	}
-
-	ret = i2c_transfer(client->adapter, msgs, 2);
-	if (ret == 2) {
-		ret = 0;
-	}
-
-done:
-	return ret;
-}
-
-static int rtc8564_write(struct i2c_client *client, unsigned char adr,
-			 unsigned char *data, unsigned char len)
-{
-	int ret = 0;
-	unsigned char _data[16];
-	struct i2c_msg wr;
-	int i;
-
-	if (!data || len > 15) {
-		ret = -EINVAL;
-		goto done;
-	}
-
-	_DBG(1, "client=%p, adr=%d, buf=%p, len=%d", client, adr, data, len);
-
-	_data[0] = adr;
-	for (i = 0; i < len; i++) {
-		_data[i + 1] = data[i];
-		_DBG(5, "data[%d] = 0x%02x (%d)", i, data[i], data[i]);
-	}
-
-	wr.addr = client->addr;
-	wr.flags = 0;
-	wr.len = len + 1;
-	wr.buf = _data;
-
-	ret = i2c_transfer(client->adapter, &wr, 1);
-	if (ret == 1) {
-		ret = 0;
-	}
-
-done:
-	return ret;
-}
-
-static int rtc8564_attach(struct i2c_adapter *adap, int addr, int kind)
-{
-	int ret;
-	struct i2c_client *new_client;
-	struct rtc8564_data *d;
-	unsigned char data[10];
-	unsigned char ad[1] = { 0 };
-	struct i2c_msg ctrl_wr[1] = {
-		{addr, 0, 2, data}
-	};
-	struct i2c_msg ctrl_rd[2] = {
-		{addr, 0, 1, ad},
-		{addr, I2C_M_RD, 2, data}
-	};
-
-	d = kzalloc(sizeof(struct rtc8564_data), GFP_KERNEL);
-	if (!d) {
-		ret = -ENOMEM;
-		goto done;
-	}
-	new_client = &d->client;
-
-	strlcpy(new_client->name, "RTC8564", I2C_NAME_SIZE);
-	i2c_set_clientdata(new_client, d);
-	new_client->addr = addr;
-	new_client->adapter = adap;
-	new_client->driver = &rtc8564_driver;
-
-	_DBG(1, "client=%p", new_client);
-
-	/* init ctrl1 reg */
-	data[0] = 0;
-	data[1] = 0;
-	ret = i2c_transfer(new_client->adapter, ctrl_wr, 1);
-	if (ret != 1) {
-		printk(KERN_INFO "rtc8564: cant init ctrl1\n");
-		ret = -ENODEV;
-		goto done;
-	}
-
-	/* read back ctrl1 and ctrl2 */
-	ret = i2c_transfer(new_client->adapter, ctrl_rd, 2);
-	if (ret != 2) {
-		printk(KERN_INFO "rtc8564: cant read ctrl\n");
-		ret = -ENODEV;
-		goto done;
-	}
-
-	d->ctrl = data[0] | (data[1] << 8);
-
-	_DBG(1, "RTC8564_REG_CTRL1=%02x, RTC8564_REG_CTRL2=%02x",
-	     data[0], data[1]);
-
-	ret = i2c_attach_client(new_client);
-done:
-	if (ret) {
-		kfree(d);
-	}
-	return ret;
-}
-
-static int rtc8564_probe(struct i2c_adapter *adap)
-{
-	return i2c_probe(adap, &addr_data, rtc8564_attach);
-}
-
-static int rtc8564_detach(struct i2c_client *client)
-{
-	i2c_detach_client(client);
-	kfree(i2c_get_clientdata(client));
-	return 0;
-}
-
-static int rtc8564_get_datetime(struct i2c_client *client, struct rtc_tm *dt)
-{
-	int ret = -EIO;
-	unsigned char buf[15];
-
-	_DBG(1, "client=%p, dt=%p", client, dt);
-
-	if (!dt)
-		return -EINVAL;
-
-	memset(buf, 0, sizeof(buf));
-
-	ret = rtc8564_read(client, 0, buf, 15);
-	if (ret)
-		return ret;
-
-	/* century stored in minute alarm reg */
-	dt->year = BCD2BIN(buf[RTC8564_REG_YEAR]);
-	dt->year += 100 * BCD2BIN(buf[RTC8564_REG_AL_MIN] & 0x3f);
-	dt->mday = BCD2BIN(buf[RTC8564_REG_DAY] & 0x3f);
-	dt->wday = BCD2BIN(buf[RTC8564_REG_WDAY] & 7);
-	dt->mon = BCD2BIN(buf[RTC8564_REG_MON_CENT] & 0x1f);
-
-	dt->secs = BCD2BIN(buf[RTC8564_REG_SEC] & 0x7f);
-	dt->vl = (buf[RTC8564_REG_SEC] & 0x80) == 0x80;
-	dt->mins = BCD2BIN(buf[RTC8564_REG_MIN] & 0x7f);
-	dt->hours = BCD2BIN(buf[RTC8564_REG_HR] & 0x3f);
-
-	_DBGRTCTM(2, *dt);
-
-	return 0;
-}
-
-static int
-rtc8564_set_datetime(struct i2c_client *client, struct rtc_tm *dt, int datetoo)
-{
-	int ret, len = 5;
-	unsigned char buf[15];
-
-	_DBG(1, "client=%p, dt=%p", client, dt);
-
-	if (!dt)
-		return -EINVAL;
-
-	_DBGRTCTM(2, *dt);
-
-	buf[RTC8564_REG_CTRL1] = CTRL1(client) | RTC8564_CTRL1_STOP;
-	buf[RTC8564_REG_CTRL2] = CTRL2(client);
-	buf[RTC8564_REG_SEC] = BIN2BCD(dt->secs);
-	buf[RTC8564_REG_MIN] = BIN2BCD(dt->mins);
-	buf[RTC8564_REG_HR] = BIN2BCD(dt->hours);
-
-	if (datetoo) {
-		len += 5;
-		buf[RTC8564_REG_DAY] = BIN2BCD(dt->mday);
-		buf[RTC8564_REG_WDAY] = BIN2BCD(dt->wday);
-		buf[RTC8564_REG_MON_CENT] = BIN2BCD(dt->mon) & 0x1f;
-		/* century stored in minute alarm reg */
-		buf[RTC8564_REG_YEAR] = BIN2BCD(dt->year % 100);
-		buf[RTC8564_REG_AL_MIN] = BIN2BCD(dt->year / 100);
-	}
-
-	ret = rtc8564_write(client, 0, buf, len);
-	if (ret) {
-		_DBG(1, "error writing data! %d", ret);
-	}
-
-	buf[RTC8564_REG_CTRL1] = CTRL1(client);
-	ret = rtc8564_write(client, 0, buf, 1);
-	if (ret) {
-		_DBG(1, "error writing data! %d", ret);
-	}
-
-	return ret;
-}
-
-static int rtc8564_get_ctrl(struct i2c_client *client, unsigned int *ctrl)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-
-	if (!ctrl)
-		return -1;
-
-	*ctrl = data->ctrl;
-	return 0;
-}
-
-static int rtc8564_set_ctrl(struct i2c_client *client, unsigned int *ctrl)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-	unsigned char buf[2];
-
-	if (!ctrl)
-		return -1;
-
-	buf[0] = *ctrl & 0xff;
-	buf[1] = (*ctrl & 0xff00) >> 8;
-	data->ctrl = *ctrl;
-
-	return rtc8564_write(client, 0, buf, 2);
-}
-
-static int rtc8564_read_mem(struct i2c_client *client, struct mem *mem)
-{
-
-	if (!mem)
-		return -EINVAL;
-
-	return rtc8564_read(client, mem->loc, mem->data, mem->nr);
-}
-
-static int rtc8564_write_mem(struct i2c_client *client, struct mem *mem)
-{
-
-	if (!mem)
-		return -EINVAL;
-
-	return rtc8564_write(client, mem->loc, mem->data, mem->nr);
-}
-
-static int
-rtc8564_command(struct i2c_client *client, unsigned int cmd, void *arg)
-{
-
-	_DBG(1, "cmd=%d", cmd);
-
-	switch (cmd) {
-	case RTC_GETDATETIME:
-		return rtc8564_get_datetime(client, arg);
-
-	case RTC_SETTIME:
-		return rtc8564_set_datetime(client, arg, 0);
-
-	case RTC_SETDATETIME:
-		return rtc8564_set_datetime(client, arg, 1);
-
-	case RTC_GETCTRL:
-		return rtc8564_get_ctrl(client, arg);
-
-	case RTC_SETCTRL:
-		return rtc8564_set_ctrl(client, arg);
-
-	case MEM_READ:
-		return rtc8564_read_mem(client, arg);
-
-	case MEM_WRITE:
-		return rtc8564_write_mem(client, arg);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-static struct i2c_driver rtc8564_driver = {
-	.driver = {
-		.name	= "RTC8564",
-	},
-	.id		= I2C_DRIVERID_RTC8564,
-	.attach_adapter = rtc8564_probe,
-	.detach_client	= rtc8564_detach,
-	.command	= rtc8564_command
-};
-
-static __init int rtc8564_init(void)
-{
-	return i2c_add_driver(&rtc8564_driver);
-}
-
-static __exit void rtc8564_exit(void)
-{
-	i2c_del_driver(&rtc8564_driver);
-}
-
-MODULE_AUTHOR("Stefan Eletzhofer <Stefan.Eletzhofer@eletztrick.de>");
-MODULE_DESCRIPTION("EPSON RTC8564 Driver");
-MODULE_LICENSE("GPL");
-
-module_init(rtc8564_init);
-module_exit(rtc8564_exit);
diff --git a/drivers/i2c/chips/rtc8564.h b/drivers/i2c/chips/rtc8564.h
deleted file mode 100644
index e5342d10b8fa..000000000000
--- a/drivers/i2c/chips/rtc8564.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- *  linux/drivers/i2c/chips/rtc8564.h
- *
- *  Copyright (C) 2002-2004 Stefan Eletzhofer
- *
- *	based on linux/drivers/acron/char/pcf8583.h
- *  Copyright (C) 2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-struct rtc_tm {
-	unsigned char	secs;
-	unsigned char	mins;
-	unsigned char	hours;
-	unsigned char	mday;
-	unsigned char	mon;
-	unsigned short	year; /* xxxx 4 digits :) */
-	unsigned char	wday;
-	unsigned char	vl;
-};
-
-struct mem {
-	unsigned int	loc;
-	unsigned int	nr;
-	unsigned char	*data;
-};
-
-#define RTC_GETDATETIME	0
-#define RTC_SETTIME	1
-#define RTC_SETDATETIME	2
-#define RTC_GETCTRL	3
-#define RTC_SETCTRL	4
-#define MEM_READ	5
-#define MEM_WRITE	6
-
-#define RTC8564_REG_CTRL1		0x0 /* T  0 S 0 | T 0 0 0 */
-#define RTC8564_REG_CTRL2		0x1 /* 0  0 0 TI/TP | AF TF AIE TIE */
-#define RTC8564_REG_SEC			0x2 /* VL 4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_MIN			0x3 /* x  4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_HR			0x4 /* x  x 2 1 | 8 4 2 1 */
-#define RTC8564_REG_DAY			0x5 /* x  x 2 1 | 8 4 2 1 */
-#define RTC8564_REG_WDAY		0x6 /* x  x x x | x 4 2 1 */
-#define RTC8564_REG_MON_CENT	0x7 /* C  x x 1 | 8 4 2 1 */
-#define RTC8564_REG_YEAR		0x8 /* 8  4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_MIN		0x9 /* AE 4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_HR		0xa /* AE 4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_DAY		0xb /* AE x 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_WDAY		0xc /* AE x x x | x 4 2 1 */
-#define RTC8564_REG_CLKOUT		0xd /* FE x x x | x x FD1 FD0 */
-#define RTC8564_REG_TCTL		0xe /* TE x x x | x x FD1 FD0 */
-#define RTC8564_REG_TIMER		0xf /* 8 bit binary */
-
-/* Control reg */
-#define RTC8564_CTRL1_TEST1		(1<<3)
-#define RTC8564_CTRL1_STOP		(1<<5)
-#define RTC8564_CTRL1_TEST2		(1<<7)
-
-#define RTC8564_CTRL2_TIE		(1<<0)
-#define RTC8564_CTRL2_AIE		(1<<1)
-#define RTC8564_CTRL2_TF		(1<<2)
-#define RTC8564_CTRL2_AF		(1<<3)
-#define RTC8564_CTRL2_TI_TP		(1<<4)
-
-/* CLKOUT frequencies */
-#define RTC8564_FD_32768HZ		(0x0)
-#define RTC8564_FD_1024HZ		(0x1)
-#define RTC8564_FD_32			(0x2)
-#define RTC8564_FD_1HZ			(0x3)
-
-/* Timer CTRL */
-#define RTC8564_TD_4096HZ		(0x0)
-#define RTC8564_TD_64HZ			(0x1)
-#define RTC8564_TD_1HZ			(0x2)
-#define RTC8564_TD_1_60HZ		(0x3)
-
-#define I2C_DRIVERID_RTC8564 0xf000
diff --git a/drivers/i2c/chips/x1205.c b/drivers/i2c/chips/x1205.c
deleted file mode 100644
index 245fffa92dbd..000000000000
--- a/drivers/i2c/chips/x1205.c
+++ /dev/null
@@ -1,698 +0,0 @@
-/*
- *  x1205.c - An i2c driver for the Xicor X1205 RTC
- *  Copyright 2004 Karen Spearel
- *  Copyright 2005 Alessandro Zummo
- *
- *  please send all reports to:
- *	kas11 at tampabay dot rr dot com
- *      a dot zummo at towertech dot it
- *
- *  based on the other drivers in this same directory.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/i2c.h>
-#include <linux/string.h>
-#include <linux/bcd.h>
-#include <linux/rtc.h>
-#include <linux/list.h>
-
-#include <linux/x1205.h>
-
-#define DRV_VERSION "0.9.9"
-
-/* Addresses to scan: none. This chip is located at
- * 0x6f and uses a two bytes register addressing.
- * Two bytes need to be written to read a single register,
- * while most other chips just require one and take the second
- * one as the data to be written. To prevent corrupting
- * unknown chips, the user must explicitely set the probe parameter.
- */
-
-static unsigned short normal_i2c[] = { I2C_CLIENT_END };
-
-/* Insmod parameters */
-I2C_CLIENT_INSMOD;
-I2C_CLIENT_MODULE_PARM(hctosys,
-	"Set the system time from the hardware clock upon initialization");
-
-/* offsets into CCR area */
-
-#define CCR_SEC			0
-#define CCR_MIN			1
-#define CCR_HOUR		2
-#define CCR_MDAY		3
-#define CCR_MONTH		4
-#define CCR_YEAR		5
-#define CCR_WDAY		6
-#define CCR_Y2K			7
-
-#define X1205_REG_SR		0x3F	/* status register */
-#define X1205_REG_Y2K		0x37
-#define X1205_REG_DW		0x36
-#define X1205_REG_YR		0x35
-#define X1205_REG_MO		0x34
-#define X1205_REG_DT		0x33
-#define X1205_REG_HR		0x32
-#define X1205_REG_MN		0x31
-#define X1205_REG_SC		0x30
-#define X1205_REG_DTR		0x13
-#define X1205_REG_ATR		0x12
-#define X1205_REG_INT		0x11
-#define X1205_REG_0		0x10
-#define X1205_REG_Y2K1		0x0F
-#define X1205_REG_DWA1		0x0E
-#define X1205_REG_YRA1		0x0D
-#define X1205_REG_MOA1		0x0C
-#define X1205_REG_DTA1		0x0B
-#define X1205_REG_HRA1		0x0A
-#define X1205_REG_MNA1		0x09
-#define X1205_REG_SCA1		0x08
-#define X1205_REG_Y2K0		0x07
-#define X1205_REG_DWA0		0x06
-#define X1205_REG_YRA0		0x05
-#define X1205_REG_MOA0		0x04
-#define X1205_REG_DTA0		0x03
-#define X1205_REG_HRA0		0x02
-#define X1205_REG_MNA0		0x01
-#define X1205_REG_SCA0		0x00
-
-#define X1205_CCR_BASE		0x30	/* Base address of CCR */
-#define X1205_ALM0_BASE		0x00	/* Base address of ALARM0 */
-
-#define X1205_SR_RTCF		0x01	/* Clock failure */
-#define X1205_SR_WEL		0x02	/* Write Enable Latch */
-#define X1205_SR_RWEL		0x04	/* Register Write Enable */
-
-#define X1205_DTR_DTR0		0x01
-#define X1205_DTR_DTR1		0x02
-#define X1205_DTR_DTR2		0x04
-
-#define X1205_HR_MIL		0x80	/* Set in ccr.hour for 24 hr mode */
-
-/* Prototypes */
-static int x1205_attach(struct i2c_adapter *adapter);
-static int x1205_detach(struct i2c_client *client);
-static int x1205_probe(struct i2c_adapter *adapter, int address, int kind);
-static int x1205_command(struct i2c_client *client, unsigned int cmd,
-	void *arg);
-
-static struct i2c_driver x1205_driver = {
-	.driver = {
-		.name	= "x1205",
-	},
-	.attach_adapter = &x1205_attach,
-	.detach_client	= &x1205_detach,
-};
-
-struct x1205_data {
-	struct i2c_client client;
-	struct list_head list;
-	unsigned int epoch;
-};
-
-static const unsigned char days_in_mo[] =
-	{ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
-
-static LIST_HEAD(x1205_clients);
-
-/* Workaround until the I2C subsytem will allow to send
- * commands to a specific client. This function will send the command
- * to the first client.
- */
-int x1205_do_command(unsigned int cmd, void *arg)
-{
-	struct list_head *walk;
-	struct list_head *tmp;
-	struct x1205_data *data;
-
-	list_for_each_safe(walk, tmp, &x1205_clients) {
-		data = list_entry(walk, struct x1205_data, list);
-		return x1205_command(&data->client, cmd, arg);
-	}
-
-	return -ENODEV;
-}
-
-#define is_leap(year) \
-	((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0))
-
-/* make sure the rtc_time values are in bounds */
-static int x1205_validate_tm(struct rtc_time *tm)
-{
-	int year = tm->tm_year + 1900;
-
-	if ((tm->tm_year < 70) || (tm->tm_year > 255))
-		return -EINVAL;
-
-	if ((tm->tm_mon > 11) || (tm->tm_mday == 0))
-		return -EINVAL;
-
-	if (tm->tm_mday > days_in_mo[tm->tm_mon]
-		+ ((tm->tm_mon == 1) && is_leap(year)))
-		return -EINVAL;
-
-	if ((tm->tm_hour >= 24) || (tm->tm_min >= 60) || (tm->tm_sec >= 60))
-		return -EINVAL;
-
-	return 0;
-}
-
-/*
- * In the routines that deal directly with the x1205 hardware, we use
- * rtc_time -- month 0-11, hour 0-23, yr = calendar year-epoch
- * Epoch is initialized as 2000. Time is set to UTC.
- */
-static int x1205_get_datetime(struct i2c_client *client, struct rtc_time *tm,
-				u8 reg_base)
-{
-	unsigned char dt_addr[2] = { 0, reg_base };
-	static unsigned char sr_addr[2] = { 0, X1205_REG_SR };
-
-	unsigned char buf[8], sr;
-
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 2, sr_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 1, &sr }, 	/* read status */
-		{ client->addr, 0, 2, dt_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 8, buf },	/* read date */
-	};
-
-	struct x1205_data *data = i2c_get_clientdata(client);
-
-	/* read status register */
-	if ((i2c_transfer(client->adapter, &msgs[0], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	/* check for battery failure */
-	if (sr & X1205_SR_RTCF) {
-		dev_warn(&client->dev,
-			"Clock had a power failure, you must set the date.\n");
-		return -EINVAL;
-	}
-
-	/* read date registers */
-	if ((i2c_transfer(client->adapter, &msgs[2], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	dev_dbg(&client->dev,
-		"%s: raw read data - sec=%02x, min=%02x, hr=%02x, "
-		"mday=%02x, mon=%02x, year=%02x, wday=%02x, y2k=%02x\n",
-		__FUNCTION__,
-		buf[0], buf[1], buf[2], buf[3],
-		buf[4], buf[5], buf[6], buf[7]);
-
-	tm->tm_sec = BCD2BIN(buf[CCR_SEC]);
-	tm->tm_min = BCD2BIN(buf[CCR_MIN]);
-	tm->tm_hour = BCD2BIN(buf[CCR_HOUR] & 0x3F); /* hr is 0-23 */
-	tm->tm_mday = BCD2BIN(buf[CCR_MDAY]);
-	tm->tm_mon = BCD2BIN(buf[CCR_MONTH]);
-	data->epoch = BCD2BIN(buf[CCR_Y2K]) * 100;
-	tm->tm_year = BCD2BIN(buf[CCR_YEAR]) + data->epoch - 1900;
-	tm->tm_wday = buf[CCR_WDAY];
-
-	dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, "
-		"mday=%d, mon=%d, year=%d, wday=%d\n",
-		__FUNCTION__,
-		tm->tm_sec, tm->tm_min, tm->tm_hour,
-		tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
-
-	return 0;
-}
-
-static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm,
-				int datetoo, u8 reg_base)
-{
-	int i, err, xfer;
-
-	unsigned char buf[8];
-
-	static const unsigned char wel[3] = { 0, X1205_REG_SR,
-						X1205_SR_WEL };
-
-	static const unsigned char rwel[3] = { 0, X1205_REG_SR,
-						X1205_SR_WEL | X1205_SR_RWEL };
-
-	static const unsigned char diswe[3] = { 0, X1205_REG_SR, 0 };
-
-	struct x1205_data *data = i2c_get_clientdata(client);
-
-	/* check if all values in the tm struct are correct */
-	if ((err = x1205_validate_tm(tm)) < 0)
-		return err;
-
-	dev_dbg(&client->dev, "%s: secs=%d, mins=%d, hours=%d, "
-		"mday=%d, mon=%d, year=%d, wday=%d\n",
-		__FUNCTION__,
-		tm->tm_sec, tm->tm_min, tm->tm_hour,
-		tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
-
-	buf[CCR_SEC] = BIN2BCD(tm->tm_sec);
-	buf[CCR_MIN] = BIN2BCD(tm->tm_min);
-
-	/* set hour and 24hr bit */
-	buf[CCR_HOUR] = BIN2BCD(tm->tm_hour) | X1205_HR_MIL;
-
-	/* should we also set the date? */
-	if (datetoo) {
-		buf[CCR_MDAY] = BIN2BCD(tm->tm_mday);
-
-		/* month, 0 - 11 */
-		buf[CCR_MONTH] = BIN2BCD(tm->tm_mon);
-
-		/* year, since 1900 */
-		buf[CCR_YEAR] = BIN2BCD(tm->tm_year + 1900 - data->epoch);
-		buf[CCR_WDAY] = tm->tm_wday & 0x07;
-		buf[CCR_Y2K] = BIN2BCD(data->epoch / 100);
-	}
-
-	/* this sequence is required to unlock the chip */
-	xfer = i2c_master_send(client, wel, 3);
-	if (xfer != 3) {
-		dev_err(&client->dev, "%s: wel - %d\n", __FUNCTION__, xfer);
-		return -EIO;
-	}
-
-	xfer = i2c_master_send(client, rwel, 3);
-	if (xfer != 3) {
-		dev_err(&client->dev, "%s: rwel - %d\n", __FUNCTION__, xfer);
-		return -EIO;
-	}
-
-	/* write register's data */
-	for (i = 0; i < (datetoo ? 8 : 3); i++) {
-		unsigned char rdata[3] = { 0, reg_base + i, buf[i] };
-
-		xfer = i2c_master_send(client, rdata, 3);
-		if (xfer != 3) {
-			dev_err(&client->dev,
-				"%s: xfer=%d addr=%02x, data=%02x\n",
-				__FUNCTION__,
-				 xfer, rdata[1], rdata[2]);
-			return -EIO;
-		}
-	};
-
-	/* disable further writes */
-	xfer = i2c_master_send(client, diswe, 3);
-	if (xfer != 3) {
-		dev_err(&client->dev, "%s: diswe - %d\n", __FUNCTION__, xfer);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int x1205_get_dtrim(struct i2c_client *client, int *trim)
-{
-	unsigned char dtr;
-	static unsigned char dtr_addr[2] = { 0, X1205_REG_DTR };
-
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 2, dtr_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 1, &dtr }, 	/* read dtr */
-	};
-
-	/* read dtr register */
-	if ((i2c_transfer(client->adapter, &msgs[0], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	dev_dbg(&client->dev, "%s: raw dtr=%x\n", __FUNCTION__, dtr);
-
-	*trim = 0;
-
-	if (dtr & X1205_DTR_DTR0)
-		*trim += 20;
-
-	if (dtr & X1205_DTR_DTR1)
-		*trim += 10;
-
-	if (dtr & X1205_DTR_DTR2)
-		*trim = -*trim;
-
-	return 0;
-}
-
-static int x1205_get_atrim(struct i2c_client *client, int *trim)
-{
-	s8 atr;
-	static unsigned char atr_addr[2] = { 0, X1205_REG_ATR };
-
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 2, atr_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 1, &atr }, 	/* read atr */
-	};
-
-	/* read atr register */
-	if ((i2c_transfer(client->adapter, &msgs[0], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	dev_dbg(&client->dev, "%s: raw atr=%x\n", __FUNCTION__, atr);
-
-	/* atr is a two's complement value on 6 bits,
-	 * perform sign extension. The formula is
-	 * Catr = (atr * 0.25pF) + 11.00pF.
-	 */
-	if (atr & 0x20)
-		atr |= 0xC0;
-
-	dev_dbg(&client->dev, "%s: raw atr=%x (%d)\n", __FUNCTION__, atr, atr);
-
-	*trim = (atr * 250) + 11000;
-
-	dev_dbg(&client->dev, "%s: real=%d\n", __FUNCTION__, *trim);
-
-	return 0;
-}
-
-static int x1205_hctosys(struct i2c_client *client)
-{
-	int err;
-
-	struct rtc_time tm;
-	struct timespec tv;
-
-	err = x1205_command(client, X1205_CMD_GETDATETIME, &tm);
-
-	if (err) {
-		dev_err(&client->dev,
-			"Unable to set the system clock\n");
-		return err;
-	}
-
-	/* IMPORTANT: the RTC only stores whole seconds. It is arbitrary
-	 * whether it stores the most close value or the value with partial
-	 * seconds truncated. However, it is important that we use it to store
-	 * the truncated value. This is because otherwise it is necessary,
-	 * in an rtc sync function, to read both xtime.tv_sec and
-	 * xtime.tv_nsec. On some processors (i.e. ARM), an atomic read
-	 * of >32bits is not possible. So storing the most close value would
-	 * slow down the sync API. So here we have the truncated value and
-	 * the best guess is to add 0.5s.
-	 */
-
-	tv.tv_nsec = NSEC_PER_SEC >> 1;
-
-	/* WARNING: this is not the C library 'mktime' call, it is a built in
-	 * inline function from include/linux/time.h.  It expects (requires)
-	 * the month to be in the range 1-12
-	 */
-
-	tv.tv_sec  = mktime(tm.tm_year + 1900, tm.tm_mon + 1,
-				tm.tm_mday, tm.tm_hour,
-				tm.tm_min, tm.tm_sec);
-
-	do_settimeofday(&tv);
-
-	dev_info(&client->dev,
-		"setting the system clock to %d-%d-%d %d:%d:%d\n",
-		tm.tm_year + 1900, tm.tm_mon + 1,
-		tm.tm_mday, tm.tm_hour, tm.tm_min,
-		tm.tm_sec);
-
-	return 0;
-}
-
-struct x1205_limit
-{
-	unsigned char reg;
-	unsigned char mask;
-	unsigned char min;
-	unsigned char max;
-};
-
-static int x1205_validate_client(struct i2c_client *client)
-{
-	int i, xfer;
-
-	/* Probe array. We will read the register at the specified
-	 * address and check if the given bits are zero.
-	 */
-	static const unsigned char probe_zero_pattern[] = {
-		/* register, mask */
-		X1205_REG_SR,	0x18,
-		X1205_REG_DTR,	0xF8,
-		X1205_REG_ATR,	0xC0,
-		X1205_REG_INT,	0x18,
-		X1205_REG_0,	0xFF,
-	};
-
-	static const struct x1205_limit probe_limits_pattern[] = {
-		/* register, mask, min, max */
-		{ X1205_REG_Y2K,	0xFF,	19,	20	},
-		{ X1205_REG_DW,		0xFF,	0,	6	},
-		{ X1205_REG_YR,		0xFF,	0,	99	},
-		{ X1205_REG_MO,		0xFF,	0,	12	},
-		{ X1205_REG_DT,		0xFF,	0,	31	},
-		{ X1205_REG_HR,		0x7F,	0,	23	},
-		{ X1205_REG_MN,		0xFF,	0,	59	},
-		{ X1205_REG_SC,		0xFF,	0,	59	},
-		{ X1205_REG_Y2K1,	0xFF,	19,	20	},
-		{ X1205_REG_Y2K0,	0xFF,	19,	20	},
-	};
-
-	/* check that registers have bits a 0 where expected */
-	for (i = 0; i < ARRAY_SIZE(probe_zero_pattern); i += 2) {
-		unsigned char buf;
-
-		unsigned char addr[2] = { 0, probe_zero_pattern[i] };
-
-		struct i2c_msg msgs[2] = {
-			{ client->addr, 0, 2, addr },
-			{ client->addr, I2C_M_RD, 1, &buf },
-		};
-
-		xfer = i2c_transfer(client->adapter, msgs, 2);
-		if (xfer != 2) {
-			dev_err(&client->adapter->dev,
-				"%s: could not read register %x\n",
-				__FUNCTION__, addr[1]);
-
-			return -EIO;
-		}
-
-		if ((buf & probe_zero_pattern[i+1]) != 0) {
-			dev_err(&client->adapter->dev,
-				"%s: register=%02x, zero pattern=%d, value=%x\n",
-				__FUNCTION__, addr[1], i, buf);
-
-			return -ENODEV;
-		}
-	}
-
-	/* check limits (only registers with bcd values) */
-	for (i = 0; i < ARRAY_SIZE(probe_limits_pattern); i++) {
-		unsigned char reg, value;
-
-		unsigned char addr[2] = { 0, probe_limits_pattern[i].reg };
-
-		struct i2c_msg msgs[2] = {
-			{ client->addr, 0, 2, addr },
-			{ client->addr, I2C_M_RD, 1, &reg },
-		};
-
-		xfer = i2c_transfer(client->adapter, msgs, 2);
-
-		if (xfer != 2) {
-			dev_err(&client->adapter->dev,
-				"%s: could not read register %x\n",
-				__FUNCTION__, addr[1]);
-
-			return -EIO;
-		}
-
-		value = BCD2BIN(reg & probe_limits_pattern[i].mask);
-
-		if (value > probe_limits_pattern[i].max ||
-			value < probe_limits_pattern[i].min) {
-			dev_dbg(&client->adapter->dev,
-				"%s: register=%x, lim pattern=%d, value=%d\n",
-				__FUNCTION__, addr[1], i, value);
-
-			return -ENODEV;
-		}
-	}
-
-	return 0;
-}
-
-static int x1205_attach(struct i2c_adapter *adapter)
-{
-	dev_dbg(&adapter->dev, "%s\n", __FUNCTION__);
-
-	return i2c_probe(adapter, &addr_data, x1205_probe);
-}
-
-int x1205_direct_attach(int adapter_id,
-	struct i2c_client_address_data *address_data)
-{
-	int err;
-	struct i2c_adapter *adapter = i2c_get_adapter(adapter_id);
-
-	if (adapter) {
-		err = i2c_probe(adapter,
-			address_data, x1205_probe);
-
-		i2c_put_adapter(adapter);
-
-		return err;
-	}
-
-	return -ENODEV;
-}
-
-static int x1205_probe(struct i2c_adapter *adapter, int address, int kind)
-{
-	struct i2c_client *client;
-	struct x1205_data *data;
-
-	int err = 0;
-
-	dev_dbg(&adapter->dev, "%s\n", __FUNCTION__);
-
-	if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) {
-		err = -ENODEV;
-		goto exit;
-	}
-
-	if (!(data = kzalloc(sizeof(struct x1205_data), GFP_KERNEL))) {
-		err = -ENOMEM;
-		goto exit;
-	}
-
-	/* Initialize our structures */
-	data->epoch = 2000;
-
-	client = &data->client;
-	client->addr = address;
-	client->driver = &x1205_driver;
-	client->adapter	= adapter;
-
-	strlcpy(client->name, "x1205", I2C_NAME_SIZE);
-
-	i2c_set_clientdata(client, data);
-
-	/* Verify the chip is really an X1205 */
-	if (kind < 0) {
-		if (x1205_validate_client(client) < 0) {
-			err = -ENODEV;
-			goto exit_kfree;
-		}
-	}
-
-	/* Inform the i2c layer */
-	if ((err = i2c_attach_client(client)))
-		goto exit_kfree;
-
-	list_add(&data->list, &x1205_clients);
-
-	dev_info(&client->dev, "chip found, driver version " DRV_VERSION "\n");
-
-	/* If requested, set the system time */
-	if (hctosys)
-		x1205_hctosys(client);
-
-	return 0;
-
-exit_kfree:
-	kfree(data);
-
-exit:
-	return err;
-}
-
-static int x1205_detach(struct i2c_client *client)
-{
-	int err;
-	struct x1205_data *data = i2c_get_clientdata(client);
-
-	dev_dbg(&client->dev, "%s\n", __FUNCTION__);
-
-	if ((err = i2c_detach_client(client)))
-		return err;
-
-	list_del(&data->list);
-
-	kfree(data);
-
-	return 0;
-}
-
-static int x1205_command(struct i2c_client *client, unsigned int cmd,
-	void *param)
-{
-	if (param == NULL)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_TIME))
-		return -EACCES;
-
-	dev_dbg(&client->dev, "%s: cmd=%d\n", __FUNCTION__, cmd);
-
-	switch (cmd) {
-	case X1205_CMD_GETDATETIME:
-		return x1205_get_datetime(client, param, X1205_CCR_BASE);
-
-	case X1205_CMD_SETTIME:
-		return x1205_set_datetime(client, param, 0,
-				X1205_CCR_BASE);
-
-	case X1205_CMD_SETDATETIME:
-		return x1205_set_datetime(client, param, 1,
-				X1205_CCR_BASE);
-
-	case X1205_CMD_GETALARM:
-		return x1205_get_datetime(client, param, X1205_ALM0_BASE);
-
-	case X1205_CMD_SETALARM:
-		return x1205_set_datetime(client, param, 1,
-				X1205_ALM0_BASE);
-
-	case X1205_CMD_GETDTRIM:
-		return x1205_get_dtrim(client, param);
-
-	case X1205_CMD_GETATRIM:
-		return x1205_get_atrim(client, param);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-static int __init x1205_init(void)
-{
-	return i2c_add_driver(&x1205_driver);
-}
-
-static void __exit x1205_exit(void)
-{
-	i2c_del_driver(&x1205_driver);
-}
-
-MODULE_AUTHOR(
-	"Karen Spearel <kas11@tampabay.rr.com>, "
-	"Alessandro Zummo <a.zummo@towertech.it>");
-MODULE_DESCRIPTION("Xicor X1205 RTC driver");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(DRV_VERSION);
-
-EXPORT_SYMBOL_GPL(x1205_do_command);
-EXPORT_SYMBOL_GPL(x1205_direct_attach);
-
-module_init(x1205_init);
-module_exit(x1205_exit);
diff --git a/include/linux/x1205.h b/include/linux/x1205.h
deleted file mode 100644
index 64fd3af894a5..000000000000
--- a/include/linux/x1205.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  x1205.h - defines for drivers/i2c/chips/x1205.c
- *  Copyright 2004 Karen Spearel
- *  Copyright 2005 Alessandro Zummo
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- */
-
-#ifndef __LINUX_X1205_H__
-#define __LINUX_X1205_H__
-
-/* commands */
-
-#define X1205_CMD_GETDATETIME	0
-#define X1205_CMD_SETTIME	1
-#define X1205_CMD_SETDATETIME	2
-#define X1205_CMD_GETALARM	3
-#define X1205_CMD_SETALARM	4
-#define X1205_CMD_GETDTRIM	5
-#define X1205_CMD_SETDTRIM	6
-#define X1205_CMD_GETATRIM	7
-#define X1205_CMD_SETATRIM	8
-
-extern int x1205_do_command(unsigned int cmd, void *arg);
-extern int x1205_direct_attach(int adapter_id,
-	struct i2c_client_address_data *address_data);
-
-#endif /* __LINUX_X1205_H__ */
-- 
cgit v1.2.3


From f7f3682fb2f8bc8a9c912baeea15454416ca1972 Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:38 -0800
Subject: [PATCH] RTC subsystem: I2C driver ids

This patch adds the I2C driver ids to i2c-id.h in preparation of the I2C
direct probing method.

This is kept separate so that it can be integrated to

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/i2c-id.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 679b46a6a565..c8b81f419fd8 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -108,6 +108,10 @@
 #define I2C_DRIVERID_UPD64083	78	/* upd64083 video processor	*/
 #define I2C_DRIVERID_UPD64031A	79	/* upd64031a video processor	*/
 #define I2C_DRIVERID_SAA717X	80	/* saa717x video encoder	*/
+#define I2C_DRIVERID_DS1672	81	/* Dallas/Maxim DS1672 RTC	*/
+#define I2C_DRIVERID_X1205	82	/* Xicor/Intersil X1205 RTC	*/
+#define I2C_DRIVERID_PCF8563	83	/* Philips PCF8563 RTC		*/
+#define I2C_DRIVERID_RS5C372	84	/* Ricoh RS5C372 RTC		*/
 
 #define I2C_DRIVERID_I2CDEV	900
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
-- 
cgit v1.2.3


From 1d98af87270cc08bb8251e004b9dc63cc838f24b Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:47 -0800
Subject: [PATCH] RTC subsystem: M48T86 driver

Add a driver for the ST M48T86 / Dallas DS12887 RTC.

This is a platform driver.  The platform device must provide I/O routines to
access the RTC.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/rtc/Kconfig      |  10 +++
 drivers/rtc/Makefile     |   1 +
 drivers/rtc/rtc-m48t86.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/m48t86.h   |  16 ++++
 4 files changed, 236 insertions(+)
 create mode 100644 drivers/rtc/rtc-m48t86.c
 create mode 100644 include/linux/m48t86.h

(limited to 'include/linux')

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 166232a3f56b..929dd8090578 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -117,6 +117,16 @@ config RTC_DRV_RS5C372
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-rs5c372.
 
+config RTC_DRV_M48T86
+	tristate "ST M48T86/Dallas DS12887"
+	depends on RTC_CLASS
+	help
+	  If you say Y here you will get support for the
+	  ST M48T86 and Dallas DS12887 RTC chips.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-m48t86.
+
 config RTC_DRV_EP93XX
 	tristate "Cirrus Logic EP93XX"
 	depends on RTC_CLASS && ARCH_EP93XX
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 56820d18161e..8d4c7fe88d58 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -16,5 +16,6 @@ obj-$(CONFIG_RTC_DRV_TEST)	+= rtc-test.o
 obj-$(CONFIG_RTC_DRV_DS1672)	+= rtc-ds1672.o
 obj-$(CONFIG_RTC_DRV_PCF8563)	+= rtc-pcf8563.o
 obj-$(CONFIG_RTC_DRV_RS5C372)	+= rtc-rs5c372.o
+obj-$(CONFIG_RTC_DRV_M48T86)	+= rtc-m48t86.o
 obj-$(CONFIG_RTC_DRV_EP93XX)	+= rtc-ep93xx.o
 obj-$(CONFIG_RTC_DRV_SA1100)	+= rtc-sa1100.o
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
new file mode 100644
index 000000000000..db445c872b1b
--- /dev/null
+++ b/drivers/rtc/rtc-m48t86.c
@@ -0,0 +1,209 @@
+/*
+ * ST M48T86 / Dallas DS12887 RTC driver
+ * Copyright (c) 2006 Tower Technologies
+ *
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This drivers only supports the clock running in BCD and 24H mode.
+ * If it will be ever adapted to binary and 12H mode, care must be taken
+ * to not introduce bugs.
+ */
+
+#include <linux/module.h>
+#include <linux/rtc.h>
+#include <linux/platform_device.h>
+#include <linux/m48t86.h>
+#include <linux/bcd.h>
+
+#define M48T86_REG_SEC		0x00
+#define M48T86_REG_SECALRM	0x01
+#define M48T86_REG_MIN		0x02
+#define M48T86_REG_MINALRM	0x03
+#define M48T86_REG_HOUR	0x04
+#define M48T86_REG_HOURALRM	0x05
+#define M48T86_REG_DOW		0x06 /* 1 = sunday */
+#define M48T86_REG_DOM		0x07
+#define M48T86_REG_MONTH	0x08 /* 1 - 12 */
+#define M48T86_REG_YEAR		0x09 /* 0 - 99 */
+#define M48T86_REG_A		0x0A
+#define M48T86_REG_B		0x0B
+#define M48T86_REG_C		0x0C
+#define M48T86_REG_D		0x0D
+
+#define M48T86_REG_B_H24	(1 << 1)
+#define M48T86_REG_B_DM		(1 << 2)
+#define M48T86_REG_B_SET	(1 << 7)
+#define M48T86_REG_D_VRT	(1 << 7)
+
+#define DRV_VERSION "0.1"
+
+
+static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	unsigned char reg;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t86_ops *ops = pdev->dev.platform_data;
+
+	reg = ops->readb(M48T86_REG_B);
+
+	if (reg & M48T86_REG_B_DM) {
+		/* data (binary) mode */
+		tm->tm_sec	= ops->readb(M48T86_REG_SEC);
+		tm->tm_min	= ops->readb(M48T86_REG_MIN);
+		tm->tm_hour	= ops->readb(M48T86_REG_HOUR) & 0x3F;
+		tm->tm_mday	= ops->readb(M48T86_REG_DOM);
+		/* tm_mon is 0-11 */
+		tm->tm_mon	= ops->readb(M48T86_REG_MONTH) - 1;
+		tm->tm_year	= ops->readb(M48T86_REG_YEAR) + 100;
+		tm->tm_wday	= ops->readb(M48T86_REG_DOW);
+	} else {
+		/* bcd mode */
+		tm->tm_sec	= BCD2BIN(ops->readb(M48T86_REG_SEC));
+		tm->tm_min	= BCD2BIN(ops->readb(M48T86_REG_MIN));
+		tm->tm_hour	= BCD2BIN(ops->readb(M48T86_REG_HOUR) & 0x3F);
+		tm->tm_mday	= BCD2BIN(ops->readb(M48T86_REG_DOM));
+		/* tm_mon is 0-11 */
+		tm->tm_mon	= BCD2BIN(ops->readb(M48T86_REG_MONTH)) - 1;
+		tm->tm_year	= BCD2BIN(ops->readb(M48T86_REG_YEAR)) + 100;
+		tm->tm_wday	= BCD2BIN(ops->readb(M48T86_REG_DOW));
+	}
+
+	/* correct the hour if the clock is in 12h mode */
+	if (!(reg & M48T86_REG_B_H24))
+		if (ops->readb(M48T86_REG_HOUR) & 0x80)
+			tm->tm_hour += 12;
+
+	return 0;
+}
+
+static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	unsigned char reg;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t86_ops *ops = pdev->dev.platform_data;
+
+	reg = ops->readb(M48T86_REG_B);
+
+	/* update flag and 24h mode */
+	reg |= M48T86_REG_B_SET | M48T86_REG_B_H24;
+	ops->writeb(reg, M48T86_REG_B);
+
+	if (reg & M48T86_REG_B_DM) {
+		/* data (binary) mode */
+		ops->writeb(tm->tm_sec, M48T86_REG_SEC);
+		ops->writeb(tm->tm_min, M48T86_REG_MIN);
+		ops->writeb(tm->tm_hour, M48T86_REG_HOUR);
+		ops->writeb(tm->tm_mday, M48T86_REG_DOM);
+		ops->writeb(tm->tm_mon + 1, M48T86_REG_MONTH);
+		ops->writeb(tm->tm_year % 100, M48T86_REG_YEAR);
+		ops->writeb(tm->tm_wday, M48T86_REG_DOW);
+	} else {
+		/* bcd mode */
+		ops->writeb(BIN2BCD(tm->tm_sec), M48T86_REG_SEC);
+		ops->writeb(BIN2BCD(tm->tm_min), M48T86_REG_MIN);
+		ops->writeb(BIN2BCD(tm->tm_hour), M48T86_REG_HOUR);
+		ops->writeb(BIN2BCD(tm->tm_mday), M48T86_REG_DOM);
+		ops->writeb(BIN2BCD(tm->tm_mon + 1), M48T86_REG_MONTH);
+		ops->writeb(BIN2BCD(tm->tm_year % 100), M48T86_REG_YEAR);
+		ops->writeb(BIN2BCD(tm->tm_wday), M48T86_REG_DOW);
+	}
+
+	/* update ended */
+	reg &= ~M48T86_REG_B_SET;
+	ops->writeb(reg, M48T86_REG_B);
+
+	return 0;
+}
+
+static int m48t86_rtc_proc(struct device *dev, struct seq_file *seq)
+{
+	unsigned char reg;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t86_ops *ops = pdev->dev.platform_data;
+
+	reg = ops->readb(M48T86_REG_B);
+
+	seq_printf(seq, "24hr\t\t: %s\n",
+		 (reg & M48T86_REG_B_H24) ? "yes" : "no");
+
+	seq_printf(seq, "mode\t\t: %s\n",
+		 (reg & M48T86_REG_B_DM) ? "binary" : "bcd");
+
+	reg = ops->readb(M48T86_REG_D);
+
+	seq_printf(seq, "battery\t\t: %s\n",
+		 (reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+
+	return 0;
+}
+
+static struct rtc_class_ops m48t86_rtc_ops = {
+	.read_time	= m48t86_rtc_read_time,
+	.set_time	= m48t86_rtc_set_time,
+	.proc		= m48t86_rtc_proc,
+};
+
+static int __devinit m48t86_rtc_probe(struct platform_device *dev)
+{
+	unsigned char reg;
+	struct m48t86_ops *ops = dev->dev.platform_data;
+	struct rtc_device *rtc = rtc_device_register("m48t86",
+				&dev->dev, &m48t86_rtc_ops, THIS_MODULE);
+
+	if (IS_ERR(rtc)) {
+		dev_err(&dev->dev, "unable to register\n");
+		return PTR_ERR(rtc);
+	}
+
+	platform_set_drvdata(dev, rtc);
+
+	/* read battery status */
+	reg = ops->readb(M48T86_REG_D);
+	dev_info(&dev->dev, "battery %s\n",
+		(reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+
+	return 0;
+}
+
+static int __devexit m48t86_rtc_remove(struct platform_device *dev)
+{
+	struct rtc_device *rtc = platform_get_drvdata(dev);
+
+ 	if (rtc)
+		rtc_device_unregister(rtc);
+
+	platform_set_drvdata(dev, NULL);
+
+	return 0;
+}
+
+static struct platform_driver m48t86_rtc_platform_driver = {
+	.driver		= {
+		.name	= "rtc-m48t86",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= m48t86_rtc_probe,
+	.remove		= __devexit_p(m48t86_rtc_remove),
+};
+
+static int __init m48t86_rtc_init(void)
+{
+	return platform_driver_register(&m48t86_rtc_platform_driver);
+}
+
+static void __exit m48t86_rtc_exit(void)
+{
+	platform_driver_unregister(&m48t86_rtc_platform_driver);
+}
+
+MODULE_AUTHOR("Alessandro Zummo <a.zummo@towertech.it>");
+MODULE_DESCRIPTION("M48T86 RTC driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+module_init(m48t86_rtc_init);
+module_exit(m48t86_rtc_exit);
diff --git a/include/linux/m48t86.h b/include/linux/m48t86.h
new file mode 100644
index 000000000000..9065199319d0
--- /dev/null
+++ b/include/linux/m48t86.h
@@ -0,0 +1,16 @@
+/*
+ * ST M48T86 / Dallas DS12887 RTC driver
+ * Copyright (c) 2006 Tower Technologies
+ *
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+struct m48t86_ops
+{
+	void (*writeb)(unsigned char value, unsigned long addr);
+	unsigned char (*readb)(unsigned long addr);
+};
-- 
cgit v1.2.3


From ed49843b897da9969e349c279ffc832efcb93213 Mon Sep 17 00:00:00 2001
From: Pavel Roskin <proski@gnu.org>
Date: Mon, 27 Mar 2006 01:17:36 -0800
Subject: [PATCH] Add ID for Quadro NVS280

Quadro NVS280 is a dual-head PCIe card with PCI ID 10de:00fd and subsystem ID
10de:0215.

Signed-off-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/nvidia/nvidia.c | 2 ++
 include/linux/pci_ids.h       | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index 018dd8174c7d..6d3e4890cb43 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -297,6 +297,8 @@ static struct pci_device_id nvidiafb_pci_tbl[] = {
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_GT,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_QUADRO_NVS280,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, 0x0252,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, 0x0313,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 6f080ae59286..02f6cf20b141 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1052,6 +1052,7 @@
 #define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT2 0x00f2
 #define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6200_ALT1 0x00f3
 #define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_GT   0x00f9
+#define PCIE_DEVICE_ID_NVIDIA_QUADRO_NVS280	0x00fd
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_SDR	0x0100
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_DDR	0x0101
 #define PCI_DEVICE_ID_NVIDIA_QUADRO		0x0103
-- 
cgit v1.2.3


From 969429b504ae866d3f8b1cafd68a2c099e305093 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:17:49 -0800
Subject: [PATCH] dm: make sure QUEUE_FLAG_CLUSTER is set properly

This flag should be set for a virtual device iff it is set for all
underlying devices.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-table.c         | 9 +++++++++
 include/linux/device-mapper.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 907b08ddb783..9558a4acec12 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -97,6 +97,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
 
 	lhs->seg_boundary_mask =
 		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
+
+	lhs->no_cluster |= rhs->no_cluster;
 }
 
 /*
@@ -523,6 +525,8 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 		rs->seg_boundary_mask =
 			min_not_zero(rs->seg_boundary_mask,
 				     q->seg_boundary_mask);
+
+		rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	}
 
 	return r;
@@ -832,6 +836,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	q->hardsect_size = t->limits.hardsect_size;
 	q->max_segment_size = t->limits.max_segment_size;
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
+	if (t->limits.no_cluster)
+		q->queue_flags &= ~(1 << QUEUE_FLAG_CLUSTER);
+	else
+		q->queue_flags |= (1 << QUEUE_FLAG_CLUSTER);
+
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 51e0e95a421a..aee10b2ea4c6 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -97,6 +97,7 @@ struct io_restrictions {
 	unsigned short		hardsect_size;
 	unsigned int		max_segment_size;
 	unsigned long		seg_boundary_mask;
+	unsigned char		no_cluster; /* inverted so that 0 is default */
 };
 
 struct dm_target {
-- 
cgit v1.2.3


From 3ac51e741a46af7a20f55e79d3e3aeaa93c6c544 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Mon, 27 Mar 2006 01:17:54 -0800
Subject: [PATCH] dm store geometry

Allow drive geometry to be stored with a new DM_DEV_SET_GEOMETRY ioctl.
Device-mapper will now respond to HDIO_GETGEO.  If the geometry information is
not available, zero will be returned for all of the parameters.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c        | 52 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c              | 46 +++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.h              |  7 ++++++
 include/linux/compat_ioctl.h |  2 ++
 include/linux/dm-ioctl.h     | 17 +++++++++++++--
 5 files changed, 121 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 65826bdac00c..8edd6435414d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/dm-ioctl.h>
+#include <linux/hdreg.h>
 
 #include <asm/uaccess.h>
 
@@ -700,6 +701,54 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 	return dm_hash_rename(param->name, new_name);
 }
 
+static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+{
+	int r = -EINVAL, x;
+	struct mapped_device *md;
+	struct hd_geometry geometry;
+	unsigned long indata[4];
+	char *geostr = (char *) param + param->data_start;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	if (geostr < (char *) (param + 1) ||
+	    invalid_str(geostr, (void *) param + param_size)) {
+		DMWARN("Invalid geometry supplied.");
+		goto out;
+	}
+
+	x = sscanf(geostr, "%lu %lu %lu %lu", indata,
+		   indata + 1, indata + 2, indata + 3);
+
+	if (x != 4) {
+		DMWARN("Unable to interpret geometry settings.");
+		goto out;
+	}
+
+	if (indata[0] > 65535 || indata[1] > 255 ||
+	    indata[2] > 255 || indata[3] > ULONG_MAX) {
+		DMWARN("Geometry exceeds range limits.");
+		goto out;
+	}
+
+	geometry.cylinders = indata[0];
+	geometry.heads = indata[1];
+	geometry.sectors = indata[2];
+	geometry.start = indata[3];
+
+	r = dm_set_geometry(md, &geometry);
+	if (!r)
+		r = __dev_status(md, param);
+
+	param->data_size = 0;
+
+out:
+	dm_put(md);
+	return r;
+}
+
 static int do_suspend(struct dm_ioctl *param)
 {
 	int r = 0;
@@ -1234,7 +1283,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd)
 
 		{DM_LIST_VERSIONS_CMD, list_versions},
 
-		{DM_TARGET_MSG_CMD, target_message}
+		{DM_TARGET_MSG_CMD, target_message},
+		{DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
 	};
 
 	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b99df48cffed..973e63d530ae 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
@@ -102,6 +103,9 @@ struct mapped_device {
 	 */
 	struct super_block *frozen_sb;
 	struct block_device *suspended_bdev;
+
+	/* forced geometry settings */
+	struct hd_geometry geometry;
 };
 
 #define MIN_IOS 256
@@ -227,6 +231,13 @@ static int dm_blk_close(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+
+	return dm_get_geometry(md, geo);
+}
+
 static inline struct dm_io *alloc_io(struct mapped_device *md)
 {
 	return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -313,6 +324,33 @@ struct dm_table *dm_get_table(struct mapped_device *md)
 	return t;
 }
 
+/*
+ * Get the geometry associated with a dm device
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+	*geo = md->geometry;
+
+	return 0;
+}
+
+/*
+ * Set the geometry of a device.
+ */
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
+
+	if (geo->start > sz) {
+		DMWARN("Start sector is beyond the geometry limits.");
+		return -EINVAL;
+	}
+
+	md->geometry = *geo;
+
+	return 0;
+}
+
 /*-----------------------------------------------------------------
  * CRUD START:
  *   A more elegant soln is in the works that uses the queue
@@ -906,6 +944,13 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 	sector_t size;
 
 	size = dm_table_get_size(t);
+
+	/*
+	 * Wipe any geometry if the size of the table changed.
+	 */
+	if (size != get_capacity(md->disk))
+		memset(&md->geometry, 0, sizeof(md->geometry));
+
 	__set_size(md, size);
 	if (size == 0)
 		return 0;
@@ -1261,6 +1306,7 @@ int dm_suspended(struct mapped_device *md)
 static struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
+	.getgeo = dm_blk_getgeo,
 	.owner = THIS_MODULE
 };
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 79e800051a10..fd90bc8f9e45 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/hdreg.h>
 
 #define DM_NAME "device-mapper"
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
@@ -85,6 +86,12 @@ int dm_wait_event(struct mapped_device *md, int event_nr);
 struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 
+/*
+ * Geometry functions.
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
+
 /*-----------------------------------------------------------------
  * Functions for manipulating a table.  Tables are also reference
  * counted.
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index efb518f16bb3..89ab677cb993 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -140,6 +140,7 @@ COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
 COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY_32)
 COMPATIBLE_IOCTL(DM_VERSION)
 COMPATIBLE_IOCTL(DM_REMOVE_ALL)
 COMPATIBLE_IOCTL(DM_LIST_DEVICES)
@@ -155,6 +156,7 @@ COMPATIBLE_IOCTL(DM_TABLE_DEPS)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
 COMPATIBLE_IOCTL(DM_TARGET_MSG)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index fa75ba0d635e..c67c6786612a 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -80,6 +80,16 @@
  *
  * DM_TARGET_MSG:
  * Pass a message string to the target at a specific offset of a device.
+ *
+ * DM_DEV_SET_GEOMETRY:
+ * Set the geometry of a device by passing in a string in this format:
+ *
+ * "cylinders heads sectors_per_track start_sector"
+ *
+ * Beware that CHS geometry is nearly obsolete and only provided
+ * for compatibility with dm devices that can be booted by a PC
+ * BIOS.  See struct hd_geometry for range limits.  Also note that
+ * the geometry is erased if the device size changes.
  */
 
 /*
@@ -218,6 +228,7 @@ enum {
 	/* Added later */
 	DM_LIST_VERSIONS_CMD,
 	DM_TARGET_MSG_CMD,
+	DM_DEV_SET_GEOMETRY_CMD
 };
 
 /*
@@ -247,6 +258,7 @@ typedef char ioctl_struct[308];
 #define DM_TABLE_STATUS_32  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct)
 #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct)
 #define DM_TARGET_MSG_32    _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct)
+#define DM_DEV_SET_GEOMETRY_32	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, ioctl_struct)
 #endif
 
 #define DM_IOCTL 0xfd
@@ -270,11 +282,12 @@ typedef char ioctl_struct[308];
 #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
 
 #define DM_TARGET_MSG	 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
+#define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	5
+#define DM_VERSION_MINOR	6
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2005-10-04)"
+#define DM_VERSION_EXTRA	"-ioctl (2006-02-17)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 6a4d44c1f1108d6c9e8850e8cf166aaba0e56eae Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:55 -0800
Subject: [PATCH] dm/md dependency tree in sysfs: holders/slaves subdirectory

Creating "slaves" and "holders" directories in /sys/block/<disk> and
creating "holders" directory under /sys/block/<disk>/<partition>

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/check.c | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/genhd.h |  7 +++++++
 2 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f924f459bdb8..60523cea7136 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -297,6 +297,30 @@ struct kobj_type ktype_part = {
 	.sysfs_ops	= &part_sysfs_ops,
 };
 
+#ifdef CONFIG_SYSFS
+static inline void partition_sysfs_add_subdir(struct hd_struct *p)
+{
+	struct kobject *k;
+
+	k = kobject_get(&p->kobj);
+	p->holder_dir = kobject_add_dir(k, "holders");
+	kobject_put(k);
+}
+
+static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
+{
+	struct kobject *k;
+
+	k = kobject_get(&disk->kobj);
+	disk->holder_dir = kobject_add_dir(k, "holders");
+	disk->slave_dir = kobject_add_dir(k, "slaves");
+	kobject_put(k);
+}
+#else
+#define partition_sysfs_add_subdir(x)	do { } while (0)
+#define disk_sysfs_add_subdirs(x)	do { } while (0)
+#endif
+
 void delete_partition(struct gendisk *disk, int part)
 {
 	struct hd_struct *p = disk->part[part-1];
@@ -310,6 +334,10 @@ void delete_partition(struct gendisk *disk, int part)
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
 	devfs_remove("%s/part%d", disk->devfs_name, part);
+#ifdef CONFIG_SYSFS
+	if (p->holder_dir)
+		kobject_unregister(p->holder_dir);
+#endif
 	kobject_unregister(&p->kobj);
 }
 
@@ -337,6 +365,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 	p->kobj.parent = &disk->kobj;
 	p->kobj.ktype = &ktype_part;
 	kobject_register(&p->kobj);
+	partition_sysfs_add_subdir(p);
 	disk->part[part-1] = p;
 }
 
@@ -383,6 +412,7 @@ void register_disk(struct gendisk *disk)
 	if ((err = kobject_add(&disk->kobj)))
 		return;
 	disk_sysfs_symlinks(disk);
+ 	disk_sysfs_add_subdirs(disk);
 	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
@@ -483,6 +513,12 @@ void del_gendisk(struct gendisk *disk)
 
 	devfs_remove_disk(disk);
 
+#ifdef CONFIG_SYSFS
+	if (disk->holder_dir)
+		kobject_unregister(disk->holder_dir);
+	if (disk->slave_dir)
+		kobject_unregister(disk->slave_dir);
+#endif
 	if (disk->driverfs_dev) {
 		char *disk_name = make_block_name(disk);
 		sysfs_remove_link(&disk->kobj, "device");
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index fd647fde5ec1..eea61cc8fac1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -78,6 +78,9 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	struct kobject kobj;
+#ifdef CONFIG_SYSFS
+	struct kobject *holder_dir;
+#endif
 	unsigned ios[2], sectors[2];	/* READs and WRITEs */
 	int policy, partno;
 };
@@ -114,6 +117,10 @@ struct gendisk {
 	int number;			/* more of the same */
 	struct device *driverfs_dev;
 	struct kobject kobj;
+#ifdef CONFIG_SYSFS
+	struct kobject *holder_dir;
+	struct kobject *slave_dir;
+#endif
 
 	struct timer_rand_state *random;
 	int policy;
-- 
cgit v1.2.3


From 100873687d81d4ce7b1299b447d33e87ba1e9583 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 27 Mar 2006 01:17:56 -0800
Subject: [PATCH]
 dm-md-dependency-tree-in-sysfs-holders-slaves-subdirectory-tidy

Remove all the CONFIG_SYSFS stuff.  That's supposed to all be implemented up
in header files.

Yes, the CONFIG_SYSFS=n data structures will be a little larger than
necessary, but that's a tradeoff we can decide to make.

Cc: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/check.c | 9 ---------
 include/linux/genhd.h | 4 ----
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 60523cea7136..af0cb4b9e784 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -297,7 +297,6 @@ struct kobj_type ktype_part = {
 	.sysfs_ops	= &part_sysfs_ops,
 };
 
-#ifdef CONFIG_SYSFS
 static inline void partition_sysfs_add_subdir(struct hd_struct *p)
 {
 	struct kobject *k;
@@ -316,10 +315,6 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	disk->slave_dir = kobject_add_dir(k, "slaves");
 	kobject_put(k);
 }
-#else
-#define partition_sysfs_add_subdir(x)	do { } while (0)
-#define disk_sysfs_add_subdirs(x)	do { } while (0)
-#endif
 
 void delete_partition(struct gendisk *disk, int part)
 {
@@ -334,10 +329,8 @@ void delete_partition(struct gendisk *disk, int part)
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
 	devfs_remove("%s/part%d", disk->devfs_name, part);
-#ifdef CONFIG_SYSFS
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
-#endif
 	kobject_unregister(&p->kobj);
 }
 
@@ -513,12 +506,10 @@ void del_gendisk(struct gendisk *disk)
 
 	devfs_remove_disk(disk);
 
-#ifdef CONFIG_SYSFS
 	if (disk->holder_dir)
 		kobject_unregister(disk->holder_dir);
 	if (disk->slave_dir)
 		kobject_unregister(disk->slave_dir);
-#endif
 	if (disk->driverfs_dev) {
 		char *disk_name = make_block_name(disk);
 		sysfs_remove_link(&disk->kobj, "device");
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index eea61cc8fac1..bd7db861041e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -78,9 +78,7 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	struct kobject kobj;
-#ifdef CONFIG_SYSFS
 	struct kobject *holder_dir;
-#endif
 	unsigned ios[2], sectors[2];	/* READs and WRITEs */
 	int policy, partno;
 };
@@ -117,10 +115,8 @@ struct gendisk {
 	int number;			/* more of the same */
 	struct device *driverfs_dev;
 	struct kobject kobj;
-#ifdef CONFIG_SYSFS
 	struct kobject *holder_dir;
 	struct kobject *slave_dir;
-#endif
 
 	struct timer_rand_state *random;
 	int policy;
-- 
cgit v1.2.3


From 641dc636b0475582e48584340b774bd1e90d40d9 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:57 -0800
Subject: [PATCH] dm/md dependency tree in sysfs: bd_claim_by_kobject

Adding bd_claim_by_kobject() function which takes kobject as additional
signature of holder device and creates sysfs symlinks between holder device
and claimed device.  bd_release_from_kobject() is a counterpart of
bd_claim_by_kobject.

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/block_dev.c     | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  10 ++
 2 files changed, 307 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5983d42df015..3f36df7e037c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -266,6 +266,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		mutex_init(&bdev->bd_mount_mutex);
 		INIT_LIST_HEAD(&bdev->bd_inodes);
 		INIT_LIST_HEAD(&bdev->bd_list);
+#ifdef CONFIG_SYSFS
+		INIT_LIST_HEAD(&bdev->bd_holder_list);
+#endif
 		inode_init_once(&ei->vfs_inode);
 	}
 }
@@ -490,6 +493,300 @@ void bd_release(struct block_device *bdev)
 
 EXPORT_SYMBOL(bd_release);
 
+#ifdef CONFIG_SYSFS
+/*
+ * Functions for bd_claim_by_kobject / bd_release_from_kobject
+ *
+ *     If a kobject is passed to bd_claim_by_kobject()
+ *     and the kobject has a parent directory,
+ *     following symlinks are created:
+ *        o from the kobject to the claimed bdev
+ *        o from "holders" directory of the bdev to the parent of the kobject
+ *     bd_release_from_kobject() removes these symlinks.
+ *
+ *     Example:
+ *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
+ *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
+ *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
+ */
+
+static struct kobject *bdev_get_kobj(struct block_device *bdev)
+{
+	if (bdev->bd_contains != bdev)
+		return kobject_get(&bdev->bd_part->kobj);
+	else
+		return kobject_get(&bdev->bd_disk->kobj);
+}
+
+static struct kobject *bdev_get_holder(struct block_device *bdev)
+{
+	if (bdev->bd_contains != bdev)
+		return kobject_get(bdev->bd_part->holder_dir);
+	else
+		return kobject_get(bdev->bd_disk->holder_dir);
+}
+
+static void add_symlink(struct kobject *from, struct kobject *to)
+{
+	if (!from || !to)
+		return;
+	sysfs_create_link(from, to, kobject_name(to));
+}
+
+static void del_symlink(struct kobject *from, struct kobject *to)
+{
+	if (!from || !to)
+		return;
+	sysfs_remove_link(from, kobject_name(to));
+}
+
+/*
+ * 'struct bd_holder' contains pointers to kobjects symlinked by
+ * bd_claim_by_kobject.
+ * It's connected to bd_holder_list which is protected by bdev->bd_sem.
+ */
+struct bd_holder {
+	struct list_head list;	/* chain of holders of the bdev */
+	int count;		/* references from the holder */
+	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */
+	struct kobject *hdev;	/* e.g. "/block/dm-0" */
+	struct kobject *hdir;	/* e.g. "/block/sda/holders" */
+	struct kobject *sdev;	/* e.g. "/block/sda" */
+};
+
+/*
+ * Get references of related kobjects at once.
+ * Returns 1 on success. 0 on failure.
+ *
+ * Should call bd_holder_release_dirs() after successful use.
+ */
+static int bd_holder_grab_dirs(struct block_device *bdev,
+			struct bd_holder *bo)
+{
+	if (!bdev || !bo)
+		return 0;
+
+	bo->sdir = kobject_get(bo->sdir);
+	if (!bo->sdir)
+		return 0;
+
+	bo->hdev = kobject_get(bo->sdir->parent);
+	if (!bo->hdev)
+		goto fail_put_sdir;
+
+	bo->sdev = bdev_get_kobj(bdev);
+	if (!bo->sdev)
+		goto fail_put_hdev;
+
+	bo->hdir = bdev_get_holder(bdev);
+	if (!bo->hdir)
+		goto fail_put_sdev;
+
+	return 1;
+
+fail_put_sdev:
+	kobject_put(bo->sdev);
+fail_put_hdev:
+	kobject_put(bo->hdev);
+fail_put_sdir:
+	kobject_put(bo->sdir);
+
+	return 0;
+}
+
+/* Put references of related kobjects at once. */
+static void bd_holder_release_dirs(struct bd_holder *bo)
+{
+	kobject_put(bo->hdir);
+	kobject_put(bo->sdev);
+	kobject_put(bo->hdev);
+	kobject_put(bo->sdir);
+}
+
+static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
+{
+	struct bd_holder *bo;
+
+	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
+	if (!bo)
+		return NULL;
+
+	bo->count = 1;
+	bo->sdir = kobj;
+
+	return bo;
+}
+
+static void free_bd_holder(struct bd_holder *bo)
+{
+	kfree(bo);
+}
+
+/**
+ * add_bd_holder - create sysfs symlinks for bd_claim() relationship
+ *
+ * @bdev:	block device to be bd_claimed
+ * @bo:		preallocated and initialized by alloc_bd_holder()
+ *
+ * If there is no matching entry with @bo in @bdev->bd_holder_list,
+ * add @bo to the list, create symlinks.
+ *
+ * Returns 1 if @bo was added to the list.
+ * Returns 0 if @bo wasn't used by any reason and should be freed.
+ */
+static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
+{
+	struct bd_holder *tmp;
+
+	if (!bo)
+		return 0;
+
+	list_for_each_entry(tmp, &bdev->bd_holder_list, list) {
+		if (tmp->sdir == bo->sdir) {
+			tmp->count++;
+			return 0;
+		}
+	}
+
+	if (!bd_holder_grab_dirs(bdev, bo))
+		return 0;
+
+	add_symlink(bo->sdir, bo->sdev);
+	add_symlink(bo->hdir, bo->hdev);
+	list_add_tail(&bo->list, &bdev->bd_holder_list);
+	return 1;
+}
+
+/**
+ * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
+ *
+ * @bdev:	block device to be bd_claimed
+ * @kobj:	holder's kobject
+ *
+ * If there is matching entry with @kobj in @bdev->bd_holder_list
+ * and no other bd_claim() from the same kobject,
+ * remove the struct bd_holder from the list, delete symlinks for it.
+ *
+ * Returns a pointer to the struct bd_holder when it's removed from the list
+ * and ready to be freed.
+ * Returns NULL if matching claim isn't found or there is other bd_claim()
+ * by the same kobject.
+ */
+static struct bd_holder *del_bd_holder(struct block_device *bdev,
+					struct kobject *kobj)
+{
+	struct bd_holder *bo;
+
+	list_for_each_entry(bo, &bdev->bd_holder_list, list) {
+		if (bo->sdir == kobj) {
+			bo->count--;
+			BUG_ON(bo->count < 0);
+			if (!bo->count) {
+				list_del(&bo->list);
+				del_symlink(bo->sdir, bo->sdev);
+				del_symlink(bo->hdir, bo->hdev);
+				bd_holder_release_dirs(bo);
+				return bo;
+			}
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * bd_claim_by_kobject - bd_claim() with additional kobject signature
+ *
+ * @bdev:	block device to be claimed
+ * @holder:	holder's signature
+ * @kobj:	holder's kobject
+ *
+ * Do bd_claim() and if it succeeds, create sysfs symlinks between
+ * the bdev and the holder's kobject.
+ * Use bd_release_from_kobject() when relesing the claimed bdev.
+ *
+ * Returns 0 on success. (same as bd_claim())
+ * Returns errno on failure.
+ */
+static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
+				struct kobject *kobj)
+{
+	int res;
+	struct bd_holder *bo;
+
+	if (!kobj)
+		return -EINVAL;
+
+	bo = alloc_bd_holder(kobj);
+	if (!bo)
+		return -ENOMEM;
+
+	down(&bdev->bd_sem);
+	res = bd_claim(bdev, holder);
+	if (res || !add_bd_holder(bdev, bo))
+		free_bd_holder(bo);
+	up(&bdev->bd_sem);
+
+	return res;
+}
+
+/**
+ * bd_release_from_kobject - bd_release() with additional kobject signature
+ *
+ * @bdev:	block device to be released
+ * @kobj:	holder's kobject
+ *
+ * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
+ */
+static void bd_release_from_kobject(struct block_device *bdev,
+					struct kobject *kobj)
+{
+	struct bd_holder *bo;
+
+	if (!kobj)
+		return;
+
+	down(&bdev->bd_sem);
+	bd_release(bdev);
+	if ((bo = del_bd_holder(bdev, kobj)))
+		free_bd_holder(bo);
+	up(&bdev->bd_sem);
+}
+
+/**
+ * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
+ *
+ * @bdev:	block device to be claimed
+ * @holder:	holder's signature
+ * @disk:	holder's gendisk
+ *
+ * Call bd_claim_by_kobject() with getting @disk->slave_dir.
+ */
+int bd_claim_by_disk(struct block_device *bdev, void *holder,
+			struct gendisk *disk)
+{
+	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
+}
+EXPORT_SYMBOL_GPL(bd_claim_by_disk);
+
+/**
+ * bd_release_from_disk - wrapper function for bd_release_from_kobject()
+ *
+ * @bdev:	block device to be claimed
+ * @disk:	holder's gendisk
+ *
+ * Call bd_release_from_kobject() and put @disk->slave_dir.
+ */
+void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
+{
+	bd_release_from_kobject(bdev, disk->slave_dir);
+	kobject_put(disk->slave_dir);
+}
+EXPORT_SYMBOL_GPL(bd_release_from_disk);
+#endif
+
 /*
  * Tries to open block device by device number.  Use it ONLY if you
  * really do not have anything better - i.e. when you are behind a
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9d9674946956..680d913350e7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -410,6 +410,9 @@ struct block_device {
 	struct list_head	bd_inodes;
 	void *			bd_holder;
 	int			bd_holders;
+#ifdef CONFIG_SYSFS
+	struct list_head	bd_holder_list;
+#endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
 	struct hd_struct *	bd_part;
@@ -1399,6 +1402,13 @@ extern int blkdev_get(struct block_device *, mode_t, unsigned);
 extern int blkdev_put(struct block_device *);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
+#ifdef CONFIG_SYSFS
+extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
+extern void bd_release_from_disk(struct block_device *, struct gendisk *);
+#else
+#define bd_claim_by_disk(bdev, holder, disk)	bd_claim(bdev, holder)
+#define bd_release_from_disk(bdev, disk)	bd_release(bdev)
+#endif
 
 /* fs/char_dev.c */
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
-- 
cgit v1.2.3


From b55e6bfcd23cb2f7249095050c649f7aea813f9f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:06 -0800
Subject: [PATCH] md: Split disks array out of raid5 conf structure so it is
 easier to grow

The remainder of this batch implements raid5 reshaping.  Currently the only
shape change that is supported is added a device, but it is envisioned that
changing the chunksize and layout will also be supported, as well as changing
the level (e.g.  1->5, 5->6).

The reshape process naturally has to move all of the data in the array, and so
should be used with caution.  It is believed to work, and some testing does
support this, but wider testing would be great for increasing my confidence.

You will need a version of mdadm newer than 2.3.1 to make use of raid5 growth.
 This is because mdadm need to take a copy of a 'critical section' at the
start of the array incase there is a crash at an awkward moment.  On restart,
mdadm will restore the critical section and allow reshape to continue.

I hope to release a 2.4-pre by early next week - it still needs a little more
polishing.

This patch:

Previously the array of disk information was included in the raid5 'conf'
structure which was allocated to an appropriate size.  This makes it awkward
to change the size of that array.  So we split it off into a separate
kmalloced array which will require a little extra indexing, but is much easier
to grow.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 10 +++++++---
 drivers/md/raid6main.c     | 10 +++++++---
 include/linux/raid/raid5.h |  2 +-
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2dba305daf3c..03f31379cebb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1822,11 +1822,13 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kzalloc(sizeof (raid5_conf_t)
-				 + mddev->raid_disks * sizeof(struct disk_info),
-				 GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
+	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+			      GFP_KERNEL);
+	if (!conf->disks)
+		goto abort;
 
 	conf->mddev = mddev;
 
@@ -1966,6 +1968,7 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid5_conf(conf);
+		kfree(conf->disks);
 		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
@@ -1986,6 +1989,7 @@ static int stop(mddev_t *mddev)
 	kfree(conf->stripe_hashtbl);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
+	kfree(conf->disks);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index cd477ebf2ee4..c7632f6cc487 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2006,11 +2006,14 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kzalloc(sizeof (raid6_conf_t)
-				 + mddev->raid_disks * sizeof(struct disk_info),
-				 GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
+	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+				 GFP_KERNEL);
+	if (!conf->disks)
+		goto abort;
+
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -2158,6 +2161,7 @@ abort:
 		print_raid6_conf(conf);
 		safe_put_page(conf->spare_page);
 		kfree(conf->stripe_hashtbl);
+		kfree(conf->disks);
 		kfree(conf);
 	}
 	mddev->private = NULL;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 394da8207b34..94dbdd406f12 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -240,7 +240,7 @@ struct raid5_private_data {
 							 * waiting for 25% to be free
 							 */        
 	spinlock_t		device_lock;
-	struct disk_info	disks[0];
+	struct disk_info	*disks;
 };
 
 typedef struct raid5_private_data raid5_conf_t;
-- 
cgit v1.2.3


From ad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:07 -0800
Subject: [PATCH] md: Allow stripes to be expanded in preparation for expanding
 an array

Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache
data structure.

This requires allocating new stripes in a new kmem_cache.  If this succeeds,
we copy cache pages over and release the old stripes and kmem_cache.

We then allocate new pages.  If that fails, we leave the stripe cache at it's
new size.  It isn't worth the effort to shrink it back again.

Unfortuanately this means we need two kmem_cache names as we, for a short
period of time, we have two kmem_caches.  So they are raid5/%s and
raid5/%s-alt

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   2 +-
 drivers/md/raid5.c         | 131 +++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid6main.c     |   4 +-
 include/linux/raid/raid5.h |   9 +++-
 4 files changed, 137 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a3ecaf8ed30a..c7b7656f9aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2775,7 +2775,6 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-	struct list_head candidates;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev0, *rdev;
 	mddev_t *mddev;
@@ -2784,6 +2783,7 @@ static void autorun_devices(int part)
 	printk(KERN_INFO "md: autorun ...\n");
 	while (!list_empty(&pending_raid_disks)) {
 		dev_t dev;
+		LIST_HEAD(candidates);
 		rdev0 = list_entry(pending_raid_disks.next,
 					 mdk_rdev_t, same_set);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 03f31379cebb..6c20b44509d8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -313,20 +313,143 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
-
-	sc = kmem_cache_create(conf->cache_name, 
+	sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+	conf->active_name = 0;
+	sc = kmem_cache_create(conf->cache_name[conf->active_name],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
 		return 1;
 	conf->slab_cache = sc;
+	conf->pool_size = devs;
 	while (num--) {
 		if (!grow_one_stripe(conf))
 			return 1;
 	}
 	return 0;
 }
+static int resize_stripes(raid5_conf_t *conf, int newsize)
+{
+	/* Make all the stripes able to hold 'newsize' devices.
+	 * New slots in each stripe get 'page' set to a new page.
+	 *
+	 * This happens in stages:
+	 * 1/ create a new kmem_cache and allocate the required number of
+	 *    stripe_heads.
+	 * 2/ gather all the old stripe_heads and tranfer the pages across
+	 *    to the new stripe_heads.  This will have the side effect of
+	 *    freezing the array as once all stripe_heads have been collected,
+	 *    no IO will be possible.  Old stripe heads are freed once their
+	 *    pages have been transferred over, and the old kmem_cache is
+	 *    freed when all stripes are done.
+	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
+	 *    we simple return a failre status - no need to clean anything up.
+	 * 4/ allocate new pages for the new slots in the new stripe_heads.
+	 *    If this fails, we don't bother trying the shrink the
+	 *    stripe_heads down again, we just leave them as they are.
+	 *    As each stripe_head is processed the new one is released into
+	 *    active service.
+	 *
+	 * Once step2 is started, we cannot afford to wait for a write,
+	 * so we use GFP_NOIO allocations.
+	 */
+	struct stripe_head *osh, *nsh;
+	LIST_HEAD(newstripes);
+	struct disk_info *ndisks;
+	int err = 0;
+	kmem_cache_t *sc;
+	int i;
+
+	if (newsize <= conf->pool_size)
+		return 0; /* never bother to shrink */
+
+	/* Step 1 */
+	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
+			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+			       0, 0, NULL, NULL);
+	if (!sc)
+		return -ENOMEM;
+
+	for (i = conf->max_nr_stripes; i; i--) {
+		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+		if (!nsh)
+			break;
+
+		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
+
+		nsh->raid_conf = conf;
+		spin_lock_init(&nsh->lock);
+
+		list_add(&nsh->lru, &newstripes);
+	}
+	if (i) {
+		/* didn't get enough, give up */
+		while (!list_empty(&newstripes)) {
+			nsh = list_entry(newstripes.next, struct stripe_head, lru);
+			list_del(&nsh->lru);
+			kmem_cache_free(sc, nsh);
+		}
+		kmem_cache_destroy(sc);
+		return -ENOMEM;
+	}
+	/* Step 2 - Must use GFP_NOIO now.
+	 * OK, we have enough stripes, start collecting inactive
+	 * stripes and copying them over
+	 */
+	list_for_each_entry(nsh, &newstripes, lru) {
+		spin_lock_irq(&conf->device_lock);
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    !list_empty(&conf->inactive_list),
+				    conf->device_lock,
+				    unplug_slaves(conf->mddev);
+			);
+		osh = get_free_stripe(conf);
+		spin_unlock_irq(&conf->device_lock);
+		atomic_set(&nsh->count, 1);
+		for(i=0; i<conf->pool_size; i++)
+			nsh->dev[i].page = osh->dev[i].page;
+		for( ; i<newsize; i++)
+			nsh->dev[i].page = NULL;
+		kmem_cache_free(conf->slab_cache, osh);
+	}
+	kmem_cache_destroy(conf->slab_cache);
+
+	/* Step 3.
+	 * At this point, we are holding all the stripes so the array
+	 * is completely stalled, so now is a good time to resize
+	 * conf->disks.
+	 */
+	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+	if (ndisks) {
+		for (i=0; i<conf->raid_disks; i++)
+			ndisks[i] = conf->disks[i];
+		kfree(conf->disks);
+		conf->disks = ndisks;
+	} else
+		err = -ENOMEM;
+
+	/* Step 4, return new stripes to service */
+	while(!list_empty(&newstripes)) {
+		nsh = list_entry(newstripes.next, struct stripe_head, lru);
+		list_del_init(&nsh->lru);
+		for (i=conf->raid_disks; i < newsize; i++)
+			if (nsh->dev[i].page == NULL) {
+				struct page *p = alloc_page(GFP_NOIO);
+				nsh->dev[i].page = p;
+				if (!p)
+					err = -ENOMEM;
+			}
+		release_stripe(nsh);
+	}
+	/* critical section pass, GFP_NOIO no longer needed */
+
+	conf->slab_cache = sc;
+	conf->active_name = 1-conf->active_name;
+	conf->pool_size = newsize;
+	return err;
+}
+
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -339,7 +462,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
 		return 0;
 	if (atomic_read(&sh->count))
 		BUG();
-	shrink_buffers(sh, conf->raid_disks);
+	shrink_buffers(sh, conf->pool_size);
 	kmem_cache_free(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
 	return 1;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index c7632f6cc487..6df4930fddec 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
 
-	sc = kmem_cache_create(conf->cache_name,
+	sc = kmem_cache_create(conf->cache_name[0],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 94dbdd406f12..b7b2653af7bb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -216,7 +216,11 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
-	char			cache_name[20];
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
 
 	int			seq_flush, seq_write;
@@ -238,7 +242,8 @@ struct raid5_private_data {
 	wait_queue_head_t	wait_for_overlap;
 	int			inactive_blocked;	/* release of inactive stripes blocked,
 							 * waiting for 25% to be free
-							 */        
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
 };
-- 
cgit v1.2.3


From 7ecaa1e6a1ad69862e9980b6c777e11f26c4782d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:08 -0800
Subject: [PATCH] md: Infrastructure to allow normal IO to continue while array
 is expanding

We need to allow that different stripes are of different effective sizes, and
use the appropriate size.  Also, when a stripe is being expanded, we must
block any IO attempts until the stripe is stable again.

Key elements in this change are:
 - each stripe_head gets a 'disk' field which is part of the key,
   thus there can sometimes be two stripe heads of the same area of
   the array, but covering different numbers of devices.  One of these
   will be marked STRIPE_EXPANDING and so won't accept new requests.
 - conf->expand_progress tracks how the expansion is progressing and
   is used to determine whether the target part of the array has been
   expanded yet or not.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 88 ++++++++++++++++++++++++++++++----------------
 include/linux/raid/raid5.h |  6 ++++
 2 files changed, 64 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6c20b44509d8..7a6df515b008 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -178,10 +178,10 @@ static int grow_buffers(struct stripe_head *sh, int num)
 
 static void raid5_build_block (struct stripe_head *sh, int i);
 
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int i;
 
 	if (atomic_read(&sh->count) != 0)
 		BUG();
@@ -198,7 +198,9 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	sh->pd_idx = pd_idx;
 	sh->state = 0;
 
-	for (i=disks; i--; ) {
+	sh->disks = disks;
+
+	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->toread || dev->towrite || dev->written ||
@@ -215,7 +217,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
 {
 	struct stripe_head *sh;
 	struct hlist_node *hn;
@@ -223,7 +225,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
 	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-		if (sh->sector == sector)
+		if (sh->sector == sector && sh->disks == disks)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
 	return NULL;
@@ -232,8 +234,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(request_queue_t *q);
 
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
-					     int pd_idx, int noblock) 
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
+					     int pd_idx, int noblock)
 {
 	struct stripe_head *sh;
 
@@ -245,7 +247,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
 				    conf->device_lock, /* nothing */);
-		sh = __find_stripe(conf, sector);
+		sh = __find_stripe(conf, sector, disks);
 		if (!sh) {
 			if (!conf->inactive_blocked)
 				sh = get_free_stripe(conf);
@@ -263,7 +265,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 					);
 				conf->inactive_blocked = 0;
 			} else
-				init_stripe(sh, sector, pd_idx);
+				init_stripe(sh, sector, pd_idx, disks);
 		} else {
 			if (atomic_read(&sh->count)) {
 				if (!list_empty(&sh->lru))
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
+	sh->disks = conf->raid_disks;
 	/* we just created an active stripe so... */
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
@@ -483,7 +486,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
 	if (bi->bi_size)
@@ -581,7 +584,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	unsigned long flags;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
@@ -735,7 +738,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 static sector_t compute_blocknr(struct stripe_head *sh, int i)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+	int raid_disks = sh->disks, data_disks = raid_disks - 1;
 	sector_t new_sector = sh->sector, check;
 	int sectors_per_chunk = conf->chunk_size >> 9;
 	sector_t stripe;
@@ -836,8 +839,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
-	raid5_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
+	int i, count, disks = sh->disks;
 	void *ptr[MAX_XOR_BLOCKS], *p;
 
 	PRINTK("compute_block, stripe %llu, idx %d\n", 
@@ -867,7 +869,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 static void compute_parity(struct stripe_head *sh, int method)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
 	void *ptr[MAX_XOR_BLOCKS];
 	struct bio *chosen;
 
@@ -1055,7 +1057,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 static void handle_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks;
+	int disks = sh->disks;
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
@@ -1649,12 +1651,10 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
 	spin_unlock_irq(&conf->device_lock);
 }
 
-static int make_request (request_queue_t *q, struct bio * bi)
+static int make_request(request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
-	const unsigned int raid_disks = conf->raid_disks;
-	const unsigned int data_disks = raid_disks - 1;
 	unsigned int dd_idx, pd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
@@ -1678,20 +1678,48 @@ static int make_request (request_queue_t *q, struct bio * bi)
 
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
+		int disks;
 		
-		new_sector = raid5_compute_sector(logical_sector,
-						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-
+	retry:
+		if (likely(conf->expand_progress == MaxSector))
+			disks = conf->raid_disks;
+		else {
+			spin_lock_irq(&conf->device_lock);
+			disks = conf->raid_disks;
+			if (logical_sector >= conf->expand_progress)
+				disks = conf->previous_raid_disks;
+			spin_unlock_irq(&conf->device_lock);
+		}
+ 		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
+						  &dd_idx, &pd_idx, conf);
 		PRINTK("raid5: make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-	retry:
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
-			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-				/* Add failed due to overlap.  Flush everything
+			if (unlikely(conf->expand_progress != MaxSector)) {
+				/* expansion might have moved on while waiting for a
+				 * stripe, so we much do the range check again.
+				 */
+				int must_retry = 0;
+				spin_lock_irq(&conf->device_lock);
+				if (logical_sector <  conf->expand_progress &&
+				    disks == conf->previous_raid_disks)
+					/* mismatch, need to try again */
+					must_retry = 1;
+				spin_unlock_irq(&conf->device_lock);
+				if (must_retry) {
+					release_stripe(sh);
+					goto retry;
+				}
+			}
+
+			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+				/* Stripe is busy expanding or
+				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
 				raid5_unplug_device(mddev->queue);
@@ -1703,7 +1731,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
 			raid5_plug_device(conf);
 			handle_stripe(sh);
 			release_stripe(sh);
-
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1779,9 +1806,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
 		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-	sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
+	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
-		sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
 		/* make sure we don't swamp the stripe cache if someone else
 		 * is trying to get access 
 		 */
@@ -1998,6 +2025,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
+	conf->expand_progress = MaxSector;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2128,7 +2156,7 @@ static void print_sh (struct stripe_head *sh)
 	printk("sh %llu,  count %d.\n",
 		(unsigned long long)sh->sector, atomic_read(&sh->count));
 	printk("sh %llu, ", (unsigned long long)sh->sector);
-	for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+	for (i = 0; i < sh->disks; i++) {
 		printk("(cache%d: %p %ld) ", 
 			i, sh->dev[i].page, sh->dev[i].flags);
 	}
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b7b2653af7bb..6fa274aea2a0 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -135,6 +135,7 @@ struct stripe_head {
 	atomic_t		count;			/* nr of active thread/requests */
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;			/* disks in stripe */
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -174,6 +175,7 @@ struct stripe_head {
 #define	STRIPE_DELAYED		6
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
 
 /*
  * Plugging:
@@ -211,6 +213,10 @@ struct raid5_private_data {
 	int			raid_disks, working_disks, failed_disks;
 	int			max_nr_stripes;
 
+	/* used during an expand */
+	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	int			previous_raid_disks;
+
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
-- 
cgit v1.2.3


From ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:09 -0800
Subject: [PATCH] md: Core of raid5 resize process

This patch provides the core of the resize/expand process.

sync_request notices if a 'reshape' is happening and acts accordingly.

It allocated new stripe_heads for the next chunk-wide-stripe in the target
geometry, marking them STRIPE_EXPANDING.

Then it finds which stripe heads in the old geometry can provide data needed
by these and marks them STRIPE_EXPAND_SOURCE.  This causes stripe_handle to
read all blocks on those stripes.

Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are
needed are copied into the corresponding STRIPE_EXPANDING stripe_head.  Once a
STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then
is written out and released.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  14 +++-
 drivers/md/raid5.c         | 185 +++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h  |   4 +
 include/linux/raid/raid5.h |   4 +-
 4 files changed, 181 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7b7656f9aa5..8e65986bc63f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2165,7 +2165,9 @@ action_show(mddev_t *mddev, char *page)
 	char *type = "idle";
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+			type = "reshape";
+		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 				type = "resync";
 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -4088,8 +4090,10 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		seq_printf(seq, "] ");
 	}
 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+		    "reshape" :
 		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-		       "resync" : "recovery"),
+		       "resync" : "recovery")),
 		      per_milli/10, per_milli % 10,
 		   (unsigned long long) resync,
 		   (unsigned long long) max_blocks);
@@ -4543,7 +4547,9 @@ static void md_do_sync(mddev_t *mddev)
 		 */
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
-	} else
+	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		max_sectors = mddev->size << 1;
+	else
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->size << 1;
 
@@ -4679,6 +4685,8 @@ static void md_do_sync(mddev_t *mddev)
 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2 &&
 	    mddev->curr_resync >= mddev->recovery_cp) {
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7a6df515b008..56cba8d3e398 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -93,11 +93,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
 					md_wakeup_thread(conf->mddev->thread);
 			}
-			list_add_tail(&sh->lru, &conf->inactive_list);
 			atomic_dec(&conf->active_stripes);
-			if (!conf->inactive_blocked ||
-			    atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
+			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+				list_add_tail(&sh->lru, &conf->inactive_list);
 				wake_up(&conf->wait_for_stripe);
+			}
 		}
 	}
 }
@@ -273,9 +273,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
-				if (list_empty(&sh->lru))
-					BUG();
-				list_del_init(&sh->lru);
+				if (!list_empty(&sh->lru))
+					list_del_init(&sh->lru);
 			}
 		}
 	} while (sh == NULL);
@@ -1035,6 +1034,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	return 0;
 }
 
+static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+{
+	int sectors_per_chunk = conf->chunk_size >> 9;
+	sector_t x = stripe;
+	int pd_idx, dd_idx;
+	int chunk_offset = sector_div(x, sectors_per_chunk);
+	stripe = x;
+	raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
+			     + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+	return pd_idx;
+}
+
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1061,7 +1072,7 @@ static void handle_stripe(struct stripe_head *sh)
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
-	int syncing;
+	int syncing, expanding, expanded;
 	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 	int non_overwrite = 0;
 	int failed_num=0;
@@ -1076,6 +1087,8 @@ static void handle_stripe(struct stripe_head *sh)
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -1268,13 +1281,14 @@ static void handle_stripe(struct stripe_head *sh)
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks))) {
+	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
 		for (i=disks; i--;) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
 			    (dev->toread ||
 			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 			     syncing ||
+			     expanding ||
 			     (failed && (sh->dev[failed_num].toread ||
 					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
 				    )
@@ -1464,13 +1478,76 @@ static void handle_stripe(struct stripe_head *sh)
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		}
 	}
 
+	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+		/* Need to write out all blocks after computing parity */
+		sh->disks = conf->raid_disks;
+		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
+		compute_parity(sh, RECONSTRUCT_WRITE);
+		for (i= conf->raid_disks; i--;) {
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			locked++;
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+		}
+		clear_bit(STRIPE_EXPANDING, &sh->state);
+	} else if (expanded) {
+		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		wake_up(&conf->wait_for_overlap);
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+	}
+
+	if (expanding && locked == 0) {
+		/* We have read all the blocks in this stripe and now we need to
+		 * copy some of them into a target stripe for expand.
+		 */
+		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+		for (i=0; i< sh->disks; i++)
+			if (i != sh->pd_idx) {
+				int dd_idx, pd_idx, j;
+				struct stripe_head *sh2;
+
+				sector_t bn = compute_blocknr(sh, i);
+				sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+								  conf->raid_disks-1,
+								  &dd_idx, &pd_idx, conf);
+				sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
+				if (sh2 == NULL)
+					/* so far only the early blocks of this stripe
+					 * have been requested.  When later blocks
+					 * get requested, we will try again
+					 */
+					continue;
+				if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+				   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+					/* must have already done this block */
+					release_stripe(sh2);
+					continue;
+				}
+				memcpy(page_address(sh2->dev[dd_idx].page),
+				       page_address(sh->dev[i].page),
+				       STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+				set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+				for (j=0; j<conf->raid_disks; j++)
+					if (j != sh2->pd_idx &&
+					    !test_bit(R5_Expanded, &sh2->dev[j].flags))
+						break;
+				if (j == conf->raid_disks) {
+					set_bit(STRIPE_EXPAND_READY, &sh2->state);
+					set_bit(STRIPE_HANDLE, &sh2->state);
+				}
+				release_stripe(sh2);
+			}
+	}
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
@@ -1509,7 +1586,7 @@ static void handle_stripe(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (syncing)
+			if (syncing || expanding || expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
@@ -1757,12 +1834,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
-	int sectors_per_chunk = conf->chunk_size >> 9;
-	sector_t x;
-	unsigned long stripe;
-	int chunk_offset;
-	int dd_idx, pd_idx;
-	sector_t first_sector;
+	int pd_idx;
+	sector_t first_sector, last_sector;
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
 	sector_t max_sector = mddev->size << 1;
@@ -1781,6 +1854,80 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 		return 0;
 	}
+
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+		/* reshaping is quite different to recovery/resync so it is
+		 * handled quite separately ... here.
+		 *
+		 * On each call to sync_request, we gather one chunk worth of
+		 * destination stripes and flag them as expanding.
+		 * Then we find all the source stripes and request reads.
+		 * As the reads complete, handle_stripe will copy the data
+		 * into the destination stripe and release that stripe.
+		 */
+		int i;
+		int dd_idx;
+		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+			int j;
+			int skipped = 0;
+			pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+			sh = get_active_stripe(conf, sector_nr+i,
+					       conf->raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPANDING, &sh->state);
+			/* If any of this stripe is beyond the end of the old
+			 * array, then we need to zero those blocks
+			 */
+			for (j=sh->disks; j--;) {
+				sector_t s;
+				if (j == sh->pd_idx)
+					continue;
+				s = compute_blocknr(sh, j);
+				if (s < (mddev->array_size<<1)) {
+					skipped = 1;
+					continue;
+				}
+				memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh->dev[j].flags);
+				set_bit(R5_UPTODATE, &sh->dev[j].flags);
+			}
+			if (!skipped) {
+				set_bit(STRIPE_EXPAND_READY, &sh->state);
+				set_bit(STRIPE_HANDLE, &sh->state);
+			}
+			release_stripe(sh);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+		spin_unlock_irq(&conf->device_lock);
+		/* Ok, those stripe are ready. We can start scheduling
+		 * reads on the source stripes.
+		 * The source stripes are determined by mapping the first and last
+		 * block on the destination stripes.
+		 */
+		raid_disks = conf->previous_raid_disks;
+		data_disks = raid_disks - 1;
+		first_sector =
+			raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		last_sector =
+			raid5_compute_sector((sector_nr+conf->chunk_size/512)
+					       *(conf->raid_disks-1) -1,
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		if (last_sector >= (mddev->size<<1))
+			last_sector = (mddev->size<<1)-1;
+		while (first_sector <= last_sector) {
+			pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+			sh = get_active_stripe(conf, first_sector,
+					       conf->previous_raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			release_stripe(sh);
+			first_sector += STRIPE_SECTORS;
+		}
+		return conf->chunk_size>>9;
+	}
 	/* if there is 1 or more failed drives and we are trying
 	 * to resync, then assert that we are finished, because there is
 	 * nothing we can do.
@@ -1799,13 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
 	}
 
-	x = sector_nr;
-	chunk_offset = sector_div(x, sectors_per_chunk);
-	stripe = x;
-	BUG_ON(x != stripe);
-
-	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
-		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+	pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
 	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
 		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 617b9506c760..4e26ef2cacca 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -157,6 +157,9 @@ struct mddev_s
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
 	 * CHECK:    user-space request for for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
 	 */
 #define	MD_RECOVERY_RUNNING	0
 #define	MD_RECOVERY_SYNC	1
@@ -166,6 +169,7 @@ struct mddev_s
 #define	MD_RECOVERY_NEEDED	5
 #define	MD_RECOVERY_REQUESTED	6
 #define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 6fa274aea2a0..55c738d50508 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -157,6 +157,7 @@ struct stripe_head {
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
+#define	R5_Expanded	10	/* This block now has post-expand data */
 /*
  * Write method
  */
@@ -176,7 +177,8 @@ struct stripe_head {
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
 #define	STRIPE_EXPANDING	9
-
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
 /*
  * Plugging:
  *
-- 
cgit v1.2.3


From 292695531ae4019bb15deedc121b218d1908b648 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:10 -0800
Subject: [PATCH] md: Final stages of raid5 expand code

This patch adds raid5_reshape and end_reshape which will start and finish the
reshape processes.

raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage
accidental use.

Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry.

and Make sure that you have backups, just in case.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/Kconfig      |  26 ++++++++++
 drivers/md/md.c         |   6 ++-
 drivers/md/raid5.c      | 123 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/raid/md.h |   3 +-
 4 files changed, 154 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac43f98062fd..fd2aae150ccc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -127,6 +127,32 @@ config MD_RAID5
 
 	  If unsure, say Y.
 
+config MD_RAID5_RESHAPE
+	bool "Support adding drives to a raid-5 array (experimental)"
+	depends on MD_RAID5 && EXPERIMENTAL
+	---help---
+	  A RAID-5 set can be expanded by adding extra drives. This
+	  requires "restriping" the array which means (almost) every
+	  block must be written to a different place.
+
+          This option allows such restriping to be done while the array
+	  is online.  However it is still EXPERIMENTAL code.  It should
+	  work, but please be sure that you have backups.
+
+	  You will need a version of mdadm newer than 2.3.1.   During the
+	  early stage of reshape there is a critical section where live data
+	  is being over-written.  A crash during this time needs extra care
+	  for recovery.  The newer mdadm takes a copy of the data in the
+	  critical section and will restore it, if necessary, after a crash.
+
+	  The mdadm usage is e.g.
+	       mdadm --grow /dev/md1 --raid-disks=6
+	  to grow '/dev/md1' to having 6 disks.
+
+	  Note: The array can only be expanded, not contracted.
+	  There should be enough spares already present to make the new
+	  array workable.
+
 config MD_RAID6
 	tristate "RAID-6 mode"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e65986bc63f..d169bc964676 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -158,11 +158,12 @@ static int start_readonly;
  */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-static void md_new_event(mddev_t *mddev)
+void md_new_event(mddev_t *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
 }
+EXPORT_SYMBOL_GPL(md_new_event);
 
 /*
  * Enables to iterate over all existing md arrays
@@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
-static void md_do_sync(mddev_t *mddev)
+void md_do_sync(mddev_t *mddev)
 {
 	mddev_t *mddev2;
 	unsigned int currspeed = 0,
@@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev)
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 }
+EXPORT_SYMBOL_GPL(md_do_sync);
 
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 56cba8d3e398..b29135acb1d9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	}
 	return 0;
 }
+
+#ifdef CONFIG_MD_RAID5_RESHAPE
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
@@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	conf->pool_size = newsize;
 	return err;
 }
-
+#endif
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	return 0;
 }
 
+static void end_reshape(raid5_conf_t *conf);
+
 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 {
 	int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
 		unplug_slaves(mddev);
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+			end_reshape(conf);
+			return 0;
+		}
 
 		if (mddev->curr_resync < max_sector) /* aborted */
 			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
+#ifdef CONFIG_MD_RAID5_RESHAPE
+static int raid5_reshape(mddev_t *mddev, int raid_disks)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int err;
+	mdk_rdev_t *rdev;
+	struct list_head *rtmp;
+	int spares = 0;
+	int added_devices = 0;
+
+	if (mddev->degraded ||
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+	if (conf->raid_disks > raid_disks)
+		return -EINVAL; /* Cannot shrink array yet */
+	if (conf->raid_disks == raid_disks)
+		return 0; /* nothing to do */
+
+	/* Can only proceed if there are plenty of stripe_heads.
+	 * We need a minimum of one full stripe,, and for sensible progress
+	 * it is best to have about 4 times that.
+	 * If we require 4 times, then the default 256 4K stripe_heads will
+	 * allow for chunk sizes up to 256K, which is probably OK.
+	 * If the chunk size is greater, user-space should request more
+	 * stripe_heads first.
+	 */
+	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+		       (mddev->chunk_size / STRIPE_SIZE)*4);
+		return -ENOSPC;
+	}
+
+	ITERATE_RDEV(mddev, rdev, rtmp)
+		if (rdev->raid_disk < 0 &&
+		    !test_bit(Faulty, &rdev->flags))
+			spares++;
+	if (conf->raid_disks + spares < raid_disks-1)
+		/* Not enough devices even to make a degraded array
+		 * of that size
+		 */
+		return -EINVAL;
+
+	err = resize_stripes(conf, raid_disks);
+	if (err)
+		return err;
+
+	spin_lock_irq(&conf->device_lock);
+	conf->previous_raid_disks = conf->raid_disks;
+	mddev->raid_disks = conf->raid_disks = raid_disks;
+	conf->expand_progress = 0;
+	spin_unlock_irq(&conf->device_lock);
+
+	/* Add some new drives, as many as will fit.
+	 * We know there are enough to make the newly sized array work.
+	 */
+	ITERATE_RDEV(mddev, rdev, rtmp)
+		if (rdev->raid_disk < 0 &&
+		    !test_bit(Faulty, &rdev->flags)) {
+			if (raid5_add_disk(mddev, rdev)) {
+				char nm[20];
+				set_bit(In_sync, &rdev->flags);
+				conf->working_disks++;
+				added_devices++;
+				sprintf(nm, "rd%d", rdev->raid_disk);
+				sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+			} else
+				break;
+		}
+
+	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+						"%s_reshape");
+	if (!mddev->sync_thread) {
+		mddev->recovery = 0;
+		spin_lock_irq(&conf->device_lock);
+		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		conf->expand_progress = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		return -EAGAIN;
+	}
+	md_wakeup_thread(mddev->sync_thread);
+	md_new_event(mddev);
+	return 0;
+}
+#endif
+
+static void end_reshape(raid5_conf_t *conf)
+{
+	struct block_device *bdev;
+
+	conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
+	set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+	conf->mddev->changed = 1;
+
+	bdev = bdget_disk(conf->mddev->gendisk, 0);
+	if (bdev) {
+		mutex_lock(&bdev->bd_inode->i_mutex);
+		i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+		mutex_unlock(&bdev->bd_inode->i_mutex);
+		bdput(bdev);
+	}
+	spin_lock_irq(&conf->device_lock);
+	conf->expand_progress = MaxSector;
+	spin_unlock_irq(&conf->device_lock);
+}
+
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+	.reshape	= raid5_reshape,
+#endif
 	.quiesce	= raid5_quiesce,
 };
 
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b6e0bcad84e1..9c77cde5a795 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 extern void md_super_wait(mddev_t *mddev);
 extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 			struct page *page, int rw);
-
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
 
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
-- 
cgit v1.2.3


From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:11 -0800
Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape

We allow the superblock to record an 'old' and a 'new' geometry, and a
position where any conversion is up to.  The geometry allows for changing
chunksize, layout and level as well as number of devices.

When using verion-0.90 superblock, we convert the version to 0.91 while the
conversion is happening so that an old kernel will refuse the assemble the
array.  For version-1, we use a feature bit for the same effect.

When starting an array we check for an incomplete reshape and restart the
reshape process if needed.  If the reshape stopped at an awkward time (like
when updating the first stripe) we refuse to assemble the array, and let
user-space worry about it.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  69 +++++++++++++++++++++-
 drivers/md/raid1.c         |   5 ++
 drivers/md/raid5.c         | 140 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/raid/md.h    |   2 +
 include/linux/raid/md_k.h  |   8 +++
 include/linux/raid/md_p.h  |  32 ++++++++++-
 include/linux/raid/raid5.h |   1 +
 7 files changed, 231 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d169bc964676..b9dfdfccdb78 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -662,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	}
 
 	if (sb->major_version != 0 ||
-	    sb->minor_version != 90) {
+	    sb->minor_version < 90 ||
+	    sb->minor_version > 91) {
 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 			sb->major_version, sb->minor_version,
 			b);
@@ -747,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->bitmap_offset = 0;
 		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
+		if (mddev->minor_version >= 91) {
+			mddev->reshape_position = sb->reshape_position;
+			mddev->delta_disks = sb->delta_disks;
+			mddev->new_level = sb->new_level;
+			mddev->new_layout = sb->new_layout;
+			mddev->new_chunk = sb->new_chunk;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 		if (sb->state & (1<<MD_SB_CLEAN))
 			mddev->recovery_cp = MaxSector;
 		else {
@@ -841,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->md_magic = MD_SB_MAGIC;
 	sb->major_version = mddev->major_version;
-	sb->minor_version = mddev->minor_version;
 	sb->patch_version = mddev->patch_version;
 	sb->gvalid_words  = 0; /* ignored */
 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -860,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->events_hi = (mddev->events>>32);
 	sb->events_lo = (u32)mddev->events;
 
+	if (mddev->reshape_position == MaxSector)
+		sb->minor_version = 90;
+	else {
+		sb->minor_version = 91;
+		sb->reshape_position = mddev->reshape_position;
+		sb->new_level = mddev->new_level;
+		sb->delta_disks = mddev->delta_disks;
+		sb->new_layout = mddev->new_layout;
+		sb->new_chunk = mddev->new_chunk;
+	}
+	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
 	{
 		sb->recovery_cp = mddev->recovery_cp;
@@ -1104,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			}
 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
 		}
+		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+			mddev->new_level = le32_to_cpu(sb->new_level);
+			mddev->new_layout = le32_to_cpu(sb->new_layout);
+			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling */
 		__u64 ev1 = le64_to_cpu(sb->events);
@@ -1175,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
+	if (mddev->reshape_position != MaxSector) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+		sb->new_layout = cpu_to_le32(mddev->new_layout);
+		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+		sb->new_level = cpu_to_le32(mddev->new_level);
+		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+	}
 
 	max_dev = 0;
 	ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1497,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev)
 	}
 }
 
-static void md_update_sb(mddev_t * mddev)
+void md_update_sb(mddev_t * mddev)
 {
 	int err;
 	struct list_head *tmp;
@@ -1574,6 +1621,7 @@ repeat:
 	wake_up(&mddev->sb_wait);
 
 }
+EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -2545,6 +2593,14 @@ static int do_md_run(mddev_t * mddev)
 	mddev->level = pers->level;
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+	if (mddev->reshape_position != MaxSector &&
+	    pers->reshape == NULL) {
+		/* This personality cannot handle reshaping... */
+		mddev->pers = NULL;
+		module_put(pers->owner);
+		return -EINVAL;
+	}
+
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 	mddev->barriers_work = 1;
@@ -3433,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 	mddev->bitmap_offset = 0;
 
+	mddev->reshape_position = MaxSector;
+
 	/*
 	 * Generate a 128 bit UUID
 	 */
 	get_random_bytes(mddev->uuid, 16);
 
+	mddev->new_level = mddev->level;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->delta_disks = 0;
+
 	return 0;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d88329e3c7a..b65b8cfbdf30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1789,6 +1789,11 @@ static int run(mddev_t *mddev)
 		       mdname(mddev), mddev->level);
 		goto out;
 	}
+	if (mddev->reshape_position != MaxSector) {
+		printk("raid1: %s: reshape_position set but not supported\n",
+		       mdname(mddev));
+		goto out;
+	}
 	/*
 	 * copy the already verified devices into our private RAID1
 	 * bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b29135acb1d9..20ae32d67e21 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
 #include <linux/raid/raid5.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/kthread.h>
 #include <asm/atomic.h>
 
 #include <linux/raid/bitmap.h>
@@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_EXPANDING, &sh->state);
 	} else if (expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
@@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+
+		if (sector_nr == 0 &&
+		    conf->expand_progress != 0) {
+			/* restarting in the middle, skip the initial sectors */
+			sector_nr = conf->expand_progress;
+			sector_div(sector_nr, conf->raid_disks-1);
+			*skipped = 1;
+			return sector_nr;
+		}
+
+		/* Cannot proceed until we've updated the superblock... */
+		wait_event(conf->wait_for_overlap,
+			   atomic_read(&conf->reshape_stripes)==0);
+		mddev->reshape_position = conf->expand_progress;
+
+		mddev->sb_dirty = 1;
+		md_wakeup_thread(mddev->thread);
+		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+			kthread_should_stop());
+
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
 			int skipped = 0;
@@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			sh = get_active_stripe(conf, sector_nr+i,
 					       conf->raid_disks, pd_idx, 0);
 			set_bit(STRIPE_EXPANDING, &sh->state);
+			atomic_inc(&conf->reshape_stripes);
 			/* If any of this stripe is beyond the end of the old
 			 * array, then we need to zero those blocks
 			 */
@@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
+	if (mddev->reshape_position != MaxSector) {
+		/* Check that we can continue the reshape.
+		 * Currently only disks can change, it must
+		 * increase, and we must be past the point where
+		 * a stripe over-writes itself
+		 */
+		sector_t here_new, here_old;
+		int old_disks;
+
+		if (mddev->new_level != mddev->level ||
+		    mddev->new_layout != mddev->layout ||
+		    mddev->new_chunk != mddev->chunk_size) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		if (mddev->delta_disks <= 0) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		old_disks = mddev->raid_disks - mddev->delta_disks;
+		/* reshape_position must be on a new-stripe boundary, and one
+		 * further up in new geometry must map after here in old geometry.
+		 */
+		here_new = mddev->reshape_position;
+		if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
+			printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+			return -EINVAL;
+		}
+		/* here_new is the stripe we will write to */
+		here_old = mddev->reshape_position;
+		sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
+		/* here_old is the first stripe that we might need to read from */
+		if (here_new >= here_old) {
+			/* Reading from the same stripe as writing to - bad */
+			printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+			return -EINVAL;
+		}
+		printk(KERN_INFO "raid5: reshape will continue\n");
+		/* OK, we should be able to continue; */
+	}
+
+
 	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
-	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+	if (mddev->reshape_position == MaxSector) {
+		conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
+	} else {
+		conf->raid_disks = mddev->raid_disks;
+		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+	}
+
+	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
 	if (!conf->disks)
 		goto abort;
@@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev)
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		raid_disk = rdev->raid_disk;
-		if (raid_disk >= mddev->raid_disks
+		if (raid_disk >= conf->raid_disks
 		    || raid_disk < 0)
 			continue;
 		disk = conf->disks + raid_disk;
@@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev)
 		}
 	}
 
-	conf->raid_disks = mddev->raid_disks;
 	/*
 	 * 0 for a fully functional array, 1 for a degraded array.
 	 */
@@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
-	conf->expand_progress = MaxSector;
+	conf->expand_progress = mddev->reshape_position;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev)
 
 	print_raid5_conf(conf);
 
+	if (conf->expand_progress != MaxSector) {
+		printk("...ok start reshape thread\n");
+		atomic_set(&conf->reshape_stripes, 0);
+		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+							"%s_reshape");
+		/* FIXME if md_register_thread fails?? */
+		md_wakeup_thread(mddev->sync_thread);
+
+	}
+
 	/* read-ahead size must cover two whole stripes, which is
 	 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
 	 */
@@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
+	mddev->array_size =  mddev->size * (conf->previous_raid_disks - 1);
 
-	mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
 	return 0;
 abort:
 	if (conf) {
@@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	/*
 	 * find the disk ...
 	 */
-	for (disk=0; disk < mddev->raid_disks; disk++)
+	for (disk=0; disk < conf->raid_disks; disk++)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
@@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	if (err)
 		return err;
 
+	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
-	mddev->raid_disks = conf->raid_disks = raid_disks;
+	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
 	spin_unlock_irq(&conf->device_lock);
 
@@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		}
 
 	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->new_level = mddev->level;
+	mddev->raid_disks = raid_disks;
+	mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+	mddev->reshape_position = 0;
+	mddev->sb_dirty = 1;
+
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		mddev->delta_disks = 0;
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
@@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf)
 {
 	struct block_device *bdev;
 
-	conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
-	set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
-	conf->mddev->changed = 1;
-
-	bdev = bdget_disk(conf->mddev->gendisk, 0);
-	if (bdev) {
-		mutex_lock(&bdev->bd_inode->i_mutex);
-		i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
-		mutex_unlock(&bdev->bd_inode->i_mutex);
-		bdput(bdev);
+	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+		conf->mddev->changed = 1;
+
+		bdev = bdget_disk(conf->mddev->gendisk, 0);
+		if (bdev) {
+			mutex_lock(&bdev->bd_inode->i_mutex);
+			i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
+			bdput(bdev);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		conf->mddev->reshape_position = MaxSector;
 	}
-	spin_lock_irq(&conf->device_lock);
-	conf->expand_progress = MaxSector;
-	spin_unlock_irq(&conf->device_lock);
 }
 
 static void raid5_quiesce(mddev_t *mddev, int state)
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 9c77cde5a795..66b44e5e0d6e 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 
+extern void md_update_sb(mddev_t * mddev);
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 #endif 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4e26ef2cacca..1a6f9f2f6282 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -132,6 +132,14 @@ struct mddev_s
 
 	char				uuid[16];
 
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout, new_chunk;
+
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* blocks scheduled */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index c100fa5d4bfa..774e1acfb8c4 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s {
 #define MD_SB_ERRORS		1
 
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+
+/*
+ * Notes:
+ * - if an array is being reshaped (restriped) in order to change the
+ *   the number of active devices in the array, 'raid_disks' will be
+ *   the larger of the old and new numbers.  'delta_disks' will
+ *   be the "new - old".  So if +ve, raid_disks is the new value, and
+ *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the
+ *   old value and "raid_disks+delta_disks" is the new (smaller) value.
+ */
+
+
 typedef struct mdp_superblock_s {
 	/*
 	 * Constant generic information
@@ -146,7 +158,13 @@ typedef struct mdp_superblock_s {
 	__u32 cp_events_hi;	/* 10 high-order of checkpoint update count   */
 #endif
 	__u32 recovery_cp;	/* 11 recovery checkpoint sector count	      */
-	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12];
+	/* There are only valid for minor_version > 90 */
+	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */
+	__u32 new_level;	/* 14 new level we are reshaping to	      */
+	__u32 delta_disks;	/* 15 change in number of raid_disks	      */
+	__u32 new_layout;	/* 16 new layout			      */
+	__u32 new_chunk;	/* 17 new chunk size (bytes)		      */
+	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
 
 	/*
 	 * Personality information
@@ -207,7 +225,14 @@ struct mdp_superblock_1 {
 				 * NOTE: signed, so bitmap can be before superblock
 				 * only meaningful of feature_map[0] is set.
 				 */
-	__u8	pad1[128-100];	/* set to 0 when written */
+
+	/* These are only valid with feature bit '4' */
+	__u64	reshape_position;	/* next address in array-space for reshape */
+	__u32	new_level;	/* new level we are reshaping to		*/
+	__u32	delta_disks;	/* change in number of raid_disks		*/
+	__u32	new_layout;	/* new layout					*/
+	__u32	new_chunk;	/* new chunk size (bytes)			*/
+	__u8	pad1[128-124];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */
 	__u64	data_offset;	/* sector start of data, often 0 */
@@ -240,8 +265,9 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RESHAPE_ACTIVE	4
 
-#define	MD_FEATURE_ALL			1
+#define	MD_FEATURE_ALL			5
 
 #endif 
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 55c738d50508..abcdf0d0658a 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -224,6 +224,7 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.
 	 */
-- 
cgit v1.2.3


From b578d55fdd80140f657130abd85aebeb345755fb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:12 -0800
Subject: [PATCH] md: Only checkpoint expansion progress occasionally

Instead of checkpointing at each stripe, only checkpoint when a new write
would overwrite uncheckpointed data.  Block any write to the uncheckpointed
area.  Arbitrarily checkpoint at least every 3Meg.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 53 ++++++++++++++++++++++++++++++++++++----------
 include/linux/raid/raid5.h |  3 +++
 2 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 20ae32d67e21..089a32604305 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1762,8 +1762,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int disks;
-		
+
 	retry:
+		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		if (likely(conf->expand_progress == MaxSector))
 			disks = conf->raid_disks;
 		else {
@@ -1771,6 +1772,13 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			disks = conf->raid_disks;
 			if (logical_sector >= conf->expand_progress)
 				disks = conf->previous_raid_disks;
+			else {
+				if (logical_sector >= conf->expand_lo) {
+					spin_unlock_irq(&conf->device_lock);
+					schedule();
+					goto retry;
+				}
+			}
 			spin_unlock_irq(&conf->device_lock);
 		}
  		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
@@ -1779,7 +1787,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
 			if (unlikely(conf->expand_progress != MaxSector)) {
@@ -1877,6 +1884,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+		sector_t writepos, safepos, gap;
 
 		if (sector_nr == 0 &&
 		    conf->expand_progress != 0) {
@@ -1887,15 +1895,36 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			return sector_nr;
 		}
 
-		/* Cannot proceed until we've updated the superblock... */
-		wait_event(conf->wait_for_overlap,
-			   atomic_read(&conf->reshape_stripes)==0);
-		mddev->reshape_position = conf->expand_progress;
-
-		mddev->sb_dirty = 1;
-		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
-			kthread_should_stop());
+		/* we update the metadata when there is more than 3Meg
+		 * in the block range (that is rather arbitrary, should
+		 * probably be time based) or when the data about to be
+		 * copied would over-write the source of the data at
+		 * the front of the range.
+		 * i.e. one new_stripe forward from expand_progress new_maps
+		 * to after where expand_lo old_maps to
+		 */
+		writepos = conf->expand_progress +
+			conf->chunk_size/512*(conf->raid_disks-1);
+		sector_div(writepos, conf->raid_disks-1);
+		safepos = conf->expand_lo;
+		sector_div(safepos, conf->previous_raid_disks-1);
+		gap = conf->expand_progress - conf->expand_lo;
+
+		if (writepos >= safepos ||
+		    gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+			/* Cannot proceed until we've updated the superblock... */
+			wait_event(conf->wait_for_overlap,
+				   atomic_read(&conf->reshape_stripes)==0);
+			mddev->reshape_position = conf->expand_progress;
+			mddev->sb_dirty = 1;
+			md_wakeup_thread(mddev->thread);
+			wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+				   kthread_should_stop());
+			spin_lock_irq(&conf->device_lock);
+			conf->expand_lo = mddev->reshape_position;
+			spin_unlock_irq(&conf->device_lock);
+			wake_up(&conf->wait_for_overlap);
+		}
 
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
@@ -2322,6 +2351,7 @@ static int run(mddev_t *mddev)
 
 	if (conf->expand_progress != MaxSector) {
 		printk("...ok start reshape thread\n");
+		conf->expand_lo = conf->expand_progress;
 		atomic_set(&conf->reshape_stripes, 0);
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2610,6 +2640,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
+	conf->expand_lo = 0;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* Add some new drives, as many as will fit.
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index abcdf0d0658a..914af667044f 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -217,6 +217,9 @@ struct raid5_private_data {
 
 	/* used during an expand */
 	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
+					    * as we haven't flushed the metadata yet
+					    */
 	int			previous_raid_disks;
 
 	struct list_head	handle_list; /* stripes needing handling */
-- 
cgit v1.2.3


From 63c70c4f3a30e77e6f445bd16eff7934a031ebd3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:13 -0800
Subject: [PATCH] md: Split reshape handler in check_reshape and start_reshape

check_reshape checks validity and does things that can be done instantly -
like adding devices to raid1.  start_reshape initiates a restriping process to
convert the whole array.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 10 ++++----
 drivers/md/raid1.c        | 19 ++++++++++++---
 drivers/md/raid5.c        | 60 +++++++++++++++++++++++++++--------------------
 include/linux/raid/md_k.h |  3 ++-
 4 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b9dfdfccdb78..1a637676a930 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2594,7 +2594,7 @@ static int do_md_run(mddev_t * mddev)
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
 	if (mddev->reshape_position != MaxSector &&
-	    pers->reshape == NULL) {
+	    pers->start_reshape == NULL) {
 		/* This personality cannot handle reshaping... */
 		mddev->pers = NULL;
 		module_put(pers->owner);
@@ -3556,14 +3556,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
 {
 	int rv;
 	/* change the number of raid disks */
-	if (mddev->pers->reshape == NULL)
+	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
 	if (raid_disks <= 0 ||
 	    raid_disks >= mddev->max_disks)
 		return -EINVAL;
-	if (mddev->sync_thread)
+	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
 		return -EBUSY;
-	rv = mddev->pers->reshape(mddev, raid_disks);
+	mddev->delta_disks = raid_disks - mddev->raid_disks;
+
+	rv = mddev->pers->check_reshape(mddev);
 	return rv;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b65b8cfbdf30..04418e10d1dc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1976,7 +1976,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
-static int raid1_reshape(mddev_t *mddev, int raid_disks)
+static int raid1_reshape(mddev_t *mddev)
 {
 	/* We need to:
 	 * 1/ resize the r1bio_pool
@@ -1993,10 +1993,22 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	struct pool_info *newpoolinfo;
 	mirror_info_t *newmirrors;
 	conf_t *conf = mddev_to_conf(mddev);
-	int cnt;
+	int cnt, raid_disks;
 
 	int d, d2;
 
+	/* Cannot change chunk_size, layout, or level */
+	if (mddev->chunk_size != mddev->new_chunk ||
+	    mddev->layout != mddev->new_layout ||
+	    mddev->level != mddev->new_level) {
+		mddev->new_chunk = mddev->chunk_size;
+		mddev->new_layout = mddev->layout;
+		mddev->new_level = mddev->level;
+		return -EINVAL;
+	}
+
+	raid_disks = mddev->raid_disks + mddev->delta_disks;
+
 	if (raid_disks < conf->raid_disks) {
 		cnt=0;
 		for (d= 0; d < conf->raid_disks; d++)
@@ -2043,6 +2055,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 
 	mddev->degraded += (raid_disks - conf->raid_disks);
 	conf->raid_disks = mddev->raid_disks = raid_disks;
+	mddev->delta_disks = 0;
 
 	conf->last_used = 0; /* just make sure it is in-range */
 	lower_barrier(conf);
@@ -2084,7 +2097,7 @@ static struct mdk_personality raid1_personality =
 	.spare_active	= raid1_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid1_resize,
-	.reshape	= raid1_reshape,
+	.check_reshape	= raid1_reshape,
 	.quiesce	= raid1_quiesce,
 };
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 089a32604305..355dafb98aac 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2590,21 +2590,15 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 }
 
 #ifdef CONFIG_MD_RAID5_RESHAPE
-static int raid5_reshape(mddev_t *mddev, int raid_disks)
+static int raid5_check_reshape(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	int err;
-	mdk_rdev_t *rdev;
-	struct list_head *rtmp;
-	int spares = 0;
-	int added_devices = 0;
 
-	if (mddev->degraded ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
-		return -EBUSY;
-	if (conf->raid_disks > raid_disks)
-		return -EINVAL; /* Cannot shrink array yet */
-	if (conf->raid_disks == raid_disks)
+	if (mddev->delta_disks < 0 ||
+	    mddev->new_level != mddev->level)
+		return -EINVAL; /* Cannot shrink array or change level yet */
+	if (mddev->delta_disks == 0)
 		return 0; /* nothing to do */
 
 	/* Can only proceed if there are plenty of stripe_heads.
@@ -2615,30 +2609,48 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	 * If the chunk size is greater, user-space should request more
 	 * stripe_heads first.
 	 */
-	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
+	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
 		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
 		       (mddev->chunk_size / STRIPE_SIZE)*4);
 		return -ENOSPC;
 	}
 
+	err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
+	if (err)
+		return err;
+
+	/* looks like we might be able to manage this */
+	return 0;
+}
+
+static int raid5_start_reshape(mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t *rdev;
+	struct list_head *rtmp;
+	int spares = 0;
+	int added_devices = 0;
+
+	if (mddev->degraded ||
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+
 	ITERATE_RDEV(mddev, rdev, rtmp)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags))
 			spares++;
-	if (conf->raid_disks + spares < raid_disks-1)
+
+	if (spares < mddev->delta_disks-1)
 		/* Not enough devices even to make a degraded array
 		 * of that size
 		 */
 		return -EINVAL;
 
-	err = resize_stripes(conf, raid_disks);
-	if (err)
-		return err;
-
 	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
-	conf->raid_disks = raid_disks;
+	conf->raid_disks += mddev->delta_disks;
 	conf->expand_progress = 0;
 	conf->expand_lo = 0;
 	spin_unlock_irq(&conf->device_lock);
@@ -2660,12 +2672,8 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 				break;
 		}
 
-	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
-	mddev->new_chunk = mddev->chunk_size;
-	mddev->new_layout = mddev->layout;
-	mddev->new_level = mddev->level;
-	mddev->raid_disks = raid_disks;
-	mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+	mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+	mddev->raid_disks = conf->raid_disks;
 	mddev->reshape_position = 0;
 	mddev->sb_dirty = 1;
 
@@ -2679,7 +2687,6 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-		mddev->delta_disks = 0;
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
@@ -2752,7 +2759,8 @@ static struct mdk_personality raid5_personality =
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
 #ifdef CONFIG_MD_RAID5_RESHAPE
-	.reshape	= raid5_reshape,
+	.check_reshape	= raid5_check_reshape,
+	.start_reshape  = raid5_start_reshape,
 #endif
 	.quiesce	= raid5_quiesce,
 };
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 1a6f9f2f6282..002ee631fabb 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -261,7 +261,8 @@ struct mdk_personality
 	int (*spare_active) (mddev_t *mddev);
 	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
 	int (*resize) (mddev_t *mddev, sector_t sectors);
-	int (*reshape) (mddev_t *mddev, int raid_disks);
+	int (*check_reshape) (mddev_t *mddev);
+	int (*start_reshape) (mddev_t *mddev);
 	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
 	/* quiesce moves between quiescence states
 	 * 0 - fully active
-- 
cgit v1.2.3


From e464eafdb4400c6d6576ba3840d8bd40340f8a96 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:14 -0800
Subject: [PATCH] md: Support suspending of IO to regions of an md array

This allows user-space to access data safely.  This is needed for raid5
reshape as user-space needs to take a backup of the first few stripes before
allowing reshape to commence.

It will also be useful in cluster-aware raid1 configurations so that all
cluster members can leave a section of the array untouched while a
resync/recovery happens.

A 'start' and 'end' of the suspended range are written to 2 sysfs attributes.
Note that only one range can be suspended at a time.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c        | 14 +++++++++++
 include/linux/raid/md_k.h |  4 ++++
 3 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a79dd33d343d..92fd0104fa04 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2365,6 +2365,63 @@ sync_completed_show(mddev_t *mddev, char *page)
 static struct md_sysfs_entry
 md_sync_completed = __ATTR_RO(sync_completed);
 
+static ssize_t
+suspend_lo_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+}
+
+static ssize_t
+suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long new = simple_strtoull(buf, &e, 10);
+
+	if (mddev->pers->quiesce == NULL)
+		return -EINVAL;
+	if (buf == e || (*e && *e != '\n'))
+		return -EINVAL;
+	if (new >= mddev->suspend_hi ||
+	    (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
+		mddev->suspend_lo = new;
+		mddev->pers->quiesce(mddev, 2);
+		return len;
+	} else
+		return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_lo =
+__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
+
+
+static ssize_t
+suspend_hi_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+}
+
+static ssize_t
+suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long new = simple_strtoull(buf, &e, 10);
+
+	if (mddev->pers->quiesce == NULL)
+		return -EINVAL;
+	if (buf == e || (*e && *e != '\n'))
+		return -EINVAL;
+	if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
+	    (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
+		mddev->suspend_hi = new;
+		mddev->pers->quiesce(mddev, 1);
+		mddev->pers->quiesce(mddev, 0);
+		return len;
+	} else
+		return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_hi =
+__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
@@ -2382,6 +2439,8 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_max.attr,
 	&md_sync_speed.attr,
 	&md_sync_completed.attr,
+	&md_suspend_lo.attr,
+	&md_suspend_hi.attr,
 	NULL,
 };
 static struct attribute_group md_redundancy_group = {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 355dafb98aac..bb16ac231a40 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1805,6 +1805,15 @@ static int make_request(request_queue_t *q, struct bio * bi)
 					goto retry;
 				}
 			}
+			/* FIXME what if we get a false positive because these
+			 * are being updated.
+			 */
+			if (logical_sector >= mddev->suspend_lo &&
+			    logical_sector < mddev->suspend_hi) {
+				release_stripe(sh);
+				schedule();
+				goto retry;
+			}
 
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
 			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
@@ -2725,6 +2734,10 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 
 	switch(state) {
+	case 2: /* resume for a suspend */
+		wake_up(&conf->wait_for_overlap);
+		break;
+
 	case 1: /* stop all writes */
 		spin_lock_irq(&conf->device_lock);
 		conf->quiesce = 1;
@@ -2738,6 +2751,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 		spin_lock_irq(&conf->device_lock);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_stripe);
+		wake_up(&conf->wait_for_overlap);
 		spin_unlock_irq(&conf->device_lock);
 		break;
 	}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 002ee631fabb..c0d3097846a7 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -151,6 +151,10 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+
+	/* allow user-space to request suspension of IO to regions of the array */
+	sector_t			suspend_lo;
+	sector_t			suspend_hi;
 	/* if zero, use the system-wide default */
 	int				sync_speed_min;
 	int				sync_speed_max;
-- 
cgit v1.2.3


From df5b89b323b922f56650b4b4d7c41899b937cf19 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:20 -0800
Subject: [PATCH] md: Convert reconfig_sem to reconfig_mutex

... being careful that mutex_trylock is inverted wrt down_trylock

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 14 +++++++-------
 include/linux/raid/md_k.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c9c9c096ad80..039e071c1007 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -255,7 +255,7 @@ static mddev_t * mddev_find(dev_t unit)
 	else
 		new->md_minor = MINOR(unit) >> MdpMinorShift;
 
-	init_MUTEX(&new->reconfig_sem);
+	mutex_init(&new->reconfig_mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
 	init_timer(&new->safemode_timer);
@@ -277,22 +277,22 @@ static mddev_t * mddev_find(dev_t unit)
 
 static inline int mddev_lock(mddev_t * mddev)
 {
-	return down_interruptible(&mddev->reconfig_sem);
+	return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
 static inline void mddev_lock_uninterruptible(mddev_t * mddev)
 {
-	down(&mddev->reconfig_sem);
+	mutex_lock(&mddev->reconfig_mutex);
 }
 
 static inline int mddev_trylock(mddev_t * mddev)
 {
-	return down_trylock(&mddev->reconfig_sem);
+	return mutex_trylock(&mddev->reconfig_mutex);
 }
 
 static inline void mddev_unlock(mddev_t * mddev)
 {
-	up(&mddev->reconfig_sem);
+	mutex_unlock(&mddev->reconfig_mutex);
 
 	md_wakeup_thread(mddev->thread);
 }
@@ -4893,7 +4893,7 @@ void md_check_recovery(mddev_t *mddev)
 		))
 		return;
 
-	if (mddev_trylock(mddev)==0) {
+	if (mddev_trylock(mddev)) {
 		int spares =0;
 
 		spin_lock_irq(&mddev->write_lock);
@@ -5029,7 +5029,7 @@ static int md_notify_reboot(struct notifier_block *this,
 		printk(KERN_INFO "md: stopping all md devices.\n");
 
 		ITERATE_MDDEV(mddev,tmp)
-			if (mddev_trylock(mddev)==0)
+			if (mddev_trylock(mddev))
 				do_md_stop (mddev, 1);
 		/*
 		 * certain more exotic SCSI devices are known to be
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index c0d3097846a7..e2df61f5b09a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -185,7 +185,7 @@ struct mddev_s
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
-	struct semaphore		reconfig_sem;
+	struct mutex			reconfig_mutex;
 	atomic_t			active;
 
 	int				changed;	/* true if we might need to reread partition info */
-- 
cgit v1.2.3


From e2d74ac0664c89757bde8fb18c98cd7bf53da61c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 28 Mar 2006 08:59:01 +0200
Subject: [PATCH] [BLOCK] cfq-iosched: change cfq io context linking from list
 to tree

On setups with many disks, we spend a considerable amount of time
looking up the process-disk mapping on each queue of io. Testing with
a NULL based block driver, this costs 40-50% reduction in throughput
for 1000 disks.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/cfq-iosched.c    | 205 +++++++++++++++++++++++--------------------------
 block/ll_rw_blk.c      |  19 +++--
 include/linux/blkdev.h |  14 ++--
 3 files changed, 114 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index bde40a6ae665..bb43a1677626 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1190,19 +1190,19 @@ cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
 	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
-static void cfq_free_io_context(struct cfq_io_context *cic)
+static void cfq_free_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry, *next;
-	int freed = 1;
+	struct rb_node *n;
+	int freed = 0;
 
-	list_for_each_safe(entry, next, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+	while ((n = rb_first(&ioc->cic_root)) != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
+		rb_erase(&__cic->rb_node, &ioc->cic_root);
 		kmem_cache_free(cfq_ioc_pool, __cic);
 		freed++;
 	}
 
-	kmem_cache_free(cfq_ioc_pool, cic);
 	if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
 		complete(ioc_gone);
 }
@@ -1210,8 +1210,7 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
 static void cfq_trim(struct io_context *ioc)
 {
 	ioc->set_ioprio = NULL;
-	if (ioc->cic)
-		cfq_free_io_context(ioc->cic);
+	cfq_free_io_context(ioc);
 }
 
 /*
@@ -1250,26 +1249,26 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 	spin_unlock(q->queue_lock);
 }
 
-static void cfq_exit_io_context(struct cfq_io_context *cic)
+static void cfq_exit_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry;
 	unsigned long flags;
-
-	local_irq_save(flags);
+	struct rb_node *n;
 
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-	read_lock(&cfq_exit_lock);
-	list_for_each(entry, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+	read_lock_irqsave(&cfq_exit_lock, flags);
+
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
+
 		cfq_exit_single_io_context(__cic);
+		n = rb_next(n);
 	}
 
-	cfq_exit_single_io_context(cic);
-	read_unlock(&cfq_exit_lock);
-	local_irq_restore(flags);
+	read_unlock_irqrestore(&cfq_exit_lock, flags);
 }
 
 static struct cfq_io_context *
@@ -1278,10 +1277,10 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
 
 	if (cic) {
-		INIT_LIST_HEAD(&cic->list);
+		RB_CLEAR(&cic->rb_node);
+		cic->key = NULL;
 		cic->cfqq[ASYNC] = NULL;
 		cic->cfqq[SYNC] = NULL;
-		cic->key = NULL;
 		cic->last_end_request = jiffies;
 		cic->ttime_total = 0;
 		cic->ttime_samples = 0;
@@ -1373,15 +1372,17 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 {
 	struct cfq_io_context *cic;
+	struct rb_node *n;
 
 	write_lock(&cfq_exit_lock);
 
-	cic = ioc->cic;
-
-	changed_ioprio(cic);
-
-	list_for_each_entry(cic, &cic->list, list)
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
+ 
 		changed_ioprio(cic);
+		n = rb_next(n);
+	}
 
 	write_unlock(&cfq_exit_lock);
 
@@ -1445,14 +1446,67 @@ out:
 	return cfqq;
 }
 
+static struct cfq_io_context *
+cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+{
+	struct rb_node *n = ioc->cic_root.rb_node;
+	struct cfq_io_context *cic;
+	void *key = cfqd;
+
+	while (n) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
+
+		if (key < cic->key)
+			n = n->rb_left;
+		else if (key > cic->key)
+			n = n->rb_right;
+		else
+			return cic;
+	}
+
+	return NULL;
+}
+
+static inline void
+cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
+	     struct cfq_io_context *cic)
+{
+	struct rb_node **p = &ioc->cic_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_io_context *__cic;
+
+	read_lock(&cfq_exit_lock);
+
+	cic->ioc = ioc;
+	cic->key = cfqd;
+
+	ioc->set_ioprio = cfq_ioc_set_ioprio;
+
+	while (*p) {
+		parent = *p;
+		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
+
+		if (cic->key < __cic->key)
+			p = &(*p)->rb_left;
+		else if (cic->key > __cic->key)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&cic->rb_node, parent, p);
+	rb_insert_color(&cic->rb_node, &ioc->cic_root);
+	list_add(&cic->queue_list, &cfqd->cic_list);
+	read_unlock(&cfq_exit_lock);
+}
+
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq. Note that caller is holding a reference to
- * cfqq, so we don't need to worry about it disappearing
+ * than one device managed by cfq.
  */
 static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
+cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
@@ -1463,88 +1517,15 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
-restart:
-	if ((cic = ioc->cic) == NULL) {
-		cic = cfq_alloc_io_context(cfqd, gfp_mask);
-
-		if (cic == NULL)
-			goto err;
-
-		/*
-		 * manually increment generic io_context usage count, it
-		 * cannot go away since we are already holding one ref to it
-		 */
-		cic->ioc = ioc;
-		cic->key = cfqd;
-		read_lock(&cfq_exit_lock);
-		ioc->set_ioprio = cfq_ioc_set_ioprio;
-		ioc->cic = cic;
-		list_add(&cic->queue_list, &cfqd->cic_list);
-		read_unlock(&cfq_exit_lock);
-	} else {
-		struct cfq_io_context *__cic;
-
-		/*
-		 * the first cic on the list is actually the head itself
-		 */
-		if (cic->key == cfqd)
-			goto out;
-
-		if (unlikely(!cic->key)) {
-			read_lock(&cfq_exit_lock);
-			if (list_empty(&cic->list))
-				ioc->cic = NULL;
-			else
-				ioc->cic = list_entry(cic->list.next,
-						      struct cfq_io_context,
-						      list);
-			read_unlock(&cfq_exit_lock);
-			kmem_cache_free(cfq_ioc_pool, cic);
-			atomic_dec(&ioc_count);
-			goto restart;
-		}
-
-		/*
-		 * cic exists, check if we already are there. linear search
-		 * should be ok here, the list will usually not be more than
-		 * 1 or a few entries long
-		 */
-		list_for_each_entry(__cic, &cic->list, list) {
-			/*
-			 * this process is already holding a reference to
-			 * this queue, so no need to get one more
-			 */
-			if (__cic->key == cfqd) {
-				cic = __cic;
-				goto out;
-			}
-			if (unlikely(!__cic->key)) {
-				read_lock(&cfq_exit_lock);
-				list_del(&__cic->list);
-				read_unlock(&cfq_exit_lock);
-				kmem_cache_free(cfq_ioc_pool, __cic);
-				atomic_dec(&ioc_count);
-				goto restart;
-			}
-		}
+	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	if (cic)
+		goto out;
 
-		/*
-		 * nope, process doesn't have a cic assoicated with this
-		 * cfqq yet. get a new one and add to list
-		 */
-		__cic = cfq_alloc_io_context(cfqd, gfp_mask);
-		if (__cic == NULL)
-			goto err;
-
-		__cic->ioc = ioc;
-		__cic->key = cfqd;
-		read_lock(&cfq_exit_lock);
-		list_add(&__cic->list, &cic->list);
-		list_add(&__cic->queue_list, &cfqd->cic_list);
-		read_unlock(&cfq_exit_lock);
-		cic = __cic;
-	}
+	cic = cfq_alloc_io_context(cfqd, gfp_mask);
+	if (cic == NULL)
+		goto err;
 
+	cfq_cic_link(cfqd, ioc, cic);
 out:
 	return cic;
 err:
@@ -1965,7 +1946,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	cic = cfq_get_io_context(cfqd, key, gfp_mask);
+	cic = cfq_get_io_context(cfqd, gfp_mask);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -2133,11 +2114,14 @@ static void cfq_exit_queue(elevator_t *e)
 	request_queue_t *q = cfqd->queue;
 
 	cfq_shutdown_timer_wq(cfqd);
+
 	write_lock(&cfq_exit_lock);
 	spin_lock_irq(q->queue_lock);
+
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
-	while(!list_empty(&cfqd->cic_list)) {
+
+	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
@@ -2152,6 +2136,7 @@ static void cfq_exit_queue(elevator_t *e)
 		cic->key = NULL;
 		list_del_init(&cic->queue_list);
 	}
+
 	spin_unlock_irq(q->queue_lock);
 	write_unlock(&cfq_exit_lock);
 
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 82469db25100..cb608768ca37 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3539,11 +3539,15 @@ void put_io_context(struct io_context *ioc)
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
 	if (atomic_dec_and_test(&ioc->refcount)) {
+		struct cfq_io_context *cic;
+
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		if (ioc->cic && ioc->cic->dtor)
-			ioc->cic->dtor(ioc->cic);
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+			cic->dtor(ioc);
+		}
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
@@ -3556,6 +3560,7 @@ void exit_io_context(void)
 {
 	unsigned long flags;
 	struct io_context *ioc;
+	struct cfq_io_context *cic;
 
 	local_irq_save(flags);
 	task_lock(current);
@@ -3567,9 +3572,11 @@ void exit_io_context(void)
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
-	if (ioc->cic && ioc->cic->exit)
-		ioc->cic->exit(ioc->cic);
-
+	if (ioc->cic_root.rb_node != NULL) {
+		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+		cic->exit(ioc);
+	}
+ 
 	put_io_context(ioc);
 }
 
@@ -3598,7 +3605,7 @@ struct io_context *current_io_context(gfp_t gfp_flags)
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
-		ret->cic = NULL;
+		ret->cic_root.rb_node = NULL;
 		tsk->io_context = ret;
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c179966f1a2f..ed0ffa673568 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -55,13 +55,11 @@ struct as_io_context {
 
 struct cfq_queue;
 struct cfq_io_context {
-	/*
-	 * circular list of cfq_io_contexts belonging to a process io context
-	 */
-	struct list_head list;
-	struct cfq_queue *cfqq[2];
+	struct rb_node rb_node;
 	void *key;
 
+	struct cfq_queue *cfqq[2];
+
 	struct io_context *ioc;
 
 	unsigned long last_end_request;
@@ -72,8 +70,8 @@ struct cfq_io_context {
 
 	struct list_head queue_list;
 
-	void (*dtor)(struct cfq_io_context *);
-	void (*exit)(struct cfq_io_context *);
+	void (*dtor)(struct io_context *); /* destructor */
+	void (*exit)(struct io_context *); /* called on task exit */
 };
 
 /*
@@ -94,7 +92,7 @@ struct io_context {
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
 	struct as_io_context *aic;
-	struct cfq_io_context *cic;
+	struct rb_root cic_root;
 };
 
 void put_io_context(struct io_context *ioc);
-- 
cgit v1.2.3


From 206dc69b31ca05baac68c75b8ed2ba7dd857d273 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 28 Mar 2006 13:03:44 +0200
Subject: [BLOCK] cfq-iosched: seek and async performance fixes

Detect whether a given process is seeky and if so disable (mostly) the
idle window if it is. We still allow just a little idle time, just enough
to allow that process to submit a new request. That is needed to maintain
fairness across priority groups.

In some cases, we could setup several async queues. This is not optimal
from a performance POV, since we want all async io in one queue to perform
good sorting on it. It also impacted sync queues, as async io got too much
slice time.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/cfq-iosched.c    | 102 +++++++++++++++++++++++++++++++------------------
 include/linux/blkdev.h |   8 +++-
 2 files changed, 72 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 15152e2da0d2..67d446de0227 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -26,18 +26,12 @@ static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
 static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
-static int cfq_slice_idle = HZ / 100;
+static int cfq_slice_idle = HZ / 70;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
-#define CFQ_KEY_ANY		(0xffff)
-
-/*
- * disable queueing at the driver/hardware level
- */
-static const int cfq_max_depth = 2;
 
 static DEFINE_RWLOCK(cfq_exit_lock);
 
@@ -102,6 +96,8 @@ static struct completion *ioc_gone;
 #define cfq_cfqq_sync(cfqq)		\
 	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
 
+#define sample_valid(samples)	((samples) > 80)
+
 /*
  * Per block device queue structure
  */
@@ -170,7 +166,6 @@ struct cfq_data {
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
-	unsigned int cfq_max_depth;
 
 	struct list_head cic_list;
 };
@@ -343,6 +338,14 @@ static int cfq_queue_empty(request_queue_t *q)
 	return !cfqd->busy_queues;
 }
 
+static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
+{
+	if (rw == READ || process_sync(task))
+		return task->pid;
+
+	return CFQ_KEY_ASYNC;
+}
+
 /*
  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
@@ -626,15 +629,20 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	cfq_add_crq_rb(crq);
 }
 
-static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-
+static struct request *
+cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
+	struct task_struct *tsk = current;
+	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
+	struct cfq_queue *cfqq;
 	struct rb_node *n;
+	sector_t sector;
 
+	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
 	if (!cfqq)
 		goto out;
 
+	sector = bio->bi_sector + bio_sectors(bio);
 	n = cfqq->sort_list.rb_node;
 	while (n) {
 		struct cfq_rq *crq = rb_entry_crq(n);
@@ -688,7 +696,7 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 		goto out;
 	}
 
-	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
+	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		ret = ELEVATOR_FRONT_MERGE;
 		goto out;
@@ -891,6 +899,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 {
+	struct cfq_io_context *cic;
 	unsigned long sl;
 
 	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
@@ -906,13 +915,23 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	/*
 	 * task has exited, don't wait
 	 */
-	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
+	cic = cfqd->active_cic;
+	if (!cic || !cic->ioc->task)
 		return 0;
 
 	cfq_mark_cfqq_must_dispatch(cfqq);
 	cfq_mark_cfqq_wait_request(cfqq);
 
 	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
+
+	/*
+	 * we don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. so allow a little bit of time for him to submit a new rq
+	 */
+	if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072)
+		sl = 2;
+
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	return 1;
 }
@@ -1129,13 +1148,6 @@ cfq_dispatch_requests(request_queue_t *q, int force)
 	if (cfqq) {
 		int max_dispatch;
 
-		/*
-		 * if idle window is disabled, allow queue buildup
-		 */
-		if (!cfq_cfqq_idle_window(cfqq) &&
-		    cfqd->rq_in_driver >= cfqd->cfq_max_depth)
-			return 0;
-
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_wait_request(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
@@ -1185,13 +1197,13 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
 		    const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct hlist_node *entry, *next;
+	struct hlist_node *entry;
+	struct cfq_queue *__cfqq;
 
-	hlist_for_each_safe(entry, next, hash_list) {
-		struct cfq_queue *__cfqq = list_entry_qhash(entry);
+	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
 		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
 
-		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
+		if (__cfqq->key == key && (__p == prio || !prio))
 			return __cfqq;
 	}
 
@@ -1572,7 +1584,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
 }
 
-#define sample_valid(samples)	((samples) > 80)
+static void
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+		       struct cfq_rq *crq)
+{
+	sector_t sdist;
+	u64 total;
+
+	if (cic->last_request_pos < crq->request->sector)
+		sdist = crq->request->sector - cic->last_request_pos;
+	else
+		sdist = cic->last_request_pos - crq->request->sector;
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc
+	 */
+	if (cic->seek_samples <= 60) /* second&third seek */
+		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
+
+	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
+	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+	total = cic->seek_total + (cic->seek_samples/2);
+	do_div(total, cic->seek_samples);
+	cic->seek_mean = (sector_t)total;
+}
 
 /*
  * Disable idle window if the process thinks too long or seeks so much that
@@ -1685,9 +1723,11 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cic = crq->io_context;
 
 	cfq_update_io_thinktime(cfqd, cic);
+	cfq_update_io_seektime(cfqd, cic, crq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
 	cic->last_queue = jiffies;
+	cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
 
 	if (cfqq == cfqd->active_queue) {
 		/*
@@ -1820,14 +1860,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		cfq_resort_rr_list(cfqq, 0);
 }
 
-static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
-{
-	if (rw == READ || process_sync(task))
-		return task->pid;
-
-	return CFQ_KEY_ASYNC;
-}
-
 static inline int
 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct task_struct *task, int rw)
@@ -2226,7 +2258,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
-	cfqd->cfq_max_depth = cfq_max_depth;
 
 	return 0;
 out_crqpool:
@@ -2309,7 +2340,6 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
-SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -2338,7 +2368,6 @@ STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -2355,7 +2384,6 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
-	CFQ_ATTR(max_depth),
 	__ATTR_NULL
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed0ffa673568..d0cac8b58de7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -63,11 +63,17 @@ struct cfq_io_context {
 	struct io_context *ioc;
 
 	unsigned long last_end_request;
-	unsigned long last_queue;
+	sector_t last_request_pos;
+ 	unsigned long last_queue;
+
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+
 	struct list_head queue_list;
 
 	void (*dtor)(struct io_context *); /* destructor */
-- 
cgit v1.2.3


From 0080b7aae88c75e2a6b38dfcb228b0f239e18e3c Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Tue, 28 Mar 2006 01:56:15 -0800
Subject: [PATCH] synclink_gt add gpio feature

Add driver support for general purpose I/O feature of the Synclink GT
adapters.

Signed-off-by: Paul Fulghum <paulkf@micrgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/synclink_gt.c | 246 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/synclink.h   |  11 +-
 2 files changed, 252 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 738ec2f4e563..8818042bad7c 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -1,5 +1,5 @@
 /*
- * $Id: synclink_gt.c,v 4.22 2006/01/09 20:16:06 paulkf Exp $
+ * $Id: synclink_gt.c,v 4.25 2006/02/06 21:20:33 paulkf Exp $
  *
  * Device driver for Microgate SyncLink GT serial adapters.
  *
@@ -92,7 +92,7 @@
  * module identification
  */
 static char *driver_name     = "SyncLink GT";
-static char *driver_version  = "$Revision: 4.22 $";
+static char *driver_version  = "$Revision: 4.25 $";
 static char *tty_driver_name = "synclink_gt";
 static char *tty_dev_prefix  = "ttySLG";
 MODULE_LICENSE("GPL");
@@ -187,6 +187,20 @@ static void hdlcdev_exit(struct slgt_info *info);
 #define SLGT_MAX_PORTS 4
 #define SLGT_REG_SIZE  256
 
+/*
+ * conditional wait facility
+ */
+struct cond_wait {
+	struct cond_wait *next;
+	wait_queue_head_t q;
+	wait_queue_t wait;
+	unsigned int data;
+};
+static void init_cond_wait(struct cond_wait *w, unsigned int data);
+static void add_cond_wait(struct cond_wait **head, struct cond_wait *w);
+static void remove_cond_wait(struct cond_wait **head, struct cond_wait *w);
+static void flush_cond_wait(struct cond_wait **head);
+
 /*
  * DMA buffer descriptor and access macros
  */
@@ -269,6 +283,9 @@ struct slgt_info {
 	struct timer_list	tx_timer;
 	struct timer_list	rx_timer;
 
+	unsigned int            gpio_present;
+	struct cond_wait        *gpio_wait_q;
+
 	spinlock_t lock;	/* spinlock for synchronizing with ISR */
 
 	struct work_struct task;
@@ -379,6 +396,11 @@ static MGSL_PARAMS default_params = {
 #define MASK_OVERRUN BIT4
 
 #define GSR   0x00 /* global status */
+#define JCR   0x04 /* JTAG control */
+#define IODR  0x08 /* GPIO direction */
+#define IOER  0x0c /* GPIO interrupt enable */
+#define IOVR  0x10 /* GPIO value */
+#define IOSR  0x14 /* GPIO interrupt status */
 #define TDR   0x80 /* tx data */
 #define RDR   0x80 /* rx data */
 #define TCR   0x82 /* tx control */
@@ -503,6 +525,9 @@ static int  tiocmset(struct tty_struct *tty, struct file *file,
 static void set_break(struct tty_struct *tty, int break_state);
 static int  get_interface(struct slgt_info *info, int __user *if_mode);
 static int  set_interface(struct slgt_info *info, int if_mode);
+static int  set_gpio(struct slgt_info *info, struct gpio_desc __user *gpio);
+static int  get_gpio(struct slgt_info *info, struct gpio_desc __user *gpio);
+static int  wait_gpio(struct slgt_info *info, struct gpio_desc __user *gpio);
 
 /*
  * driver functions
@@ -1112,6 +1137,12 @@ static int ioctl(struct tty_struct *tty, struct file *file,
 		return get_interface(info, argp);
 	case MGSL_IOCSIF:
 		return set_interface(info,(int)arg);
+	case MGSL_IOCSGPIO:
+		return set_gpio(info, argp);
+	case MGSL_IOCGGPIO:
+		return get_gpio(info, argp);
+	case MGSL_IOCWAITGPIO:
+		return wait_gpio(info, argp);
 	case TIOCGICOUNT:
 		spin_lock_irqsave(&info->lock,flags);
 		cnow = info->icount;
@@ -2158,6 +2189,24 @@ static void isr_txeom(struct slgt_info *info, unsigned short status)
 	}
 }
 
+static void isr_gpio(struct slgt_info *info, unsigned int changed, unsigned int state)
+{
+	struct cond_wait *w, *prev;
+
+	/* wake processes waiting for specific transitions */
+	for (w = info->gpio_wait_q, prev = NULL ; w != NULL ; w = w->next) {
+		if (w->data & changed) {
+			w->data = state;
+			wake_up_interruptible(&w->q);
+			if (prev != NULL)
+				prev->next = w->next;
+			else
+				info->gpio_wait_q = w->next;
+		} else
+			prev = w;
+	}
+}
+
 /* interrupt service routine
  *
  * 	irq	interrupt number
@@ -2193,6 +2242,22 @@ static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 		}
 	}
 
+	if (info->gpio_present) {
+		unsigned int state;
+		unsigned int changed;
+		while ((changed = rd_reg32(info, IOSR)) != 0) {
+			DBGISR(("%s iosr=%08x\n", info->device_name, changed));
+			/* read latched state of GPIO signals */
+			state = rd_reg32(info, IOVR);
+			/* clear pending GPIO interrupt bits */
+			wr_reg32(info, IOSR, changed);
+			for (i=0 ; i < info->port_count ; i++) {
+				if (info->port_array[i] != NULL)
+					isr_gpio(info->port_array[i], changed, state);
+			}
+		}
+	}
+
 	for(i=0; i < info->port_count ; i++) {
 		struct slgt_info *port = info->port_array[i];
 
@@ -2276,6 +2341,8 @@ static void shutdown(struct slgt_info *info)
 		set_signals(info);
 	}
 
+	flush_cond_wait(&info->gpio_wait_q);
+
 	spin_unlock_irqrestore(&info->lock,flags);
 
 	if (info->tty)
@@ -2650,6 +2717,175 @@ static int set_interface(struct slgt_info *info, int if_mode)
 	return 0;
 }
 
+/*
+ * set general purpose IO pin state and direction
+ *
+ * user_gpio fields:
+ * state   each bit indicates a pin state
+ * smask   set bit indicates pin state to set
+ * dir     each bit indicates a pin direction (0=input, 1=output)
+ * dmask   set bit indicates pin direction to set
+ */
+static int set_gpio(struct slgt_info *info, struct gpio_desc __user *user_gpio)
+{
+ 	unsigned long flags;
+	struct gpio_desc gpio;
+	__u32 data;
+
+	if (!info->gpio_present)
+		return -EINVAL;
+	if (copy_from_user(&gpio, user_gpio, sizeof(gpio)))
+		return -EFAULT;
+	DBGINFO(("%s set_gpio state=%08x smask=%08x dir=%08x dmask=%08x\n",
+		 info->device_name, gpio.state, gpio.smask,
+		 gpio.dir, gpio.dmask));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (gpio.dmask) {
+		data = rd_reg32(info, IODR);
+		data |= gpio.dmask & gpio.dir;
+		data &= ~(gpio.dmask & ~gpio.dir);
+		wr_reg32(info, IODR, data);
+	}
+	if (gpio.smask) {
+		data = rd_reg32(info, IOVR);
+		data |= gpio.smask & gpio.state;
+		data &= ~(gpio.smask & ~gpio.state);
+		wr_reg32(info, IOVR, data);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return 0;
+}
+
+/*
+ * get general purpose IO pin state and direction
+ */
+static int get_gpio(struct slgt_info *info, struct gpio_desc __user *user_gpio)
+{
+	struct gpio_desc gpio;
+	if (!info->gpio_present)
+		return -EINVAL;
+	gpio.state = rd_reg32(info, IOVR);
+	gpio.smask = 0xffffffff;
+	gpio.dir   = rd_reg32(info, IODR);
+	gpio.dmask = 0xffffffff;
+	if (copy_to_user(user_gpio, &gpio, sizeof(gpio)))
+		return -EFAULT;
+	DBGINFO(("%s get_gpio state=%08x dir=%08x\n",
+		 info->device_name, gpio.state, gpio.dir));
+	return 0;
+}
+
+/*
+ * conditional wait facility
+ */
+static void init_cond_wait(struct cond_wait *w, unsigned int data)
+{
+	init_waitqueue_head(&w->q);
+	init_waitqueue_entry(&w->wait, current);
+	w->data = data;
+}
+
+static void add_cond_wait(struct cond_wait **head, struct cond_wait *w)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&w->q, &w->wait);
+	w->next = *head;
+	*head = w;
+}
+
+static void remove_cond_wait(struct cond_wait **head, struct cond_wait *cw)
+{
+	struct cond_wait *w, *prev;
+	remove_wait_queue(&cw->q, &cw->wait);
+	set_current_state(TASK_RUNNING);
+	for (w = *head, prev = NULL ; w != NULL ; prev = w, w = w->next) {
+		if (w == cw) {
+			if (prev != NULL)
+				prev->next = w->next;
+			else
+				*head = w->next;
+			break;
+		}
+	}
+}
+
+static void flush_cond_wait(struct cond_wait **head)
+{
+	while (*head != NULL) {
+		wake_up_interruptible(&(*head)->q);
+		*head = (*head)->next;
+	}
+}
+
+/*
+ * wait for general purpose I/O pin(s) to enter specified state
+ *
+ * user_gpio fields:
+ * state - bit indicates target pin state
+ * smask - set bit indicates watched pin
+ *
+ * The wait ends when at least one watched pin enters the specified
+ * state. When 0 (no error) is returned, user_gpio->state is set to the
+ * state of all GPIO pins when the wait ends.
+ *
+ * Note: Each pin may be a dedicated input, dedicated output, or
+ * configurable input/output. The number and configuration of pins
+ * varies with the specific adapter model. Only input pins (dedicated
+ * or configured) can be monitored with this function.
+ */
+static int wait_gpio(struct slgt_info *info, struct gpio_desc __user *user_gpio)
+{
+ 	unsigned long flags;
+	int rc = 0;
+	struct gpio_desc gpio;
+	struct cond_wait wait;
+	u32 state;
+
+	if (!info->gpio_present)
+		return -EINVAL;
+	if (copy_from_user(&gpio, user_gpio, sizeof(gpio)))
+		return -EFAULT;
+	DBGINFO(("%s wait_gpio() state=%08x smask=%08x\n",
+		 info->device_name, gpio.state, gpio.smask));
+	/* ignore output pins identified by set IODR bit */
+	if ((gpio.smask &= ~rd_reg32(info, IODR)) == 0)
+		return -EINVAL;
+	init_cond_wait(&wait, gpio.smask);
+
+	spin_lock_irqsave(&info->lock, flags);
+	/* enable interrupts for watched pins */
+	wr_reg32(info, IOER, rd_reg32(info, IOER) | gpio.smask);
+	/* get current pin states */
+	state = rd_reg32(info, IOVR);
+
+	if (gpio.smask & ~(state ^ gpio.state)) {
+		/* already in target state */
+		gpio.state = state;
+	} else {
+		/* wait for target state */
+		add_cond_wait(&info->gpio_wait_q, &wait);
+		spin_unlock_irqrestore(&info->lock, flags);
+		schedule();
+		if (signal_pending(current))
+			rc = -ERESTARTSYS;
+		else
+			gpio.state = wait.data;
+		spin_lock_irqsave(&info->lock, flags);
+		remove_cond_wait(&info->gpio_wait_q, &wait);
+	}
+
+	/* disable all GPIO interrupts if no waiting processes */
+	if (info->gpio_wait_q == NULL)
+		wr_reg32(info, IOER, 0);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	if ((rc == 0) && copy_to_user(user_gpio, &gpio, sizeof(gpio)))
+		rc = -EFAULT;
+	return rc;
+}
+
 static int modem_input_wait(struct slgt_info *info,int arg)
 {
  	unsigned long flags;
@@ -3166,8 +3402,10 @@ static void device_init(int adapter_num, struct pci_dev *pdev)
 		} else {
 			port_array[0]->irq_requested = 1;
 			adapter_test(port_array[0]);
-			for (i=1 ; i < port_count ; i++)
+			for (i=1 ; i < port_count ; i++) {
 				port_array[i]->init_error = port_array[0]->init_error;
+				port_array[i]->gpio_present = port_array[0]->gpio_present;
+			}
 		}
 	}
 }
@@ -4301,7 +4539,7 @@ static int register_test(struct slgt_info *info)
 			break;
 		}
 	}
-
+	info->gpio_present = (rd_reg32(info, JCR) & BIT5) ? 1 : 0;
 	info->init_error = rc ? 0 : DiagStatus_AddressFailure;
 	return rc;
 }
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 1b7cd8d1a71b..2993302f7923 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -1,7 +1,7 @@
 /*
  * SyncLink Multiprotocol Serial Adapter Driver
  *
- * $Id: synclink.h,v 3.10 2005/11/08 19:50:54 paulkf Exp $
+ * $Id: synclink.h,v 3.11 2006/02/06 21:20:29 paulkf Exp $
  *
  * Copyright (C) 1998-2000 by Microgate Corporation
  *
@@ -221,6 +221,12 @@ struct mgsl_icount {
 	__u32	rxidle;
 };
 
+struct gpio_desc {
+	__u32 state;
+	__u32 smask;
+	__u32 dir;
+	__u32 dmask;
+};
 
 #define DEBUG_LEVEL_DATA	1
 #define DEBUG_LEVEL_ERROR 	2
@@ -276,5 +282,8 @@ struct mgsl_icount {
 #define MGSL_IOCLOOPTXDONE	_IO(MGSL_MAGIC_IOC,9)
 #define MGSL_IOCSIF		_IO(MGSL_MAGIC_IOC,10)
 #define MGSL_IOCGIF		_IO(MGSL_MAGIC_IOC,11)
+#define MGSL_IOCSGPIO		_IOW(MGSL_MAGIC_IOC,16,struct gpio_desc)
+#define MGSL_IOCGGPIO		_IOR(MGSL_MAGIC_IOC,17,struct gpio_desc)
+#define MGSL_IOCWAITGPIO	_IOWR(MGSL_MAGIC_IOC,18,struct gpio_desc)
 
 #endif /* _SYNCLINK_H_ */
-- 
cgit v1.2.3


From 273577165cd206d2d6689ee4b18aa13de1ec4bde Mon Sep 17 00:00:00 2001
From: Brian Rogan <bcr6@cornell.edu>
Date: Tue, 28 Mar 2006 01:56:20 -0800
Subject: [PATCH] Add oprofile_add_ext_sample

On ppc64 we look at a profiling register to work out the sample address and
if it was in userspace or kernel.

The backtrace interface oprofile_add_sample does not allow this.  Create
oprofile_add_ext_sample and make oprofile_add_sample use it too.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: Philippe Elie <phil.el@wanadoo.fr>
Cc: John Levon <levon@movementarian.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/oprofile/cpu_buffer.c | 13 ++++++++++---
 include/linux/oprofile.h      | 10 ++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 330d3869b41e..fc4bc9b94c74 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -217,11 +217,10 @@ static void oprofile_end_trace(struct oprofile_cpu_buffer * cpu_buf)
 	cpu_buf->tracing = 0;
 }
 
-void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
+void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
+				unsigned long event, int is_kernel)
 {
 	struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
-	unsigned long pc = profile_pc(regs);
-	int is_kernel = !user_mode(regs);
 
 	if (!backtrace_depth) {
 		log_sample(cpu_buf, pc, is_kernel, event);
@@ -238,6 +237,14 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 	oprofile_end_trace(cpu_buf);
 }
 
+void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
+{
+	int is_kernel = !user_mode(regs);
+	unsigned long pc = profile_pc(regs);
+
+	oprofile_add_ext_sample(pc, regs, event, is_kernel);
+}
+
 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 {
 	struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 559c4c38a9c7..b5b3197dfd4f 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -61,6 +61,16 @@ void oprofile_arch_exit(void);
  */
 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event);
 
+/**
+ * Add an extended sample.  Use this when the PC is not from the regs, and
+ * we cannot determine if we're in kernel mode from the regs.
+ *
+ * This function does perform a backtrace.
+ *
+ */
+void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
+				unsigned long event, int is_kernel);
+
 /* Use this instead when the PC value is not from the regs. Doesn't
  * backtrace. */
 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
-- 
cgit v1.2.3


From a28af471b8946de052a0eb0c080d5457be93f168 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 28 Mar 2006 01:56:26 -0800
Subject: [PATCH] fs/fat/: proper prototypes for two functions

Add proper prototypes for fat_cache_init() and fat_cache_destroy() in
msdos_fs.h.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/inode.c           | 3 ---
 include/linux/msdos_fs.h | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 404bfc9f7385..c1ce284f8a94 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1435,9 +1435,6 @@ out_fail:
 
 EXPORT_SYMBOL_GPL(fat_fill_super);
 
-int __init fat_cache_init(void);
-void fat_cache_destroy(void);
-
 static int __init init_fat_fs(void)
 {
 	int err;
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 779e6a5744c7..53cee1581650 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -420,6 +420,9 @@ extern int date_dos2unix(unsigned short time, unsigned short date);
 extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
 #endif /* __KERNEL__ */
 
 #endif
-- 
cgit v1.2.3


From e51236092d2f7e40e87e88804b5b42e5f8025415 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Mar 2006 01:56:27 -0800
Subject: [PATCH] remove relayfs_fs.h

This is obsolete.

Cc: Tom Zanussi <zanussi@us.ibm.com>
Cc: Jens Axboe <axboe@suse.de>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/relayfs_fs.h | 287 ---------------------------------------------
 1 file changed, 287 deletions(-)
 delete mode 100644 include/linux/relayfs_fs.h

(limited to 'include/linux')

diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
deleted file mode 100644
index 7342e66247fb..000000000000
--- a/include/linux/relayfs_fs.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * linux/include/linux/relayfs_fs.h
- *
- * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
- * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
- *
- * RelayFS definitions and declarations
- */
-
-#ifndef _LINUX_RELAYFS_FS_H
-#define _LINUX_RELAYFS_FS_H
-
-#include <linux/config.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-#include <linux/list.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/kref.h>
-
-/*
- * Tracks changes to rchan/rchan_buf structs
- */
-#define RELAYFS_CHANNEL_VERSION		6
-
-/*
- * Per-cpu relay channel buffer
- */
-struct rchan_buf
-{
-	void *start;			/* start of channel buffer */
-	void *data;			/* start of current sub-buffer */
-	size_t offset;			/* current offset into sub-buffer */
-	size_t subbufs_produced;	/* count of sub-buffers produced */
-	size_t subbufs_consumed;	/* count of sub-buffers consumed */
-	struct rchan *chan;		/* associated channel */
-	wait_queue_head_t read_wait;	/* reader wait queue */
-	struct work_struct wake_readers; /* reader wake-up work struct */
-	struct dentry *dentry;		/* channel file dentry */
-	struct kref kref;		/* channel buffer refcount */
-	struct page **page_array;	/* array of current buffer pages */
-	unsigned int page_count;	/* number of current buffer pages */
-	unsigned int finalized;		/* buffer has been finalized */
-	size_t *padding;		/* padding counts per sub-buffer */
-	size_t prev_padding;		/* temporary variable */
-	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
-	unsigned int cpu;		/* this buf's cpu */
-} ____cacheline_aligned;
-
-/*
- * Relay channel data structure
- */
-struct rchan
-{
-	u32 version;			/* the version of this struct */
-	size_t subbuf_size;		/* sub-buffer size */
-	size_t n_subbufs;		/* number of sub-buffers per buffer */
-	size_t alloc_size;		/* total buffer size allocated */
-	struct rchan_callbacks *cb;	/* client callbacks */
-	struct kref kref;		/* channel refcount */
-	void *private_data;		/* for user-defined data */
-	size_t last_toobig;		/* tried to log event > subbuf size */
-	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
-};
-
-/*
- * Relay channel client callbacks
- */
-struct rchan_callbacks
-{
-	/*
-	 * subbuf_start - called on buffer-switch to a new sub-buffer
-	 * @buf: the channel buffer containing the new sub-buffer
-	 * @subbuf: the start of the new sub-buffer
-	 * @prev_subbuf: the start of the previous sub-buffer
-	 * @prev_padding: unused space at the end of previous sub-buffer
-	 *
-	 * The client should return 1 to continue logging, 0 to stop
-	 * logging.
-	 *
-	 * NOTE: subbuf_start will also be invoked when the buffer is
-	 *       created, so that the first sub-buffer can be initialized
-	 *       if necessary.  In this case, prev_subbuf will be NULL.
-	 *
-	 * NOTE: the client can reserve bytes at the beginning of the new
-	 *       sub-buffer by calling subbuf_start_reserve() in this callback.
-	 */
-	int (*subbuf_start) (struct rchan_buf *buf,
-			     void *subbuf,
-			     void *prev_subbuf,
-			     size_t prev_padding);
-
-	/*
-	 * buf_mapped - relayfs buffer mmap notification
-	 * @buf: the channel buffer
-	 * @filp: relayfs file pointer
-	 *
-	 * Called when a relayfs file is successfully mmapped
-	 */
-        void (*buf_mapped)(struct rchan_buf *buf,
-			   struct file *filp);
-
-	/*
-	 * buf_unmapped - relayfs buffer unmap notification
-	 * @buf: the channel buffer
-	 * @filp: relayfs file pointer
-	 *
-	 * Called when a relayfs file is successfully unmapped
-	 */
-        void (*buf_unmapped)(struct rchan_buf *buf,
-			     struct file *filp);
-	/*
-	 * create_buf_file - create file to represent a relayfs channel buffer
-	 * @filename: the name of the file to create
-	 * @parent: the parent of the file to create
-	 * @mode: the mode of the file to create
-	 * @buf: the channel buffer
-	 * @is_global: outparam - set non-zero if the buffer should be global
-	 *
-	 * Called during relay_open(), once for each per-cpu buffer,
-	 * to allow the client to create a file to be used to
-	 * represent the corresponding channel buffer.  If the file is
-	 * created outside of relayfs, the parent must also exist in
-	 * that filesystem.
-	 *
-	 * The callback should return the dentry of the file created
-	 * to represent the relay buffer.
-	 *
-	 * Setting the is_global outparam to a non-zero value will
-	 * cause relay_open() to create a single global buffer rather
-	 * than the default set of per-cpu buffers.
-	 *
-	 * See Documentation/filesystems/relayfs.txt for more info.
-	 */
-	struct dentry *(*create_buf_file)(const char *filename,
-					  struct dentry *parent,
-					  int mode,
-					  struct rchan_buf *buf,
-					  int *is_global);
-
-	/*
-	 * remove_buf_file - remove file representing a relayfs channel buffer
-	 * @dentry: the dentry of the file to remove
-	 *
-	 * Called during relay_close(), once for each per-cpu buffer,
-	 * to allow the client to remove a file used to represent a
-	 * channel buffer.
-	 *
-	 * The callback should return 0 if successful, negative if not.
-	 */
-	int (*remove_buf_file)(struct dentry *dentry);
-};
-
-/*
- * relayfs kernel API, fs/relayfs/relay.c
- */
-
-struct rchan *relay_open(const char *base_filename,
-			 struct dentry *parent,
-			 size_t subbuf_size,
-			 size_t n_subbufs,
-			 struct rchan_callbacks *cb);
-extern void relay_close(struct rchan *chan);
-extern void relay_flush(struct rchan *chan);
-extern void relay_subbufs_consumed(struct rchan *chan,
-				   unsigned int cpu,
-				   size_t consumed);
-extern void relay_reset(struct rchan *chan);
-extern int relay_buf_full(struct rchan_buf *buf);
-
-extern size_t relay_switch_subbuf(struct rchan_buf *buf,
-				  size_t length);
-extern struct dentry *relayfs_create_dir(const char *name,
-					 struct dentry *parent);
-extern int relayfs_remove_dir(struct dentry *dentry);
-extern struct dentry *relayfs_create_file(const char *name,
-					  struct dentry *parent,
-					  int mode,
-					  struct file_operations *fops,
-					  void *data);
-extern int relayfs_remove_file(struct dentry *dentry);
-
-/**
- *	relay_write - write data into the channel
- *	@chan: relay channel
- *	@data: data to be written
- *	@length: number of bytes to write
- *
- *	Writes data into the current cpu's channel buffer.
- *
- *	Protects the buffer by disabling interrupts.  Use this
- *	if you might be logging from interrupt context.  Try
- *	__relay_write() if you know you	won't be logging from
- *	interrupt context.
- */
-static inline void relay_write(struct rchan *chan,
-			       const void *data,
-			       size_t length)
-{
-	unsigned long flags;
-	struct rchan_buf *buf;
-
-	local_irq_save(flags);
-	buf = chan->buf[smp_processor_id()];
-	if (unlikely(buf->offset + length > chan->subbuf_size))
-		length = relay_switch_subbuf(buf, length);
-	memcpy(buf->data + buf->offset, data, length);
-	buf->offset += length;
-	local_irq_restore(flags);
-}
-
-/**
- *	__relay_write - write data into the channel
- *	@chan: relay channel
- *	@data: data to be written
- *	@length: number of bytes to write
- *
- *	Writes data into the current cpu's channel buffer.
- *
- *	Protects the buffer by disabling preemption.  Use
- *	relay_write() if you might be logging from interrupt
- *	context.
- */
-static inline void __relay_write(struct rchan *chan,
-				 const void *data,
-				 size_t length)
-{
-	struct rchan_buf *buf;
-
-	buf = chan->buf[get_cpu()];
-	if (unlikely(buf->offset + length > buf->chan->subbuf_size))
-		length = relay_switch_subbuf(buf, length);
-	memcpy(buf->data + buf->offset, data, length);
-	buf->offset += length;
-	put_cpu();
-}
-
-/**
- *	relay_reserve - reserve slot in channel buffer
- *	@chan: relay channel
- *	@length: number of bytes to reserve
- *
- *	Returns pointer to reserved slot, NULL if full.
- *
- *	Reserves a slot in the current cpu's channel buffer.
- *	Does not protect the buffer at all - caller must provide
- *	appropriate synchronization.
- */
-static inline void *relay_reserve(struct rchan *chan, size_t length)
-{
-	void *reserved;
-	struct rchan_buf *buf = chan->buf[smp_processor_id()];
-
-	if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
-		length = relay_switch_subbuf(buf, length);
-		if (!length)
-			return NULL;
-	}
-	reserved = buf->data + buf->offset;
-	buf->offset += length;
-
-	return reserved;
-}
-
-/**
- *	subbuf_start_reserve - reserve bytes at the start of a sub-buffer
- *	@buf: relay channel buffer
- *	@length: number of bytes to reserve
- *
- *	Helper function used to reserve bytes at the beginning of
- *	a sub-buffer in the subbuf_start() callback.
- */
-static inline void subbuf_start_reserve(struct rchan_buf *buf,
-					size_t length)
-{
-	BUG_ON(length >= buf->chan->subbuf_size - 1);
-	buf->offset = length;
-}
-
-/*
- * exported relay file operations, fs/relayfs/inode.c
- */
-extern struct file_operations relay_file_operations;
-
-#endif /* _LINUX_RELAYFS_FS_H */
-
-- 
cgit v1.2.3


From d266ab88938e49aa95f1965ee020df1b1d4c5761 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 28 Mar 2006 01:56:31 -0800
Subject: [PATCH] Small fixes backported to old IDE SiS driver

Some quick backport bits from the libata PATA work to fix things found in
the sis driver.  The piix driver needs some fixes too but those are way to
large and need someone working on old IDE with time to do them.

This patch fixes the case where random bits get loaded into SIS timing
registers according to the description of the correct behaviour from
Vojtech Pavlik.  It also adds the SiS5517 ATA16 chipset which is not
currently supported by the driver.  Thanks to Conrad Harriss for loaning me
the machine with the 5517 chipset.

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/pci/sis5513.c | 2 ++
 include/linux/pci_ids.h   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 75a2253a3e68..8e9d87701ce2 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -112,6 +112,7 @@ static const struct {
 
 	{ "SiS5596",	PCI_DEVICE_ID_SI_5596,	ATA_16   },
 	{ "SiS5571",	PCI_DEVICE_ID_SI_5571,	ATA_16   },
+	{ "SiS5517",	PCI_DEVICE_ID_SI_5517,	ATA_16   },
 	{ "SiS551x",	PCI_DEVICE_ID_SI_5511,	ATA_16   },
 };
 
@@ -524,6 +525,7 @@ static void config_art_rwp_pio (ide_drive_t *drive, u8 pio)
 			case 3:		test1 = 0x30|0x03; break;
 			case 2:		test1 = 0x40|0x04; break;
 			case 1:		test1 = 0x60|0x07; break;
+			case 0:		test1 = 0x00; break;
 			default:	break;
 		}
 		pci_write_config_byte(dev, drive_pci, test1);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 02f6cf20b141..e2ab2ac18d6b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -642,6 +642,7 @@
 #define PCI_DEVICE_ID_SI_965		0x0965
 #define PCI_DEVICE_ID_SI_5511		0x5511
 #define PCI_DEVICE_ID_SI_5513		0x5513
+#define PCI_DEVICE_ID_SI_5517		0x5517
 #define PCI_DEVICE_ID_SI_5518		0x5518
 #define PCI_DEVICE_ID_SI_5571		0x5571
 #define PCI_DEVICE_ID_SI_5581		0x5581
-- 
cgit v1.2.3


From 70674f95c0a2ea694d5c39f4e514f538a09be36f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 28 Mar 2006 01:56:33 -0800
Subject: [PATCH] Optimize select/poll by putting small data sets on the stack

Optimize select and poll by a using stack space for small fd sets

This brings back an old optimization from Linux 2.0.  Using the stack is
faster than kmalloc.  On a Intel P4 system it speeds up a select of a
single pty fd by about 13% (~4000 cycles -> ~3500)

It also saves memory because a daemon hanging in select or poll will
usually save one or two less pages.  This can add up - e.g.  if you have 10
daemons blocking in poll/select you save 40KB of memory.

I did a patch for this long ago, but it was never applied.  This version is
a reimplementation of the old patch that tries to be less intrusive.  I
only did the minimal changes needed for the stack allocation.

The cut off point before external memory is allocated is currently at
832bytes.  The system calls always allocate this much memory on the stack.

These 832 bytes are divided into 256 bytes frontend data (for the select
bitmaps of the pollfds) and the rest of the space for the wait queues used
by the low level drivers.  There are some extreme cases where this won't
work out for select and it falls back to allocating memory too early -
especially with very sparse large select bitmaps - but the majority of
processes who only have a small number of file descriptors should be ok.
[TBD: 832/256 might not be the best split for select or poll]

I suspect more optimizations might be possible, but they would be more
complicated.  One way would be to cache the select/poll context over
multiple system calls because typically the input values should be similar.
 Problem is when to flush the file descriptors out though.

Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/select.c          | 106 +++++++++++++++++++++++++++++++--------------------
 include/linux/poll.h |  17 +++++++++
 2 files changed, 81 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/select.c b/fs/select.c
index 1815a57d2255..d8b4f0722b8d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -29,12 +29,6 @@
 #define ROUND_UP(x,y) (((x)+(y)-1)/(y))
 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
-struct poll_table_entry {
-	struct file * filp;
-	wait_queue_t wait;
-	wait_queue_head_t * wait_address;
-};
-
 struct poll_table_page {
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
@@ -64,13 +58,23 @@ void poll_initwait(struct poll_wqueues *pwq)
 	init_poll_funcptr(&pwq->pt, __pollwait);
 	pwq->error = 0;
 	pwq->table = NULL;
+	pwq->inline_index = 0;
 }
 
 EXPORT_SYMBOL(poll_initwait);
 
+static void free_poll_entry(struct poll_table_entry *entry)
+{
+	remove_wait_queue(entry->wait_address,&entry->wait);
+	fput(entry->filp);
+}
+
 void poll_freewait(struct poll_wqueues *pwq)
 {
 	struct poll_table_page * p = pwq->table;
+	int i;
+	for (i = 0; i < pwq->inline_index; i++)
+		free_poll_entry(pwq->inline_entries + i);
 	while (p) {
 		struct poll_table_entry * entry;
 		struct poll_table_page *old;
@@ -78,8 +82,7 @@ void poll_freewait(struct poll_wqueues *pwq)
 		entry = p->entry;
 		do {
 			entry--;
-			remove_wait_queue(entry->wait_address,&entry->wait);
-			fput(entry->filp);
+			free_poll_entry(entry);
 		} while (entry > p->entries);
 		old = p;
 		p = p->next;
@@ -89,12 +92,14 @@ void poll_freewait(struct poll_wqueues *pwq)
 
 EXPORT_SYMBOL(poll_freewait);
 
-static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
-		       poll_table *_p)
+static struct poll_table_entry *poll_get_entry(poll_table *_p)
 {
 	struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
 	struct poll_table_page *table = p->table;
 
+	if (p->inline_index < N_INLINE_POLL_ENTRIES)
+		return p->inline_entries + p->inline_index++;
+
 	if (!table || POLL_TABLE_FULL(table)) {
 		struct poll_table_page *new_table;
 
@@ -102,7 +107,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 		if (!new_table) {
 			p->error = -ENOMEM;
 			__set_current_state(TASK_RUNNING);
-			return;
+			return NULL;
 		}
 		new_table->entry = new_table->entries;
 		new_table->next = table;
@@ -110,16 +115,21 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 		table = new_table;
 	}
 
-	/* Add a new entry */
-	{
-		struct poll_table_entry * entry = table->entry;
-		table->entry = entry+1;
-	 	get_file(filp);
-	 	entry->filp = filp;
-		entry->wait_address = wait_address;
-		init_waitqueue_entry(&entry->wait, current);
-		add_wait_queue(wait_address,&entry->wait);
-	}
+	return table->entry++;
+}
+
+/* Add a new entry */
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+				poll_table *p)
+{
+	struct poll_table_entry *entry = poll_get_entry(p);
+	if (!entry)
+		return;
+	get_file(filp);
+	entry->filp = filp;
+	entry->wait_address = wait_address;
+	init_waitqueue_entry(&entry->wait, current);
+	add_wait_queue(wait_address,&entry->wait);
 }
 
 #define FDS_IN(fds, n)		(fds->in + n)
@@ -284,16 +294,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 	return retval;
 }
 
-static void *select_bits_alloc(int size)
-{
-	return kmalloc(6 * size, GFP_KERNEL);
-}
-
-static void select_bits_free(void *bits, int size)
-{
-	kfree(bits);
-}
-
 /*
  * We can actually return ERESTARTSYS instead of EINTR, but I'd
  * like to be certain this leads to no problems. So I return
@@ -312,6 +312,8 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	char *bits;
 	int ret, size, max_fdset;
 	struct fdtable *fdt;
+	/* Allocate small arguments on the stack to save memory and be faster */
+	char stack_fds[SELECT_STACK_ALLOC];
 
 	ret = -EINVAL;
 	if (n < 0)
@@ -332,7 +334,10 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	 */
 	ret = -ENOMEM;
 	size = FDS_BYTES(n);
-	bits = select_bits_alloc(size);
+	if (6*size < SELECT_STACK_ALLOC)
+		bits = stack_fds;
+	else
+		bits = kmalloc(6 * size, GFP_KERNEL);
 	if (!bits)
 		goto out_nofds;
 	fds.in      = (unsigned long *)  bits;
@@ -367,7 +372,8 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		ret = -EFAULT;
 
 out:
-	select_bits_free(bits, size);
+	if (bits != stack_fds)
+		kfree(bits);
 out_nofds:
 	return ret;
 }
@@ -619,6 +625,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	return count;
 }
 
+#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
+			sizeof(struct pollfd))
+
 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 {
 	struct poll_wqueues table;
@@ -628,6 +637,9 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
  	struct poll_list *walk;
 	struct fdtable *fdt;
 	int max_fdset;
+	/* Allocate small arguments on the stack to save memory and be faster */
+	char stack_pps[POLL_STACK_ALLOC];
+	struct poll_list *stack_pp = NULL;
 
 	/* Do a sanity check on nfds ... */
 	rcu_read_lock();
@@ -645,14 +657,23 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 	err = -ENOMEM;
 	while(i!=0) {
 		struct poll_list *pp;
-		pp = kmalloc(sizeof(struct poll_list)+
-				sizeof(struct pollfd)*
-				(i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i),
-					GFP_KERNEL);
-		if(pp==NULL)
-			goto out_fds;
+		int num, size;
+		if (stack_pp == NULL)
+			num = N_STACK_PPS;
+		else
+			num = POLLFD_PER_PAGE;
+		if (num > i)
+			num = i;
+		size = sizeof(struct poll_list) + sizeof(struct pollfd)*num;
+		if (!stack_pp)
+			stack_pp = pp = (struct poll_list *)stack_pps;
+		else {
+			pp = kmalloc(size, GFP_KERNEL);
+			if (!pp)
+				goto out_fds;
+		}
 		pp->next=NULL;
-		pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i);
+		pp->len = num;
 		if (head == NULL)
 			head = pp;
 		else
@@ -660,7 +681,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 
 		walk = pp;
 		if (copy_from_user(pp->entries, ufds + nfds-i, 
-				sizeof(struct pollfd)*pp->len)) {
+				sizeof(struct pollfd)*num)) {
 			err = -EFAULT;
 			goto out_fds;
 		}
@@ -689,7 +710,8 @@ out_fds:
 	walk = head;
 	while(walk!=NULL) {
 		struct poll_list *pp = walk->next;
-		kfree(walk);
+		if (walk != stack_pp)
+			kfree(walk);
 		walk = pp;
 	}
 	poll_freewait(&table);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 8e8f6098508a..51e1b56741fb 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -11,6 +11,15 @@
 #include <linux/mm.h>
 #include <asm/uaccess.h>
 
+/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
+   additional memory. */
+#define MAX_STACK_ALLOC 832
+#define FRONTEND_STACK_ALLOC	256
+#define SELECT_STACK_ALLOC	FRONTEND_STACK_ALLOC
+#define POLL_STACK_ALLOC	FRONTEND_STACK_ALLOC
+#define WQUEUES_STACK_ALLOC	(MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
+#define N_INLINE_POLL_ENTRIES	(WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))
+
 struct poll_table_struct;
 
 /* 
@@ -33,6 +42,12 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->qproc = qproc;
 }
 
+struct poll_table_entry {
+	struct file * filp;
+	wait_queue_t wait;
+	wait_queue_head_t * wait_address;
+};
+
 /*
  * Structures and helpers for sys_poll/sys_poll
  */
@@ -40,6 +55,8 @@ struct poll_wqueues {
 	poll_table pt;
 	struct poll_table_page * table;
 	int error;
+	int inline_index;
+	struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
 };
 
 extern void poll_initwait(struct poll_wqueues *pwq);
-- 
cgit v1.2.3


From 631d6747e1d877a4baa924cb373b8b9511a53e5e Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 01:56:36 -0800
Subject: [PATCH] for_each_possible_cpu: defines for_each_possible_cpu

for_each_cpu() is a for-loop over cpu_possible_map.  for_each_online_cpu is
for-loop cpu over cpu_online_map.  .....for_each_cpu() is not sufficiently
explicit and can lead to mistakes.

This patch adds for_each_possible_cpu() in preparation for the removal of
for_each_cpu().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 99e6115d8e52..9cbb781d6f80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -67,7 +67,7 @@
  *
  * int any_online_cpu(mask)		First online cpu in mask
  *
- * for_each_cpu(cpu)			for-loop cpu over cpu_possible_map
+ * for_each_possible_cpu(cpu)		for-loop cpu over cpu_possible_map
  * for_each_online_cpu(cpu)		for-loop cpu over cpu_online_map
  * for_each_present_cpu(cpu)		for-loop cpu over cpu_present_map
  *
@@ -405,7 +405,8 @@ int __any_online_cpu(const cpumask_t *mask);
 #define any_online_cpu(mask)		0
 #endif
 
-#define for_each_cpu(cpu)	  for_each_cpu_mask((cpu), cpu_possible_map)
+#define for_each_cpu(cpu)  for_each_cpu_mask((cpu), cpu_possible_map)
+#define for_each_possible_cpu(cpu)  for_each_cpu_mask((cpu), cpu_possible_map)
 #define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
-- 
cgit v1.2.3


From 0a945022778f100115d0cb6234eb28fc1b15ccaf Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 01:56:37 -0800
Subject: [PATCH] for_each_possible_cpu: fixes for generic part

replaces for_each_cpu with for_each_possible_cpu().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c            | 2 +-
 fs/file.c                    | 2 +-
 fs/proc/proc_misc.c          | 2 +-
 include/asm-generic/percpu.h | 2 +-
 include/linux/genhd.h        | 4 ++--
 include/linux/kernel_stat.h  | 2 +-
 init/main.c                  | 2 +-
 kernel/rcutorture.c          | 4 ++--
 kernel/sched.c               | 8 ++++----
 mm/slab.c                    | 4 ++--
 mm/swap.c                    | 2 +-
 11 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 82469db25100..5a19e2eb5711 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3514,7 +3514,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
diff --git a/fs/file.c b/fs/file.c
index bbc743314730..55f4e7022563 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -373,6 +373,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 void __init files_defer_init(void)
 {
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		fdtable_defer_list_init(i);
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1e9ea37d457e..1edce0c34bfd 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -534,7 +534,7 @@ static int show_stat(struct seq_file *p, void *v)
 	if (wall_to_monotonic.tv_nsec)
 		--jif;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int j;
 
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 78cf45547e31..c0caf433a7d7 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -19,7 +19,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for_each_cpu(__i)					\
+	for_each_possible_cpu(__i)				\
 		memcpy((pcpudst)+__per_cpu_offset[__i],		\
 		       (src), (size));				\
 } while (0)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3c1b0294a742..10a27f29d692 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -152,14 +152,14 @@ struct disk_attribute {
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
-	for_each_cpu(i)							\
+	for_each_possible_cpu(i)					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	res;								\
 })
 
 static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		memset(per_cpu_ptr(gendiskp->dkstats, i), value,
 				sizeof (struct disk_stats));
 }		
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a484572c302e..b46249082cca 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,7 +46,7 @@ static inline int kstat_irqs(int irq)
 {
 	int cpu, sum = 0;
 
-	for_each_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		sum += kstat_cpu(cpu).irqs[irq];
 
 	return sum;
diff --git a/init/main.c b/init/main.c
index 64466ea1984c..4a2f0898dda1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -341,7 +341,7 @@ static void __init setup_per_cpu_areas(void)
 #endif
 	ptr = alloc_bootmem(size * nr_possible_cpus);
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 		ptr += size;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b4b362b5baf5..8154e7589d12 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
 	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
 			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -535,7 +535,7 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_error, 0);
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			per_cpu(rcu_torture_count, cpu)[i] = 0;
 			per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7854ee516b92..a9ecac398bb9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1625,7 +1625,7 @@ unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 
 	/*
@@ -1642,7 +1642,7 @@ unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 
 	return sum;
@@ -1652,7 +1652,7 @@ unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 
 	return sum;
@@ -6080,7 +6080,7 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
diff --git a/mm/slab.c b/mm/slab.c
index 681837499d7d..4cbf8bb13557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3311,7 +3311,7 @@ void *__alloc_percpu(size_t size)
 	 * and we have no way of figuring out how to fix the array
 	 * that we have allocated then....
 	 */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int node = cpu_to_node(i);
 
 		if (node_online(node))
@@ -3398,7 +3398,7 @@ void free_percpu(const void *objp)
 	/*
 	 * We allocate for all cpus so we cannot use for online cpu here.
 	 */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 	    kfree(p->ptrs[i]);
 	kfree(p);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 91b7e2026f69..88895c249bc9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -512,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc)
 
 	spin_lock(&fbc->lock);
 	ret = fbc->count;
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		long *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
-- 
cgit v1.2.3


From 99ac48f54a91d02140c497edc31dc57d4bc5c85d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 28 Mar 2006 01:56:41 -0800
Subject: [PATCH] mark f_ops const in the inode

Mark the f_ops members of inodes as const, as well as fix the
ripple-through this causes by places that copy this f_ops and then "do
stuff" with it.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/ppc/kernel/ppc_htab.c                |  2 +-
 drivers/char/drm/drm_fops.c               |  2 +-
 drivers/char/drm/i810_dma.c               |  2 +-
 drivers/char/drm/i830_dma.c               |  2 +-
 drivers/char/mem.c                        |  2 +-
 drivers/char/misc.c                       |  2 +-
 drivers/input/input.c                     |  2 +-
 drivers/isdn/capi/kcapi_proc.c            |  2 +-
 drivers/media/dvb/dvb-core/dvbdev.c       |  2 +-
 drivers/media/video/videodev.c            |  2 +-
 drivers/message/i2o/i2o_proc.c            |  2 +-
 drivers/oprofile/oprofilefs.c             |  6 +++---
 drivers/telephony/phonedev.c              |  2 +-
 drivers/usb/core/file.c                   |  6 +++---
 drivers/usb/gadget/inode.c                |  6 +++---
 fs/char_dev.c                             |  4 ++--
 fs/debugfs/inode.c                        |  2 +-
 fs/inode.c                                |  2 +-
 fs/nfsd/vfs.c                             |  2 +-
 fs/proc/generic.c                         |  2 +-
 fs/proc/internal.h                        |  2 +-
 fs/proc/proc_misc.c                       |  2 +-
 fs/select.c                               |  2 +-
 include/linux/cdev.h                      |  4 ++--
 include/linux/debugfs.h                   |  2 +-
 include/linux/fs.h                        |  6 +++---
 include/linux/input.h                     |  2 +-
 include/linux/miscdevice.h                |  2 +-
 include/linux/oprofile.h                  |  4 ++--
 include/linux/proc_fs.h                   |  4 ++--
 include/linux/sound.h                     | 12 ++++++------
 include/linux/sunrpc/stats.h              |  4 ++--
 include/linux/usb.h                       |  2 +-
 include/linux/videodev2.h                 |  2 +-
 include/sound/core.h                      |  6 +++---
 net/sunrpc/rpc_pipe.c                     |  2 +-
 net/sunrpc/stats.c                        |  4 ++--
 sound/core/init.c                         |  3 ++-
 sound/core/sound.c                        |  4 ++--
 sound/core/sound_oss.c                    |  2 +-
 sound/sound_core.c                        | 22 +++++++++++-----------
 42 files changed, 75 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index b3962c3a0348..5be40aa483fd 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -103,7 +103,7 @@ spufs_setattr(struct dentry *dentry, struct iattr *attr)
 
 static int
 spufs_new_file(struct super_block *sb, struct dentry *dentry,
-		struct file_operations *fops, int mode,
+		const struct file_operations *fops, int mode,
 		struct spu_context *ctx)
 {
 	static struct inode_operations spufs_file_iops = {
diff --git a/arch/ppc/kernel/ppc_htab.c b/arch/ppc/kernel/ppc_htab.c
index 2f5c7650274f..9b84bffdefce 100644
--- a/arch/ppc/kernel/ppc_htab.c
+++ b/arch/ppc/kernel/ppc_htab.c
@@ -52,7 +52,7 @@ static int ppc_htab_open(struct inode *inode, struct file *file)
 	return single_open(file, ppc_htab_show, NULL);
 }
 
-struct file_operations ppc_htab_operations = {
+const struct file_operations ppc_htab_operations = {
 	.open		= ppc_htab_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/drivers/char/drm/drm_fops.c b/drivers/char/drm/drm_fops.c
index 641f7633878c..b7f7951c4587 100644
--- a/drivers/char/drm/drm_fops.c
+++ b/drivers/char/drm/drm_fops.c
@@ -175,7 +175,7 @@ int drm_stub_open(struct inode *inode, struct file *filp)
 	drm_device_t *dev = NULL;
 	int minor = iminor(inode);
 	int err = -ENODEV;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 
 	DRM_DEBUG("\n");
 
diff --git a/drivers/char/drm/i810_dma.c b/drivers/char/drm/i810_dma.c
index ae0aa6d7e0bb..c658dde3633b 100644
--- a/drivers/char/drm/i810_dma.c
+++ b/drivers/char/drm/i810_dma.c
@@ -126,7 +126,7 @@ static int i810_map_buffer(drm_buf_t * buf, struct file *filp)
 	drm_device_t *dev = priv->head->dev;
 	drm_i810_buf_priv_t *buf_priv = buf->dev_private;
 	drm_i810_private_t *dev_priv = dev->dev_private;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 	int retcode = 0;
 
 	if (buf_priv->currently_mapped == I810_BUF_MAPPED)
diff --git a/drivers/char/drm/i830_dma.c b/drivers/char/drm/i830_dma.c
index 163f2cbfe60d..b0f815d8cea8 100644
--- a/drivers/char/drm/i830_dma.c
+++ b/drivers/char/drm/i830_dma.c
@@ -128,7 +128,7 @@ static int i830_map_buffer(drm_buf_t * buf, struct file *filp)
 	drm_device_t *dev = priv->head->dev;
 	drm_i830_buf_priv_t *buf_priv = buf->dev_private;
 	drm_i830_private_t *dev_priv = dev->dev_private;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 	unsigned long virtual;
 	int retcode = 0;
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 5245ba1649ed..66719f9d294c 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -899,7 +899,7 @@ static const struct {
 	unsigned int		minor;
 	char			*name;
 	umode_t			mode;
-	struct file_operations	*fops;
+	const struct file_operations	*fops;
 } devlist[] = { /* list of minor devices */
 	{1, "mem",     S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops},
 	{2, "kmem",    S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops},
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 3e4c0414a01a..96eb2a709e21 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -129,7 +129,7 @@ static int misc_open(struct inode * inode, struct file * file)
 	int minor = iminor(inode);
 	struct miscdevice *c;
 	int err = -ENODEV;
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 	
 	down(&misc_sem);
 	
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 4fe3da3c667a..f8af0945964e 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -923,7 +923,7 @@ void input_unregister_handler(struct input_handler *handler)
 static int input_open_file(struct inode *inode, struct file *file)
 {
 	struct input_handler *handler = input_table[iminor(inode) >> 5];
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 	int err;
 
 	/* No load-on-demand here? */
diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c
index 2cc8b27e4c3b..ca9dc00a45c4 100644
--- a/drivers/isdn/capi/kcapi_proc.c
+++ b/drivers/isdn/capi/kcapi_proc.c
@@ -233,7 +233,7 @@ static struct file_operations proc_applstats_ops = {
 };
 
 static void
-create_seq_entry(char *name, mode_t mode, struct file_operations *f)
+create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
 {
 	struct proc_dir_entry *entry;
 	entry = create_proc_entry(name, mode, NULL);
diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c
index 54f8b95717b0..96fe0ecae250 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.c
+++ b/drivers/media/dvb/dvb-core/dvbdev.c
@@ -86,7 +86,7 @@ static int dvb_device_open(struct inode *inode, struct file *file)
 
 	if (dvbdev && dvbdev->fops) {
 		int err = 0;
-		struct file_operations *old_fops;
+		const struct file_operations *old_fops;
 
 		file->private_data = dvbdev;
 		old_fops = file->f_op;
diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 75e3d41382f2..5f87dd5f1d0b 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -97,7 +97,7 @@ static int video_open(struct inode *inode, struct file *file)
 	unsigned int minor = iminor(inode);
 	int err = 0;
 	struct video_device *vfl;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 
 	if(minor>=VIDEO_NUM_DEVICES)
 		return -ENODEV;
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 2a0c42b8cda5..3d2e76eea93e 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -56,7 +56,7 @@
 typedef struct _i2o_proc_entry_t {
 	char *name;		/* entry name */
 	mode_t mode;		/* mode */
-	struct file_operations *fops;	/* open function */
+	const struct file_operations *fops;	/* open function */
 } i2o_proc_entry;
 
 /* global I2O /proc/i2o entry */
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index d6bae699749a..b62da9b0cbf0 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -130,7 +130,7 @@ static struct file_operations ulong_ro_fops = {
 
 
 static struct dentry * __oprofilefs_create_file(struct super_block * sb,
-	struct dentry * root, char const * name, struct file_operations * fops,
+	struct dentry * root, char const * name, const struct file_operations * fops,
 	int perm)
 {
 	struct dentry * dentry;
@@ -203,7 +203,7 @@ int oprofilefs_create_ro_atomic(struct super_block * sb, struct dentry * root,
 
  
 int oprofilefs_create_file(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops)
+	char const * name, const struct file_operations * fops)
 {
 	if (!__oprofilefs_create_file(sb, root, name, fops, 0644))
 		return -EFAULT;
@@ -212,7 +212,7 @@ int oprofilefs_create_file(struct super_block * sb, struct dentry * root,
 
 
 int oprofilefs_create_file_perm(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops, int perm)
+	char const * name, const struct file_operations * fops, int perm)
 {
 	if (!__oprofilefs_create_file(sb, root, name, fops, perm))
 		return -EFAULT;
diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c
index 7a6db1c5c8c5..e166fffea86b 100644
--- a/drivers/telephony/phonedev.c
+++ b/drivers/telephony/phonedev.c
@@ -49,7 +49,7 @@ static int phone_open(struct inode *inode, struct file *file)
 	unsigned int minor = iminor(inode);
 	int err = 0;
 	struct phone_device *p;
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 
 	if (minor >= PHONE_NUM_DEVICES)
 		return -ENODEV;
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index 37b13368c814..b263a54a13c0 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -24,15 +24,15 @@
 #include "usb.h"
 
 #define MAX_USB_MINORS	256
-static struct file_operations *usb_minors[MAX_USB_MINORS];
+static const struct file_operations *usb_minors[MAX_USB_MINORS];
 static DEFINE_SPINLOCK(minor_lock);
 
 static int usb_open(struct inode * inode, struct file * file)
 {
 	int minor = iminor(inode);
-	struct file_operations *c;
+	const struct file_operations *c;
 	int err = -ENODEV;
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 
 	spin_lock (&minor_lock);
 	c = usb_minors[minor];
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index b44cfda76b61..3f618ce6998d 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1581,7 +1581,7 @@ restart:
 
 static struct inode *
 gadgetfs_create_file (struct super_block *sb, char const *name,
-		void *data, struct file_operations *fops,
+		void *data, const struct file_operations *fops,
 		struct dentry **dentry_p);
 
 static int activate_ep_files (struct dev_data *dev)
@@ -1955,7 +1955,7 @@ module_param (default_perm, uint, 0644);
 
 static struct inode *
 gadgetfs_make_inode (struct super_block *sb,
-		void *data, struct file_operations *fops,
+		void *data, const struct file_operations *fops,
 		int mode)
 {
 	struct inode *inode = new_inode (sb);
@@ -1979,7 +1979,7 @@ gadgetfs_make_inode (struct super_block *sb,
  */
 static struct inode *
 gadgetfs_create_file (struct super_block *sb, char const *name,
-		void *data, struct file_operations *fops,
+		void *data, const struct file_operations *fops,
 		struct dentry **dentry_p)
 {
 	struct dentry	*dentry;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 8c6eb04d31e2..b53dffa46bac 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -250,7 +250,7 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 }
 
 int register_chrdev(unsigned int major, const char *name,
-		    struct file_operations *fops)
+		    const struct file_operations *fops)
 {
 	struct char_device_struct *cd;
 	struct cdev *cdev;
@@ -473,7 +473,7 @@ struct cdev *cdev_alloc(void)
 	return p;
 }
 
-void cdev_init(struct cdev *cdev, struct file_operations *fops)
+void cdev_init(struct cdev *cdev, const struct file_operations *fops)
 {
 	memset(cdev, 0, sizeof *cdev);
 	INIT_LIST_HEAD(&cdev->list);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d4f1a2cddd47..85d166cdcae4 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -191,7 +191,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
  */
 struct dentry *debugfs_create_file(const char *name, mode_t mode,
 				   struct dentry *parent, void *data,
-				   struct file_operations *fops)
+				   const struct file_operations *fops)
 {
 	struct dentry *dentry = NULL;
 	int error;
diff --git a/fs/inode.c b/fs/inode.c
index 1fddf2803af8..32b7c3375021 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -104,7 +104,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 {
 	static struct address_space_operations empty_aops;
 	static struct inode_operations empty_iops;
-	static struct file_operations empty_fops;
+	static const struct file_operations empty_fops;
 	struct inode *inode;
 
 	if (sb->s_op->alloc_inode)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5320e5afaddb..31018333dc38 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -706,7 +706,7 @@ nfsd_close(struct file *filp)
  * after it.
  */
 static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-			      struct file_operations *fop)
+			      const struct file_operations *fop)
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 47b7a20d45eb..4ba03009cf72 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -560,7 +560,7 @@ static void proc_kill_inodes(struct proc_dir_entry *de)
 		struct file * filp = list_entry(p, struct file, f_u.fu_list);
 		struct dentry * dentry = filp->f_dentry;
 		struct inode * inode;
-		struct file_operations *fops;
+		const struct file_operations *fops;
 
 		if (dentry->d_op != &proc_dentry_operations)
 			continue;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 95a1cf32b838..0502f17b860d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -30,7 +30,7 @@ do {						\
 
 #endif
 
-extern void create_seq_entry(char *name, mode_t mode, struct file_operations *f);
+extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f);
 extern int proc_exe_link(struct inode *, struct dentry **, struct vfsmount **);
 extern int proc_tid_stat(struct task_struct *,  char *);
 extern int proc_tgid_stat(struct task_struct *, char *);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1edce0c34bfd..ef5a3323f4b5 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -731,7 +731,7 @@ static struct file_operations proc_sysrq_trigger_operations = {
 
 struct proc_dir_entry *proc_root_kcore;
 
-void create_seq_entry(char *name, mode_t mode, struct file_operations *f)
+void create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
 {
 	struct proc_dir_entry *entry;
 	entry = create_proc_entry(name, mode, NULL);
diff --git a/fs/select.c b/fs/select.c
index 05cd199a1127..b3a3a1326af6 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -220,7 +220,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
 			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
 			unsigned long res_in = 0, res_out = 0, res_ex = 0;
-			struct file_operations *f_op = NULL;
+			const struct file_operations *f_op = NULL;
 			struct file *file = NULL;
 
 			in = *inp++; out = *outp++; ex = *exp++;
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index 8da37e29cb87..2216638962d2 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -5,13 +5,13 @@
 struct cdev {
 	struct kobject kobj;
 	struct module *owner;
-	struct file_operations *ops;
+	const struct file_operations *ops;
 	struct list_head list;
 	dev_t dev;
 	unsigned int count;
 };
 
-void cdev_init(struct cdev *, struct file_operations *);
+void cdev_init(struct cdev *, const struct file_operations *);
 
 struct cdev *cdev_alloc(void);
 
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4b0428e335be..176e2d371577 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -29,7 +29,7 @@ struct debugfs_blob_wrapper {
 #if defined(CONFIG_DEBUG_FS)
 struct dentry *debugfs_create_file(const char *name, mode_t mode,
 				   struct dentry *parent, void *data,
-				   struct file_operations *fops);
+				   const struct file_operations *fops);
 
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 680d913350e7..ef355bc73714 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -496,7 +496,7 @@ struct inode {
 	struct mutex		i_mutex;
 	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
-	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
+	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
 	struct file_lock	*i_flock;
 	struct address_space	*i_mapping;
@@ -636,7 +636,7 @@ struct file {
 	} f_u;
 	struct dentry		*f_dentry;
 	struct vfsmount         *f_vfsmnt;
-	struct file_operations	*f_op;
+	const struct file_operations	*f_op;
 	atomic_t		f_count;
 	unsigned int 		f_flags;
 	mode_t			f_mode;
@@ -1414,7 +1414,7 @@ extern void bd_release_from_disk(struct block_device *, struct gendisk *);
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
 extern int register_chrdev(unsigned int, const char *,
-			   struct file_operations *);
+			   const struct file_operations *);
 extern int unregister_chrdev(unsigned int, const char *);
 extern void unregister_chrdev_region(dev_t, unsigned);
 extern int chrdev_open(struct inode *, struct file *);
diff --git a/include/linux/input.h b/include/linux/input.h
index 6d4cc3c110d6..1d4e341b72e6 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -957,7 +957,7 @@ struct input_handler {
 	struct input_handle* (*connect)(struct input_handler *handler, struct input_dev *dev, struct input_device_id *id);
 	void (*disconnect)(struct input_handle *handle);
 
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	int minor;
 	char *name;
 
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 14ceebfc1efa..5b584dafb5a6 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -36,7 +36,7 @@ struct class_device;
 struct miscdevice  {
 	int minor;
 	const char *name;
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	struct list_head list;
 	struct device *dev;
 	struct class_device *class;
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index b5b3197dfd4f..0d514b252454 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -84,10 +84,10 @@ void oprofile_add_trace(unsigned long eip);
  * the specified file operations.
  */
 int oprofilefs_create_file(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops);
+	char const * name, const struct file_operations * fops);
 
 int oprofilefs_create_file_perm(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops, int perm);
+	char const * name, const struct file_operations * fops, int perm);
  
 /** Create a file for read/write access to an unsigned long. */
 int oprofilefs_create_ulong(struct super_block * sb, struct dentry * root,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index cb224cf653b1..6d03d025fcd5 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -58,7 +58,7 @@ struct proc_dir_entry {
 	gid_t gid;
 	loff_t size;
 	struct inode_operations * proc_iops;
-	struct file_operations * proc_fops;
+	const struct file_operations * proc_fops;
 	get_info_t *get_info;
 	struct module *owner;
 	struct proc_dir_entry *next, *parent, *subdir;
@@ -189,7 +189,7 @@ static inline struct proc_dir_entry *proc_net_create(const char *name,
 }
 
 static inline struct proc_dir_entry *proc_net_fops_create(const char *name,
-	mode_t mode, struct file_operations *fops)
+	mode_t mode, const struct file_operations *fops)
 {
 	struct proc_dir_entry *res = create_proc_entry(name, mode, proc_net);
 	if (res)
diff --git a/include/linux/sound.h b/include/linux/sound.h
index 72b9af4c3fd4..f63d8342ffa3 100644
--- a/include/linux/sound.h
+++ b/include/linux/sound.h
@@ -30,12 +30,12 @@
  */
  
 struct device;
-extern int register_sound_special(struct file_operations *fops, int unit);
-extern int register_sound_special_device(struct file_operations *fops, int unit, struct device *dev);
-extern int register_sound_mixer(struct file_operations *fops, int dev);
-extern int register_sound_midi(struct file_operations *fops, int dev);
-extern int register_sound_dsp(struct file_operations *fops, int dev);
-extern int register_sound_synth(struct file_operations *fops, int dev);
+extern int register_sound_special(const struct file_operations *fops, int unit);
+extern int register_sound_special_device(const struct file_operations *fops, int unit, struct device *dev);
+extern int register_sound_mixer(const struct file_operations *fops, int dev);
+extern int register_sound_midi(const struct file_operations *fops, int dev);
+extern int register_sound_dsp(const struct file_operations *fops, int dev);
+extern int register_sound_synth(const struct file_operations *fops, int dev);
 
 extern void unregister_sound_special(int unit);
 extern void unregister_sound_mixer(int unit);
diff --git a/include/linux/sunrpc/stats.h b/include/linux/sunrpc/stats.h
index 0d6ed3c8bdc4..d93c24b47f3f 100644
--- a/include/linux/sunrpc/stats.h
+++ b/include/linux/sunrpc/stats.h
@@ -50,7 +50,7 @@ struct proc_dir_entry *	rpc_proc_register(struct rpc_stat *);
 void			rpc_proc_unregister(const char *);
 void			rpc_proc_zero(struct rpc_program *);
 struct proc_dir_entry *	svc_proc_register(struct svc_stat *,
-					  struct file_operations *);
+					  const struct file_operations *);
 void			svc_proc_unregister(const char *);
 
 void			svc_seq_show(struct seq_file *,
@@ -65,7 +65,7 @@ static inline void rpc_proc_unregister(const char *p) {}
 static inline void rpc_proc_zero(struct rpc_program *p) {}
 
 static inline struct proc_dir_entry *svc_proc_register(struct svc_stat *s,
-						       struct file_operations *f) { return NULL; }
+						       const struct file_operations *f) { return NULL; }
 static inline void svc_proc_unregister(const char *p) {}
 
 static inline void svc_seq_show(struct seq_file *seq,
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 130d125fda12..e34e5e3dce52 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -615,7 +615,7 @@ extern struct bus_type usb_bus_type;
  */
 struct usb_class_driver {
 	char *name;
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	int minor_base;
 };
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 2275bfec5b68..af2d6155d3fe 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -75,7 +75,7 @@ struct video_device
 	int minor;
 
 	/* device ops + callbacks */
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	void (*release)(struct video_device *vfd);
 
 
diff --git a/include/sound/core.h b/include/sound/core.h
index 144bdc2f217f..7f32c12b4a0a 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -186,7 +186,7 @@ struct snd_minor {
 	int type;			/* SNDRV_DEVICE_TYPE_XXX */
 	int card;			/* card number */
 	int device;			/* device number */
-	struct file_operations *f_ops;	/* file operations */
+	const struct file_operations *f_ops;	/* file operations */
 	void *private_data;		/* private data for f_ops->open */
 	char name[0];			/* device name (keep at the end of
 								structure) */
@@ -200,14 +200,14 @@ extern int snd_ecards_limit;
 void snd_request_card(int card);
 
 int snd_register_device(int type, struct snd_card *card, int dev,
-			struct file_operations *f_ops, void *private_data,
+			const struct file_operations *f_ops, void *private_data,
 			const char *name);
 int snd_unregister_device(int type, struct snd_card *card, int dev);
 void *snd_lookup_minor_data(unsigned int minor, int type);
 
 #ifdef CONFIG_SND_OSSEMUL
 int snd_register_oss_device(int type, struct snd_card *card, int dev,
-			    struct file_operations *f_ops, void *private_data,
+			    const struct file_operations *f_ops, void *private_data,
 			    const char *name);
 int snd_unregister_oss_device(int type, struct snd_card *card, int dev);
 void *snd_lookup_oss_minor_data(unsigned int minor, int type);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index aa4158be9900..cc673dd8433f 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -395,7 +395,7 @@ enum {
  */
 struct rpc_filelist {
 	char *name;
-	struct file_operations *i_fop;
+	const struct file_operations *i_fop;
 	int mode;
 };
 
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 790941e8af4d..dea529666d69 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(rpc_print_iostats);
  * Register/unregister RPC proc files
  */
 static inline struct proc_dir_entry *
-do_register(const char *name, void *data, struct file_operations *fops)
+do_register(const char *name, void *data, const struct file_operations *fops)
 {
 	struct proc_dir_entry *ent;
 
@@ -253,7 +253,7 @@ rpc_proc_unregister(const char *name)
 }
 
 struct proc_dir_entry *
-svc_proc_register(struct svc_stat *statp, struct file_operations *fops)
+svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
 {
 	return do_register(statp->program->pg_name, statp, fops);
 }
diff --git a/sound/core/init.c b/sound/core/init.c
index ad68761abba1..5bb8a8b23d51 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -223,7 +223,8 @@ int snd_card_disconnect(struct snd_card *card)
 	struct snd_monitor_file *mfile;
 	struct file *file;
 	struct snd_shutdown_f_ops *s_f_ops;
-	struct file_operations *f_ops, *old_f_ops;
+	struct file_operations *f_ops;
+	const struct file_operations *old_f_ops;
 	int err;
 
 	spin_lock(&card->files_lock);
diff --git a/sound/core/sound.c b/sound/core/sound.c
index 4d28e5212611..108e430b5036 100644
--- a/sound/core/sound.c
+++ b/sound/core/sound.c
@@ -137,7 +137,7 @@ static int snd_open(struct inode *inode, struct file *file)
 {
 	unsigned int minor = iminor(inode);
 	struct snd_minor *mptr = NULL;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 	int err = 0;
 
 	if (minor >= ARRAY_SIZE(snd_minors))
@@ -240,7 +240,7 @@ static int snd_kernel_minor(int type, struct snd_card *card, int dev)
  * Retrurns zero if successful, or a negative error code on failure.
  */
 int snd_register_device(int type, struct snd_card *card, int dev,
-			struct file_operations *f_ops, void *private_data,
+			const struct file_operations *f_ops, void *private_data,
 			const char *name)
 {
 	int minor;
diff --git a/sound/core/sound_oss.c b/sound/core/sound_oss.c
index 4023d3b406de..9055c6de9587 100644
--- a/sound/core/sound_oss.c
+++ b/sound/core/sound_oss.c
@@ -95,7 +95,7 @@ static int snd_oss_kernel_minor(int type, struct snd_card *card, int dev)
 }
 
 int snd_register_oss_device(int type, struct snd_card *card, int dev,
-			    struct file_operations *f_ops, void *private_data,
+			    const struct file_operations *f_ops, void *private_data,
 			    const char *name)
 {
 	int minor = snd_oss_kernel_minor(type, card, dev);
diff --git a/sound/sound_core.c b/sound/sound_core.c
index 394b53e20cb8..6f849720aef3 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -53,7 +53,7 @@
 struct sound_unit
 {
 	int unit_minor;
-	struct file_operations *unit_fops;
+	const struct file_operations *unit_fops;
 	struct sound_unit *next;
 	char name[32];
 };
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(sound_class);
  *	join into it. Called with the lock asserted
  */
 
-static int __sound_insert_unit(struct sound_unit * s, struct sound_unit **list, struct file_operations *fops, int index, int low, int top)
+static int __sound_insert_unit(struct sound_unit * s, struct sound_unit **list, const struct file_operations *fops, int index, int low, int top)
 {
 	int n=low;
 
@@ -153,7 +153,7 @@ static DEFINE_SPINLOCK(sound_loader_lock);
  *	list. Acquires locks as needed
  */
 
-static int sound_insert_unit(struct sound_unit **list, struct file_operations *fops, int index, int low, int top, const char *name, umode_t mode, struct device *dev)
+static int sound_insert_unit(struct sound_unit **list, const struct file_operations *fops, int index, int low, int top, const char *name, umode_t mode, struct device *dev)
 {
 	struct sound_unit *s = kmalloc(sizeof(*s), GFP_KERNEL);
 	int r;
@@ -237,7 +237,7 @@ static struct sound_unit *chains[SOUND_STEP];
  *	a negative error code is returned.
  */
  
-int register_sound_special_device(struct file_operations *fops, int unit,
+int register_sound_special_device(const struct file_operations *fops, int unit,
 				  struct device *dev)
 {
 	const int chain = unit % SOUND_STEP;
@@ -301,7 +301,7 @@ int register_sound_special_device(struct file_operations *fops, int unit,
  
 EXPORT_SYMBOL(register_sound_special_device);
 
-int register_sound_special(struct file_operations *fops, int unit)
+int register_sound_special(const struct file_operations *fops, int unit)
 {
 	return register_sound_special_device(fops, unit, NULL);
 }
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(register_sound_special);
  *	number is returned, on failure a negative error code is returned.
  */
 
-int register_sound_mixer(struct file_operations *fops, int dev)
+int register_sound_mixer(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[0], fops, dev, 0, 128,
 				 "mixer", S_IRUSR | S_IWUSR, NULL);
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(register_sound_mixer);
  *	number is returned, on failure a negative error code is returned.
  */
 
-int register_sound_midi(struct file_operations *fops, int dev)
+int register_sound_midi(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[2], fops, dev, 2, 130,
 				 "midi", S_IRUSR | S_IWUSR, NULL);
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(register_sound_midi);
  *	and will always allocate them as a matching pair - eg dsp3/audio3
  */
 
-int register_sound_dsp(struct file_operations *fops, int dev)
+int register_sound_dsp(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[3], fops, dev, 3, 131,
 				 "dsp", S_IWUSR | S_IRUSR, NULL);
@@ -381,7 +381,7 @@ EXPORT_SYMBOL(register_sound_dsp);
  */
 
 
-int register_sound_synth(struct file_operations *fops, int dev)
+int register_sound_synth(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[9], fops, dev, 9, 137,
 				 "synth", S_IRUSR | S_IWUSR, NULL);
@@ -501,7 +501,7 @@ int soundcore_open(struct inode *inode, struct file *file)
 	int chain;
 	int unit = iminor(inode);
 	struct sound_unit *s;
-	struct file_operations *new_fops = NULL;
+	const struct file_operations *new_fops = NULL;
 
 	chain=unit&0x0F;
 	if(chain==4 || chain==5)	/* dsp/audio/dsp16 */
@@ -540,7 +540,7 @@ int soundcore_open(struct inode *inode, struct file *file)
 		 * switching ->f_op in the first place.
 		 */
 		int err = 0;
-		struct file_operations *old_fops = file->f_op;
+		const struct file_operations *old_fops = file->f_op;
 		file->f_op = new_fops;
 		spin_unlock(&sound_loader_lock);
 		if(file->f_op->open)
-- 
cgit v1.2.3


From 4b6f5d20b04dcbc3d888555522b90ba6d36c4106 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 28 Mar 2006 01:56:42 -0800
Subject: [PATCH] Make most file operations structs in fs/ const

This is a conversion to make the various file_operations structs in fs/
const.  Basically a regexp job, with a few manual fixups

The goal is both to increase correctness (harder to accidentally write to
shared datastructures) and reducing the false sharing of cachelines with
things that get dirty in .data (while .rodata is nicely read only and thus
cache clean)

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/misc/ibmasm/ibmasmfs.c  |  2 +-
 fs/9p/v9fs_vfs.h                |  4 ++--
 fs/9p/vfs_dir.c                 |  2 +-
 fs/9p/vfs_file.c                |  2 +-
 fs/adfs/adfs.h                  |  4 ++--
 fs/adfs/dir.c                   |  2 +-
 fs/adfs/file.c                  |  2 +-
 fs/affs/affs.h                  |  6 +++---
 fs/affs/dir.c                   |  2 +-
 fs/affs/file.c                  |  2 +-
 fs/afs/dir.c                    |  2 +-
 fs/afs/internal.h               |  4 ++--
 fs/afs/mntpt.c                  |  2 +-
 fs/afs/proc.c                   | 10 +++++-----
 fs/autofs/autofs_i.h            |  2 +-
 fs/autofs/root.c                |  2 +-
 fs/autofs4/autofs_i.h           |  4 ++--
 fs/autofs4/root.c               |  4 ++--
 fs/bad_inode.c                  |  2 +-
 fs/befs/linuxvfs.c              |  2 +-
 fs/bfs/bfs.h                    |  4 ++--
 fs/bfs/dir.c                    |  2 +-
 fs/bfs/file.c                   |  2 +-
 fs/binfmt_misc.c                |  6 +++---
 fs/block_dev.c                  |  2 +-
 fs/char_dev.c                   |  2 +-
 fs/cifs/cifsfs.c                | 10 +++++-----
 fs/cifs/cifsfs.h                | 10 +++++-----
 fs/coda/dir.c                   |  2 +-
 fs/coda/file.c                  |  2 +-
 fs/coda/pioctl.c                |  2 +-
 fs/coda/psdev.c                 |  2 +-
 fs/configfs/configfs_internal.h |  6 +++---
 fs/configfs/dir.c               |  2 +-
 fs/configfs/file.c              |  2 +-
 fs/cramfs/inode.c               |  4 ++--
 fs/debugfs/file.c               |  4 ++--
 fs/devfs/base.c                 | 12 ++++++------
 fs/efs/dir.c                    |  2 +-
 fs/eventpoll.c                  |  2 +-
 fs/ext2/dir.c                   |  2 +-
 fs/ext2/ext2.h                  |  6 +++---
 fs/ext2/file.c                  |  4 ++--
 fs/ext3/dir.c                   |  2 +-
 fs/ext3/file.c                  |  2 +-
 fs/fat/dir.c                    |  2 +-
 fs/fat/file.c                   |  2 +-
 fs/fifo.c                       |  2 +-
 fs/freevxfs/vxfs_extern.h       |  2 +-
 fs/freevxfs/vxfs_lookup.c       |  2 +-
 fs/fuse/dev.c                   |  2 +-
 fs/fuse/dir.c                   |  2 +-
 fs/fuse/file.c                  |  6 +++---
 fs/fuse/fuse_i.h                |  2 +-
 fs/hfs/dir.c                    |  2 +-
 fs/hfs/hfs_fs.h                 |  2 +-
 fs/hfs/inode.c                  |  4 ++--
 fs/hfsplus/dir.c                |  2 +-
 fs/hfsplus/inode.c              |  2 +-
 fs/hostfs/hostfs_kern.c         |  4 ++--
 fs/hpfs/dir.c                   |  2 +-
 fs/hpfs/file.c                  |  2 +-
 fs/hpfs/hpfs_fn.h               |  4 ++--
 fs/hppfs/hppfs_kern.c           |  4 ++--
 fs/hugetlbfs/inode.c            |  4 ++--
 fs/inotify.c                    |  2 +-
 fs/isofs/dir.c                  |  2 +-
 fs/isofs/isofs.h                |  2 +-
 fs/jffs/inode-v23.c             |  8 ++++----
 fs/jffs2/dir.c                  |  2 +-
 fs/jffs2/file.c                 |  2 +-
 fs/jffs2/os-linux.h             |  4 ++--
 fs/jfs/file.c                   |  2 +-
 fs/jfs/jfs_inode.h              |  4 ++--
 fs/jfs/namei.c                  |  2 +-
 fs/libfs.c                      |  2 +-
 fs/minix/dir.c                  |  2 +-
 fs/minix/file.c                 |  2 +-
 fs/minix/minix.h                |  4 ++--
 fs/ncpfs/dir.c                  |  2 +-
 fs/ncpfs/file.c                 |  2 +-
 fs/nfs/dir.c                    |  2 +-
 fs/nfs/file.c                   |  2 +-
 fs/nfsd/nfsctl.c                |  4 ++--
 fs/nfsd/stats.c                 |  2 +-
 fs/ntfs/dir.c                   |  2 +-
 fs/ntfs/file.c                  |  4 ++--
 fs/ntfs/ntfs.h                  |  6 +++---
 fs/ocfs2/dlmglue.c              |  2 +-
 fs/ocfs2/file.c                 |  4 ++--
 fs/ocfs2/file.h                 |  4 ++--
 fs/openpromfs/inode.c           |  6 +++---
 fs/pipe.c                       |  6 +++---
 fs/proc/kcore.c                 |  2 +-
 fs/proc/kmsg.c                  |  2 +-
 fs/proc/vmcore.c                |  2 +-
 fs/qnx4/dir.c                   |  2 +-
 fs/qnx4/file.c                  |  2 +-
 fs/ramfs/file-mmu.c             |  2 +-
 fs/ramfs/file-nommu.c           |  2 +-
 fs/ramfs/internal.h             |  2 +-
 fs/read_write.c                 |  2 +-
 fs/reiserfs/dir.c               |  2 +-
 fs/reiserfs/file.c              |  2 +-
 fs/reiserfs/procfs.c            |  2 +-
 fs/romfs/inode.c                |  2 +-
 fs/smbfs/dir.c                  |  2 +-
 fs/smbfs/file.c                 |  2 +-
 fs/smbfs/proto.h                |  4 ++--
 fs/sysfs/bin.c                  |  2 +-
 fs/sysfs/dir.c                  |  2 +-
 fs/sysfs/file.c                 |  2 +-
 fs/sysfs/sysfs.h                |  6 +++---
 fs/sysv/dir.c                   |  2 +-
 fs/sysv/file.c                  |  2 +-
 fs/sysv/sysv.h                  |  4 ++--
 fs/udf/dir.c                    |  2 +-
 fs/udf/file.c                   |  2 +-
 fs/udf/udfdecl.h                |  4 ++--
 fs/ufs/dir.c                    |  2 +-
 fs/ufs/file.c                   |  2 +-
 fs/xfs/linux-2.6/xfs_file.c     |  6 +++---
 fs/xfs/linux-2.6/xfs_iops.h     |  6 +++---
 include/linux/coda_linux.h      |  6 +++---
 include/linux/crash_dump.h      |  2 +-
 include/linux/efs_fs.h          |  2 +-
 include/linux/ext3_fs.h         |  4 ++--
 include/linux/fs.h              | 20 ++++++++++----------
 include/linux/hugetlb.h         |  2 +-
 include/linux/msdos_fs.h        |  4 ++--
 include/linux/ncp_fs.h          |  4 ++--
 include/linux/nfs_fs.h          |  4 ++--
 include/linux/proc_fs.h         |  6 +++---
 include/linux/qnx4_fs.h         |  4 ++--
 include/linux/ramfs.h           |  2 +-
 include/linux/reiserfs_fs.h     |  4 ++--
 include/linux/ufs_fs.h          |  4 ++--
 net/nonet.c                     |  2 +-
 net/socket.c                    |  2 +-
 139 files changed, 225 insertions(+), 225 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 5c550fcac2c4..26a230b6ff80 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -101,7 +101,7 @@ static struct super_operations ibmasmfs_s_ops = {
 	.drop_inode	= generic_delete_inode,
 };
 
-static struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations;
+static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations;
 
 static struct file_system_type ibmasmfs_type = {
 	.owner          = THIS_MODULE,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 43c9f7de0314..f867b8d3e973 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -39,8 +39,8 @@
 
 extern struct file_system_type v9fs_fs_type;
 extern struct address_space_operations v9fs_addr_operations;
-extern struct file_operations v9fs_file_operations;
-extern struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_dir_operations;
 extern struct dentry_operations v9fs_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 766f11f1215c..e32d5971039b 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -204,7 +204,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-struct file_operations v9fs_dir_operations = {
+const struct file_operations v9fs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = v9fs_dir_readdir,
 	.open = v9fs_file_open,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 59e744163407..083dcfcd158e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -266,7 +266,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	return total;
 }
 
-struct file_operations v9fs_file_operations = {
+const struct file_operations v9fs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = v9fs_file_read,
 	.write = v9fs_file_write,
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index f6cd01352cc8..29217ff36d44 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -85,7 +85,7 @@ void __adfs_error(struct super_block *sb, const char *function,
 
 /* dir_*.c */
 extern struct inode_operations adfs_dir_inode_operations;
-extern struct file_operations adfs_dir_operations;
+extern const struct file_operations adfs_dir_operations;
 extern struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
@@ -94,7 +94,7 @@ extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
 
 /* file.c */
 extern struct inode_operations adfs_file_inode_operations;
-extern struct file_operations adfs_file_operations;
+extern const struct file_operations adfs_file_operations;
 
 static inline __u32 signed_asl(__u32 val, signed int shift)
 {
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 0b4c3a028076..7b075fc397da 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -196,7 +196,7 @@ out:
 	return ret;
 }
 
-struct file_operations adfs_dir_operations = {
+const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= adfs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 6af10885f9d6..1014b9f2117b 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -25,7 +25,7 @@
 
 #include "adfs.h"
 
-struct file_operations adfs_file_operations = {
+const struct file_operations adfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.mmap		= generic_file_mmap,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0c6799f2137a..a43a876742b8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -192,9 +192,9 @@ extern void   affs_dir_truncate(struct inode *);
 extern struct inode_operations	 affs_file_inode_operations;
 extern struct inode_operations	 affs_dir_inode_operations;
 extern struct inode_operations   affs_symlink_inode_operations;
-extern struct file_operations	 affs_file_operations;
-extern struct file_operations	 affs_file_operations_ofs;
-extern struct file_operations	 affs_dir_operations;
+extern const struct file_operations	 affs_file_operations;
+extern const struct file_operations	 affs_file_operations_ofs;
+extern const struct file_operations	 affs_dir_operations;
 extern struct address_space_operations	 affs_symlink_aops;
 extern struct address_space_operations	 affs_aops;
 extern struct address_space_operations	 affs_aops_ofs;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 548efd0ee98c..5d9649fa1814 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -17,7 +17,7 @@
 
 static int affs_readdir(struct file *, void *, filldir_t);
 
-struct file_operations affs_dir_operations = {
+const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= affs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index f72fb776ecdf..7076262af39b 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -25,7 +25,7 @@ static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
 static int affs_file_open(struct inode *inode, struct file *filp);
 static int affs_file_release(struct inode *inode, struct file *filp);
 
-struct file_operations affs_file_operations = {
+const struct file_operations affs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5c61c24dab2a..a6dff6a4f204 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -32,7 +32,7 @@ static int afs_d_delete(struct dentry *dentry);
 static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
 				  loff_t fpos, ino_t ino, unsigned dtype);
 
-struct file_operations afs_dir_file_operations = {
+const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
 	.readdir	= afs_dir_readdir,
 };
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ab8f87c66319..72febdf9a35a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -64,7 +64,7 @@ extern struct cachefs_index_def afs_cache_cell_index_def;
  * dir.c
  */
 extern struct inode_operations afs_dir_inode_operations;
-extern struct file_operations afs_dir_file_operations;
+extern const struct file_operations afs_dir_file_operations;
 
 /*
  * file.c
@@ -105,7 +105,7 @@ extern struct cachefs_netfs afs_cache_netfs;
  * mntpt.c
  */
 extern struct inode_operations afs_mntpt_inode_operations;
-extern struct file_operations afs_mntpt_file_operations;
+extern const struct file_operations afs_mntpt_file_operations;
 extern struct afs_timer afs_mntpt_expiry_timer;
 extern struct afs_timer_ops afs_mntpt_expiry_timer_ops;
 extern unsigned long afs_mntpt_expiry_timeout;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 31ee06590de5..4e6eeb59b83c 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -32,7 +32,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
 
-struct file_operations afs_mntpt_file_operations = {
+const struct file_operations afs_mntpt_file_operations = {
 	.open		= afs_mntpt_open,
 };
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 9c81b8f7eef0..101d21b6c037 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -37,7 +37,7 @@ static struct seq_operations afs_proc_cells_ops = {
 	.show	= afs_proc_cells_show,
 };
 
-static struct file_operations afs_proc_cells_fops = {
+static const struct file_operations afs_proc_cells_fops = {
 	.open		= afs_proc_cells_open,
 	.read		= seq_read,
 	.write		= afs_proc_cells_write,
@@ -53,7 +53,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 				       const char __user *buf,
 				       size_t size, loff_t *_pos);
 
-static struct file_operations afs_proc_rootcell_fops = {
+static const struct file_operations afs_proc_rootcell_fops = {
 	.open		= afs_proc_rootcell_open,
 	.read		= afs_proc_rootcell_read,
 	.write		= afs_proc_rootcell_write,
@@ -77,7 +77,7 @@ static struct seq_operations afs_proc_cell_volumes_ops = {
 	.show	= afs_proc_cell_volumes_show,
 };
 
-static struct file_operations afs_proc_cell_volumes_fops = {
+static const struct file_operations afs_proc_cell_volumes_fops = {
 	.open		= afs_proc_cell_volumes_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -101,7 +101,7 @@ static struct seq_operations afs_proc_cell_vlservers_ops = {
 	.show	= afs_proc_cell_vlservers_show,
 };
 
-static struct file_operations afs_proc_cell_vlservers_fops = {
+static const struct file_operations afs_proc_cell_vlservers_fops = {
 	.open		= afs_proc_cell_vlservers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -124,7 +124,7 @@ static struct seq_operations afs_proc_cell_servers_ops = {
 	.show	= afs_proc_cell_servers_show,
 };
 
-static struct file_operations afs_proc_cell_servers_fops = {
+static const struct file_operations afs_proc_cell_servers_fops = {
 	.open		= afs_proc_cell_servers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 990c28da5aec..a62327f1bdff 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -146,7 +146,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info
 
 extern struct inode_operations autofs_root_inode_operations;
 extern struct inode_operations autofs_symlink_inode_operations;
-extern struct file_operations autofs_root_operations;
+extern const struct file_operations autofs_root_operations;
 
 /* Initializing function */
 
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 870e2cf33016..9cac08d6a873 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -26,7 +26,7 @@ static int autofs_root_rmdir(struct inode *,struct dentry *);
 static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 
-struct file_operations autofs_root_operations = {
+const struct file_operations autofs_root_operations = {
 	.read		= generic_read_dir,
 	.readdir	= autofs_root_readdir,
 	.ioctl		= autofs_root_ioctl,
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c70204a61268..57c4903614e5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -176,8 +176,8 @@ extern struct inode_operations autofs4_dir_inode_operations;
 extern struct inode_operations autofs4_root_inode_operations;
 extern struct inode_operations autofs4_indirect_root_inode_operations;
 extern struct inode_operations autofs4_direct_root_inode_operations;
-extern struct file_operations autofs4_dir_operations;
-extern struct file_operations autofs4_root_operations;
+extern const struct file_operations autofs4_dir_operations;
+extern const struct file_operations autofs4_root_operations;
 
 /* Initializing function */
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c8fe43a475e2..84e030c8ddd0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t fil
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
 
-struct file_operations autofs4_root_operations = {
+const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
@@ -40,7 +40,7 @@ struct file_operations autofs4_root_operations = {
 	.ioctl		= autofs4_root_ioctl,
 };
 
-struct file_operations autofs4_dir_operations = {
+const struct file_operations autofs4_dir_operations = {
 	.open		= autofs4_dir_open,
 	.release	= autofs4_dir_close,
 	.read		= generic_read_dir,
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index e172180a1d8c..80599ae33966 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -22,7 +22,7 @@ static int return_EIO(void)
 
 #define EIO_ERROR ((void *) (return_EIO))
 
-static struct file_operations bad_file_ops =
+static const struct file_operations bad_file_ops =
 {
 	.llseek		= EIO_ERROR,
 	.aio_read	= EIO_ERROR,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 044a59587829..68ebd10f345d 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -64,7 +64,7 @@ static const struct super_operations befs_sops = {
 /* slab cache for befs_inode_info objects */
 static kmem_cache_t *befs_inode_cachep;
 
-static struct file_operations befs_dir_operations = {
+static const struct file_operations befs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= befs_readdir,
 };
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 1fbc53f14aba..9d791004b21c 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -49,11 +49,11 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 
 /* file.c */
 extern struct inode_operations bfs_file_inops;
-extern struct file_operations bfs_file_operations;
+extern const struct file_operations bfs_file_operations;
 extern struct address_space_operations bfs_aops;
 
 /* dir.c */
 extern struct inode_operations bfs_dir_inops;
-extern struct file_operations bfs_dir_operations;
+extern const struct file_operations bfs_dir_operations;
 
 #endif /* _FS_BFS_BFS_H */
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 5af928fa0449..26fad9621738 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -70,7 +70,7 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 	return 0;	
 }
 
-struct file_operations bfs_dir_operations = {
+const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 807723b65daf..d83cd74a2e4e 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -17,7 +17,7 @@
 #define dprintf(x...)
 #endif
 
-struct file_operations bfs_file_operations = {
+const struct file_operations bfs_file_operations = {
 	.llseek 	= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 6a7b730c206b..d73d75591a39 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -600,7 +600,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static struct file_operations bm_entry_operations = {
+static const struct file_operations bm_entry_operations = {
 	.read		= bm_entry_read,
 	.write		= bm_entry_write,
 };
@@ -668,7 +668,7 @@ out:
 	return count;
 }
 
-static struct file_operations bm_register_operations = {
+static const struct file_operations bm_register_operations = {
 	.write		= bm_register_write,
 };
 
@@ -715,7 +715,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 	return count;
 }
 
-static struct file_operations bm_status_operations = {
+static const struct file_operations bm_status_operations = {
 	.read		= bm_status_read,
 	.write		= bm_status_write,
 };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 17c76182f389..af88c43043d5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1087,7 +1087,7 @@ struct address_space_operations def_blk_aops = {
 	.direct_IO	= blkdev_direct_IO,
 };
 
-struct file_operations def_blk_fops = {
+const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index b53dffa46bac..4e1b849f912f 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -406,7 +406,7 @@ static void cdev_purge(struct cdev *cdev)
  * is contain the open that then fills in the correct operations
  * depending on the special file...
  */
-struct file_operations def_chr_fops = {
+const struct file_operations def_chr_fops = {
 	.open = chrdev_open,
 };
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6b99b51d6694..4bbc544857bc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -583,7 +583,7 @@ struct inode_operations cifs_symlink_inode_ops = {
 #endif 
 };
 
-struct file_operations cifs_file_ops = {
+const struct file_operations cifs_file_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.readv = generic_file_readv,
@@ -607,7 +607,7 @@ struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
-struct file_operations cifs_file_direct_ops = {
+const struct file_operations cifs_file_direct_ops = {
 	/* no mmap, no aio, no readv - 
 	   BB reevaluate whether they can be done with directio, no cache */
 	.read = cifs_user_read,
@@ -626,7 +626,7 @@ struct file_operations cifs_file_direct_ops = {
 	.dir_notify = cifs_dir_notify,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
-struct file_operations cifs_file_nobrl_ops = {
+const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.readv = generic_file_readv,
@@ -649,7 +649,7 @@ struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
-struct file_operations cifs_file_direct_nobrl_ops = {
+const struct file_operations cifs_file_direct_nobrl_ops = {
 	/* no mmap, no aio, no readv - 
 	   BB reevaluate whether they can be done with directio, no cache */
 	.read = cifs_user_read,
@@ -668,7 +668,7 @@ struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
-struct file_operations cifs_dir_ops = {
+const struct file_operations cifs_dir_ops = {
 	.readdir = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 821a8eb22559..74f405ae4da3 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,10 +61,10 @@ extern struct inode_operations cifs_file_inode_ops;
 extern struct inode_operations cifs_symlink_inode_ops;
 
 /* Functions related to files and directories */
-extern struct file_operations cifs_file_ops;
-extern struct file_operations cifs_file_direct_ops; /* if directio mount */
-extern struct file_operations cifs_file_nobrl_ops;
-extern struct file_operations cifs_file_direct_nobrl_ops; /* if directio mount */
+extern const struct file_operations cifs_file_ops;
+extern const struct file_operations cifs_file_direct_ops; /* if directio mount */
+extern const struct file_operations cifs_file_nobrl_ops;
+extern const struct file_operations cifs_file_direct_nobrl_ops; /* if directio mount */
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
@@ -76,7 +76,7 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, struct dentry *, int);
 extern int cifs_flush(struct file *);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
-extern struct file_operations cifs_dir_ops;
+extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 extern int cifs_dir_notify(struct file *, unsigned long arg);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 54f76de8a686..71f2ea632e53 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -82,7 +82,7 @@ struct inode_operations coda_dir_inode_operations =
 	.setattr	= coda_setattr,
 };
 
-struct file_operations coda_dir_operations = {
+const struct file_operations coda_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= coda_readdir,
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 146a991d6eb5..7c2642431fa5 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -288,7 +288,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	return err;
 }
 
-struct file_operations coda_file_operations = {
+const struct file_operations coda_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= coda_file_read,
 	.write		= coda_file_write,
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 127714936c66..214822be87bd 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -36,7 +36,7 @@ struct inode_operations coda_ioctl_inode_operations =
 	.setattr	= coda_setattr,
 };
 
-struct file_operations coda_ioctl_operations = {
+const struct file_operations coda_ioctl_operations = {
 	.owner		= THIS_MODULE,
 	.ioctl		= coda_pioctl,
 };
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 98c74fe2e139..6c6771db36da 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -342,7 +342,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
 }
 
 
-static struct file_operations coda_psdev_fops = {
+static const struct file_operations coda_psdev_fops = {
 	.owner		= THIS_MODULE,
 	.read		= coda_psdev_read,
 	.write		= coda_psdev_write,
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index f70e46951b37..3f4ff7a242b9 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -72,9 +72,9 @@ extern void configfs_release_fs(void);
 
 extern struct rw_semaphore configfs_rename_sem;
 extern struct super_block * configfs_sb;
-extern struct file_operations configfs_dir_operations;
-extern struct file_operations configfs_file_operations;
-extern struct file_operations bin_fops;
+extern const struct file_operations configfs_dir_operations;
+extern const struct file_operations configfs_file_operations;
+extern const struct file_operations bin_fops;
 extern struct inode_operations configfs_dir_inode_operations;
 extern struct inode_operations configfs_symlink_inode_operations;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ca60e3abef45..8ed9b06a9828 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1027,7 +1027,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 	return offset;
 }
 
-struct file_operations configfs_dir_operations = {
+const struct file_operations configfs_dir_operations = {
 	.open		= configfs_dir_open,
 	.release	= configfs_dir_close,
 	.llseek		= configfs_dir_lseek,
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3921920d8716..f499803743e0 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -322,7 +322,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-struct file_operations configfs_file_operations = {
+const struct file_operations configfs_file_operations = {
 	.read		= configfs_read_file,
 	.write		= configfs_write_file,
 	.llseek		= generic_file_llseek,
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index acc1b2c10a86..9efcc3a164e8 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -29,7 +29,7 @@
 
 static struct super_operations cramfs_ops;
 static struct inode_operations cramfs_dir_inode_operations;
-static struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_directory_operations;
 static struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -512,7 +512,7 @@ static struct address_space_operations cramfs_aops = {
 /*
  * A directory can only readdir
  */
-static struct file_operations cramfs_directory_operations = {
+static const struct file_operations cramfs_directory_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= cramfs_readdir,
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 40c4fc973fad..66a505422e5c 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -39,7 +39,7 @@ static int default_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-struct file_operations debugfs_file_operations = {
+const struct file_operations debugfs_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
 	.open =		default_open,
@@ -213,7 +213,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
 	return count;
 }
 
-static struct file_operations fops_bool = {
+static const struct file_operations fops_bool = {
 	.read =		read_file_bool,
 	.write =	write_file_bool,
 	.open =		default_open,
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index b621521e09d4..52f5059c4f31 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -856,14 +856,14 @@ static int devfsd_close(struct inode *inode, struct file *file);
 #ifdef CONFIG_DEVFS_DEBUG
 static ssize_t stat_read(struct file *file, char __user *buf, size_t len,
 			 loff_t * ppos);
-static struct file_operations stat_fops = {
+static const struct file_operations stat_fops = {
 	.open = nonseekable_open,
 	.read = stat_read,
 };
 #endif
 
 /*  Devfs daemon file operations  */
-static struct file_operations devfsd_fops = {
+static const struct file_operations devfsd_fops = {
 	.open = nonseekable_open,
 	.read = devfsd_read,
 	.ioctl = devfsd_ioctl,
@@ -1842,8 +1842,8 @@ static int try_modload(struct devfs_entry *parent, struct fs_info *fs_info,
 
 static struct inode_operations devfs_iops;
 static struct inode_operations devfs_dir_iops;
-static struct file_operations devfs_fops;
-static struct file_operations devfs_dir_fops;
+static const struct file_operations devfs_fops;
+static const struct file_operations devfs_dir_fops;
 static struct inode_operations devfs_symlink_iops;
 
 static int devfs_notify_change(struct dentry *dentry, struct iattr *iattr)
@@ -2061,11 +2061,11 @@ static int devfs_open(struct inode *inode, struct file *file)
 	return err;
 }				/*  End Function devfs_open  */
 
-static struct file_operations devfs_fops = {
+static const struct file_operations devfs_fops = {
 	.open = devfs_open,
 };
 
-static struct file_operations devfs_dir_fops = {
+static const struct file_operations devfs_dir_fops = {
 	.read = generic_read_dir,
 	.readdir = devfs_readdir,
 };
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 777c614ff360..17f5b2d3c16a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -10,7 +10,7 @@
 
 static int efs_readdir(struct file *, void *, filldir_t);
 
-struct file_operations efs_dir_operations = {
+const struct file_operations efs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= efs_readdir,
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e067a06c6464..242fe1a66ce5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -290,7 +290,7 @@ static kmem_cache_t *pwq_cache __read_mostly;
 static struct vfsmount *eventpoll_mnt __read_mostly;
 
 /* File callbacks that implement the eventpoll file behaviour */
-static struct file_operations eventpoll_fops = {
+static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_close,
 	.poll		= ep_eventpoll_poll
 };
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 0165388c425c..d672aa9f4061 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -658,7 +658,7 @@ not_empty:
 	return 0;
 }
 
-struct file_operations ext2_dir_operations = {
+const struct file_operations ext2_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext2_readdir,
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 11035ac7986f..9f74a62be555 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -154,12 +154,12 @@ extern void ext2_write_super (struct super_block *);
  */
 
 /* dir.c */
-extern struct file_operations ext2_dir_operations;
+extern const struct file_operations ext2_dir_operations;
 
 /* file.c */
 extern struct inode_operations ext2_file_inode_operations;
-extern struct file_operations ext2_file_operations;
-extern struct file_operations ext2_xip_file_operations;
+extern const struct file_operations ext2_file_operations;
+extern const struct file_operations ext2_xip_file_operations;
 
 /* inode.c */
 extern struct address_space_operations ext2_aops;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a484412fc782..509cceca04db 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -39,7 +39,7 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
  * We have mostly NULL's here: the current defaults are ok for
  * the ext2 filesystem.
  */
-struct file_operations ext2_file_operations = {
+const struct file_operations ext2_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
@@ -56,7 +56,7 @@ struct file_operations ext2_file_operations = {
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
-struct file_operations ext2_xip_file_operations = {
+const struct file_operations ext2_xip_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= xip_file_read,
 	.write		= xip_file_write,
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 38bd3f6ec147..f37528ed222e 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -39,7 +39,7 @@ static int ext3_dx_readdir(struct file * filp,
 static int ext3_release_dir (struct inode * inode,
 				struct file * filp);
 
-struct file_operations ext3_dir_operations = {
+const struct file_operations ext3_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 59098ea56711..783a796220bb 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -105,7 +105,7 @@ force_commit:
 	return ret;
 }
 
-struct file_operations ext3_file_operations = {
+const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4095bc149eb1..698b85bb1dd4 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -741,7 +741,7 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
 	return ret;
 }
 
-struct file_operations fat_dir_operations = {
+const struct file_operations fat_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
 	.ioctl		= fat_dir_ioctl,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 88aa1ae13f9f..1ee25232e6af 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -112,7 +112,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	}
 }
 
-struct file_operations fat_file_operations = {
+const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/fifo.c b/fs/fifo.c
index d13fcd3ec803..889f722ee36d 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -145,6 +145,6 @@ err_nocleanup:
  * is contain the open that then fills in the correct operations
  * depending on the access mode of the file...
  */
-struct file_operations def_fifo_fops = {
+const struct file_operations def_fifo_fops = {
 	.open		= fifo_open,	/* will set read or write pipe_fops */
 };
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 927acf70c591..1cf1fe8466a2 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -63,7 +63,7 @@ extern void			vxfs_clear_inode(struct inode *);
 
 /* vxfs_lookup.c */
 extern struct inode_operations	vxfs_dir_inode_ops;
-extern struct file_operations	vxfs_dir_operations;
+extern const struct file_operations	vxfs_dir_operations;
 
 /* vxfs_olt.c */
 extern int			vxfs_read_olt(struct super_block *, u_long);
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 554eb455722c..29cce456c7ce 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -56,7 +56,7 @@ struct inode_operations vxfs_dir_inode_ops = {
 	.lookup =		vxfs_lookup,
 };
 
-struct file_operations vxfs_dir_operations = {
+const struct file_operations vxfs_dir_operations = {
 	.readdir =		vxfs_readdir,
 };
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0c9a2ee54c91..23d1f52eb1b8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -922,7 +922,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-struct file_operations fuse_dev_operations = {
+const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
 	.read		= fuse_dev_read,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c72a8a97935c..256355b80256 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1170,7 +1170,7 @@ static struct inode_operations fuse_dir_inode_operations = {
 	.removexattr	= fuse_removexattr,
 };
 
-static struct file_operations fuse_dir_operations = {
+static const struct file_operations fuse_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fuse_readdir,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6f05379b0a0d..975f2697e866 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 
-static struct file_operations fuse_direct_io_file_operations;
+static const struct file_operations fuse_direct_io_file_operations;
 
 static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 			  struct fuse_open_out *outargp)
@@ -611,7 +611,7 @@ static int fuse_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static struct file_operations fuse_file_operations = {
+static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
@@ -623,7 +623,7 @@ static struct file_operations fuse_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
-static struct file_operations fuse_direct_io_file_operations = {
+static const struct file_operations fuse_direct_io_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= fuse_direct_read,
 	.write		= fuse_direct_write,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4a83adfec968..a16a04fcf41e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -346,7 +346,7 @@ static inline u64 get_node_id(struct inode *inode)
 }
 
 /** Device operations */
-extern struct file_operations fuse_dev_operations;
+extern const struct file_operations fuse_dev_operations;
 
 /**
  * This is the single global spinlock which protects FUSE's structures
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 534e5a7480ef..7cd8cc03aea7 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -313,7 +313,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return res;
 }
 
-struct file_operations hfs_dir_operations = {
+const struct file_operations hfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= hfs_readdir,
 	.llseek		= generic_file_llseek,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 18ce47ab1b71..3ed8663a8db1 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -169,7 +169,7 @@ extern int hfs_cat_move(u32, struct inode *, struct qstr *,
 extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *);
 
 /* dir.c */
-extern struct file_operations hfs_dir_operations;
+extern const struct file_operations hfs_dir_operations;
 extern struct inode_operations hfs_dir_inode_operations;
 
 /* extent.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2c564701724f..2d4ced22201b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -17,7 +17,7 @@
 #include "hfs_fs.h"
 #include "btree.h"
 
-static struct file_operations hfs_file_operations;
+static const struct file_operations hfs_file_operations;
 static struct inode_operations hfs_file_inode_operations;
 
 /*================ Variable-like macros ================*/
@@ -601,7 +601,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 }
 
 
-static struct file_operations hfs_file_operations = {
+static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 01a6fe3a395c..1f9ece0de326 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -483,7 +483,7 @@ struct inode_operations hfsplus_dir_inode_operations = {
 	.rename		= hfsplus_rename,
 };
 
-struct file_operations hfsplus_dir_operations = {
+const struct file_operations hfsplus_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= hfsplus_readdir,
 	.ioctl          = hfsplus_ioctl,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 9fbe4d2aeece..acf66dba3e01 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -280,7 +280,7 @@ static struct inode_operations hfsplus_file_inode_operations = {
 	.listxattr	= hfsplus_listxattr,
 };
 
-static struct file_operations hfsplus_file_operations = {
+static const struct file_operations hfsplus_file_operations = {
 	.llseek 	= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b3ad0bd0312f..bf0f8e16e433 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -384,7 +384,7 @@ int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
 }
 
-static struct file_operations hostfs_file_fops = {
+static const struct file_operations hostfs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.sendfile	= generic_file_sendfile,
@@ -399,7 +399,7 @@ static struct file_operations hostfs_file_fops = {
 	.fsync		= hostfs_fsync,
 };
 
-static struct file_operations hostfs_dir_fops = {
+static const struct file_operations hostfs_dir_fops = {
 	.llseek		= generic_file_llseek,
 	.readdir	= hostfs_readdir,
 	.read		= generic_read_dir,
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 5591f9623aa2..ecc9180645ae 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -310,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	return ERR_PTR(-ENOENT);
 }
 
-struct file_operations hpfs_dir_ops =
+const struct file_operations hpfs_dir_ops =
 {
 	.llseek		= hpfs_dir_lseek,
 	.read		= generic_read_dir,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 7c995ac4081b..d3b9fffe45a1 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -119,7 +119,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
 	return retval;
 }
 
-struct file_operations hpfs_file_ops =
+const struct file_operations hpfs_file_ops =
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 4c6473ab3b34..29b7a3e55173 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -240,7 +240,7 @@ void hpfs_set_dentry_operations(struct dentry *);
 /* dir.c */
 
 struct dentry *hpfs_lookup(struct inode *, struct dentry *, struct nameidata *);
-extern struct file_operations hpfs_dir_ops;
+extern const struct file_operations hpfs_dir_ops;
 
 /* dnode.c */
 
@@ -266,7 +266,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
 /* file.c */
 
 int hpfs_file_fsync(struct file *, struct dentry *, int);
-extern struct file_operations hpfs_file_ops;
+extern const struct file_operations hpfs_file_ops;
 extern struct inode_operations hpfs_file_iops;
 extern struct address_space_operations hpfs_aops;
 
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index a44dc5897399..2ba20cdb5baa 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -558,7 +558,7 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
 	return(default_llseek(file, off, where));
 }
 
-static struct file_operations hppfs_file_fops = {
+static const struct file_operations hppfs_file_fops = {
 	.owner		= NULL,
 	.llseek		= hppfs_llseek,
 	.read		= hppfs_read,
@@ -609,7 +609,7 @@ static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return(0);
 }
 
-static struct file_operations hppfs_dir_fops = {
+static const struct file_operations hppfs_dir_fops = {
 	.owner		= NULL,
 	.readdir	= hppfs_readdir,
 	.open		= hppfs_dir_open,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 25fa8bba8cb5..3a5b4e923455 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -35,7 +35,7 @@
 
 static struct super_operations hugetlbfs_ops;
 static struct address_space_operations hugetlbfs_aops;
-struct file_operations hugetlbfs_file_operations;
+const struct file_operations hugetlbfs_file_operations;
 static struct inode_operations hugetlbfs_dir_inode_operations;
 static struct inode_operations hugetlbfs_inode_operations;
 
@@ -566,7 +566,7 @@ static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
 		inode_init_once(&ei->vfs_inode);
 }
 
-struct file_operations hugetlbfs_file_operations = {
+const struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= simple_sync_file,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
diff --git a/fs/inotify.c b/fs/inotify.c
index f48a3dae0712..367c487c014b 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -920,7 +920,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 	return ret;
 }
 
-static struct file_operations inotify_fops = {
+static const struct file_operations inotify_fops = {
 	.poll           = inotify_poll,
 	.read           = inotify_read,
 	.release        = inotify_release,
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 7901ac9f97ab..5440ea292c69 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -16,7 +16,7 @@
 
 static int isofs_readdir(struct file *, void *, filldir_t);
 
-struct file_operations isofs_dir_operations =
+const struct file_operations isofs_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= isofs_readdir,
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 439a19b1bf3e..b87ba066f5e7 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -175,6 +175,6 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
 }
 
 extern struct inode_operations isofs_dir_inode_operations;
-extern struct file_operations isofs_dir_operations;
+extern const struct file_operations isofs_dir_operations;
 extern struct address_space_operations isofs_symlink_aops;
 extern struct export_operations isofs_export_ops;
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 5a4519e834da..020cc097c539 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -55,9 +55,9 @@
 static int jffs_remove(struct inode *dir, struct dentry *dentry, int type);
 
 static struct super_operations jffs_ops;
-static struct file_operations jffs_file_operations;
+static const struct file_operations jffs_file_operations;
 static struct inode_operations jffs_file_inode_operations;
-static struct file_operations jffs_dir_operations;
+static const struct file_operations jffs_dir_operations;
 static struct inode_operations jffs_dir_inode_operations;
 static struct address_space_operations jffs_address_operations;
 
@@ -1629,7 +1629,7 @@ static int jffs_fsync(struct file *f, struct dentry *d, int datasync)
 }
 
 
-static struct file_operations jffs_file_operations =
+static const struct file_operations jffs_file_operations =
 {
 	.open		= generic_file_open,
 	.llseek		= generic_file_llseek,
@@ -1649,7 +1649,7 @@ static struct inode_operations jffs_file_inode_operations =
 };
 
 
-static struct file_operations jffs_dir_operations =
+static const struct file_operations jffs_dir_operations =
 {
 	.readdir	= jffs_readdir,
 };
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index a7bf9cb2567f..8bc7a5018e40 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -37,7 +37,7 @@ static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t);
 static int jffs2_rename (struct inode *, struct dentry *,
                         struct inode *, struct dentry *);
 
-struct file_operations jffs2_dir_operations =
+const struct file_operations jffs2_dir_operations =
 {
 	.read =		generic_read_dir,
 	.readdir =	jffs2_readdir,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 935f273dc57b..9f4171213e58 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -38,7 +38,7 @@ int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	return 0;
 }
 
-struct file_operations jffs2_file_operations =
+const struct file_operations jffs2_file_operations =
 {
 	.llseek =	generic_file_llseek,
 	.open =		generic_file_open,
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 59e7a393200c..d307cf548625 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -159,11 +159,11 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
 void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c);
 
 /* dir.c */
-extern struct file_operations jffs2_dir_operations;
+extern const struct file_operations jffs2_dir_operations;
 extern struct inode_operations jffs2_dir_inode_operations;
 
 /* file.c */
-extern struct file_operations jffs2_file_operations;
+extern const struct file_operations jffs2_file_operations;
 extern struct inode_operations jffs2_file_inode_operations;
 extern struct address_space_operations jffs2_file_address_operations;
 int jffs2_fsync(struct file *, struct dentry *, int);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index e1ac6e497e2b..1c9745be5ada 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -100,7 +100,7 @@ struct inode_operations jfs_file_inode_operations = {
 #endif
 };
 
-struct file_operations jfs_file_operations = {
+const struct file_operations jfs_file_operations = {
 	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
 	.write		= generic_file_write,
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 095d471b9f9a..c30072674464 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -35,9 +35,9 @@ extern void jfs_set_inode_flags(struct inode *);
 
 extern struct address_space_operations jfs_aops;
 extern struct inode_operations jfs_dir_inode_operations;
-extern struct file_operations jfs_dir_operations;
+extern const struct file_operations jfs_dir_operations;
 extern struct inode_operations jfs_file_inode_operations;
-extern struct file_operations jfs_file_operations;
+extern const struct file_operations jfs_file_operations;
 extern struct inode_operations jfs_symlink_inode_operations;
 extern struct dentry_operations jfs_ci_dentry_operations;
 #endif				/* _H_JFS_INODE */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 309cee575f7d..09ea03f62277 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1519,7 +1519,7 @@ struct inode_operations jfs_dir_inode_operations = {
 #endif
 };
 
-struct file_operations jfs_dir_operations = {
+const struct file_operations jfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= jfs_readdir,
 	.fsync		= jfs_fsync,
diff --git a/fs/libfs.c b/fs/libfs.c
index 4fdeaceb892c..7145ba7a48d0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -179,7 +179,7 @@ ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t
 	return -EISDIR;
 }
 
-struct file_operations simple_dir_operations = {
+const struct file_operations simple_dir_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.llseek		= dcache_dir_lseek,
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 732502aabc05..69224d1fe043 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -14,7 +14,7 @@ typedef struct minix_dir_entry minix_dirent;
 
 static int minix_readdir(struct file *, void *, filldir_t);
 
-struct file_operations minix_dir_operations = {
+const struct file_operations minix_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= minix_readdir,
 	.fsync		= minix_sync_file,
diff --git a/fs/minix/file.c b/fs/minix/file.c
index f1d77acb3f01..420b32882a10 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -15,7 +15,7 @@
  */
 int minix_sync_file(struct file *, struct dentry *, int);
 
-struct file_operations minix_file_operations = {
+const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e42a8bb89001..c55b77cdcc8e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -81,8 +81,8 @@ extern int minix_sync_file(struct file *, struct dentry *, int);
 
 extern struct inode_operations minix_file_inode_operations;
 extern struct inode_operations minix_dir_inode_operations;
-extern struct file_operations minix_file_operations;
-extern struct file_operations minix_dir_operations;
+extern const struct file_operations minix_file_operations;
+extern const struct file_operations minix_dir_operations;
 extern struct dentry_operations minix_dentry_operations;
 
 static inline struct minix_sb_info *minix_sb(struct super_block *sb)
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index cfd76f431dc0..f0860c602d8b 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,7 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
 #define ncp_symlink NULL
 #endif
 		      
-struct file_operations ncp_dir_operations =
+const struct file_operations ncp_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= ncp_readdir,
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index ebdad8f6398f..e6b7c67cf057 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -283,7 +283,7 @@ static int ncp_release(struct inode *inode, struct file *file) {
 	return 0;
 }
 
-struct file_operations ncp_file_operations =
+const struct file_operations ncp_file_operations =
 {
 	.llseek		= remote_llseek,
 	.read		= ncp_file_read,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06c48b385c94..a23f34894167 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -54,7 +54,7 @@ static int nfs_rename(struct inode *, struct dentry *,
 static int nfs_fsync_dir(struct file *, struct dentry *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 
-struct file_operations nfs_dir_operations = {
+const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
 	.read		= generic_read_dir,
 	.readdir	= nfs_readdir,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index dee49a0cb995..f1df2c8d9259 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -49,7 +49,7 @@ static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
 
-struct file_operations nfs_file_operations = {
+const struct file_operations nfs_file_operations = {
 	.llseek		= nfs_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c8960aff0968..3ef017b3b5bd 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -134,7 +134,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
 	return simple_transaction_read(file, buf, size, pos);
 }
 
-static struct file_operations transaction_ops = {
+static const struct file_operations transaction_ops = {
 	.write		= nfsctl_transaction_write,
 	.read		= nfsctl_transaction_read,
 	.release	= simple_transaction_release,
@@ -146,7 +146,7 @@ static int exports_open(struct inode *inode, struct file *file)
 	return seq_open(file, &nfs_exports_op);
 }
 
-static struct file_operations exports_operations = {
+static const struct file_operations exports_operations = {
 	.open		= exports_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 1cf955bcc526..57265d563804 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -80,7 +80,7 @@ static int nfsd_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, nfsd_proc_show, NULL);
 }
 
-static struct file_operations nfsd_proc_fops = {
+static const struct file_operations nfsd_proc_fops = {
 	.owner = THIS_MODULE,
 	.open = nfsd_proc_open,
 	.read  = seq_read,
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9d9ed3fe371d..d1e2c6f9f05e 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1553,7 +1553,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
 
 #endif /* NTFS_RW */
 
-struct file_operations ntfs_dir_ops = {
+const struct file_operations ntfs_dir_ops = {
 	.llseek		= generic_file_llseek,	/* Seek inside directory. */
 	.read		= generic_read_dir,	/* Return -EISDIR. */
 	.readdir	= ntfs_readdir,		/* Read directory contents. */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f5d057e4acc2..c63a83e8da98 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2294,7 +2294,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 
 #endif /* NTFS_RW */
 
-struct file_operations ntfs_file_ops = {
+const struct file_operations ntfs_file_ops = {
 	.llseek		= generic_file_llseek,	 /* Seek inside file. */
 	.read		= generic_file_read,	 /* Read from file. */
 	.aio_read	= generic_file_aio_read, /* Async read from file. */
@@ -2337,6 +2337,6 @@ struct inode_operations ntfs_file_inode_ops = {
 #endif /* NTFS_RW */
 };
 
-struct file_operations ntfs_empty_file_ops = {};
+const struct file_operations ntfs_empty_file_ops = {};
 
 struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index 166142960b53..bf7b3d7c0930 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -60,13 +60,13 @@ extern struct kmem_cache *ntfs_index_ctx_cache;
 extern struct address_space_operations ntfs_aops;
 extern struct address_space_operations ntfs_mst_aops;
 
-extern struct  file_operations ntfs_file_ops;
+extern const struct  file_operations ntfs_file_ops;
 extern struct inode_operations ntfs_file_inode_ops;
 
-extern struct  file_operations ntfs_dir_ops;
+extern const struct  file_operations ntfs_dir_ops;
 extern struct inode_operations ntfs_dir_inode_ops;
 
-extern struct  file_operations ntfs_empty_file_ops;
+extern const struct  file_operations ntfs_empty_file_ops;
 extern struct inode_operations ntfs_empty_inode_ops;
 
 extern struct export_operations ntfs_export_ops;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 84f153aca692..64cd52860c87 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2017,7 +2017,7 @@ out:
 	return ret;
 }
 
-static struct file_operations ocfs2_dlm_debug_fops = {
+static const struct file_operations ocfs2_dlm_debug_fops = {
 	.open =		ocfs2_dlm_debug_open,
 	.release =	ocfs2_dlm_debug_release,
 	.read =		seq_read,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4b4cbadd5838..34e903a6a46b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1176,7 +1176,7 @@ struct inode_operations ocfs2_special_file_iops = {
 	.getattr	= ocfs2_getattr,
 };
 
-struct file_operations ocfs2_fops = {
+const struct file_operations ocfs2_fops = {
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.sendfile	= generic_file_sendfile,
@@ -1188,7 +1188,7 @@ struct file_operations ocfs2_fops = {
 	.aio_write	= ocfs2_file_aio_write,
 };
 
-struct file_operations ocfs2_dops = {
+const struct file_operations ocfs2_dops = {
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a5ea33b24060..740c9e7ca599 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -26,8 +26,8 @@
 #ifndef OCFS2_FILE_H
 #define OCFS2_FILE_H
 
-extern struct file_operations ocfs2_fops;
-extern struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops;
+extern const struct file_operations ocfs2_dops;
 extern struct inode_operations ocfs2_file_iops;
 extern struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index aeb0106890e4..0f14276a2e51 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -581,17 +581,17 @@ int property_release (struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static struct file_operations openpromfs_prop_ops = {
+static const struct file_operations openpromfs_prop_ops = {
 	.read		= property_read,
 	.write		= property_write,
 	.release	= property_release,
 };
 
-static struct file_operations openpromfs_nodenum_ops = {
+static const struct file_operations openpromfs_nodenum_ops = {
 	.read		= nodenum_read,
 };
 
-static struct file_operations openprom_operations = {
+static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
 	.readdir	= openpromfs_readdir,
 };
diff --git a/fs/pipe.c b/fs/pipe.c
index 4384c9290943..e2f4f1d9ffc2 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -568,7 +568,7 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
  * The file_operations structs are not static because they
  * are also used in linux/fs/fifo.c to do operations on FIFOs.
  */
-struct file_operations read_fifo_fops = {
+const struct file_operations read_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
@@ -580,7 +580,7 @@ struct file_operations read_fifo_fops = {
 	.fasync		= pipe_read_fasync,
 };
 
-struct file_operations write_fifo_fops = {
+const struct file_operations write_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= pipe_write,
@@ -592,7 +592,7 @@ struct file_operations write_fifo_fops = {
 	.fasync		= pipe_write_fasync,
 };
 
-struct file_operations rdwr_fifo_fops = {
+const struct file_operations rdwr_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index adc2cd95169a..17f6e8fa1397 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -31,7 +31,7 @@ static int open_kcore(struct inode * inode, struct file * filp)
 
 static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
 
-struct file_operations proc_kcore_operations = {
+const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
 };
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 10d37bf25206..ff3b90b56e9d 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -47,7 +47,7 @@ static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 }
 
 
-struct file_operations proc_kmsg_operations = {
+const struct file_operations proc_kmsg_operations = {
 	.read		= kmsg_read,
 	.poll		= kmsg_poll,
 	.open		= kmsg_open,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4063fb32f78c..7efa73d44c9a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -172,7 +172,7 @@ static int open_vmcore(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-struct file_operations proc_vmcore_operations = {
+const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
 	.open		= open_vmcore,
 };
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 7a8f5595c26f..9031948fefd0 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -81,7 +81,7 @@ out:
 	return 0;
 }
 
-struct file_operations qnx4_dir_operations =
+const struct file_operations qnx4_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index c33963fded9e..62af4b1348bd 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -19,7 +19,7 @@
  * We have mostly NULL's here: the current defaults are ok for
  * the qnx4 filesystem.
  */
-struct file_operations qnx4_file_operations =
+const struct file_operations qnx4_file_operations =
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 6ada2095b9ac..00a933eb820c 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -32,7 +32,7 @@ struct address_space_operations ramfs_aops = {
 	.commit_write	= simple_commit_write
 };
 
-struct file_operations ramfs_file_operations = {
+const struct file_operations ramfs_file_operations = {
 	.read		= generic_file_read,
 	.write		= generic_file_write,
 	.mmap		= generic_file_mmap,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b1ca234068f6..f443a84b98a5 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -33,7 +33,7 @@ struct address_space_operations ramfs_aops = {
 	.commit_write		= simple_commit_write
 };
 
-struct file_operations ramfs_file_operations = {
+const struct file_operations ramfs_file_operations = {
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
 	.read			= generic_file_read,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 272c8a7120b0..313237631b49 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -11,5 +11,5 @@
 
 
 extern struct address_space_operations ramfs_aops;
-extern struct file_operations ramfs_file_operations;
+extern const struct file_operations ramfs_file_operations;
 extern struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
index 34b1bf259efd..6256ca81a718 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -19,7 +19,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-struct file_operations generic_ro_fops = {
+const struct file_operations generic_ro_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.mmap		= generic_file_readonly_mmap,
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index d71ac6579289..973c819f8033 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -18,7 +18,7 @@ static int reiserfs_readdir(struct file *, void *, filldir_t);
 static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 			      int datasync);
 
-struct file_operations reiserfs_dir_operations = {
+const struct file_operations reiserfs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index d0c1e865963e..010094d14da6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1566,7 +1566,7 @@ static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
 	return generic_file_aio_write(iocb, buf, count, pos);
 }
 
-struct file_operations reiserfs_file_operations = {
+const struct file_operations reiserfs_file_operations = {
 	.read = generic_file_read,
 	.write = reiserfs_file_write,
 	.ioctl = reiserfs_ioctl,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index ef6caed9336b..731688e1cfe3 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -470,7 +470,7 @@ static int r_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations r_file_operations = {
+static const struct file_operations r_file_operations = {
 	.open = r_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c2fc424d7d5c..9b9eda7b335c 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -463,7 +463,7 @@ static struct address_space_operations romfs_aops = {
 	.readpage = romfs_readpage
 };
 
-static struct file_operations romfs_dir_operations = {
+static const struct file_operations romfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= romfs_readdir,
 };
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 0424d06b147e..34c7a11d91f0 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -34,7 +34,7 @@ static int smb_rename(struct inode *, struct dentry *,
 static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
 static int smb_link(struct dentry *, struct inode *, struct dentry *);
 
-struct file_operations smb_dir_operations =
+const struct file_operations smb_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= smb_readdir,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 7042e62726a4..c56bd99a9701 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -401,7 +401,7 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return error;
 }
 
-struct file_operations smb_file_operations =
+const struct file_operations smb_file_operations =
 {
 	.llseek		= remote_llseek,
 	.read		= smb_file_read,
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index e866ec8660d0..47664597e6b1 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -35,7 +35,7 @@ extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
 extern void smb_install_null_ops(struct smb_ops *ops);
 /* dir.c */
-extern struct file_operations smb_dir_operations;
+extern const struct file_operations smb_dir_operations;
 extern struct inode_operations smb_dir_inode_operations;
 extern struct inode_operations smb_dir_inode_operations_unix;
 extern void smb_new_dentry(struct dentry *dentry);
@@ -64,7 +64,7 @@ extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
 /* file.c */
 extern struct address_space_operations smb_file_aops;
-extern struct file_operations smb_file_operations;
+extern const struct file_operations smb_file_operations;
 extern struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
 extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 78899eeab974..c16a93c353c0 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -163,7 +163,7 @@ static int release(struct inode * inode, struct file * file)
 	return 0;
 }
 
-struct file_operations bin_fops = {
+const struct file_operations bin_fops = {
 	.read		= read,
 	.write		= write,
 	.mmap		= mmap,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 9ee956864445..f26880a4785e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -503,7 +503,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 	return offset;
 }
 
-struct file_operations sysfs_dir_operations = {
+const struct file_operations sysfs_dir_operations = {
 	.open		= sysfs_dir_open,
 	.release	= sysfs_dir_close,
 	.llseek		= sysfs_dir_lseek,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5e83e7246788..830f76fa098c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -348,7 +348,7 @@ static int sysfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-struct file_operations sysfs_file_operations = {
+const struct file_operations sysfs_file_operations = {
 	.read		= sysfs_read_file,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index cf11d5b789d9..32958a7c50e9 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -21,9 +21,9 @@ extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 
 extern struct rw_semaphore sysfs_rename_sem;
 extern struct super_block * sysfs_sb;
-extern struct file_operations sysfs_dir_operations;
-extern struct file_operations sysfs_file_operations;
-extern struct file_operations bin_fops;
+extern const struct file_operations sysfs_dir_operations;
+extern const struct file_operations sysfs_file_operations;
+extern const struct file_operations bin_fops;
 extern struct inode_operations sysfs_dir_inode_operations;
 extern struct inode_operations sysfs_symlink_inode_operations;
 
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index cce8b05cba5a..8c66e9270dd6 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -20,7 +20,7 @@
 
 static int sysv_readdir(struct file *, void *, filldir_t);
 
-struct file_operations sysv_dir_operations = {
+const struct file_operations sysv_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysv_readdir,
 	.fsync		= sysv_sync_file,
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index da69abc06240..a59e303135fa 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -19,7 +19,7 @@
  * We have mostly NULLs here: the current defaults are OK for
  * the coh filesystem.
  */
-struct file_operations sysv_file_operations = {
+const struct file_operations sysv_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index b7f9b4a42aab..393a480e4deb 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -159,8 +159,8 @@ extern ino_t sysv_inode_by_name(struct dentry *);
 extern struct inode_operations sysv_file_inode_operations;
 extern struct inode_operations sysv_dir_inode_operations;
 extern struct inode_operations sysv_fast_symlink_inode_operations;
-extern struct file_operations sysv_file_operations;
-extern struct file_operations sysv_dir_operations;
+extern const struct file_operations sysv_file_operations;
+extern const struct file_operations sysv_dir_operations;
 extern struct address_space_operations sysv_aops;
 extern struct super_operations sysv_sops;
 extern struct dentry_operations sysv_dentry_operations;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f5222527fe39..8c28efa3b8ff 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -42,7 +42,7 @@ static int do_udf_readdir(struct inode *, struct file *, filldir_t, void *);
 
 /* readdir and lookup functions */
 
-struct file_operations udf_dir_operations = {
+const struct file_operations udf_dir_operations = {
 	.read			= generic_read_dir,
 	.readdir		= udf_readdir,
 	.ioctl			= udf_ioctl,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index a6f2acc1f15c..e34b00e303f1 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -248,7 +248,7 @@ static int udf_release_file(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-struct file_operations udf_file_operations = {
+const struct file_operations udf_file_operations = {
 	.read			= generic_file_read,
 	.ioctl			= udf_ioctl,
 	.open			= generic_file_open,
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 1d5800e0cbe7..023e19ba5a2e 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -44,9 +44,9 @@ struct buffer_head;
 struct super_block;
 
 extern struct inode_operations udf_dir_inode_operations;
-extern struct file_operations udf_dir_operations;
+extern const struct file_operations udf_dir_operations;
 extern struct inode_operations udf_file_inode_operations;
-extern struct file_operations udf_file_operations;
+extern const struct file_operations udf_file_operations;
 extern struct address_space_operations udf_aops;
 extern struct address_space_operations udf_adinicb_aops;
 extern struct address_space_operations udf_symlink_aops;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 7c10c68902ae..1a561202d3f4 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -620,7 +620,7 @@ int ufs_empty_dir (struct inode * inode)
 	return 1;
 }
 
-struct file_operations ufs_dir_operations = {
+const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 62ad481810ef..312fd3f86313 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -31,7 +31,7 @@
  * the ufs filesystem.
  */
  
-struct file_operations ufs_file_operations = {
+const struct file_operations ufs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 185567a6a561..85997b1205f5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -528,7 +528,7 @@ open_exec_out:
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
 
-struct file_operations xfs_file_operations = {
+const struct file_operations xfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
@@ -550,7 +550,7 @@ struct file_operations xfs_file_operations = {
 #endif
 };
 
-struct file_operations xfs_invis_file_operations = {
+const struct file_operations xfs_invis_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
@@ -570,7 +570,7 @@ struct file_operations xfs_invis_file_operations = {
 };
 
 
-struct file_operations xfs_dir_file_operations = {
+const struct file_operations xfs_dir_file_operations = {
 	.read		= generic_read_dir,
 	.readdir	= xfs_file_readdir,
 	.unlocked_ioctl	= xfs_file_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index a8417d7af5f9..ad6173da5678 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,9 +22,9 @@ extern struct inode_operations xfs_inode_operations;
 extern struct inode_operations xfs_dir_inode_operations;
 extern struct inode_operations xfs_symlink_inode_operations;
 
-extern struct file_operations xfs_file_operations;
-extern struct file_operations xfs_dir_file_operations;
-extern struct file_operations xfs_invis_file_operations;
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+extern const struct file_operations xfs_invis_file_operations;
 
 extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *,
                         int, unsigned int, void __user *);
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index cc621ec409d8..b3ecf8f71d97 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -30,9 +30,9 @@ extern struct inode_operations coda_ioctl_inode_operations;
 extern struct address_space_operations coda_file_aops;
 extern struct address_space_operations coda_symlink_aops;
 
-extern struct file_operations coda_dir_operations;
-extern struct file_operations coda_file_operations;
-extern struct file_operations coda_ioctl_operations;
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
 
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 534d750d922d..32503657f14f 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -11,7 +11,7 @@
 extern unsigned long long elfcorehdr_addr;
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
-extern struct file_operations proc_vmcore_operations;
+extern const struct file_operations proc_vmcore_operations;
 extern struct proc_dir_entry *proc_vmcore;
 
 #endif /* CONFIG_CRASH_DUMP */
diff --git a/include/linux/efs_fs.h b/include/linux/efs_fs.h
index 28f368c526fb..fbfa6b52e2fb 100644
--- a/include/linux/efs_fs.h
+++ b/include/linux/efs_fs.h
@@ -37,7 +37,7 @@ static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb)
 struct statfs;
 
 extern struct inode_operations efs_dir_inode_operations;
-extern struct file_operations efs_dir_operations;
+extern const struct file_operations efs_dir_operations;
 extern struct address_space_operations efs_symlink_aops;
 
 extern void efs_read_inode(struct inode *);
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 8bb4f842cded..3ade6a4e3bdd 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -833,11 +833,11 @@ do {								\
  */
 
 /* dir.c */
-extern struct file_operations ext3_dir_operations;
+extern const struct file_operations ext3_dir_operations;
 
 /* file.c */
 extern struct inode_operations ext3_file_inode_operations;
-extern struct file_operations ext3_file_operations;
+extern const struct file_operations ext3_file_operations;
 
 /* namei.c */
 extern struct inode_operations ext3_dir_inode_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef355bc73714..408fe89498f4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1390,11 +1390,11 @@ extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
-extern struct file_operations def_blk_fops;
+extern const struct file_operations def_blk_fops;
 extern struct address_space_operations def_blk_aops;
-extern struct file_operations def_chr_fops;
-extern struct file_operations bad_sock_fops;
-extern struct file_operations def_fifo_fops;
+extern const struct file_operations def_chr_fops;
+extern const struct file_operations bad_sock_fops;
+extern const struct file_operations def_fifo_fops;
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
@@ -1444,9 +1444,9 @@ extern void init_special_inode(struct inode *, umode_t, dev_t);
 extern void make_bad_inode(struct inode *);
 extern int is_bad_inode(struct inode *);
 
-extern struct file_operations read_fifo_fops;
-extern struct file_operations write_fifo_fops;
-extern struct file_operations rdwr_fifo_fops;
+extern const struct file_operations read_fifo_fops;
+extern const struct file_operations write_fifo_fops;
+extern const struct file_operations rdwr_fifo_fops;
 
 extern int fs_may_remount_ro(struct super_block *);
 
@@ -1688,7 +1688,7 @@ static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
 				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
 }
 
-extern struct file_operations generic_ro_fops;
+extern const struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 
@@ -1744,9 +1744,9 @@ extern int simple_commit_write(struct file *file, struct page *page,
 
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
-extern struct file_operations simple_dir_operations;
+extern const struct file_operations simple_dir_operations;
 extern struct inode_operations simple_dir_inode_operations;
-struct tree_descr { char *name; struct file_operations *ops; int mode; };
+struct tree_descr { char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
 extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
 extern int simple_pin_fs(char *name, struct vfsmount **mount, int *count);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d6f1019625af..4c5e610fe442 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -154,7 +154,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-extern struct file_operations hugetlbfs_file_operations;
+extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
 int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 53cee1581650..d9035c73e5d1 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -334,7 +334,7 @@ extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
 		    unsigned long *mapped_blocks);
 
 /* fat/dir.c */
-extern struct file_operations fat_dir_operations;
+extern const struct file_operations fat_dir_operations;
 extern int fat_search_long(struct inode *inode, const unsigned char *name,
 			   int name_len, struct fat_slot_info *sinfo);
 extern int fat_dir_empty(struct inode *dir);
@@ -397,7 +397,7 @@ extern int fat_count_free_clusters(struct super_block *sb);
 /* fat/file.c */
 extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
 			     unsigned int cmd, unsigned long arg);
-extern struct file_operations fat_file_operations;
+extern const struct file_operations fat_file_operations;
 extern struct inode_operations fat_file_inode_operations;
 extern int fat_notify_change(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index e01342568530..96dc237b8f03 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -209,7 +209,7 @@ void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
 
 /* linux/fs/ncpfs/dir.c */
 extern struct inode_operations ncp_dir_inode_operations;
-extern struct file_operations ncp_dir_operations;
+extern const struct file_operations ncp_dir_operations;
 int ncp_conn_logged_in(struct super_block *);
 int ncp_date_dos2unix(__le16 time, __le16 date);
 void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
@@ -230,7 +230,7 @@ void ncp_unlock_server(struct ncp_server *server);
 
 /* linux/fs/ncpfs/file.c */
 extern struct inode_operations ncp_file_inode_operations;
-extern struct file_operations ncp_file_operations;
+extern const struct file_operations ncp_file_operations;
 int ncp_make_open(struct inode *, int);
 
 /* linux/fs/ncpfs/mmap.c */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cbebd7d1b9e8..c71227dd4389 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -324,7 +324,7 @@ extern struct inode_operations nfs_file_inode_operations;
 #ifdef CONFIG_NFS_V3
 extern struct inode_operations nfs3_file_inode_operations;
 #endif /* CONFIG_NFS_V3 */
-extern struct file_operations nfs_file_operations;
+extern const struct file_operations nfs_file_operations;
 extern struct address_space_operations nfs_file_aops;
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
@@ -371,7 +371,7 @@ extern struct inode_operations nfs_dir_inode_operations;
 #ifdef CONFIG_NFS_V3
 extern struct inode_operations nfs3_dir_inode_operations;
 #endif /* CONFIG_NFS_V3 */
-extern struct file_operations nfs_dir_operations;
+extern const struct file_operations nfs_dir_operations;
 extern struct dentry_operations nfs_dentry_operations;
 
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 6d03d025fcd5..135871df9911 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -128,9 +128,9 @@ extern int proc_match(int, const char *,struct proc_dir_entry *);
 extern int proc_readdir(struct file *, void *, filldir_t);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
-extern struct file_operations proc_kcore_operations;
-extern struct file_operations proc_kmsg_operations;
-extern struct file_operations ppc_htab_operations;
+extern const struct file_operations proc_kcore_operations;
+extern const struct file_operations proc_kmsg_operations;
+extern const struct file_operations ppc_htab_operations;
 
 /*
  * proc_tty.c
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index fc610bb0f733..27f49c85d5d6 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -118,8 +118,8 @@ extern struct buffer_head *qnx4_bread(struct inode *, int, int);
 
 extern struct inode_operations qnx4_file_inode_operations;
 extern struct inode_operations qnx4_dir_inode_operations;
-extern struct file_operations qnx4_file_operations;
-extern struct file_operations qnx4_dir_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
 extern int qnx4_is_free(struct super_block *sb, long block);
 extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
 extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 953b6df5d037..78ecfa28b1c2 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -15,7 +15,7 @@ extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 #endif
 
-extern struct file_operations ramfs_file_operations;
+extern const struct file_operations ramfs_file_operations;
 extern struct vm_operations_struct generic_file_vm_ops;
 
 #endif
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 912f1b7cb18f..5676c4210e2c 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1960,7 +1960,7 @@ int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
 extern struct inode_operations reiserfs_dir_inode_operations;
 extern struct inode_operations reiserfs_symlink_inode_operations;
 extern struct inode_operations reiserfs_special_inode_operations;
-extern struct file_operations reiserfs_dir_operations;
+extern const struct file_operations reiserfs_dir_operations;
 
 /* tail_conversion.c */
 int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
@@ -1972,7 +1972,7 @@ void reiserfs_unmap_buffer(struct buffer_head *);
 
 /* file.c */
 extern struct inode_operations reiserfs_file_inode_operations;
-extern struct file_operations reiserfs_file_operations;
+extern const struct file_operations reiserfs_file_operations;
 extern struct address_space_operations reiserfs_address_space_operations;
 
 /* fix_nodes.c */
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index b0ffe4356e5a..843aeaaa79d4 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -895,7 +895,7 @@ extern void ufs_set_link(struct inode *, struct ufs_dir_entry *, struct buffer_h
 
 /* file.c */
 extern struct inode_operations ufs_file_inode_operations;
-extern struct file_operations ufs_file_operations;
+extern const struct file_operations ufs_file_operations;
 
 extern struct address_space_operations ufs_aops;
 
@@ -915,7 +915,7 @@ extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 
 /* namei.c */
-extern struct file_operations ufs_dir_operations;
+extern const struct file_operations ufs_dir_operations;
         
 /* super.c */
 extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
diff --git a/net/nonet.c b/net/nonet.c
index 1230f0ae832e..92e76640c7cd 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -19,7 +19,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 	return -ENXIO;
 }
 
-struct file_operations bad_sock_fops = {
+const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
 };
diff --git a/net/socket.c b/net/socket.c
index 5211ba270375..fcd77eac0ccf 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -539,7 +539,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 	return -ENXIO;
 }
 
-struct file_operations bad_sock_fops = {
+const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
 };
-- 
cgit v1.2.3


From 910638ae7ed4be27d6af55f6c9b5bf54b838e78b Mon Sep 17 00:00:00 2001
From: Matthias Gehre <M.Gehre@gmx.de>
Date: Tue, 28 Mar 2006 01:56:48 -0800
Subject: [PATCH] Replace 0xff.. with correct DMA_xBIT_MASK

Replace all occurences of 0xff..  in calls to function pci_set_dma_mask()
and pci_set_consistant_dma_mask() with the corresponding DMA_xBIT_MASK from
linux/dma-mapping.h.

Signed-off-by: Matthias Gehre <M.Gehre@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DMA-mapping.txt                 | 2 ++
 drivers/atm/lanai.c                           | 2 +-
 drivers/block/umem.c                          | 5 +++--
 drivers/net/forcedeth.c                       | 3 ++-
 drivers/net/ioc3-eth.c                        | 7 ++++---
 drivers/net/ns83820.c                         | 6 +++---
 drivers/net/wan/wanxl.c                       | 4 ++--
 drivers/net/wireless/prism54/islpci_hotplug.c | 3 ++-
 drivers/scsi/BusLogic.c                       | 7 ++++---
 drivers/scsi/a100u2w.c                        | 3 ++-
 drivers/scsi/aacraid/aachba.c                 | 1 +
 drivers/scsi/aacraid/linit.c                  | 5 +++--
 drivers/scsi/atp870u.c                        | 3 ++-
 drivers/scsi/dpt_i2o.c                        | 5 +++--
 drivers/scsi/eata.c                           | 3 ++-
 drivers/scsi/gdth.c                           | 7 ++++---
 drivers/scsi/initio.c                         | 3 ++-
 drivers/scsi/ips.c                            | 5 +++--
 drivers/scsi/megaraid.c                       | 7 ++++---
 drivers/scsi/nsp32.c                          | 3 ++-
 drivers/scsi/qla1280.c                        | 5 +++--
 drivers/scsi/qlogicfc.c                       | 5 +++--
 include/linux/dma-mapping.h                   | 1 +
 sound/oss/esssolo1.c                          | 2 +-
 sound/oss/sonicvibes.c                        | 3 ++-
 sound/pci/ad1889.c                            | 1 +
 sound/pci/ali5451/ali5451.c                   | 5 +++--
 sound/pci/als4000.c                           | 5 +++--
 sound/pci/azt3328.c                           | 5 +++--
 sound/pci/emu10k1/emu10k1x.c                  | 1 +
 sound/pci/es1938.c                            | 5 +++--
 sound/pci/es1968.c                            | 1 +
 sound/pci/ice1712/ice1712.c                   | 2 ++
 sound/pci/maestro3.c                          | 1 +
 sound/pci/mixart/mixart.c                     | 2 ++
 sound/pci/pcxhr/pcxhr.c                       | 1 +
 sound/pci/sonicvibes.c                        | 5 +++--
 sound/pci/trident/trident_main.c              | 5 +++--
 38 files changed, 88 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt
index 684557474c15..ee4bb73683cd 100644
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/DMA-mapping.txt
@@ -199,6 +199,8 @@ address during PCI bus mastering you might do something like:
 		       "mydev: 24-bit DMA addressing not available.\n");
 		goto ignore_this_device;
 	}
+[Better use DMA_24BIT_MASK instead of 0x00ffffff.
+See linux/include/dma-mapping.h for reference.]
 
 When pci_set_dma_mask() is successful, and returns zero, the PCI layer
 saves away this mask you have provided.  The PCI layer will use this
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 69f4c7ce9a63..cac09e353be8 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1972,7 +1972,7 @@ static int __devinit lanai_pci_start(struct lanai_dev *lanai)
 		    "(itf %d): No suitable DMA available.\n", lanai->number);
 		return -EBUSY;
 	}
-	if (pci_set_consistent_dma_mask(pci, 0xFFFFFFFF) != 0) {
+	if (pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) != 0) {
 		printk(KERN_WARNING DEV_LABEL
 		    "(itf %d): No suitable DMA available.\n", lanai->number);
 		return -EBUSY;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index c16e66b9c7a7..f7d4c65a7b8c 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -50,6 +50,7 @@
 #include <linux/timer.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 
 #include <linux/fcntl.h>        /* O_ACCMODE */
 #include <linux/hdreg.h>  /* HDIO_GETGEO */
@@ -881,8 +882,8 @@ static int __devinit mm_pci_probe(struct pci_dev *dev, const struct pci_device_i
 	printk(KERN_INFO "Micro Memory(tm) controller #%d found at %02x:%02x (PCI Mem Module (Battery Backup))\n",
 	       card->card_number, dev->bus->number, dev->devfn);
 
-	if (pci_set_dma_mask(dev, 0xffffffffffffffffLL) &&
-	    pci_set_dma_mask(dev, 0xffffffffLL)) {
+	if (pci_set_dma_mask(dev, DMA_64BIT_MASK) &&
+	    pci_set_dma_mask(dev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "MM%d: NO suitable DMA found\n",num_cards);
 		return  -ENOMEM;
 	}
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index e7fc28b07e5a..7627a75f4f7c 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -134,6 +134,7 @@
 #include <linux/random.h>
 #include <linux/init.h>
 #include <linux/if_vlan.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/irq.h>
 #include <asm/io.h>
@@ -2932,7 +2933,7 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 	if (id->driver_data & DEV_HAS_HIGH_DMA) {
 		/* packet format 3: supports 40-bit addressing */
 		np->desc_ver = DESC_VER_3;
-		if (pci_set_dma_mask(pci_dev, 0x0000007fffffffffULL)) {
+		if (pci_set_dma_mask(pci_dev, DMA_39BIT_MASK)) {
 			printk(KERN_INFO "forcedeth: 64-bit DMA failed, using 32-bit addressing for device %s.\n",
 					pci_name(pci_dev));
 		} else {
diff --git a/drivers/net/ioc3-eth.c b/drivers/net/ioc3-eth.c
index 9b8295ee06ef..ae71ed57c12d 100644
--- a/drivers/net/ioc3-eth.c
+++ b/drivers/net/ioc3-eth.c
@@ -44,6 +44,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
+#include <linux/dma-mapping.h>
 
 #ifdef CONFIG_SERIAL_8250
 #include <linux/serial_core.h>
@@ -1195,17 +1196,17 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	int err, pci_using_dac;
 
 	/* Configure DMA attributes. */
-	err = pci_set_dma_mask(pdev, 0xffffffffffffffffULL);
+	err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
 	if (!err) {
 		pci_using_dac = 1;
-		err = pci_set_consistent_dma_mask(pdev, 0xffffffffffffffffULL);
+		err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
 		if (err < 0) {
 			printk(KERN_ERR "%s: Unable to obtain 64 bit DMA "
 			       "for consistent allocations\n", pci_name(pdev));
 			goto out;
 		}
 	} else {
-		err = pci_set_dma_mask(pdev, 0xffffffffULL);
+		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
 		if (err) {
 			printk(KERN_ERR "%s: No usable DMA configuration, "
 			       "aborting.\n", pci_name(pdev));
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index 0fede50abd3e..8e9b1a537dee 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -1828,10 +1828,10 @@ static int __devinit ns83820_init_one(struct pci_dev *pci_dev, const struct pci_
 	int using_dac = 0;
 
 	/* See if we can set the dma mask early on; failure is fatal. */
-	if (sizeof(dma_addr_t) == 8 && 
-	 	!pci_set_dma_mask(pci_dev, 0xffffffffffffffffULL)) {
+	if (sizeof(dma_addr_t) == 8 &&
+	 	!pci_set_dma_mask(pci_dev, DMA_64BIT_MASK)) {
 		using_dac = 1;
-	} else if (!pci_set_dma_mask(pci_dev, 0xffffffff)) {
+	} else if (!pci_set_dma_mask(pci_dev, DMA_32BIT_MASK)) {
 		using_dac = 0;
 	} else {
 		printk(KERN_WARNING "ns83820.c: pci_set_dma_mask failed!\n");
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index 9d3b51c3ef54..29a756dd979b 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -577,8 +577,8 @@ static int __devinit wanxl_pci_init_one(struct pci_dev *pdev,
 	   We set both dma_mask and consistent_dma_mask to 28 bits
 	   and pray pci_alloc_consistent() will use this info. It should
 	   work on most platforms */
-	if (pci_set_consistent_dma_mask(pdev, 0x0FFFFFFF) ||
-	    pci_set_dma_mask(pdev, 0x0FFFFFFF)) {
+	if (pci_set_consistent_dma_mask(pdev, DMA_28BIT_MASK) ||
+	    pci_set_dma_mask(pdev, DMA_28BIT_MASK)) {
 		printk(KERN_ERR "wanXL: No usable DMA configuration\n");
 		return -EIO;
 	}
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index b41d666fea3c..bfa0cc319a09 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -22,6 +22,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/init.h> /* For __init, __exit */
+#include <linux/dma-mapping.h>
 
 #include "prismcompat.h"
 #include "islpci_dev.h"
@@ -124,7 +125,7 @@ prism54_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	/* enable PCI DMA */
-	if (pci_set_dma_mask(pdev, 0xffffffff)) {
+	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
 		printk(KERN_ERR "%s: 32-bit PCI DMA not supported", DRV_NAME);
 		goto do_pci_disable_device;
         }
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index 5bf83cbca868..bde3d5834ade 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -42,6 +42,7 @@
 #include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
+#include <linux/dma-mapping.h>
 #include <scsi/scsicam.h>
 
 #include <asm/dma.h>
@@ -677,7 +678,7 @@ static int __init BusLogic_InitializeMultiMasterProbeInfo(struct BusLogic_HostAd
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, (u64) 0xffffffff))
+		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK ))
 			continue;
 
 		Bus = PCI_Device->bus->number;
@@ -832,7 +833,7 @@ static int __init BusLogic_InitializeMultiMasterProbeInfo(struct BusLogic_HostAd
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, (u64) 0xffffffff))
+		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK))
 			continue;
 
 		Bus = PCI_Device->bus->number;
@@ -886,7 +887,7 @@ static int __init BusLogic_InitializeFlashPointProbeInfo(struct BusLogic_HostAda
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, (u64) 0xffffffff))
+		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK))
 			continue;
 
 		Bus = PCI_Device->bus->number;
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index 9f45ae1745da..3dce21c78737 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -89,6 +89,7 @@
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -1052,7 +1053,7 @@ static int __devinit inia100_probe_one(struct pci_dev *pdev,
 
 	if (pci_enable_device(pdev))
 		goto out;
-	if (pci_set_dma_mask(pdev, 0xffffffffULL)) {
+	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "Unable to set 32bit DMA "
 				    "on inia100 adapter, ignoring.\n");
 		goto out_disable_device;
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index a16f8ded8f1d..8df4a0ea3761 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/completion.h>
 #include <linux/blkdev.h>
+#include <linux/dma-mapping.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index c2596335549d..720330778648 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -45,6 +45,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
 #include <linux/syscalls.h>
 #include <linux/delay.h>
 #include <linux/smp_lock.h>
@@ -806,8 +807,8 @@ static int __devinit aac_probe_one(struct pci_dev *pdev,
 	 * to driver communication memory to be allocated below 2gig
 	 */
 	if (aac_drivers[index].quirks & AAC_QUIRK_31BIT) 
-		if (pci_set_dma_mask(pdev, 0x7FFFFFFFULL) ||
-				pci_set_consistent_dma_mask(pdev, 0x7FFFFFFFULL))
+		if (pci_set_dma_mask(pdev, DMA_31BIT_MASK) ||
+				pci_set_consistent_dma_mask(pdev, DMA_31BIT_MASK))
 			goto out;
 	
 	pci_set_master(pdev);
diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c
index 5227a779c05c..a198d86667e9 100644
--- a/drivers/scsi/atp870u.c
+++ b/drivers/scsi/atp870u.c
@@ -28,6 +28,7 @@
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/blkdev.h>
+#include <linux/dma-mapping.h>
 #include <asm/system.h>
 #include <asm/io.h>
 
@@ -2631,7 +2632,7 @@ static int atp870u_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (pci_enable_device(pdev))
 		return -EIO;
 
-        if (!pci_set_dma_mask(pdev, 0xFFFFFFFFUL)) {
+        if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
                 printk(KERN_INFO "atp870u: use 32bit DMA mask.\n");
         } else {
                 printk(KERN_ERR "atp870u: DMA mask required but not available.\n");
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 6e6b293dcb28..b1b704a42efd 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -57,6 +57,7 @@ MODULE_DESCRIPTION("Adaptec I2O RAID Driver");
 #include <linux/reboot.h>
 #include <linux/spinlock.h>
 #include <linux/smp_lock.h>
+#include <linux/dma-mapping.h>
 
 #include <linux/timer.h>
 #include <linux/string.h>
@@ -906,8 +907,8 @@ static int adpt_install_hba(struct scsi_host_template* sht, struct pci_dev* pDev
 	}
 
 	pci_set_master(pDev);
-	if (pci_set_dma_mask(pDev, 0xffffffffffffffffULL) &&
-	    pci_set_dma_mask(pDev, 0xffffffffULL))
+	if (pci_set_dma_mask(pDev, DMA_64BIT_MASK) &&
+	    pci_set_dma_mask(pDev, DMA_32BIT_MASK))
 		return -EINVAL;
 
 	base_addr0_phys = pci_resource_start(pDev,0);
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index b3f9de8f7595..059eeee4b554 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -490,6 +490,7 @@
 #include <linux/init.h>
 #include <linux/ctype.h>
 #include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
 #include <asm/byteorder.h>
 #include <asm/dma.h>
 #include <asm/io.h>
@@ -1426,7 +1427,7 @@ static int port_detect(unsigned long port_base, unsigned int j,
 
 	if (ha->pdev) {
 		pci_set_master(ha->pdev);
-		if (pci_set_dma_mask(ha->pdev, 0xffffffff))
+		if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK))
 			printk("%s: warning, pci_set_dma_mask failed.\n",
 			       ha->board_name);
 	}
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 7f7013e80a88..d5740bbdef3e 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -388,6 +388,7 @@
 #include <linux/proc_fs.h>
 #include <linux/time.h>
 #include <linux/timer.h>
+#include <linux/dma-mapping.h>
 #ifdef GDTH_RTC
 #include <linux/mc146818rtc.h>
 #endif
@@ -4527,15 +4528,15 @@ static int __init gdth_detect(struct scsi_host_template *shtp)
             if (!(ha->cache_feat & ha->raw_feat & ha->screen_feat &GDT_64BIT)||
                 /* 64-bit DMA only supported from FW >= x.43 */
                 (!ha->dma64_support)) {
-                if (pci_set_dma_mask(pcistr[ctr].pdev, 0xffffffff)) {
+                if (pci_set_dma_mask(pcistr[ctr].pdev, DMA_32BIT_MASK)) {
                     printk(KERN_WARNING "GDT-PCI %d: Unable to set 32-bit DMA\n", hanum);
                     err = TRUE;
                 }
             } else {
                 shp->max_cmd_len = 16;
-                if (!pci_set_dma_mask(pcistr[ctr].pdev, 0xffffffffffffffffULL)) {
+                if (!pci_set_dma_mask(pcistr[ctr].pdev, DMA_64BIT_MASK)) {
                     printk("GDT-PCI %d: 64-bit DMA enabled\n", hanum);
-                } else if (pci_set_dma_mask(pcistr[ctr].pdev, 0xffffffff)) {
+                } else if (pci_set_dma_mask(pcistr[ctr].pdev, DMA_32BIT_MASK)) {
                     printk(KERN_WARNING "GDT-PCI %d: Unable to set 64/32-bit DMA\n", hanum);
                     err = TRUE;
                 }
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index ea6f3c0e05d9..0cc7f65b584f 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -127,6 +127,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/jiffies.h>
+#include <linux/dma-mapping.h>
 #include <asm/io.h>
 
 #include <scsi/scsi.h>
@@ -2780,7 +2781,7 @@ static int tul_NewReturnNumberOfAdapters(void)
 			if (((dRegValue & 0xFF00) >> 8) == 0xFF)
 				dRegValue = 0;
 			wBIOS = (wBIOS << 8) + ((UWORD) ((dRegValue & 0xFF00) >> 8));
-			if (pci_set_dma_mask(pDev, 0xffffffff)) {
+			if (pci_set_dma_mask(pDev, DMA_32BIT_MASK)) {
 				printk(KERN_WARNING 
 				       "i91u: Could not set 32 bit DMA mask\n");
 				continue;
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 481708d527ae..a4c0b04cfdbd 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -179,6 +179,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/types.h>
+#include <linux/dma-mapping.h>
 
 #include <scsi/sg.h>
 
@@ -7284,10 +7285,10 @@ ips_init_phase1(struct pci_dev *pci_dev, int *indexPtr)
 	 * are guaranteed to be < 4G.
 	 */
 	if (IPS_ENABLE_DMA64 && IPS_HAS_ENH_SGLIST(ha) &&
-	    !pci_set_dma_mask(ha->pcidev, 0xffffffffffffffffULL)) {
+	    !pci_set_dma_mask(ha->pcidev, DMA_64BIT_MASK)) {
 		(ha)->flags |= IPS_HA_ENH_SG;
 	} else {
-		if (pci_set_dma_mask(ha->pcidev, 0xffffffffULL) != 0) {
+		if (pci_set_dma_mask(ha->pcidev, DMA_32BIT_MASK) != 0) {
 			printk(KERN_WARNING "Unable to set DMA Mask\n");
 			return ips_abort_init(ha, index);
 		}
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 7144674bc8e6..80b68a2481b3 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -45,6 +45,7 @@
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/dma-mapping.h>
 #include <scsi/scsicam.h>
 
 #include "scsi.h"
@@ -2094,7 +2095,7 @@ make_local_pdev(adapter_t *adapter, struct pci_dev **pdev)
 
 	memcpy(*pdev, adapter->dev, sizeof(struct pci_dev));
 
-	if( pci_set_dma_mask(*pdev, 0xffffffff) != 0 ) {
+	if( pci_set_dma_mask(*pdev, DMA_32BIT_MASK) != 0 ) {
 		kfree(*pdev);
 		return -1;
 	}
@@ -4859,10 +4860,10 @@ megaraid_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	/* Set the Mode of addressing to 64 bit if we can */
 	if ((adapter->flag & BOARD_64BIT) && (sizeof(dma_addr_t) == 8)) {
-		pci_set_dma_mask(pdev, 0xffffffffffffffffULL);
+		pci_set_dma_mask(pdev, DMA_64BIT_MASK);
 		adapter->has_64bit_addr = 1;
 	} else  {
-		pci_set_dma_mask(pdev, 0xffffffff);
+		pci_set_dma_mask(pdev, DMA_32BIT_MASK);
 		adapter->has_64bit_addr = 0;
 	}
 		
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index a279ebb61447..30ee0ef4b459 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -38,6 +38,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/dma.h>
 #include <asm/system.h>
@@ -2776,7 +2777,7 @@ static int nsp32_detect(struct scsi_host_template *sht)
 	/*
 	 * setup DMA 
 	 */
-	if (pci_set_dma_mask(PCIDEV, 0xffffffffUL) != 0) {
+	if (pci_set_dma_mask(PCIDEV, DMA_32BIT_MASK) != 0) {
 		nsp32_msg (KERN_ERR, "failed to set PCI DMA mask");
 		goto scsi_unregister;
 	}
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index e0230249fa0f..5a48e55f9418 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -350,6 +350,7 @@
 #include <linux/pci_ids.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -4321,7 +4322,7 @@ qla1280_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 #ifdef QLA_64BIT_PTR
 	if (pci_set_dma_mask(ha->pdev, (dma_addr_t) ~ 0ULL)) {
-		if (pci_set_dma_mask(ha->pdev, 0xffffffff)) {
+		if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK)) {
 			printk(KERN_WARNING "scsi(%li): Unable to set a "
 			       "suitable DMA mask - aborting\n", ha->host_no);
 			error = -ENODEV;
@@ -4331,7 +4332,7 @@ qla1280_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		dprintk(2, "scsi(%li): 64 Bit PCI Addressing Enabled\n",
 			ha->host_no);
 #else
-	if (pci_set_dma_mask(ha->pdev, 0xffffffff)) {
+	if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "scsi(%li): Unable to set a "
 		       "suitable DMA mask - aborting\n", ha->host_no);
 		error = -ENODEV;
diff --git a/drivers/scsi/qlogicfc.c b/drivers/scsi/qlogicfc.c
index 5b15998c71a6..52b224a5d6fd 100644
--- a/drivers/scsi/qlogicfc.c
+++ b/drivers/scsi/qlogicfc.c
@@ -61,6 +61,7 @@
 #include <linux/unistd.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
 #include <linux/jiffies.h>
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -738,8 +739,8 @@ static int isp2x00_detect(struct scsi_host_template * tmpt)
 				continue;
 
 			/* Try to configure DMA attributes. */
-			if (pci_set_dma_mask(pdev, 0xffffffffffffffffULL) &&
-			    pci_set_dma_mask(pdev, 0xffffffffULL))
+			if (pci_set_dma_mask(pdev, DMA_64BIT_MASK) &&
+			    pci_set_dma_mask(pdev, DMA_32BIT_MASK))
 					continue;
 
 		        host = scsi_register(tmpt, sizeof(struct isp2x00_hostdata));
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index a8731062a74c..9b4751aecc23 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -21,6 +21,7 @@ enum dma_data_direction {
 #define DMA_30BIT_MASK	0x000000003fffffffULL
 #define DMA_29BIT_MASK	0x000000001fffffffULL
 #define DMA_28BIT_MASK	0x000000000fffffffULL
+#define DMA_24BIT_MASK 0x0000000000ffffffULL
 
 #include <asm/dma-mapping.h>
 
diff --git a/sound/oss/esssolo1.c b/sound/oss/esssolo1.c
index 78d3e29ce968..6861563d7525 100644
--- a/sound/oss/esssolo1.c
+++ b/sound/oss/esssolo1.c
@@ -2348,7 +2348,7 @@ static int __devinit solo1_probe(struct pci_dev *pcidev, const struct pci_device
 	/* Recording requires 24-bit DMA, so attempt to set dma mask
 	 * to 24 bits first, then 32 bits (playback only) if that fails.
 	 */
-	if (pci_set_dma_mask(pcidev, 0x00ffffff) &&
+	if (pci_set_dma_mask(pcidev, DMA_24BIT_MASK) &&
 	    pci_set_dma_mask(pcidev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "solo1: architecture does not support 24bit or 32bit PCI busmaster DMA\n");
 		return -ENODEV;
diff --git a/sound/oss/sonicvibes.c b/sound/oss/sonicvibes.c
index 4471757b7985..42bd276cfc39 100644
--- a/sound/oss/sonicvibes.c
+++ b/sound/oss/sonicvibes.c
@@ -116,6 +116,7 @@
 #include <linux/spinlock.h>
 #include <linux/smp_lock.h>
 #include <linux/gameport.h>
+#include <linux/dma-mapping.h>
 #include <linux/mutex.h>
 
 
@@ -2535,7 +2536,7 @@ static int __devinit sv_probe(struct pci_dev *pcidev, const struct pci_device_id
 		return -ENODEV;
 	if (pcidev->irq == 0)
 		return -ENODEV;
-	if (pci_set_dma_mask(pcidev, 0x00ffffff)) {
+	if (pci_set_dma_mask(pcidev, DMA_24BIT_MASK)) {
 		printk(KERN_WARNING "sonicvibes: architecture does not support 24bit PCI busmaster DMA\n");
 		return -ENODEV;
 	}
diff --git a/sound/pci/ad1889.c b/sound/pci/ad1889.c
index 2aa5a7fdb6e0..c6c8333acc62 100644
--- a/sound/pci/ad1889.c
+++ b/sound/pci/ad1889.c
@@ -39,6 +39,7 @@
 #include <linux/interrupt.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/driver.h>
 #include <sound/core.h>
diff --git a/sound/pci/ali5451/ali5451.c b/sound/pci/ali5451/ali5451.c
index e264136e8fb4..fc92b6896c24 100644
--- a/sound/pci/ali5451/ali5451.c
+++ b/sound/pci/ali5451/ali5451.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 #include <sound/info.h>
@@ -2220,8 +2221,8 @@ static int __devinit snd_ali_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 31 bits */
-	if (pci_set_dma_mask(pci, 0x7fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x7fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_31BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_31BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 31bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/als4000.c b/sound/pci/als4000.c
index 7b2ff5f4672e..100d8127a411 100644
--- a/sound/pci/als4000.c
+++ b/sound/pci/als4000.c
@@ -70,6 +70,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 #include <sound/rawmidi.h>
@@ -688,8 +689,8 @@ static int __devinit snd_card_als4000_probe(struct pci_dev *pci,
 		return err;
 	}
 	/* check, if we can restrict PCI DMA transfers to 24 bits */
-	if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/azt3328.c b/sound/pci/azt3328.c
index e077eb3fbe2f..680077e1e057 100644
--- a/sound/pci/azt3328.c
+++ b/sound/pci/azt3328.c
@@ -104,6 +104,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/pcm.h>
@@ -1669,8 +1670,8 @@ snd_azf3328_create(struct snd_card *card,
 	chip->irq = -1;
 
 	/* check if we can restrict PCI DMA transfers to 24 bits */
-	if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		err = -ENXIO;
 		goto out_err;
diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c
index 2208dbd48be9..3e332f398162 100644
--- a/sound/pci/emu10k1/emu10k1x.c
+++ b/sound/pci/emu10k1/emu10k1x.c
@@ -36,6 +36,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/initval.h>
 #include <sound/pcm.h>
diff --git a/sound/pci/es1938.c b/sound/pci/es1938.c
index 0d556b09ad04..4d62fe439177 100644
--- a/sound/pci/es1938.c
+++ b/sound/pci/es1938.c
@@ -55,6 +55,7 @@
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/pcm.h>
@@ -1517,8 +1518,8 @@ static int __devinit snd_es1938_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
         /* check, if we can restrict PCI DMA transfers to 24 bits */
-	if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
                 return -ENXIO;
diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c
index dd465a186e11..e3ad17f53c29 100644
--- a/sound/pci/es1968.c
+++ b/sound/pci/es1968.c
@@ -104,6 +104,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <linux/mutex.h>
 
 #include <sound/core.h>
diff --git a/sound/pci/ice1712/ice1712.c b/sound/pci/ice1712/ice1712.c
index 672e198317e1..b88eeba2f5d1 100644
--- a/sound/pci/ice1712/ice1712.c
+++ b/sound/pci/ice1712/ice1712.c
@@ -56,7 +56,9 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <linux/mutex.h>
+
 #include <sound/core.h>
 #include <sound/cs8427.h>
 #include <sound/info.h>
diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c
index 8bc084956c28..44393e190929 100644
--- a/sound/pci/maestro3.c
+++ b/sound/pci/maestro3.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/info.h>
 #include <sound/control.h>
diff --git a/sound/pci/mixart/mixart.c b/sound/pci/mixart/mixart.c
index 43ee3b2b948f..b5a095052d4c 100644
--- a/sound/pci/mixart/mixart.c
+++ b/sound/pci/mixart/mixart.c
@@ -28,6 +28,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
+#include <linux/dma-mapping.h>
+
 #include <sound/core.h>
 #include <sound/initval.h>
 #include <sound/info.h>
diff --git a/sound/pci/pcxhr/pcxhr.c b/sound/pci/pcxhr/pcxhr.c
index f679779d96e3..35875c8aa299 100644
--- a/sound/pci/pcxhr/pcxhr.c
+++ b/sound/pci/pcxhr/pcxhr.c
@@ -30,6 +30,7 @@
 #include <linux/delay.h>
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/pci/sonicvibes.c b/sound/pci/sonicvibes.c
index 7bbea3738b8a..2d66a09fe5ee 100644
--- a/sound/pci/sonicvibes.c
+++ b/sound/pci/sonicvibes.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/core.h>
 #include <sound/pcm.h>
@@ -1227,8 +1228,8 @@ static int __devinit snd_sonicvibes_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 24 bits */
-        if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+        if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
                 return -ENXIO;
diff --git a/sound/pci/trident/trident_main.c b/sound/pci/trident/trident_main.c
index 83b7d8aba9e6..52178b8ad49d 100644
--- a/sound/pci/trident/trident_main.c
+++ b/sound/pci/trident/trident_main.c
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/gameport.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/core.h>
 #include <sound/info.h>
@@ -3554,8 +3555,8 @@ int __devinit snd_trident_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 30 bits */
-	if (pci_set_dma_mask(pci, 0x3fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x3fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_30BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_30BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 30bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
-- 
cgit v1.2.3


From 7f927fcc2fd1575d01efb4b76665975007945690 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 28 Mar 2006 01:56:53 -0800
Subject: [PATCH] Typo fixes

Fix a lot of typos.  Eyeballed by jmc@ in OpenBSD.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/m68k/README.buddha         | 2 +-
 Documentation/networking/ifenslave.c     | 2 +-
 arch/arm/lib/copy_template.S             | 2 +-
 drivers/char/mxser.h                     | 2 +-
 drivers/char/synclink.c                  | 2 +-
 drivers/edac/edac_mc.c                   | 2 +-
 drivers/net/irda/nsc-ircc.c              | 2 +-
 drivers/net/sis900.c                     | 2 +-
 drivers/net/tulip/de4x5.c                | 2 +-
 drivers/net/tulip/pnic2.c                | 2 +-
 drivers/net/typhoon.c                    | 4 ++--
 drivers/net/wireless/orinoco.c           | 2 +-
 drivers/net/wireless/prism54/isl_ioctl.c | 2 +-
 drivers/scsi/3w-9xxx.c                   | 2 +-
 drivers/serial/8250.c                    | 2 +-
 drivers/serial/serial_txx9.c             | 2 +-
 drivers/serial/sunsu.c                   | 2 +-
 drivers/usb/host/ohci-s3c2410.c          | 2 +-
 drivers/usb/net/zaurus.c                 | 2 +-
 fs/mbcache.c                             | 2 +-
 include/asm-parisc/pdc.h                 | 2 +-
 include/asm-sh/addrspace.h               | 2 +-
 include/asm-sparc64/floppy.h             | 2 +-
 include/linux/fb.h                       | 2 +-
 mm/mempolicy.c                           | 2 +-
 net/irda/af_irda.c                       | 6 +++---
 sound/pci/rme32.c                        | 8 ++++----
 sound/pci/rme96.c                        | 8 ++++----
 sound/pci/rme9652/hdspm.c                | 2 +-
 sound/usb/usx2y/usx2yhwdeppcm.c          | 2 +-
 30 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/m68k/README.buddha b/Documentation/m68k/README.buddha
index bf802ffc98ad..ef484a719bb9 100644
--- a/Documentation/m68k/README.buddha
+++ b/Documentation/m68k/README.buddha
@@ -29,7 +29,7 @@ address is written to $4a, then the whole Byte is written to
 $48, while it doesn't matter how often you're writing to $4a
 as  long as $48 is not touched.  After $48 has been written,
 the  whole card disappears from $e8 and is mapped to the new
-address just written.  Make shure $4a is written before $48,
+address just written.  Make sure $4a is written before $48,
 otherwise your chance is only 1:16 to find the board :-).
 
 The local memory-map is even active when mapped to $e8:
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
index 545447ac503a..a12059886755 100644
--- a/Documentation/networking/ifenslave.c
+++ b/Documentation/networking/ifenslave.c
@@ -87,7 +87,7 @@
  *	   would fail and generate an error message in the system log.
  * 	 - For opt_c: slave should not be set to the master's setting
  *	   while it is running. It was already set during enslave. To
- *	   simplify things, it is now handeled separately.
+ *	   simplify things, it is now handled separately.
  *
  *    - 2003/12/01 - Shmulik Hen <shmulik.hen at intel dot com>
  *	 - Code cleanup and style changes
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 838e435e4922..cab355c0c1f7 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -236,7 +236,7 @@
 
 
 /*
- * Abort preanble and completion macros.
+ * Abort preamble and completion macros.
  * If a fixup handler is required then those macros must surround it.
  * It is assumed that the fixup code will handle the private part of
  * the exit macro.
diff --git a/drivers/char/mxser.h b/drivers/char/mxser.h
index e7fd0b08e0b7..7e188a4d602a 100644
--- a/drivers/char/mxser.h
+++ b/drivers/char/mxser.h
@@ -118,7 +118,7 @@
 
 // enable CTS interrupt
 #define MOXA_MUST_IER_ECTSI		0x80
-// eanble RTS interrupt
+// enable RTS interrupt
 #define MOXA_MUST_IER_ERTSI		0x40
 // enable Xon/Xoff interrupt
 #define MOXA_MUST_IER_XINT		0x20
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index abb03e52fe12..fee2aca3f6a5 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -5996,7 +5996,7 @@ static void usc_set_async_mode( struct mgsl_struct *info )
 	 * <15..8>	?		RxFIFO IRQ Request Level
 	 *
 	 * Note: For async mode the receive FIFO level must be set
-	 * to 0 to aviod the situation where the FIFO contains fewer bytes
+	 * to 0 to avoid the situation where the FIFO contains fewer bytes
 	 * than the trigger level and no more data is expected.
 	 *
 	 * <7>		0		Exited Hunt IA (Interrupt Arm)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 905f58ba8e16..ea06e3a4dc35 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -2044,7 +2044,7 @@ static int __init edac_mc_init(void)
 	 */
 	clear_pci_parity_errors();
 
-	/* Create the MC sysfs entires */
+	/* Create the MC sysfs entries */
 	if (edac_sysfs_memctrl_setup()) {
 		edac_printk(KERN_ERR, EDAC_MC,
 			"Error initializing sysfs code\n");
diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
index 9aa074b44dd3..cc7ff8f00e42 100644
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -812,7 +812,7 @@ static int nsc_ircc_init_39x(nsc_chip_t *chip, chipio_t *info)
 	int cfg_base = info->cfg_base;
 	int enabled;
 
-	/* User is shure about his config... accept it. */
+	/* User is sure about his config... accept it. */
 	IRDA_DEBUG(2, "%s(): nsc_ircc_init_39x (user settings): "
 		   "io=0x%04x, irq=%d, dma=%d\n", 
 		   __FUNCTION__, info->fir_base, info->irq, info->dma);
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 8429ceb01389..b82191d2bee1 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -2283,7 +2283,7 @@ static void set_rx_mode(struct net_device *net_dev)
 	int i, table_entries;
 	u32 rx_mode;
 
-	/* 635 Hash Table entires = 256(2^16) */
+	/* 635 Hash Table entries = 256(2^16) */
 	if((sis_priv->chipset_rev >= SIS635A_900_REV) ||
 			(sis_priv->chipset_rev == SIS900B_900_REV))
 		table_entries = 16;
diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c
index ee48bfd67349..d1a86a080a65 100644
--- a/drivers/net/tulip/de4x5.c
+++ b/drivers/net/tulip/de4x5.c
@@ -513,7 +513,7 @@ struct mii_phy {
     u_char  *rst;           /* Start of reset sequence in SROM           */
     u_int mc;               /* Media Capabilities                        */
     u_int ana;              /* NWay Advertisement                        */
-    u_int fdx;              /* Full DupleX capabilites for each media    */
+    u_int fdx;              /* Full DupleX capabilities for each media   */
     u_int ttm;              /* Transmit Threshold Mode for each media    */
     u_int mci;              /* 21142 MII Connector Interrupt info        */
 };
diff --git a/drivers/net/tulip/pnic2.c b/drivers/net/tulip/pnic2.c
index 55f4a9a631bc..ab985023fcca 100644
--- a/drivers/net/tulip/pnic2.c
+++ b/drivers/net/tulip/pnic2.c
@@ -199,7 +199,7 @@ void pnic2_lnk_change(struct net_device *dev, int csr5)
 	               /* negotiation ended successfully */
 
 	               /* get the link partners reply and mask out all but
-                        * bits 24-21 which show the partners capabilites
+                        * bits 24-21 which show the partners capabilities
                         * and match those to what we advertised
                         *
                         * then begin to interpret the results of the negotiation.
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index cde35dd87906..c1ce87a5f8d3 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -208,7 +208,7 @@ static const struct typhoon_card_info typhoon_card_info[] __devinitdata = {
 };
 
 /* Notes on the new subsystem numbering scheme:
- * bits 0-1 indicate crypto capabilites: (0) variable, (1) DES, or (2) 3DES
+ * bits 0-1 indicate crypto capabilities: (0) variable, (1) DES, or (2) 3DES
  * bit 4 indicates if this card has secured firmware (we don't support it)
  * bit 8 indicates if this is a (0) copper or (1) fiber card
  * bits 12-16 indicate card type: (0) client and (1) server
@@ -788,7 +788,7 @@ typhoon_start_tx(struct sk_buff *skb, struct net_device *dev)
 	/* we have two rings to choose from, but we only use txLo for now
 	 * If we start using the Hi ring as well, we'll need to update
 	 * typhoon_stop_runtime(), typhoon_interrupt(), typhoon_num_free_tx(),
-	 * and TXHI_ENTIRES to match, as well as update the TSO code below
+	 * and TXHI_ENTRIES to match, as well as update the TSO code below
 	 * to get the right DMA address
 	 */
 	txRing = &tp->txLoRing;
diff --git a/drivers/net/wireless/orinoco.c b/drivers/net/wireless/orinoco.c
index 6fd0bf736830..8dfdfbd5966c 100644
--- a/drivers/net/wireless/orinoco.c
+++ b/drivers/net/wireless/orinoco.c
@@ -3858,7 +3858,7 @@ static int orinoco_ioctl_setscan(struct net_device *dev,
 	unsigned long flags;
 
 	/* Note : you may have realised that, as this is a SET operation,
-	 * this is priviledged and therefore a normal user can't
+	 * this is privileged and therefore a normal user can't
 	 * perform scanning.
 	 * This is not an error, while the device perform scanning,
 	 * traffic doesn't flow, so it's a perfect DoS...
diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c
index e5bb9f5ae429..989599ad33ef 100644
--- a/drivers/net/wireless/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/prism54/isl_ioctl.c
@@ -747,7 +747,7 @@ prism54_get_essid(struct net_device *ndev, struct iw_request_info *info,
 
 	if (essid->length) {
 		dwrq->flags = 1;	/* set ESSID to ON for Wireless Extensions */
-		/* if it is to big, trunk it */
+		/* if it is too big, trunk it */
 		dwrq->length = min((u8)IW_ESSID_MAX_SIZE, essid->length);
 	} else {
 		dwrq->flags = 0;
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 0ab26d01877b..0d2b447c50ed 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1026,7 +1026,7 @@ static void twa_free_request_id(TW_Device_Extension *tw_dev, int request_id)
 	tw_dev->free_tail = (tw_dev->free_tail + 1) % TW_Q_LENGTH;
 } /* End twa_free_request_id() */
 
-/* This function will get parameter table entires from the firmware */
+/* This function will get parameter table entries from the firmware */
 static void *twa_get_param(TW_Device_Extension *tw_dev, int request_id, int table_id, int parameter_id, int parameter_size_bytes)
 {
 	TW_Command_Full *full_command_packet;
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 5996d3cd0ed8..674b15c78f68 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1528,7 +1528,7 @@ static int serial8250_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	serial8250_clear_fifos(up);
 
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index b848b7d94412..3bdee64d1a99 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -483,7 +483,7 @@ static int serial_txx9_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	sio_set(up, TXX9_SIFCR,
 		TXX9_SIFCR_TFRST | TXX9_SIFCR_RFRST | TXX9_SIFCR_FRSTE);
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 9fe2283d91e5..1c4396c2962d 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -641,7 +641,7 @@ static int sunsu_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	if (uart_config[up->port.type].flags & UART_CLEAR_FIFO) {
 		serial_outp(up, UART_FCR, UART_FCR_ENABLE_FIFO);
diff --git a/drivers/usb/host/ohci-s3c2410.c b/drivers/usb/host/ohci-s3c2410.c
index 372527a83593..682bf2215660 100644
--- a/drivers/usb/host/ohci-s3c2410.c
+++ b/drivers/usb/host/ohci-s3c2410.c
@@ -158,7 +158,7 @@ static int ohci_s3c2410_hub_control (
 		"s3c2410_hub_control(%p,0x%04x,0x%04x,0x%04x,%p,%04x)\n",
 		hcd, typeReq, wValue, wIndex, buf, wLength);
 
-	/* if we are only an humble host without any special capabilites
+	/* if we are only an humble host without any special capabilities
 	 * process the request straight away and exit */
 
 	if (info == NULL) {
diff --git a/drivers/usb/net/zaurus.c b/drivers/usb/net/zaurus.c
index 9c5ab251370c..f7ac9d6b9856 100644
--- a/drivers/usb/net/zaurus.c
+++ b/drivers/usb/net/zaurus.c
@@ -217,7 +217,7 @@ static int blan_mdlm_bind(struct usbnet *dev, struct usb_interface *intf)
 			 * with devices that use it and those that don't.
 			 */
 			if ((detail->bDetailData[1] & ~0x02) != 0x01) {
-				/* bmDataCapabilites == 0 would be fine too,
+				/* bmDataCapabilities == 0 would be fine too,
 				 * but framing is minidriver-coupled for now.
 				 */
 bad_detail:
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 73e754fea2d8..e4fde1ab22cd 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -311,7 +311,7 @@ fail:
 /*
  * mb_cache_shrink()
  *
- * Removes all cache entires of a device from the cache. All cache entries
+ * Removes all cache entries of a device from the cache. All cache entries
  * currently in use cannot be freed, and thus remain in the cache. All others
  * are freed.
  *
diff --git a/include/asm-parisc/pdc.h b/include/asm-parisc/pdc.h
index 8e23e4c674f6..0a3face6c480 100644
--- a/include/asm-parisc/pdc.h
+++ b/include/asm-parisc/pdc.h
@@ -333,7 +333,7 @@ struct pdc_model {		/* for PDC_MODEL */
 	unsigned long curr_key;
 };
 
-/* Values for PDC_MODEL_CAPABILITES non-equivalent virtual aliasing support */
+/* Values for PDC_MODEL_CAPABILITIES non-equivalent virtual aliasing support */
 
 #define PDC_MODEL_IOPDIR_FDC            (1 << 2)        /* see sba_iommu.c */
 #define PDC_MODEL_NVA_MASK		(3 << 4)
diff --git a/include/asm-sh/addrspace.h b/include/asm-sh/addrspace.h
index dbb05d1a26d1..720afc11c2ca 100644
--- a/include/asm-sh/addrspace.h
+++ b/include/asm-sh/addrspace.h
@@ -13,7 +13,7 @@
 
 #include <asm/cpu/addrspace.h>
 
-/* Memory segments (32bit Priviledged mode addresses)  */
+/* Memory segments (32bit Privileged mode addresses)  */
 #define P0SEG		0x00000000
 #define P1SEG		0x80000000
 #define P2SEG		0xa0000000
diff --git a/include/asm-sparc64/floppy.h b/include/asm-sparc64/floppy.h
index 49d49a285943..6a95d5d0c576 100644
--- a/include/asm-sparc64/floppy.h
+++ b/include/asm-sparc64/floppy.h
@@ -738,7 +738,7 @@ static unsigned long __init sun_floppy_init(void)
 		if (!sun_floppy_types[0] && sun_floppy_types[1]) {
 			/*
 			 * Set the drive exchange bit in FCR on NS87303,
-			 * make shure other bits are sane before doing so.
+			 * make sure other bits are sane before doing so.
 			 */
 			ns87303_modify(config, FER, FER_EDM, 0);
 			ns87303_modify(config, ASC, ASC_DRV2_SEL, 0);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 2cb19e6503aa..d03fadfcafe3 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -734,7 +734,7 @@ struct fb_tile_ops {
 
 /* A driver may set this flag to indicate that it does want a set_par to be
  * called every time when fbcon_switch is executed. The advantage is that with
- * this flag set you can really be shure that set_par is always called before
+ * this flag set you can really be sure that set_par is always called before
  * any of the functions dependant on the correct hardware state or altering
  * that state, even if you are using some broken X releases. The disadvantage
  * is that it introduces unwanted delays to every console switch if set_par
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4f71cfd29c6f..dec8249e972d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -912,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	/*
 	 * Check if this process has the right to modify the specified
 	 * process. The right exists if the process has administrative
-	 * capabilities, superuser priviledges or the same
+	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 759445648667..627b11342233 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1302,7 +1302,7 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 	if (sk->sk_state != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
-	/* Check that we don't send out to big frames */
+	/* Check that we don't send out too big frames */
 	if (len > self->max_data_size) {
 		IRDA_DEBUG(2, "%s(), Chopping frame from %zd to %d bytes!\n",
 			   __FUNCTION__, len, self->max_data_size);
@@ -1546,7 +1546,7 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
 	IRDA_ASSERT(self != NULL, return -1;);
 
 	/*
-	 * Check that we don't send out to big frames. This is an unreliable
+	 * Check that we don't send out too big frames. This is an unreliable
 	 * service, so we have no fragmentation and no coalescence
 	 */
 	if (len > self->max_data_size) {
@@ -1642,7 +1642,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
 	}
 
 	/*
-	 * Check that we don't send out to big frames. This is an unreliable
+	 * Check that we don't send out too big frames. This is an unreliable
 	 * service, so we have no fragmentation and no coalescence
 	 */
 	if (len > self->max_data_size) {
diff --git a/sound/pci/rme32.c b/sound/pci/rme32.c
index 0cbef5fe6c63..ab78544bf042 100644
--- a/sound/pci/rme32.c
+++ b/sound/pci/rme32.c
@@ -313,7 +313,7 @@ static int snd_rme32_capture_copy(struct snd_pcm_substream *substream, int chann
 }
 
 /*
- * SPDIF I/O capabilites (half-duplex mode)
+ * SPDIF I/O capabilities (half-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_spdif_info = {
 	.info =		(SNDRV_PCM_INFO_MMAP_IOMEM |
@@ -339,7 +339,7 @@ static struct snd_pcm_hardware snd_rme32_spdif_info = {
 };
 
 /*
- * ADAT I/O capabilites (half-duplex mode)
+ * ADAT I/O capabilities (half-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_adat_info =
 {
@@ -364,7 +364,7 @@ static struct snd_pcm_hardware snd_rme32_adat_info =
 };
 
 /*
- * SPDIF I/O capabilites (full-duplex mode)
+ * SPDIF I/O capabilities (full-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_spdif_fd_info = {
 	.info =		(SNDRV_PCM_INFO_MMAP |
@@ -390,7 +390,7 @@ static struct snd_pcm_hardware snd_rme32_spdif_fd_info = {
 };
 
 /*
- * ADAT I/O capabilites (full-duplex mode)
+ * ADAT I/O capabilities (full-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_adat_fd_info =
 {
diff --git a/sound/pci/rme96.c b/sound/pci/rme96.c
index 0e694b011dcc..6c2a9f4a7659 100644
--- a/sound/pci/rme96.c
+++ b/sound/pci/rme96.c
@@ -359,7 +359,7 @@ snd_rme96_capture_copy(struct snd_pcm_substream *substream,
 }
 
 /*
- * Digital output capabilites (S/PDIF)
+ * Digital output capabilities (S/PDIF)
  */
 static struct snd_pcm_hardware snd_rme96_playback_spdif_info =
 {
@@ -388,7 +388,7 @@ static struct snd_pcm_hardware snd_rme96_playback_spdif_info =
 };
 
 /*
- * Digital input capabilites (S/PDIF)
+ * Digital input capabilities (S/PDIF)
  */
 static struct snd_pcm_hardware snd_rme96_capture_spdif_info =
 {
@@ -417,7 +417,7 @@ static struct snd_pcm_hardware snd_rme96_capture_spdif_info =
 };
 
 /*
- * Digital output capabilites (ADAT)
+ * Digital output capabilities (ADAT)
  */
 static struct snd_pcm_hardware snd_rme96_playback_adat_info =
 {
@@ -442,7 +442,7 @@ static struct snd_pcm_hardware snd_rme96_playback_adat_info =
 };
 
 /*
- * Digital input capabilites (ADAT)
+ * Digital input capabilities (ADAT)
  */
 static struct snd_pcm_hardware snd_rme96_capture_adat_info =
 {
diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index 980b9cd689dd..b5538efd146b 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -2256,7 +2256,7 @@ static int snd_hdspm_create_controls(struct snd_card *card, struct hdspm * hdspm
 	}
 
 	/* Channel playback mixer as default control 
-	   Note: the whole matrix would be 128*HDSPM_MIXER_CHANNELS Faders, thats to big for any alsamixer 
+	   Note: the whole matrix would be 128*HDSPM_MIXER_CHANNELS Faders, thats too big for any alsamixer
 	   they are accesible via special IOCTL on hwdep
 	   and the mixer 2dimensional mixer control */
 
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 315855082fe1..fe67a92e2a1a 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -404,7 +404,7 @@ static void usX2Y_usbpcm_subs_startup(struct snd_usX2Y_substream *subs)
 	struct usX2Ydev * usX2Y = subs->usX2Y;
 	usX2Y->prepare_subs = subs;
 	subs->urb[0]->start_frame = -1;
-	smp_wmb();	// Make shure above modifications are seen by i_usX2Y_subs_startup()
+	smp_wmb();	// Make sure above modifications are seen by i_usX2Y_subs_startup()
 	usX2Y_urbs_set_complete(usX2Y, i_usX2Y_usbpcm_subs_startup);
 }
 
-- 
cgit v1.2.3


From 6c99c5cb94319a601b5ec5ee31c331f84755dd74 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 28 Mar 2006 16:11:00 -0800
Subject: [PATCH] Remove dead kill_sl prototype from sched.h

The kill_sl function doesn't exist in the kernel so a prototype is completely
unnecessary.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20b4f0372e44..5f5ab98bbb6b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1101,7 +1101,6 @@ extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
 extern void zap_other_threads(struct task_struct *p);
 extern int kill_pg(pid_t, int, int);
-extern int kill_sl(pid_t, int, int);
 extern int kill_proc(pid_t, int, int);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
-- 
cgit v1.2.3


From d73d65293e3e2de7e916a89c8da30be0948afab7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 28 Mar 2006 16:11:03 -0800
Subject: [PATCH] pidhash: kill switch_exec_pids

switch_exec_pids is only called from de_thread by way of exec, and it is
only called when we are exec'ing from a non thread group leader.

Currently switch_exec_pids gives the leader the pid of the thread and
unhashes and rehashes all of the process groups.  The leader is already in
the EXIT_DEAD state so no one cares about it's pids.  The only concern for
the leader is that __unhash_process called from release_task will function
correctly.  If we don't touch the leader at all we know that
__unhash_process will work fine so there is no need to touch the leader.

For the task becomming the thread group leader, we just need to give it the
pid of the old thread group leader, add it to the task list, and attach it
to the session and the process group of the thread group.

Currently de_thread is also adding the task to the task list which is just
silly.

Currently the only leader of __detach_pid besides detach_pid is
switch_exec_pids because of the ugly extra work that was being
performed.

So this patch removes switch_exec_pids because it is doing too much, it is
creating an unnecessary special case in pid.c, duing work duplicated in
de_thread, and generally obscuring what it is going on.

The necessary work is added to de_thread, and it seems to be a little
clearer there what is going on.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Kirill Korotaev <dev@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c           | 14 +++++++++++---
 include/linux/pid.h |  1 -
 kernel/pid.c        | 30 ------------------------------
 3 files changed, 11 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index dd194923c52c..db0769447d33 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -708,7 +708,17 @@ static int de_thread(struct task_struct *tsk)
 		remove_parent(current);
 		remove_parent(leader);
 
-		switch_exec_pids(leader, current);
+
+		/* Become a process group leader with the old leader's pid.
+		 * Note: The old leader also uses thispid until release_task
+		 *       is called.  Odd but simple and correct.
+		 */
+		detach_pid(current, PIDTYPE_PID);
+		current->pid = leader->pid;
+		attach_pid(current, PIDTYPE_PID,  current->pid);
+		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
+		attach_pid(current, PIDTYPE_SID,  current->signal->session);
+		list_add_tail(&current->tasks, &init_task.tasks);
 
 		current->parent = current->real_parent = leader->real_parent;
 		leader->parent = leader->real_parent = child_reaper;
@@ -722,8 +732,6 @@ static int de_thread(struct task_struct *tsk)
 			__ptrace_link(current, parent);
 		}
 
-		list_del(&current->tasks);
-		list_add_tail(&current->tasks, &init_task.tasks);
 		current->exit_signal = SIGCHLD;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 5b2fcb19d2da..099e70ecf7c7 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -38,7 +38,6 @@ extern struct pid *FASTCALL(find_pid(enum pid_type, int));
 
 extern int alloc_pidmap(void);
 extern void FASTCALL(free_pidmap(int));
-extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread);
 
 #define do_each_task_pid(who, type, task)				\
 	if ((task = find_task_by_pid_type(type, who))) {		\
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc07246991..7781d9999058 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,36 +217,6 @@ task_t *find_task_by_pid_type(int type, int nr)
 
 EXPORT_SYMBOL(find_task_by_pid_type);
 
-/*
- * This function switches the PIDs if a non-leader thread calls
- * sys_execve() - this must be done without releasing the PID.
- * (which a detach_pid() would eventually do.)
- */
-void switch_exec_pids(task_t *leader, task_t *thread)
-{
-	__detach_pid(leader, PIDTYPE_PID);
-	__detach_pid(leader, PIDTYPE_TGID);
-	__detach_pid(leader, PIDTYPE_PGID);
-	__detach_pid(leader, PIDTYPE_SID);
-
-	__detach_pid(thread, PIDTYPE_PID);
-	__detach_pid(thread, PIDTYPE_TGID);
-
-	leader->pid = leader->tgid = thread->pid;
-	thread->pid = thread->tgid;
-
-	attach_pid(thread, PIDTYPE_PID, thread->pid);
-	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
-	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
-	list_add_tail(&thread->tasks, &init_task.tasks);
-
-	attach_pid(leader, PIDTYPE_PID, leader->pid);
-	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
-	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
-}
-
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
-- 
cgit v1.2.3


From 8fafabd86f1b75ed3cc6a6ffbe6c3e53e3d8457d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:05 -0800
Subject: [PATCH] remove add_parent()'s parent argument

add_parent(p, parent) is always called with parent == p->parent, and it makes
no sense to do it differently.  This patch removes this argument.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/irixsig.c | 4 ++--
 fs/exec.c                  | 4 ++--
 include/linux/sched.h      | 4 ++--
 kernel/exit.c              | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/irixsig.c b/arch/mips/kernel/irixsig.c
index 08273a2a501d..8150f071f80a 100644
--- a/arch/mips/kernel/irixsig.c
+++ b/arch/mips/kernel/irixsig.c
@@ -603,7 +603,7 @@ repeat:
 			/* move to end of parent's list to avoid starvation */
 			write_lock_irq(&tasklist_lock);
 			remove_parent(p);
-			add_parent(p, p->parent);
+			add_parent(p);
 			write_unlock_irq(&tasklist_lock);
 			retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 			if (retval)
@@ -643,7 +643,7 @@ repeat:
 				write_lock_irq(&tasklist_lock);
 				remove_parent(p);
 				p->parent = p->real_parent;
-				add_parent(p, p->parent);
+				add_parent(p);
 				do_notify_parent(p, SIGCHLD);
 				write_unlock_irq(&tasklist_lock);
 			} else
diff --git a/fs/exec.c b/fs/exec.c
index db0769447d33..9046ad2b0614 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -725,8 +725,8 @@ static int de_thread(struct task_struct *tsk)
 		current->group_leader = current;
 		leader->group_leader = leader;
 
-		add_parent(current, current->parent);
-		add_parent(leader, leader->parent);
+		add_parent(current);
+		add_parent(leader);
 		if (ptrace) {
 			current->ptrace = ptrace;
 			__ptrace_link(current, parent);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f5ab98bbb6b..b4b14c32b28a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1184,7 +1184,7 @@ extern void wait_task_inactive(task_t * p);
 #endif
 
 #define remove_parent(p)	list_del_init(&(p)->sibling)
-#define add_parent(p, parent)	list_add_tail(&(p)->sibling,&(parent)->children)
+#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
 #define REMOVE_LINKS(p) do {					\
 	if (thread_group_leader(p))				\
@@ -1195,7 +1195,7 @@ extern void wait_task_inactive(task_t * p);
 #define SET_LINKS(p) do {					\
 	if (thread_group_leader(p))				\
 		list_add_tail(&(p)->tasks,&init_task.tasks);	\
-	add_parent(p, (p)->parent);				\
+	add_parent(p);						\
 	} while (0)
 
 #define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
diff --git a/kernel/exit.c b/kernel/exit.c
index e04a59405e99..df26c33037d2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1281,7 +1281,7 @@ bail_ref:
 
 	/* move to end of parent's list to avoid starvation */
 	remove_parent(p);
-	add_parent(p, p->parent);
+	add_parent(p);
 
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v1.2.3


From c97d98931ac52ef110b62d9b75c6a6f2bfbc1898 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:06 -0800
Subject: [PATCH] kill SET_LINKS/REMOVE_LINKS

Both SET_LINKS() and SET_LINKS/REMOVE_LINKS() have exactly one caller, and
these callers already check thread_group_leader().

This patch kills theese macros, they mix two different things: setting
process's parent and registering it in init_task.tasks list.  Callers are
updated to do these actions by hand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 12 ------------
 kernel/exit.c         |  4 +++-
 kernel/fork.c         |  4 +++-
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4b14c32b28a..1f16fb1fea22 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1186,18 +1186,6 @@ extern void wait_task_inactive(task_t * p);
 #define remove_parent(p)	list_del_init(&(p)->sibling)
 #define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
-#define REMOVE_LINKS(p) do {					\
-	if (thread_group_leader(p))				\
-		list_del_init(&(p)->tasks);			\
-	remove_parent(p);					\
-	} while (0)
-
-#define SET_LINKS(p) do {					\
-	if (thread_group_leader(p))				\
-		list_add_tail(&(p)->tasks,&init_task.tasks);	\
-	add_parent(p);						\
-	} while (0)
-
 #define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
 #define prev_task(p)	list_entry((p)->tasks.prev, struct task_struct, tasks)
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b5e8b67680e..f436a6bd3fb7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,11 +54,13 @@ static void __unhash_process(struct task_struct *p)
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
+
+		list_del_init(&p->tasks);
 		if (p->pid)
 			__get_cpu_var(process_counts)--;
 	}
 
-	REMOVE_LINKS(p);
+	remove_parent(p);
 }
 
 void release_task(struct task_struct * p)
diff --git a/kernel/fork.c b/kernel/fork.c
index c49bd193b058..74c67629ee62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1181,7 +1181,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->ioprio = current->ioprio;
 
-	SET_LINKS(p);
+	add_parent(p);
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
 
@@ -1191,6 +1191,8 @@ static task_t *copy_process(unsigned long clone_flags,
 		p->signal->session = current->signal->session;
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+		list_add_tail(&p->tasks, &init_task.tasks);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	}
-- 
cgit v1.2.3


From 73b9ebfe126a4a886ee46cbab637374d7024668a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:07 -0800
Subject: [PATCH] pidhash: don't count idle threads

fork_idle() does unhash_process() just after copy_process().  Contrary,
boot_cpu's idle thread explicitely registers itself for each pid_type with nr
= 0.

copy_process() already checks p->pid != 0 before process_counts++, I think we
can just skip attach_pid() calls and job control inits for idle threads and
kill unhash_process().  We don't need to cleanup ->proc_dentry in fork_idle()
because with this patch idle threads are never hashed in
kernel/pid.c:pid_hash[].

We don't need to hash pid == 0 in pidmap_init().  free_pidmap() is never
called with pid == 0 arg, so it will never be reused.  So it is still possible
to use pid == 0 in any PIDTYPE_xxx namespace from kernel/pid.c's POV.

However with this patch we don't hash pid == 0 for PIDTYPE_PID case.  We still
have have PIDTYPE_PGID/PIDTYPE_SID entries with pid == 0: /sbin/init and
kernel threads which don't call daemonize().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/smp.c  |  1 -
 include/linux/sched.h |  2 --
 kernel/exit.c         | 18 +-----------------
 kernel/fork.c         | 35 ++++++++++++++++++-----------------
 kernel/pid.c          | 10 +---------
 5 files changed, 20 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index c8d8d0ac1a7f..511116aebaf7 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -143,7 +143,6 @@ void smp_prepare_cpus(unsigned int maxcpus)
 		idle = idle_thread(cpu);
 
 		init_idle(idle, cpu);
-		unhash_process(idle);
 
 		waittime = 200000000;
 		while (waittime-- && !cpu_isset(cpu, cpu_callin_map))
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f16fb1fea22..ddc0df7f8bf5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1214,8 +1214,6 @@ static inline int thread_group_empty(task_t *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-extern void unhash_process(struct task_struct *p);
-
 /*
  * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index f436a6bd3fb7..a94e1c31131b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,8 +56,7 @@ static void __unhash_process(struct task_struct *p)
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_init(&p->tasks);
-		if (p->pid)
-			__get_cpu_var(process_counts)--;
+		__get_cpu_var(process_counts)--;
 	}
 
 	remove_parent(p);
@@ -118,21 +117,6 @@ repeat:
 		goto repeat;
 }
 
-/* we are using it only for SMP init */
-
-void unhash_process(struct task_struct *p)
-{
-	struct dentry *proc_dentry;
-
-	spin_lock(&p->proc_lock);
-	proc_dentry = proc_pid_unhash(p);
-	write_lock_irq(&tasklist_lock);
-	__unhash_process(p);
-	write_unlock_irq(&tasklist_lock);
-	spin_unlock(&p->proc_lock);
-	proc_pid_flush(proc_dentry);
-}
-
 /*
  * This checks not only the pgrp, but falls back on the pid if no
  * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
diff --git a/kernel/fork.c b/kernel/fork.c
index 74c67629ee62..0c32e28cdc5f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1181,25 +1181,26 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->ioprio = current->ioprio;
 
-	add_parent(p);
-	if (unlikely(p->ptrace & PT_PTRACED))
-		__ptrace_link(p, current->parent);
-
-	if (thread_group_leader(p)) {
-		p->signal->tty = current->signal->tty;
-		p->signal->pgrp = process_group(current);
-		p->signal->session = current->signal->session;
-		attach_pid(p, PIDTYPE_PGID, process_group(p));
-		attach_pid(p, PIDTYPE_SID, p->signal->session);
-
-		list_add_tail(&p->tasks, &init_task.tasks);
-		if (p->pid)
+	if (likely(p->pid)) {
+		add_parent(p);
+		if (unlikely(p->ptrace & PT_PTRACED))
+			__ptrace_link(p, current->parent);
+
+		if (thread_group_leader(p)) {
+			p->signal->tty = current->signal->tty;
+			p->signal->pgrp = process_group(current);
+			p->signal->session = current->signal->session;
+			attach_pid(p, PIDTYPE_PGID, process_group(p));
+			attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+			list_add_tail(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
+		}
+		attach_pid(p, PIDTYPE_TGID, p->tgid);
+		attach_pid(p, PIDTYPE_PID, p->pid);
+		nr_threads++;
 	}
-	attach_pid(p, PIDTYPE_TGID, p->tgid);
-	attach_pid(p, PIDTYPE_PID, p->pid);
 
-	nr_threads++;
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
@@ -1263,7 +1264,7 @@ task_t * __devinit fork_idle(int cpu)
 	if (!task)
 		return ERR_PTR(-ENOMEM);
 	init_idle(task, cpu);
-	unhash_process(task);
+
 	return task;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 7781d9999058..a9f2dfd006d2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -247,16 +247,8 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
-	int i;
-
 	pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, pidmap_array->page);
 	atomic_dec(&pidmap_array->nr_free);
-
-	/*
-	 * Allocate PID 0, and hash it via all PID types:
-	 */
-
-	for (i = 0; i < PIDTYPE_MAX; i++)
-		attach_pid(current, i, 0);
 }
-- 
cgit v1.2.3


From c7c6464117a02b0d54feb4ebeca4db70fa493678 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:09 -0800
Subject: [PATCH] pidhash: don't use zero pids

daemonize() calls set_special_pids(1,1), while init and kernel threads spawned
from init/main.c:init() run with 0,0 special pids.  This patch changes
INIT_SIGNALS() so that that they run with ->pgrp == ->session == 1 also.  This
patch relies on fact that swapper's pid == 1.

Now we have no hashed zero pids in pid_hash[].

User-space visibible change is that now /sbin/init runs with (1,1) special
pids and becomes a session leader.

Quoting Eric W. Biederman:
>
> daemonize consuming pids (1,1) then consumes pgrp 1.  So that when
> /sbin/init calls setsid() it thinks /sbin/init is a process group
> leader and setsid() fails.  So /sbin/init wants pgrp 1 session 1
> but doesn't get it.  I am pretty certain daemonize did not exist so
> /sbin/init got pgrp 1 session 1 in 2.4.
>
> That is the bug that is being fixed.
>
> This patch takes things one step farther and essentially calls
> setsid() for pid == 1 before init is execed.  That is new behavior
> but it cleans up the kernel as we now do not need to support the
> case of a process without a process group or a session.
>
> The only process that could have possibly cared was /sbin/init
> and it already calls setsid() because it doesn't want that.
>
> If this was going to break anything noticeable the change in behavior
> from 2.4 to 2.6 would have already done that.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/init_task.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 92146f3b7423..41ecbb847f32 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -62,6 +62,8 @@
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
+	.pgrp		= 1,						\
+	.session	= 1,						\
 }
 
 #define INIT_SIGHAND(sighand) {						\
-- 
cgit v1.2.3


From aa1757f90bea3f598b6e5d04d922a6a60200f1da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:12 -0800
Subject: [PATCH] convert sighand_cache to use SLAB_DESTROY_BY_RCU

This patch borrows a clever Hugh's 'struct anon_vma' trick.

Without tasklist_lock held we can't trust task->sighand until we locked it
and re-checked that it is still the same.

But this means we don't need to defer 'kmem_cache_free(sighand)'.  We can
return the memory to slab immediately, all we need is to be sure that
sighand->siglock can't dissapear inside rcu protected section.

To do so we need to initialize ->siglock inside ctor function,
SLAB_DESTROY_BY_RCU does the rest.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  3 +--
 include/linux/sched.h |  8 --------
 kernel/fork.c         | 21 +++++++++++----------
 kernel/signal.c       |  2 +-
 4 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 9046ad2b0614..950ebd43cdc3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ no_thread_group:
 		/*
 		 * Move our state over to newsighand and switch it in.
 		 */
-		spin_lock_init(&newsighand->siglock);
 		atomic_set(&newsighand->count, 1);
 		memcpy(newsighand->action, oldsighand->action,
 		       sizeof(newsighand->action));
@@ -785,7 +784,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			sighand_free(oldsighand);
+			kmem_cache_free(sighand_cachep, oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ddc0df7f8bf5..bbcfc873bd98 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -355,16 +355,8 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
-	struct rcu_head		rcu;
 };
 
-extern void sighand_free_cb(struct rcu_head *rhp);
-
-static inline void sighand_free(struct sighand_struct *sp)
-{
-	call_rcu(&sp->rcu, sighand_free_cb);
-}
-
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c32e28cdc5f..33ffb5bf0dbc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -786,14 +786,6 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
-void sighand_free_cb(struct rcu_head *rhp)
-{
-	struct sighand_struct *sp;
-
-	sp = container_of(rhp, struct sighand_struct, rcu);
-	kmem_cache_free(sighand_cachep, sp);
-}
-
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -806,7 +798,6 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
-	spin_lock_init(&sig->siglock);
 	atomic_set(&sig->count, 1);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 	return 0;
@@ -1356,11 +1347,21 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct sighand_struct *sighand = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+					SLAB_CTOR_CONSTRUCTOR)
+		spin_lock_init(&sighand->siglock);
+}
+
 void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+			sighand_ctor, NULL);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index dc8f91bf9f89..b0b1ca9daa33 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -330,7 +330,7 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		sighand_free(sighand);
+		kmem_cache_free(sighand_cachep, sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
-- 
cgit v1.2.3


From f63ee72e0fb82e504a0489490babc7612c7cd6c2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:13 -0800
Subject: [PATCH] introduce lock_task_sighand() helper

Add lock_task_sighand() helper and converts group_send_sig_info() to use
it.  Hopefully we will have more users soon.

This patch also removes '!sighand->count' and '!p->usage' checks, I think
they both are bogus, racy and unneeded (but probably it makes sense to
restore them as BUG_ON()s).

->sighand is cleared and it's ->count is decremented in release_task() with
sighand->siglock held, so it is a bug to have '!p->usage || !->count' after
we already locked and verified it is the same.  On the other hand, an
already dead task without ->sighand can have a non-zero ->usage due to
ptrace, for example.

If we read the stale value of ->sighand we must see the change after
spin_lock(), because that change was done while holding that same old
->sighand.siglock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  9 +++++++++
 kernel/signal.c       | 38 ++++++++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bbcfc873bd98..ca1fd31aae97 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,6 +1225,15 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
+extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+							unsigned long *flags);
+
+static inline void unlock_task_sighand(struct task_struct *tsk,
+						unsigned long *flags)
+{
+	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+}
+
 #ifndef __HAVE_THREAD_FUNCTIONS
 
 #define task_thread_info(task) (task)->thread_info
diff --git a/kernel/signal.c b/kernel/signal.c
index b0b1ca9daa33..819fa49aa70a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1120,27 +1120,37 @@ void zap_other_threads(struct task_struct *p)
 /*
  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+{
+	struct sighand_struct *sighand;
+
+	for (;;) {
+		sighand = rcu_dereference(tsk->sighand);
+		if (unlikely(sighand == NULL))
+			break;
+
+		spin_lock_irqsave(&sighand->siglock, *flags);
+		if (likely(sighand == tsk->sighand))
+			break;
+		spin_unlock_irqrestore(&sighand->siglock, *flags);
+	}
+
+	return sighand;
+}
+
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
-	struct sighand_struct *sp;
 	int ret;
 
-retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
-		spin_lock_irqsave(&sp->siglock, flags);
-		if (p->sighand != sp) {
-			spin_unlock_irqrestore(&sp->siglock, flags);
-			goto retry;
-		}
-		if ((atomic_read(&sp->count) == 0) ||
-				(atomic_read(&p->usage) == 0)) {
-			spin_unlock_irqrestore(&sp->siglock, flags);
-			return -ESRCH;
+
+	if (!ret && sig) {
+		ret = -ESRCH;
+		if (lock_task_sighand(p, &flags)) {
+			ret = __group_send_sig_info(sig, info, p);
+			unlock_task_sighand(p, &flags);
 		}
-		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&sp->siglock, flags);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 7001510d0cbf51ad202dd2d0744f54104285cbb9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:14 -0800
Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_sighand

The only caller of exit_sighand(tsk) is copy_process's error path.  We can
call __exit_sighand() directly and kill exit_sighand().

This 'tsk' was not yet registered in pid_hash[] or init_task.tasks, it has no
external references, nobody can see it, and

	IF (clone_flags & CLONE_SIGHAND)
		At least 'current' has a reference to ->sighand, this
		means atomic_dec_and_test(sighand->count) can't be true.

	ELSE
		Nobody can see this ->sighand, this means we can free it
		without any locking.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 -
 kernel/fork.c         |  3 ++-
 kernel/signal.c       | 14 --------------
 3 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ca1fd31aae97..69c2a1e1529e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,6 @@ extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void exit_signal(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
-extern void exit_sighand(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 33ffb5bf0dbc..8a46ad52be8f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1208,7 +1208,8 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
-	exit_sighand(p);
+	if (p->sighand)
+		__exit_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index c5b65aa4c2bc..1d7f4463c32d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -336,20 +336,6 @@ void __exit_sighand(struct task_struct *tsk)
 		kmem_cache_free(sighand_cachep, sighand);
 }
 
-void exit_sighand(struct task_struct *tsk)
-{
-	write_lock_irq(&tasklist_lock);
-	rcu_read_lock();
-	if (tsk->sighand != NULL) {
-		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
-		spin_lock(&sighand->siglock);
-		__exit_sighand(tsk);
-		spin_unlock(&sighand->siglock);
-	}
-	rcu_read_unlock();
-	write_unlock_irq(&tasklist_lock);
-}
-
 /*
  * This function expects the tasklist_lock write-locked.
  */
-- 
cgit v1.2.3


From 6b3934ef52712ece50605dfc72e55d00c580831a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:16 -0800
Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_signal

__exit_signal() does important cleanups atomically under ->siglock.  It is
also called from copy_process's error path.  This is not good, for example we
can't move __unhash_process() under ->siglock for that reason.

We should not mix these 2 paths, just look at ugly 'if (p->sighand)' under
'bad_fork_cleanup_sighand:' label.  For copy_process() case it is sufficient
to just backout copy_signal(), nothing more.

Again, nobody can see this task yet.  For CLONE_THREAD case we just decrement
signal->count, otherwise nobody can see this ->signal and we can free it
lockless.

This patch assumes it is safe to do exit_thread_group_keys() without
tasklist_lock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 +-
 include/linux/slab.h  |  1 -
 kernel/fork.c         | 23 +++++++++++++++++++----
 kernel/signal.c       | 15 +--------------
 4 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69c2a1e1529e..7dd430b697aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1149,7 +1149,7 @@ extern void flush_thread(void);
 extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
-extern void exit_signal(struct task_struct *);
+extern void __cleanup_signal(struct signal_struct *);
 extern void __exit_signal(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 15e1d9736b1b..3af03b19c983 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -210,7 +210,6 @@ extern kmem_cache_t	*names_cachep;
 extern kmem_cache_t	*files_cachep;
 extern kmem_cache_t	*filp_cachep;
 extern kmem_cache_t	*fs_cachep;
-extern kmem_cache_t	*signal_cachep;
 extern kmem_cache_t	*sighand_cachep;
 extern kmem_cache_t	*bio_cachep;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a46ad52be8f..0aff28cdbadd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
 #endif
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
-kmem_cache_t *signal_cachep;
+static kmem_cache_t *signal_cachep;
 
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
 kmem_cache_t *sighand_cachep;
@@ -872,6 +872,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	return 0;
 }
 
+void __cleanup_signal(struct signal_struct *sig)
+{
+	exit_thread_group_keys(sig);
+	kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	atomic_dec(&sig->live);
+
+	if (atomic_dec_and_test(&sig->count))
+		__cleanup_signal(sig);
+}
+
 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
@@ -1206,10 +1222,9 @@ bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
 bad_fork_cleanup_signal:
-	exit_signal(p);
+	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	if (p->sighand)
-		__exit_sighand(p);
+	__exit_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d7f4463c32d..54e9ef673e68 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -395,23 +395,10 @@ void __exit_signal(struct task_struct *tsk)
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
-		/*
-		 * We are cleaning up the signal_struct here.
-		 */
-		exit_thread_group_keys(sig);
-		kmem_cache_free(signal_cachep, sig);
+		__cleanup_signal(sig);
 	}
 }
 
-void exit_signal(struct task_struct *tsk)
-{
-	atomic_dec(&tsk->signal->live);
-
-	write_lock_irq(&tasklist_lock);
-	__exit_signal(tsk);
-	write_unlock_irq(&tasklist_lock);
-}
-
 /*
  * Flush all handlers for a task.
  */
-- 
cgit v1.2.3


From c81addc9d3a0ebff2155e0cd86f90820ab97147e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:17 -0800
Subject: [PATCH] rename __exit_sighand to cleanup_sighand

Cosmetic, rename __exit_sighand to cleanup_sighand and move it close to
copy_sighand().

This matches copy_signal/cleanup_signal naming, and I think it is easier to
follow.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 +-
 kernel/fork.c         | 12 +++++++++++-
 kernel/signal.c       | 19 ++-----------------
 3 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7dd430b697aa..921148277da9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1150,8 +1150,8 @@ extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
+extern void cleanup_sighand(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
-extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0aff28cdbadd..12cdd9fc9d02 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -803,6 +803,16 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	return 0;
 }
 
+void cleanup_sighand(struct task_struct *tsk)
+{
+	struct sighand_struct * sighand = tsk->sighand;
+
+	/* Ok, we're done with the signal handlers */
+	tsk->sighand = NULL;
+	if (atomic_dec_and_test(&sighand->count))
+		kmem_cache_free(sighand_cachep, sighand);
+}
+
 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct signal_struct *sig;
@@ -1224,7 +1234,7 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	__exit_sighand(p);
+	cleanup_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index ca1fa854e469..b29c868bd5ee 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -310,9 +310,7 @@ static void flush_sigqueue(struct sigpending *queue)
 /*
  * Flush all pending signals for a task.
  */
-
-void
-flush_signals(struct task_struct *t)
+void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
@@ -323,19 +321,6 @@ flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_sighand(struct task_struct *tsk)
-{
-	struct sighand_struct * sighand = tsk->sighand;
-
-	/* Ok, we're done with the signal handlers */
-	tsk->sighand = NULL;
-	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
-}
-
 /*
  * This function expects the tasklist_lock write-locked.
  */
@@ -386,7 +371,7 @@ void __exit_signal(struct task_struct *tsk)
 	}
 
 	tsk->signal = NULL;
-	__exit_sighand(tsk);
+	cleanup_sighand(tsk);
 	spin_unlock(&sighand->siglock);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 6a14c5c9da0b4c34b5be783403c54f0396fcfe77 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:18 -0800
Subject: [PATCH] move __exit_signal() to kernel/exit.c

__exit_signal() is private to release_task() now.  I think it is better to
make it static in kernel/exit.c and export flush_sigqueue() instead - this
function is much more simple and straightforward.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h  |  1 -
 include/linux/signal.h |  2 ++
 kernel/exit.c          | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/signal.c        | 65 +-------------------------------------------------
 4 files changed, 66 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 921148277da9..a913fca9e70d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,6 @@ extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
 extern void cleanup_sighand(struct task_struct *);
-extern void __exit_signal(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index b7d093520bb6..162a8fd10b29 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -249,6 +249,8 @@ static inline void init_sigpending(struct sigpending *sig)
 	INIT_LIST_HEAD(&sig->list);
 }
 
+extern void flush_sigqueue(struct sigpending *queue);
+
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index 77c35efad88c..3823ec89d7b8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,6 +29,7 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/posix-timers.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
@@ -62,6 +63,68 @@ static void __unhash_process(struct task_struct *p)
 	remove_parent(p);
 }
 
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct sighand_struct *sighand;
+
+	BUG_ON(!sig);
+	BUG_ON(!atomic_read(&sig->count));
+
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
+	spin_lock(&sighand->siglock);
+
+	posix_cpu_timers_exit(tsk);
+	if (atomic_dec_and_test(&sig->count))
+		posix_cpu_timers_exit_group(tsk);
+	else {
+		/*
+		 * If there is any task waiting for the group exit
+		 * then notify it:
+		 */
+		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
+			wake_up_process(sig->group_exit_task);
+			sig->group_exit_task = NULL;
+		}
+		if (tsk == sig->curr_target)
+			sig->curr_target = next_thread(tsk);
+		/*
+		 * Accumulate here the counters for all threads but the
+		 * group leader as they die, so they can be added into
+		 * the process-wide totals when those are taken.
+		 * The group leader stays around as a zombie as long
+		 * as there are other threads.  When it gets reaped,
+		 * the exit.c code will add its counts into these totals.
+		 * We won't ever get here for the group leader, since it
+		 * will have been the last reference on the signal_struct.
+		 */
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
+		sig->min_flt += tsk->min_flt;
+		sig->maj_flt += tsk->maj_flt;
+		sig->nvcsw += tsk->nvcsw;
+		sig->nivcsw += tsk->nivcsw;
+		sig->sched_time += tsk->sched_time;
+		sig = NULL; /* Marker for below. */
+	}
+
+	tsk->signal = NULL;
+	cleanup_sighand(tsk);
+	spin_unlock(&sighand->siglock);
+	rcu_read_unlock();
+
+	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+	flush_sigqueue(&tsk->pending);
+	if (sig) {
+		flush_sigqueue(&sig->shared_pending);
+		__cleanup_signal(sig);
+	}
+}
+
 void release_task(struct task_struct * p)
 {
 	int zap_leader;
diff --git a/kernel/signal.c b/kernel/signal.c
index b29c868bd5ee..6ea49f742a2f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
-#include <linux/posix-timers.h>
 #include <linux/signal.h>
 #include <linux/audit.h>
 #include <linux/capability.h>
@@ -295,7 +294,7 @@ static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
-static void flush_sigqueue(struct sigpending *queue)
+void flush_sigqueue(struct sigpending *queue)
 {
 	struct sigqueue *q;
 
@@ -321,68 +320,6 @@ void flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_signal(struct task_struct *tsk)
-{
-	struct signal_struct *sig = tsk->signal;
-	struct sighand_struct *sighand;
-
-	BUG_ON(!sig);
-	BUG_ON(!atomic_read(&sig->count));
-
-	rcu_read_lock();
-	sighand = rcu_dereference(tsk->sighand);
-	spin_lock(&sighand->siglock);
-
-	posix_cpu_timers_exit(tsk);
-	if (atomic_dec_and_test(&sig->count))
-		posix_cpu_timers_exit_group(tsk);
-	else {
-		/*
-		 * If there is any task waiting for the group exit
-		 * then notify it:
-		 */
-		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
-			wake_up_process(sig->group_exit_task);
-			sig->group_exit_task = NULL;
-		}
-		if (tsk == sig->curr_target)
-			sig->curr_target = next_thread(tsk);
-		/*
-		 * Accumulate here the counters for all threads but the
-		 * group leader as they die, so they can be added into
-		 * the process-wide totals when those are taken.
-		 * The group leader stays around as a zombie as long
-		 * as there are other threads.  When it gets reaped,
-		 * the exit.c code will add its counts into these totals.
-		 * We won't ever get here for the group leader, since it
-		 * will have been the last reference on the signal_struct.
-		 */
-		sig->utime = cputime_add(sig->utime, tsk->utime);
-		sig->stime = cputime_add(sig->stime, tsk->stime);
-		sig->min_flt += tsk->min_flt;
-		sig->maj_flt += tsk->maj_flt;
-		sig->nvcsw += tsk->nvcsw;
-		sig->nivcsw += tsk->nivcsw;
-		sig->sched_time += tsk->sched_time;
-		sig = NULL; /* Marker for below. */
-	}
-
-	tsk->signal = NULL;
-	cleanup_sighand(tsk);
-	spin_unlock(&sighand->siglock);
-	rcu_read_unlock();
-
-	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-	flush_sigqueue(&tsk->pending);
-	if (sig) {
-		flush_sigqueue(&sig->shared_pending);
-		__cleanup_signal(sig);
-	}
-}
-
 /*
  * Flush all handlers for a task.
  */
-- 
cgit v1.2.3


From 47e65328a7b1cdfc4e3102e50d60faf94ebba7d3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:25 -0800
Subject: [PATCH] pids: kill PIDTYPE_TGID

This patch kills PIDTYPE_TGID pid_type thus saving one hash table in
kernel/pid.c and speeding up subthreads create/destroy a bit.  It is also a
preparation for the further tref/pids rework.

This patch adds 'struct list_head thread_group' to 'struct task_struct'
instead.

We don't detach group leader from PIDTYPE_PID namespace until another
thread inherits it's ->pid == ->tgid, so we are safe wrt premature
free_pidmap(->tgid) call.

Currently there are no users of find_task_by_pid_type(PIDTYPE_TGID).
Should the need arise, we can use find_task_by_pid()->group_leader.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-By: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pid.h   |  1 -
 include/linux/sched.h | 11 ++++++++---
 kernel/exit.c         | 10 +---------
 kernel/fork.c         |  4 +++-
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 099e70ecf7c7..5b9082cc600f 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,7 +4,6 @@
 enum pid_type
 {
 	PIDTYPE_PID,
-	PIDTYPE_TGID,
 	PIDTYPE_PGID,
 	PIDTYPE_SID,
 	PIDTYPE_MAX
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a913fca9e70d..99855f694ebd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -752,6 +752,7 @@ struct task_struct {
 
 	/* PID/PID hash table linkage. */
 	struct pid pids[PIDTYPE_MAX];
+	struct list_head thread_group;
 
 	struct completion *vfork_done;		/* for vfork() */
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
@@ -1192,13 +1193,17 @@ extern void wait_task_inactive(task_t * p);
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 
-extern task_t * FASTCALL(next_thread(const task_t *p));
-
 #define thread_group_leader(p)	(p->pid == p->tgid)
 
+static inline task_t *next_thread(task_t *p)
+{
+	return list_entry(rcu_dereference(p->thread_group.next),
+				task_t, thread_group);
+}
+
 static inline int thread_group_empty(task_t *p)
 {
-	return list_empty(&p->pids[PIDTYPE_TGID].pid_list);
+	return list_empty(&p->thread_group);
 }
 
 #define delay_group_leader(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index aea23e713cf4..22399caf7574 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@ static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
-	detach_pid(p, PIDTYPE_TGID);
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
@@ -59,7 +58,7 @@ static void __unhash_process(struct task_struct *p)
 		list_del_init(&p->tasks);
 		__get_cpu_var(process_counts)--;
 	}
-
+	list_del_rcu(&p->thread_group);
 	remove_parent(p);
 }
 
@@ -964,13 +963,6 @@ asmlinkage long sys_exit(int error_code)
 	do_exit((error_code&0xff)<<8);
 }
 
-task_t fastcall *next_thread(const task_t *p)
-{
-	return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
-}
-
-EXPORT_SYMBOL(next_thread);
-
 /*
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
diff --git a/kernel/fork.c b/kernel/fork.c
index 12cdd9fc9d02..bc551efb5fd4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,6 +1112,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * We dont wake it up yet.
 	 */
 	p->group_leader = p;
+	INIT_LIST_HEAD(&p->thread_group);
 	INIT_LIST_HEAD(&p->ptrace_children);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
@@ -1165,7 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags,
 			retval = -EAGAIN;
 			goto bad_fork_cleanup_namespace;
 		}
+
 		p->group_leader = current->group_leader;
+		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
 
 		if (current->signal->group_stop_count > 0) {
 			/*
@@ -1213,7 +1216,6 @@ static task_t *copy_process(unsigned long clone_flags,
 			list_add_tail(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
 		}
-		attach_pid(p, PIDTYPE_TGID, p->tgid);
 		attach_pid(p, PIDTYPE_PID, p->pid);
 		nr_threads++;
 	}
-- 
cgit v1.2.3


From a7e5328a06a2beee3a2bbfaf87ce2a7bbe937de1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:27 -0800
Subject: [PATCH] cleanup __exit_signal->cleanup_sighand path

Move 'tsk->sighand = NULL' from cleanup_sighand() to __exit_signal().  This
makes the exit path more understandable and allows us to do
cleanup_sighand() outside of ->siglock protected section.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/exit.c         | 3 ++-
 kernel/fork.c         | 8 ++------
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 99855f694ebd..d04186d8cc68 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,7 @@ extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
-extern void cleanup_sighand(struct task_struct *);
+extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/kernel/exit.c b/kernel/exit.c
index 22399caf7574..bc0ec674d3f4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -114,10 +114,11 @@ static void __exit_signal(struct task_struct *tsk)
 	__unhash_process(tsk);
 
 	tsk->signal = NULL;
-	cleanup_sighand(tsk);
+	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 	rcu_read_unlock();
 
+	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
diff --git a/kernel/fork.c b/kernel/fork.c
index aa50c848fae7..b3f7a1bb5e55 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -803,12 +803,8 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	return 0;
 }
 
-void cleanup_sighand(struct task_struct *tsk)
+void __cleanup_sighand(struct sighand_struct *sighand)
 {
-	struct sighand_struct * sighand = tsk->sighand;
-
-	/* Ok, we're done with the signal handlers */
-	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
 		kmem_cache_free(sighand_cachep, sighand);
 }
@@ -1233,7 +1229,7 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	cleanup_sighand(p);
+	__cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
-- 
cgit v1.2.3


From 4e5ec5dba22ea509b1a004f9815751f0ffc815e5 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 27 Mar 2006 18:42:40 +0100
Subject: [PATCH] libata: BMDMA handling updates

This is the minimal patch set to enable the current code to be used with
a controller following SFF (ie any PATA and early SATA controllers)
safely without crashes if there is no BMDMA area or if BMDMA is not
assigned by the BIOS for some reason.

Simplex status is recorded but not acted upon in this change, this isn't
a problem with the current drivers as none of them are for simplex
hardware. A following diff will deal with that.

The flags in the probe structure remain ->host_set_flags although Jeff
asked me to rename them, simply because the rename would break the usual
Linux rules that old code should break when there are changes. not
compile and run and then blow up/eat your computer/etc. Renaming this
later is a trivial exercise once a better name is chosen.

Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-bmdma.c | 26 ++++++++++++++++++++++----
 include/linux/libata.h      |  4 ++++
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-bmdma.c b/drivers/scsi/libata-bmdma.c
index 95d81d86d8b7..835dff0bafdc 100644
--- a/drivers/scsi/libata-bmdma.c
+++ b/drivers/scsi/libata-bmdma.c
@@ -703,6 +703,7 @@ ata_pci_init_native_mode(struct pci_dev *pdev, struct ata_port_info **port, int
 	struct ata_probe_ent *probe_ent =
 		ata_probe_ent_alloc(pci_dev_to_dev(pdev), port[0]);
 	int p = 0;
+	unsigned long bmdma;
 
 	if (!probe_ent)
 		return NULL;
@@ -716,7 +717,12 @@ ata_pci_init_native_mode(struct pci_dev *pdev, struct ata_port_info **port, int
 		probe_ent->port[p].altstatus_addr =
 		probe_ent->port[p].ctl_addr =
 			pci_resource_start(pdev, 1) | ATA_PCI_CTL_OFS;
-		probe_ent->port[p].bmdma_addr = pci_resource_start(pdev, 4);
+		bmdma = pci_resource_start(pdev, 4);
+		if (bmdma) {
+			if (inb(bmdma + 2) & 0x80)
+				probe_ent->host_set_flags |= ATA_HOST_SIMPLEX;
+			probe_ent->port[p].bmdma_addr = bmdma;
+		}
 		ata_std_ports(&probe_ent->port[p]);
 		p++;
 	}
@@ -726,7 +732,13 @@ ata_pci_init_native_mode(struct pci_dev *pdev, struct ata_port_info **port, int
 		probe_ent->port[p].altstatus_addr =
 		probe_ent->port[p].ctl_addr =
 			pci_resource_start(pdev, 3) | ATA_PCI_CTL_OFS;
-		probe_ent->port[p].bmdma_addr = pci_resource_start(pdev, 4) + 8;
+		bmdma = pci_resource_start(pdev, 4);
+		if (bmdma) {
+			bmdma += 8;
+			if(inb(bmdma + 2) & 0x80)
+			probe_ent->host_set_flags |= ATA_HOST_SIMPLEX;
+			probe_ent->port[p].bmdma_addr = bmdma;
+		}
 		ata_std_ports(&probe_ent->port[p]);
 		p++;
 	}
@@ -740,6 +752,7 @@ static struct ata_probe_ent *ata_pci_init_legacy_port(struct pci_dev *pdev,
 				struct ata_port_info *port, int port_num)
 {
 	struct ata_probe_ent *probe_ent;
+	unsigned long bmdma;
 
 	probe_ent = ata_probe_ent_alloc(pci_dev_to_dev(pdev), port);
 	if (!probe_ent)
@@ -766,8 +779,13 @@ static struct ata_probe_ent *ata_pci_init_legacy_port(struct pci_dev *pdev,
 			break;
 	}
 
-	probe_ent->port[0].bmdma_addr =
-		pci_resource_start(pdev, 4) + 8 * port_num;
+	bmdma = pci_resource_start(pdev, 4);
+	if (bmdma != 0) {
+		bmdma += 8 * port_num;
+		probe_ent->port[0].bmdma_addr = bmdma;
+		if (inb(bmdma + 2) & 0x80)
+			probe_ent->host_set_flags |= ATA_HOST_SIMPLEX;
+	}
 	ata_std_ports(&probe_ent->port[0]);
 
 	return probe_ent;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9fcc061e3adf..a5c213ce97c9 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -160,6 +160,9 @@ enum {
 	ATA_QCFLAG_DMAMAP	= ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
 	ATA_QCFLAG_EH_SCHEDULED = (1 << 5), /* EH scheduled */
 
+	/* host set flags */
+	ATA_HOST_SIMPLEX	= (1 << 0),	/* Host is simplex, one DMA channel per host_set only */
+	
 	/* various lengths of time */
 	ATA_TMOUT_PIO		= 30 * HZ,
 	ATA_TMOUT_BOOT		= 30 * HZ,	/* heuristic */
@@ -278,6 +281,7 @@ struct ata_probe_ent {
 	unsigned long		irq;
 	unsigned int		irq_flags;
 	unsigned long		host_flags;
+	unsigned long		host_set_flags;
 	void __iomem		*mmio_base;
 	void			*private_data;
 };
-- 
cgit v1.2.3


From e35a9e01f2a504871e70576a9e11dbe4d8dee456 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 27 Mar 2006 18:46:37 +0100
Subject: [PATCH] libata: Add ->set_mode hook for odd drivers

Some hardware doesn't want the usual mode setup logic running. This
allows the hardware driver to replace it for special cases in the least
invasive way possible.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/scsi/libata-core.c | 6 +++++-
 include/linux/libata.h     | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 86310562da8b..10933cb722e6 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1409,7 +1409,11 @@ static int ata_bus_probe(struct ata_port *ap)
 	if (!found)
 		goto err_out_disable;
 
-	ata_set_mode(ap);
+	if (ap->ops->set_mode)
+		ap->ops->set_mode(ap);
+	else
+		ata_set_mode(ap);
+
 	if (ap->flags & ATA_FLAG_PORT_DISABLED)
 		goto err_out_disable;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a5c213ce97c9..6a9316cbb70b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -433,6 +433,7 @@ struct ata_port_operations {
 	void (*dev_select)(struct ata_port *ap, unsigned int device);
 
 	void (*phy_reset) (struct ata_port *ap); /* obsolete */
+	void (*set_mode) (struct ata_port *ap);
 	int (*probe_reset) (struct ata_port *ap, unsigned int *classes);
 
 	void (*post_set_mode) (struct ata_port *ap);
-- 
cgit v1.2.3


From 5444a6f405618706eddbe1605ef8533b1b655764 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 27 Mar 2006 18:58:20 +0100
Subject: [PATCH] libata: Simplex and other mode filtering logic

Add a field to the host_set called 'flags' (was host_set_flags changed
to suit Jeff)
Add a simplex_claimed field so we can remember who owns the DMA channel
Add a ->mode_filter() hook to allow drivers to filter modes
Add docs for mode_filter and set_mode
Filter according to simplex state
Filter cable in core

This provides the needed framework to support all the mode rules found
in the PATA world. The simplex filter deals with 'to spec' simplex DMA
systems found in older chips. The cable filter avoids duplicating the
same rules in each chip driver with PATA. Finally the mode filter is
neccessary because drive/chip combinations have errata that forbid
certain modes with some drives or types of ATA object.

Drive speed setup remains per channel for now and the filters now use
the framework Tejun put into place which cleans them up a lot from the
older libata-pata patches.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 Documentation/DocBook/libata.tmpl | 47 +++++++++++++++++++++++++++++++++++----
 drivers/scsi/libata-core.c        | 31 ++++++++++++++++++++++++--
 include/linux/libata.h            |  4 ++++
 3 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
index d260d92089ad..5bcbb6ee3bc0 100644
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -120,14 +120,27 @@ void (*dev_config) (struct ata_port *, struct ata_device *);
 	<programlisting>
 void (*set_piomode) (struct ata_port *, struct ata_device *);
 void (*set_dmamode) (struct ata_port *, struct ata_device *);
-void (*post_set_mode) (struct ata_port *ap);
+void (*post_set_mode) (struct ata_port *);
+unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int);
 	</programlisting>
 
 	<para>
 	Hooks called prior to the issue of SET FEATURES - XFER MODE
-	command.  dev->pio_mode is guaranteed to be valid when
-	->set_piomode() is called, and dev->dma_mode is guaranteed to be
-	valid when ->set_dmamode() is called.  ->post_set_mode() is
+	command.  The optional ->mode_filter() hook is called when libata
+	has built a mask of the possible modes. This is passed to the 
+	->mode_filter() function which should return a mask of valid modes
+	after filtering those unsuitable due to hardware limits. It is not
+	valid to use this interface to add modes.
+	</para>
+	<para>
+	dev->pio_mode and dev->dma_mode are guaranteed to be valid when
+	->set_piomode() and when ->set_dmamode() is called. The timings for
+	any other drive sharing the cable will also be valid at this point.
+	That is the library records the decisions for the modes of each
+	drive on a channel before it attempts to set any of them.
+	</para>
+	<para>
+	->post_set_mode() is
 	called unconditionally, after the SET FEATURES - XFER MODE
 	command completes successfully.
 	</para>
@@ -230,6 +243,32 @@ void (*dev_select)(struct ata_port *ap, unsigned int device);
 
 	</sect2>
 
+	<sect2><title>Private tuning method</title>
+	<programlisting>
+void (*set_mode) (struct ata_port *ap);
+	</programlisting>
+
+	<para>
+	By default libata performs drive and controller tuning in
+	accordance with the ATA timing rules and also applies blacklists
+	and cable limits. Some controllers need special handling and have
+	custom tuning rules, typically raid controllers that use ATA
+	commands but do not actually do drive timing.
+	</para>
+
+	<warning>
+	<para>
+	This hook should not be used to replace the standard controller
+	tuning logic when a controller has quirks. Replacing the default
+	tuning logic in that case would bypass handling for drive and
+	bridge quirks that may be important to data reliability. If a
+	controller needs to filter the mode selection it should use the
+	mode_filter hook instead.
+	</para>
+	</warning>
+
+	</sect2>
+
 	<sect2><title>Reset ATA bus</title>
 	<programlisting>
 void (*phy_reset) (struct ata_port *ap);
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 41659448a204..b1ddf40533b5 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1818,7 +1818,7 @@ static void ata_host_set_dma(struct ata_port *ap)
  */
 static void ata_set_mode(struct ata_port *ap)
 {
-	int i, rc;
+	int i, rc, used_dma = 0;
 
 	/* step 1: calculate xfer_mask */
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
@@ -1836,6 +1836,9 @@ static void ata_set_mode(struct ata_port *ap)
 		dma_mask = ata_pack_xfermask(0, dev->mwdma_mask, dev->udma_mask);
 		dev->pio_mode = ata_xfer_mask2mode(pio_mask);
 		dev->dma_mode = ata_xfer_mask2mode(dma_mask);
+
+		if (dev->dma_mode)
+			used_dma = 1;
 	}
 
 	/* step 2: always set host PIO timings */
@@ -1857,6 +1860,17 @@ static void ata_set_mode(struct ata_port *ap)
 			goto err_out;
 	}
 
+	/*
+	 *	Record simplex status. If we selected DMA then the other
+	 *	host channels are not permitted to do so.
+	 */
+
+	if (used_dma && (ap->host_set->flags & ATA_HOST_SIMPLEX))
+		ap->host_set->simplex_claimed = 1;
+
+	/*
+	 *	Chip specific finalisation
+	 */
 	if (ap->ops->post_set_mode)
 		ap->ops->post_set_mode(ap);
 
@@ -2646,13 +2660,14 @@ static int ata_dma_blacklisted(const struct ata_device *dev)
  */
 static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev)
 {
+	struct ata_host_set *hs = ap->host_set;
 	unsigned long xfer_mask;
 	int i;
 
 	xfer_mask = ata_pack_xfermask(ap->pio_mask, ap->mwdma_mask,
 				      ap->udma_mask);
 
-	/* use port-wide xfermask for now */
+	/* FIXME: Use port-wide xfermask for now */
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
 		struct ata_device *d = &ap->device[i];
 		if (!ata_dev_present(d))
@@ -2662,12 +2677,23 @@ static void ata_dev_xfermask(struct ata_port *ap, struct ata_device *dev)
 		xfer_mask &= ata_id_xfermask(d->id);
 		if (ata_dma_blacklisted(d))
 			xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA);
+		/* Apply cable rule here. Don't apply it early because when
+		   we handle hot plug the cable type can itself change */
+		if (ap->cbl == ATA_CBL_PATA40)
+			xfer_mask &= ~(0xF8 << ATA_SHIFT_UDMA);
 	}
 
 	if (ata_dma_blacklisted(dev))
 		printk(KERN_WARNING "ata%u: dev %u is on DMA blacklist, "
 		       "disabling DMA\n", ap->id, dev->devno);
 
+	if (hs->flags & ATA_HOST_SIMPLEX) {
+		if (hs->simplex_claimed)
+			xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA);
+	}
+	if (ap->ops->mode_filter)
+		xfer_mask = ap->ops->mode_filter(ap, dev, xfer_mask);
+
 	ata_unpack_xfermask(xfer_mask, &dev->pio_mask, &dev->mwdma_mask,
 			    &dev->udma_mask);
 }
@@ -4531,6 +4557,7 @@ int ata_device_add(const struct ata_probe_ent *ent)
 	host_set->mmio_base = ent->mmio_base;
 	host_set->private_data = ent->private_data;
 	host_set->ops = ent->port_ops;
+	host_set->flags = ent->host_set_flags;
 
 	/* register each port bound to this device */
 	for (i = 0; i < ent->n_ports; i++) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 6a9316cbb70b..0d61357604d5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -294,6 +294,9 @@ struct ata_host_set {
 	unsigned int		n_ports;
 	void			*private_data;
 	const struct ata_port_operations *ops;
+	unsigned long		flags;
+	int			simplex_claimed;	/* Keep seperate in case we
+							   ever need to do this locked */
 	struct ata_port *	ports[0];
 };
 
@@ -423,6 +426,7 @@ struct ata_port_operations {
 
 	void (*set_piomode) (struct ata_port *, struct ata_device *);
 	void (*set_dmamode) (struct ata_port *, struct ata_device *);
+	unsigned long (*mode_filter) (const struct ata_port *, struct ata_device *, unsigned long);
 
 	void (*tf_load) (struct ata_port *ap, const struct ata_taskfile *tf);
 	void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
-- 
cgit v1.2.3


From 5274f052e7b3dbd81935772eb551dfd0325dfa9d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 30 Mar 2006 15:15:30 +0200
Subject: [PATCH] Introduce sys_splice() system call

This adds support for the sys_splice system call. Using a pipe as a
transport, it can connect to files or sockets (latter as output only).

From the splice.c comments:

   "splice": joining two ropes together by interweaving their strands.

   This is the "extended pipe" functionality, where a pipe is used as
   an arbitrary in-memory buffer. Think of a pipe as a small kernel
   buffer that you can use to transfer data from one end to the other.

   The traditional unix read/write is extended with a "splice()" operation
   that transfers data buffers to or from a pipe buffer.

   Named by Larry McVoy, original implementation from Linus, extended by
   Jens to support splicing to files and fixing the initial implementation
   bugs.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |   1 +
 arch/ia64/kernel/entry.S         |   1 +
 fs/Makefile                      |   2 +-
 fs/ext2/file.c                   |   2 +
 fs/ext3/file.c                   |   2 +
 fs/pipe.c                        |  33 ++-
 fs/reiserfs/file.c               |   2 +
 fs/splice.c                      | 612 +++++++++++++++++++++++++++++++++++++++
 include/asm-i386/unistd.h        |   3 +-
 include/asm-ia64/unistd.h        |   3 +-
 include/asm-powerpc/unistd.h     |   3 +-
 include/asm-x86_64/unistd.h      |   4 +-
 include/linux/fs.h               |   4 +
 include/linux/syscalls.h         |   2 +
 net/socket.c                     |   6 +-
 15 files changed, 669 insertions(+), 11 deletions(-)
 create mode 100644 fs/splice.c

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 326595f3fa4d..ce3ef4fa0551 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -312,3 +312,4 @@ ENTRY(sys_call_table)
 	.long sys_unshare		/* 310 */
 	.long sys_set_robust_list
 	.long sys_get_robust_list
+	.long sys_splice
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 0e3eda99e549..750e8e7fbdc3 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1605,5 +1605,6 @@ sys_call_table:
 	data8 sys_ni_syscall			// reserved for pselect
 	data8 sys_ni_syscall			// 1295 reserved for ppoll
 	data8 sys_unshare
+	data8 sys_splice
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/fs/Makefile b/fs/Makefile
index 080b3867be4d..f3a4f7077175 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o drop_caches.o
+		ioprio.o pnode.o drop_caches.o splice.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 509cceca04db..23e2c7ccec1d 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -53,6 +53,8 @@ const struct file_operations ext2_file_operations = {
 	.readv		= generic_file_readv,
 	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 783a796220bb..1efefb630ea9 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -119,6 +119,8 @@ const struct file_operations ext3_file_operations = {
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
 	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 };
 
 struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/pipe.c b/fs/pipe.c
index e2f4f1d9ffc2..2414bf270db6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -15,6 +15,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -94,11 +95,20 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
 {
 	struct page *page = buf->page;
 
-	if (info->tmp_page) {
-		__free_page(page);
+	/*
+	 * If nobody else uses this page, and we don't already have a
+	 * temporary page, let's keep track of it as a one-deep
+	 * allocation cache
+	 */
+	if (page_count(page) == 1 && !info->tmp_page) {
+		info->tmp_page = page;
 		return;
 	}
-	info->tmp_page = page;
+
+	/*
+	 * Otherwise just release our reference to it
+	 */
+	page_cache_release(page);
 }
 
 static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
@@ -152,6 +162,11 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 				chars = total_len;
 
 			addr = ops->map(filp, info, buf);
+			if (IS_ERR(addr)) {
+				if (!ret)
+					ret = PTR_ERR(addr);
+				break;
+			}
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
 			ops->unmap(info, buf);
 			if (unlikely(error)) {
@@ -254,8 +269,16 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 		struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
-			void *addr = ops->map(filp, info, buf);
-			int error = pipe_iov_copy_from_user(offset + addr, iov, chars);
+			void *addr;
+			int error;
+
+			addr = ops->map(filp, info, buf);
+			if (IS_ERR(addr)) {
+				error = PTR_ERR(addr);
+				goto out;
+			}
+			error = pipe_iov_copy_from_user(offset + addr, iov,
+							chars);
 			ops->unmap(info, buf);
 			ret = error;
 			do_wakeup = 1;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 010094d14da6..cf6e1cf40351 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1576,6 +1576,8 @@ const struct file_operations reiserfs_file_operations = {
 	.sendfile = generic_file_sendfile,
 	.aio_read = generic_file_aio_read,
 	.aio_write = reiserfs_aio_write,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
 };
 
 struct inode_operations reiserfs_file_inode_operations = {
diff --git a/fs/splice.c b/fs/splice.c
new file mode 100644
index 000000000000..efa47c1c4e13
--- /dev/null
+++ b/fs/splice.c
@@ -0,0 +1,612 @@
+/*
+ * "splice": joining two ropes together by interweaving their strands.
+ *
+ * This is the "extended pipe" functionality, where a pipe is used as
+ * an arbitrary in-memory buffer. Think of a pipe as a small kernel
+ * buffer that you can use to transfer data from one end to the other.
+ *
+ * The traditional unix read/write is extended with a "splice()" operation
+ * that transfers data buffers to or from a pipe buffer.
+ *
+ * Named by Larry McVoy, original implementation from Linus, extended by
+ * Jens to support splicing to files and fixing the initial implementation
+ * bugs.
+ *
+ * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
+ *
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mm_inline.h>
+
+/*
+ * Passed to the actors
+ */
+struct splice_desc {
+	unsigned int len, total_len;	/* current and remaining length */
+	unsigned int flags;		/* splice flags */
+	struct file *file;		/* file to read/write */
+	loff_t pos;			/* file position */
+};
+
+static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
+					struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+	buf->page = NULL;
+}
+
+static void *page_cache_pipe_buf_map(struct file *file,
+				     struct pipe_inode_info *info,
+				     struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	lock_page(page);
+
+	if (!PageUptodate(page)) {
+		unlock_page(page);
+		return ERR_PTR(-EIO);
+	}
+
+	if (!page->mapping) {
+		unlock_page(page);
+		return ERR_PTR(-ENODATA);
+	}
+
+	return kmap(buf->page);
+}
+
+static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
+				      struct pipe_buffer *buf)
+{
+	unlock_page(buf->page);
+	kunmap(buf->page);
+}
+
+static struct pipe_buf_operations page_cache_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = page_cache_pipe_buf_map,
+	.unmap = page_cache_pipe_buf_unmap,
+	.release = page_cache_pipe_buf_release,
+};
+
+static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
+			    int nr_pages, unsigned long offset,
+			    unsigned long len)
+{
+	struct pipe_inode_info *info;
+	int ret, do_wakeup, i;
+
+	ret = 0;
+	do_wakeup = 0;
+	i = 0;
+
+	mutex_lock(PIPE_MUTEX(*inode));
+
+	info = inode->i_pipe;
+	for (;;) {
+		int bufs;
+
+		if (!PIPE_READERS(*inode)) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		bufs = info->nrbufs;
+		if (bufs < PIPE_BUFFERS) {
+			int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
+			struct pipe_buffer *buf = info->bufs + newbuf;
+			struct page *page = pages[i++];
+			unsigned long this_len;
+
+			this_len = PAGE_CACHE_SIZE - offset;
+			if (this_len > len)
+				this_len = len;
+
+			buf->page = page;
+			buf->offset = offset;
+			buf->len = this_len;
+			buf->ops = &page_cache_pipe_buf_ops;
+			info->nrbufs = ++bufs;
+			do_wakeup = 1;
+
+			ret += this_len;
+			len -= this_len;
+			offset = 0;
+			if (!--nr_pages)
+				break;
+			if (!len)
+				break;
+			if (bufs < PIPE_BUFFERS)
+				continue;
+
+			break;
+		}
+
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			wake_up_interruptible_sync(PIPE_WAIT(*inode));
+			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
+				    POLL_IN);
+			do_wakeup = 0;
+		}
+
+		PIPE_WAITING_WRITERS(*inode)++;
+		pipe_wait(inode);
+		PIPE_WAITING_WRITERS(*inode)--;
+	}
+
+	mutex_unlock(PIPE_MUTEX(*inode));
+
+	if (do_wakeup) {
+		wake_up_interruptible(PIPE_WAIT(*inode));
+		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+	}
+
+	while (i < nr_pages)
+		page_cache_release(pages[i++]);
+
+	return ret;
+}
+
+static int __generic_file_splice_read(struct file *in, struct inode *pipe,
+				      size_t len)
+{
+	struct address_space *mapping = in->f_mapping;
+	unsigned int offset, nr_pages;
+	struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
+	struct page *page;
+	pgoff_t index, pidx;
+	int i, j;
+
+	index = in->f_pos >> PAGE_CACHE_SHIFT;
+	offset = in->f_pos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (nr_pages > PIPE_BUFFERS)
+		nr_pages = PIPE_BUFFERS;
+
+	/*
+	 * initiate read-ahead on this page range
+	 */
+	do_page_cache_readahead(mapping, in, index, nr_pages);
+
+	/*
+	 * Get as many pages from the page cache as possible..
+	 * Start IO on the page cache entries we create (we
+	 * can assume that any pre-existing ones we find have
+	 * already had IO started on them).
+	 */
+	i = find_get_pages(mapping, index, nr_pages, pages);
+
+	/*
+	 * common case - we found all pages and they are contiguous,
+	 * kick them off
+	 */
+	if (i && (pages[i - 1]->index == index + i - 1))
+		goto splice_them;
+
+	/*
+	 * fill shadow[] with pages at the right locations, so we only
+	 * have to fill holes
+	 */
+	memset(shadow, 0, i * sizeof(struct page *));
+	for (j = 0, pidx = index; j < i; pidx++, j++)
+		shadow[pages[j]->index - pidx] = pages[j];
+
+	/*
+	 * now fill in the holes
+	 */
+	for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
+		int error;
+
+		if (shadow[i])
+			continue;
+
+		/*
+		 * no page there, look one up / create it
+		 */
+		page = find_or_create_page(mapping, pidx,
+						   mapping_gfp_mask(mapping));
+		if (!page)
+			break;
+
+		if (PageUptodate(page))
+			unlock_page(page);
+		else {
+			error = mapping->a_ops->readpage(in, page);
+
+			if (unlikely(error)) {
+				page_cache_release(page);
+				break;
+			}
+		}
+		shadow[i] = page;
+	}
+
+	if (!i) {
+		for (i = 0; i < nr_pages; i++) {
+			 if (shadow[i])
+				page_cache_release(shadow[i]);
+		}
+		return 0;
+	}
+
+	memcpy(pages, shadow, i * sizeof(struct page *));
+
+	/*
+	 * Now we splice them into the pipe..
+	 */
+splice_them:
+	return move_to_pipe(pipe, pages, i, offset, len);
+}
+
+ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
+				 size_t len, unsigned int flags)
+{
+	ssize_t spliced;
+	int ret;
+
+	ret = 0;
+	spliced = 0;
+	while (len) {
+		ret = __generic_file_splice_read(in, pipe, len);
+
+		if (ret <= 0)
+			break;
+
+		in->f_pos += ret;
+		len -= ret;
+		spliced += ret;
+	}
+
+	if (spliced)
+		return spliced;
+
+	return ret;
+}
+
+/*
+ * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
+ */
+static int pipe_to_sendpage(struct pipe_inode_info *info,
+			    struct pipe_buffer *buf, struct splice_desc *sd)
+{
+	struct file *file = sd->file;
+	loff_t pos = sd->pos;
+	unsigned int offset;
+	ssize_t ret;
+	void *ptr;
+
+	/*
+	 * sub-optimal, but we are limited by the pipe ->map. we don't
+	 * need a kmap'ed buffer here, we just want to make sure we
+	 * have the page pinned if the pipe page originates from the
+	 * page cache
+	 */
+	ptr = buf->ops->map(file, info, buf);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	offset = pos & ~PAGE_CACHE_MASK;
+
+	ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
+					sd->len < sd->total_len);
+
+	buf->ops->unmap(info, buf);
+	if (ret == sd->len)
+		return 0;
+
+	return -EIO;
+}
+
+/*
+ * This is a little more tricky than the file -> pipe splicing. There are
+ * basically three cases:
+ *
+ *	- Destination page already exists in the address space and there
+ *	  are users of it. For that case we have no other option that
+ *	  copying the data. Tough luck.
+ *	- Destination page already exists in the address space, but there
+ *	  are no users of it. Make sure it's uptodate, then drop it. Fall
+ *	  through to last case.
+ *	- Destination page does not exist, we can add the pipe page to
+ *	  the page cache and avoid the copy.
+ *
+ * For now we just do the slower thing and always copy pages over, it's
+ * easier than migrating pages from the pipe to the target file. For the
+ * case of doing file | file splicing, the migrate approach had some LRU
+ * nastiness...
+ */
+static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
+			struct splice_desc *sd)
+{
+	struct file *file = sd->file;
+	struct address_space *mapping = file->f_mapping;
+	unsigned int offset;
+	struct page *page;
+	char *src, *dst;
+	pgoff_t index;
+	int ret;
+
+	/*
+	 * after this, page will be locked and unmapped
+	 */
+	src = buf->ops->map(file, info, buf);
+	if (IS_ERR(src))
+		return PTR_ERR(src);
+
+	index = sd->pos >> PAGE_CACHE_SHIFT;
+	offset = sd->pos & ~PAGE_CACHE_MASK;
+
+find_page:
+	ret = -ENOMEM;
+	page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
+	if (!page)
+		goto out;
+
+	/*
+	 * If the page is uptodate, it is also locked. If it isn't
+	 * uptodate, we can mark it uptodate if we are filling the
+	 * full page. Otherwise we need to read it in first...
+	 */
+	if (!PageUptodate(page)) {
+		if (sd->len < PAGE_CACHE_SIZE) {
+			ret = mapping->a_ops->readpage(file, page);
+			if (unlikely(ret))
+				goto out;
+
+			lock_page(page);
+
+			if (!PageUptodate(page)) {
+				/*
+				 * page got invalidated, repeat
+				 */
+				if (!page->mapping) {
+					unlock_page(page);
+					page_cache_release(page);
+					goto find_page;
+				}
+				ret = -EIO;
+				goto out;
+			}
+		} else {
+			WARN_ON(!PageLocked(page));
+			SetPageUptodate(page);
+		}
+	}
+
+	ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
+	if (ret)
+		goto out;
+
+	dst = kmap_atomic(page, KM_USER0);
+	memcpy(dst + offset, src + buf->offset, sd->len);
+	flush_dcache_page(page);
+	kunmap_atomic(dst, KM_USER0);
+
+	ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
+	if (ret < 0)
+		goto out;
+
+	set_page_dirty(page);
+	ret = write_one_page(page, 0);
+out:
+	if (ret < 0)
+		unlock_page(page);
+	page_cache_release(page);
+	buf->ops->unmap(info, buf);
+	return ret;
+}
+
+typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
+			   struct splice_desc *);
+
+static ssize_t move_from_pipe(struct inode *inode, struct file *out,
+			      size_t len, unsigned int flags,
+			      splice_actor *actor)
+{
+	struct pipe_inode_info *info;
+	int ret, do_wakeup, err;
+	struct splice_desc sd;
+
+	ret = 0;
+	do_wakeup = 0;
+
+	sd.total_len = len;
+	sd.flags = flags;
+	sd.file = out;
+	sd.pos = out->f_pos;
+
+	mutex_lock(PIPE_MUTEX(*inode));
+
+	info = inode->i_pipe;
+	for (;;) {
+		int bufs = info->nrbufs;
+
+		if (bufs) {
+			int curbuf = info->curbuf;
+			struct pipe_buffer *buf = info->bufs + curbuf;
+			struct pipe_buf_operations *ops = buf->ops;
+
+			sd.len = buf->len;
+			if (sd.len > sd.total_len)
+				sd.len = sd.total_len;
+
+			err = actor(info, buf, &sd);
+			if (err) {
+				if (!ret && err != -ENODATA)
+					ret = err;
+
+				break;
+			}
+
+			ret += sd.len;
+			buf->offset += sd.len;
+			buf->len -= sd.len;
+			if (!buf->len) {
+				buf->ops = NULL;
+				ops->release(info, buf);
+				curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
+				info->curbuf = curbuf;
+				info->nrbufs = --bufs;
+				do_wakeup = 1;
+			}
+
+			sd.pos += sd.len;
+			sd.total_len -= sd.len;
+			if (!sd.total_len)
+				break;
+		}
+
+		if (bufs)
+			continue;
+		if (!PIPE_WRITERS(*inode))
+			break;
+		if (!PIPE_WAITING_WRITERS(*inode)) {
+			if (ret)
+				break;
+		}
+
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			wake_up_interruptible_sync(PIPE_WAIT(*inode));
+			kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
+			do_wakeup = 0;
+		}
+
+		pipe_wait(inode);
+	}
+
+	mutex_unlock(PIPE_MUTEX(*inode));
+
+	if (do_wakeup) {
+		wake_up_interruptible(PIPE_WAIT(*inode));
+		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+	}
+
+	mutex_lock(&out->f_mapping->host->i_mutex);
+	out->f_pos = sd.pos;
+	mutex_unlock(&out->f_mapping->host->i_mutex);
+	return ret;
+
+}
+
+ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
+				  size_t len, unsigned int flags)
+{
+	return move_from_pipe(inode, out, len, flags, pipe_to_file);
+}
+
+ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+				size_t len, unsigned int flags)
+{
+	return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
+}
+
+static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
+			   unsigned int flags)
+{
+	loff_t pos;
+	int ret;
+
+	if (!out->f_op || !out->f_op->splice_write)
+		return -EINVAL;
+
+	if (!(out->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	pos = out->f_pos;
+	ret = rw_verify_area(WRITE, out, &pos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	return out->f_op->splice_write(pipe, out, len, flags);
+}
+
+static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
+			 unsigned int flags)
+{
+	loff_t pos, isize, left;
+	int ret;
+
+	if (!in->f_op || !in->f_op->splice_read)
+		return -EINVAL;
+
+	if (!(in->f_mode & FMODE_READ))
+		return -EBADF;
+
+	pos = in->f_pos;
+	ret = rw_verify_area(READ, in, &pos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
+	isize = i_size_read(in->f_mapping->host);
+	if (unlikely(in->f_pos >= isize))
+		return 0;
+	
+	left = isize - in->f_pos;
+	if (left < len)
+		len = left;
+
+	return in->f_op->splice_read(in, pipe, len, flags);
+}
+
+static long do_splice(struct file *in, struct file *out, size_t len,
+		      unsigned int flags)
+{
+	struct inode *pipe;
+
+	pipe = in->f_dentry->d_inode;
+	if (pipe->i_pipe)
+		return do_splice_from(pipe, out, len, flags);
+
+	pipe = out->f_dentry->d_inode;
+	if (pipe->i_pipe)
+		return do_splice_to(in, pipe, len, flags);
+
+	return -EINVAL;
+}
+
+asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
+{
+	long error;
+	struct file *in, *out;
+	int fput_in, fput_out;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fget_light(fdin, &fput_in);
+	if (in) {
+		if (in->f_mode & FMODE_READ) {
+			out = fget_light(fdout, &fput_out);
+			if (out) {
+				if (out->f_mode & FMODE_WRITE)
+					error = do_splice(in, out, len, flags);
+				fput_light(out, fput_out);
+			}
+		}
+
+		fput_light(in, fput_in);
+	}
+
+	return error;
+}
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 014e3562895b..789e9bdd0a40 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -318,8 +318,9 @@
 #define __NR_unshare		310
 #define __NR_set_robust_list	311
 #define __NR_get_robust_list	312
+#define __NR_sys_splice		313
 
-#define NR_syscalls 313
+#define NR_syscalls 314
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 019956c613e4..36070c1014d8 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -285,12 +285,13 @@
 #define __NR_faccessat			1293
 /* 1294, 1295 reserved for pselect/ppoll */
 #define __NR_unshare			1296
+#define __NR_splice			1297
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls			273 /* length of syscall table */
+#define NR_syscalls			274 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index 1e990747dce7..536ba0873052 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -301,8 +301,9 @@
 #define __NR_pselect6		280
 #define __NR_ppoll		281
 #define __NR_unshare		282
+#define __NR_splice		283
 
-#define __NR_syscalls		283
+#define __NR_syscalls		284
 
 #ifdef __KERNEL__
 #define __NR__exit __NR_exit
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index fcc516353087..f21ff2c1e960 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -609,8 +609,10 @@ __SYSCALL(__NR_unshare,	sys_unshare)
 __SYSCALL(__NR_set_robust_list, sys_set_robust_list)
 #define __NR_get_robust_list	274
 __SYSCALL(__NR_get_robust_list, sys_get_robust_list)
+#define __NR_splice		275
+__SYSCALL(__NR_splice, sys_splice)
 
-#define __NR_syscall_max __NR_get_robust_list
+#define __NR_syscall_max __NR_splice
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 408fe89498f4..20fa5f6d7269 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1032,6 +1032,8 @@ struct file_operations {
 	int (*check_flags)(int);
 	int (*dir_notify)(struct file *filp, unsigned long arg);
 	int (*flock) (struct file *, int, struct file_lock *);
+	ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int);
+	ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int);
 };
 
 struct inode_operations {
@@ -1609,6 +1611,8 @@ extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor
 extern void do_generic_mapping_read(struct address_space *mapping,
 				    struct file_ra_state *, struct file *,
 				    loff_t *, read_descriptor_t *, read_actor_t);
+extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int);
+extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int);
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e487e3b60f60..e78ffc7d5b56 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -569,5 +569,7 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
 asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
 				   int flags, int mode);
 asmlinkage long sys_unshare(unsigned long unshare_flags);
+asmlinkage long sys_splice(int fdin, int fdout, size_t len,
+				unsigned int flags);
 
 #endif
diff --git a/net/socket.c b/net/socket.c
index fcd77eac0ccf..b13042f68c02 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -119,6 +119,9 @@ static ssize_t sock_writev(struct file *file, const struct iovec *vector,
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more);
 
+extern ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
+				size_t len, unsigned int flags);
+
 
 /*
  *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
@@ -141,7 +144,8 @@ static struct file_operations socket_file_ops = {
 	.fasync =	sock_fasync,
 	.readv =	sock_readv,
 	.writev =	sock_writev,
-	.sendpage =	sock_sendpage
+	.sendpage =	sock_sendpage,
+	.splice_write = generic_splice_sendpage,
 };
 
 /*
-- 
cgit v1.2.3


From 5abc97aa25b2c41413b3a520faee83f2282d9f18 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 30 Mar 2006 15:16:46 +0200
Subject: [PATCH] splice: add support for SPLICE_F_MOVE flag

This enables the caller to migrate pages from one address space page
cache to another.  In buzz word marketing, you can do zero-copy file
copies!

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/pipe.c                 |   8 +++
 fs/splice.c               | 121 ++++++++++++++++++++++++++++++++--------------
 include/linux/pipe_fs_i.h |   8 +++
 3 files changed, 100 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 2414bf270db6..109a102c150d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -121,11 +121,19 @@ static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer
 	kunmap(buf->page);
 }
 
+static int anon_pipe_buf_steal(struct pipe_inode_info *info,
+			       struct pipe_buffer *buf)
+{
+	buf->stolen = 1;
+	return 0;
+}
+
 static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = anon_pipe_buf_map,
 	.unmap = anon_pipe_buf_unmap,
 	.release = anon_pipe_buf_release,
+	.steal = anon_pipe_buf_steal,
 };
 
 static ssize_t
diff --git a/fs/splice.c b/fs/splice.c
index efa47c1c4e13..4a026f95884f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mm_inline.h>
+#include <linux/swap.h>
 
 /*
  * Passed to the actors
@@ -32,11 +33,37 @@ struct splice_desc {
 	loff_t pos;			/* file position */
 };
 
+static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
+				     struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	WARN_ON(!PageLocked(page));
+	WARN_ON(!PageUptodate(page));
+
+	if (!remove_mapping(page_mapping(page), page))
+		return 1;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irq(&zone->lru_lock);
+		BUG_ON(!PageLRU(page));
+		__ClearPageLRU(page);
+		del_page_from_lru(zone, page);
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	buf->stolen = 1;
+	return 0;
+}
+
 static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 					struct pipe_buffer *buf)
 {
 	page_cache_release(buf->page);
 	buf->page = NULL;
+	buf->stolen = 0;
 }
 
 static void *page_cache_pipe_buf_map(struct file *file,
@@ -63,7 +90,8 @@ static void *page_cache_pipe_buf_map(struct file *file,
 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 				      struct pipe_buffer *buf)
 {
-	unlock_page(buf->page);
+	if (!buf->stolen)
+		unlock_page(buf->page);
 	kunmap(buf->page);
 }
 
@@ -72,6 +100,7 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.map = page_cache_pipe_buf_map,
 	.unmap = page_cache_pipe_buf_unmap,
 	.release = page_cache_pipe_buf_release,
+	.steal = page_cache_pipe_buf_steal,
 };
 
 static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
@@ -336,8 +365,8 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	struct address_space *mapping = file->f_mapping;
 	unsigned int offset;
 	struct page *page;
-	char *src, *dst;
 	pgoff_t index;
+	char *src;
 	int ret;
 
 	/*
@@ -350,40 +379,54 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
-find_page:
-	ret = -ENOMEM;
-	page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
-	if (!page)
-		goto out;
-
 	/*
-	 * If the page is uptodate, it is also locked. If it isn't
-	 * uptodate, we can mark it uptodate if we are filling the
-	 * full page. Otherwise we need to read it in first...
+	 * reuse buf page, if SPLICE_F_MOVE is set
 	 */
-	if (!PageUptodate(page)) {
-		if (sd->len < PAGE_CACHE_SIZE) {
-			ret = mapping->a_ops->readpage(file, page);
-			if (unlikely(ret))
-				goto out;
-
-			lock_page(page);
-
-			if (!PageUptodate(page)) {
-				/*
-				 * page got invalidated, repeat
-				 */
-				if (!page->mapping) {
-					unlock_page(page);
-					page_cache_release(page);
-					goto find_page;
+	if (sd->flags & SPLICE_F_MOVE) {
+		if (buf->ops->steal(info, buf))
+			goto find_page;
+
+		page = buf->page;
+		if (add_to_page_cache_lru(page, mapping, index,
+						mapping_gfp_mask(mapping)))
+			goto find_page;
+	} else {
+find_page:
+		ret = -ENOMEM;
+		page = find_or_create_page(mapping, index,
+						mapping_gfp_mask(mapping));
+		if (!page)
+			goto out;
+
+		/*
+		 * If the page is uptodate, it is also locked. If it isn't
+		 * uptodate, we can mark it uptodate if we are filling the
+		 * full page. Otherwise we need to read it in first...
+		 */
+		if (!PageUptodate(page)) {
+			if (sd->len < PAGE_CACHE_SIZE) {
+				ret = mapping->a_ops->readpage(file, page);
+				if (unlikely(ret))
+					goto out;
+
+				lock_page(page);
+
+				if (!PageUptodate(page)) {
+					/*
+					 * page got invalidated, repeat
+					 */
+					if (!page->mapping) {
+						unlock_page(page);
+						page_cache_release(page);
+						goto find_page;
+					}
+					ret = -EIO;
+					goto out;
 				}
-				ret = -EIO;
-				goto out;
+			} else {
+				WARN_ON(!PageLocked(page));
+				SetPageUptodate(page);
 			}
-		} else {
-			WARN_ON(!PageLocked(page));
-			SetPageUptodate(page);
 		}
 	}
 
@@ -391,10 +434,13 @@ find_page:
 	if (ret)
 		goto out;
 
-	dst = kmap_atomic(page, KM_USER0);
-	memcpy(dst + offset, src + buf->offset, sd->len);
-	flush_dcache_page(page);
-	kunmap_atomic(dst, KM_USER0);
+	if (!buf->stolen) {
+		char *dst = kmap_atomic(page, KM_USER0);
+
+		memcpy(dst + offset, src + buf->offset, sd->len);
+		flush_dcache_page(page);
+		kunmap_atomic(dst, KM_USER0);
+	}
 
 	ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
 	if (ret < 0)
@@ -405,7 +451,8 @@ find_page:
 out:
 	if (ret < 0)
 		unlock_page(page);
-	page_cache_release(page);
+	if (!buf->stolen)
+		page_cache_release(page);
 	buf->ops->unmap(info, buf);
 	return ret;
 }
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b12e59c75752..75c7f55023ab 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -9,6 +9,7 @@ struct pipe_buffer {
 	struct page *page;
 	unsigned int offset, len;
 	struct pipe_buf_operations *ops;
+	unsigned int stolen;
 };
 
 struct pipe_buf_operations {
@@ -16,6 +17,7 @@ struct pipe_buf_operations {
 	void * (*map)(struct file *, struct pipe_inode_info *, struct pipe_buffer *);
 	void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *);
 	void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
+	int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
 };
 
 struct pipe_inode_info {
@@ -53,4 +55,10 @@ void pipe_wait(struct inode * inode);
 struct inode* pipe_new(struct inode* inode);
 void free_pipe_info(struct inode* inode);
 
+/*
+ * splice is tied to pipes as a transport (at least for now), so we'll just
+ * add the splice flags here.
+ */
+#define SPLICE_F_MOVE	(0x01)	/* move pages instead of copying */
+
 #endif
-- 
cgit v1.2.3