From df46b9a44ceb5af2ea2351ce8e28ae7bd840b00f Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Mon, 20 Jun 2005 14:04:44 +0200
Subject: [PATCH] Add blk_rq_map_kern()

Add blk_rq_map_kern which takes a kernel buffer and maps it into
a request and bio. This can be used by the dm hw_handlers, old
sg_scsi_ioctl, and one day scsi special requests so all requests
comming into scsi will have bios. All requests having bios
should allow scsi to use scatter lists for all IO and allow it
to use block layer functions.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 include/linux/bio.h    | 2 ++
 include/linux/blkdev.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 038022763f09..1dd2bc2e84ae 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -282,6 +282,8 @@ extern int bio_get_nr_vecs(struct block_device *);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int);
 extern void bio_unmap_user(struct bio *);
+extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
+				unsigned int);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4a99b76c5a33..67339bc5f6bc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -560,6 +560,8 @@ extern void blk_run_queue(request_queue_t *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct request *, struct bio *, unsigned int);
+extern struct request *blk_rq_map_kern(request_queue_t *, int, void *,
+					unsigned int, unsigned int);
 extern int blk_execute_rq(request_queue_t *, struct gendisk *, struct request *);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
-- 
cgit v1.2.3


From dd1cab95f356f1395278633565f198463cf6bd24 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 20 Jun 2005 14:06:01 +0200
Subject: [PATCH] Cleanup blk_rq_map_* interfaces

Change the blk_rq_map_user() and blk_rq_map_kern() interface to require
a previously allocated request to be passed in. This is both more efficient
for multiple iterations of mapping data to the same request, and it is also
a much nicer API.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/block/ll_rw_blk.c  | 68 ++++++++++++++++++----------------------------
 drivers/block/scsi_ioctl.c | 23 ++++++++++------
 drivers/cdrom/cdrom.c      | 13 ++++++---
 include/linux/blkdev.h     |  7 ++---
 4 files changed, 53 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 1471aca6fa18..42c4f3651cf8 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -2107,21 +2107,19 @@ EXPORT_SYMBOL(blk_insert_request);
  *    original bio must be passed back in to blk_rq_unmap_user() for proper
  *    unmapping.
  */
-struct request *blk_rq_map_user(request_queue_t *q, int rw, void __user *ubuf,
-				unsigned int len)
+int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
+		    unsigned int len)
 {
 	unsigned long uaddr;
-	struct request *rq;
 	struct bio *bio;
+	int reading;
 
 	if (len > (q->max_sectors << 9))
-		return ERR_PTR(-EINVAL);
-	if ((!len && ubuf) || (len && !ubuf))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
+	if (!len || !ubuf)
+		return -EINVAL;
 
-	rq = blk_get_request(q, rw, __GFP_WAIT);
-	if (!rq)
-		return ERR_PTR(-ENOMEM);
+	reading = rq_data_dir(rq) == READ;
 
 	/*
 	 * if alignment requirement is satisfied, map in user pages for
@@ -2129,9 +2127,9 @@ struct request *blk_rq_map_user(request_queue_t *q, int rw, void __user *ubuf,
 	 */
 	uaddr = (unsigned long) ubuf;
 	if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
-		bio = bio_map_user(q, NULL, uaddr, len, rw == READ);
+		bio = bio_map_user(q, NULL, uaddr, len, reading);
 	else
-		bio = bio_copy_user(q, uaddr, len, rw == READ);
+		bio = bio_copy_user(q, uaddr, len, reading);
 
 	if (!IS_ERR(bio)) {
 		rq->bio = rq->biotail = bio;
@@ -2139,14 +2137,13 @@ struct request *blk_rq_map_user(request_queue_t *q, int rw, void __user *ubuf,
 
 		rq->buffer = rq->data = NULL;
 		rq->data_len = len;
-		return rq;
+		return 0;
 	}
 
 	/*
 	 * bio is the err-ptr
 	 */
-	blk_put_request(rq);
-	return (struct request *) bio;
+	return PTR_ERR(bio);
 }
 
 EXPORT_SYMBOL(blk_rq_map_user);
@@ -2160,7 +2157,7 @@ EXPORT_SYMBOL(blk_rq_map_user);
  * Description:
  *    Unmap a request previously mapped by blk_rq_map_user().
  */
-int blk_rq_unmap_user(struct request *rq, struct bio *bio, unsigned int ulen)
+int blk_rq_unmap_user(struct bio *bio, unsigned int ulen)
 {
 	int ret = 0;
 
@@ -2171,8 +2168,7 @@ int blk_rq_unmap_user(struct request *rq, struct bio *bio, unsigned int ulen)
 			ret = bio_uncopy_user(bio);
 	}
 
-	blk_put_request(rq);
-	return ret;
+	return 0;
 }
 
 EXPORT_SYMBOL(blk_rq_unmap_user);
@@ -2184,39 +2180,29 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
  * @kbuf:	the kernel buffer
  * @len:	length of user data
  */
-struct request *blk_rq_map_kern(request_queue_t *q, int rw, void *kbuf,
-				unsigned int len, unsigned int gfp_mask)
+int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
+		    unsigned int len, unsigned int gfp_mask)
 {
-	struct request *rq;
 	struct bio *bio;
 
 	if (len > (q->max_sectors << 9))
-		return ERR_PTR(-EINVAL);
-	if ((!len && kbuf) || (len && !kbuf))
-		return ERR_PTR(-EINVAL);
-
-	rq = blk_get_request(q, rw, gfp_mask);
-	if (!rq)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
+	if (!len || !kbuf)
+		return -EINVAL;
 
 	bio = bio_map_kern(q, kbuf, len, gfp_mask);
-	if (!IS_ERR(bio)) {
-		if (rw)
-			bio->bi_rw |= (1 << BIO_RW);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
 
-		rq->bio = rq->biotail = bio;
-		blk_rq_bio_prep(q, rq, bio);
+	if (rq_data_dir(rq) == WRITE)
+		bio->bi_rw |= (1 << BIO_RW);
 
-		rq->buffer = rq->data = NULL;
-		rq->data_len = len;
-		return rq;
-	}
+	rq->bio = rq->biotail = bio;
+	blk_rq_bio_prep(q, rq, bio);
 
-	/*
-	 * bio is the err-ptr
-	 */
-	blk_put_request(rq);
-	return (struct request *) bio;
+	rq->buffer = rq->data = NULL;
+	rq->data_len = len;
+	return 0;
 }
 
 EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/drivers/block/scsi_ioctl.c b/drivers/block/scsi_ioctl.c
index 681871ca5d60..93c4ca874be3 100644
--- a/drivers/block/scsi_ioctl.c
+++ b/drivers/block/scsi_ioctl.c
@@ -216,7 +216,7 @@ static int sg_io(struct file *file, request_queue_t *q,
 		struct gendisk *bd_disk, struct sg_io_hdr *hdr)
 {
 	unsigned long start_time;
-	int reading, writing;
+	int reading, writing, ret;
 	struct request *rq;
 	struct bio *bio;
 	char sense[SCSI_SENSE_BUFFERSIZE];
@@ -255,14 +255,17 @@ static int sg_io(struct file *file, request_queue_t *q,
 			reading = 1;
 			break;
 		}
+	}
 
-		rq = blk_rq_map_user(q, writing ? WRITE : READ, hdr->dxferp,
-				     hdr->dxfer_len);
+	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
+	if (!rq)
+		return -ENOMEM;
 
-		if (IS_ERR(rq))
-			return PTR_ERR(rq);
-	} else
-		rq = blk_get_request(q, READ, __GFP_WAIT);
+	if (reading || writing) {
+		ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
+		if (ret)
+			goto out;
+	}
 
 	/*
 	 * fill in request structure
@@ -321,11 +324,13 @@ static int sg_io(struct file *file, request_queue_t *q,
 	}
 
 	if (blk_rq_unmap_user(rq, bio, hdr->dxfer_len))
-		return -EFAULT;
+		ret = -EFAULT;
 
 	/* may not have succeeded, but output values written to control
 	 * structure (struct sg_io_hdr).  */
-	return 0;
+out:
+	blk_put_request(rq);
+	return ret;
 }
 
 #define OMAX_SB_LEN 16          /* For backward compatibility */
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index beaa561f2ed8..6a7d926774a1 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2097,6 +2097,10 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 	if (!q)
 		return -ENXIO;
 
+	rq = blk_get_request(q, READ, GFP_KERNEL);
+	if (!rq)
+		return -ENOMEM;
+
 	cdi->last_sense = 0;
 
 	while (nframes) {
@@ -2108,9 +2112,9 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		rq = blk_rq_map_user(q, READ, ubuf, len);
-		if (IS_ERR(rq))
-			return PTR_ERR(rq);
+		ret = blk_rq_map_user(q, rq, ubuf, len);
+		if (ret)
+			break;
 
 		memset(rq->cmd, 0, sizeof(rq->cmd));
 		rq->cmd[0] = GPCMD_READ_CD;
@@ -2138,7 +2142,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 			cdi->last_sense = s->sense_key;
 		}
 
-		if (blk_rq_unmap_user(rq, bio, len))
+		if (blk_rq_unmap_user(bio, len))
 			ret = -EFAULT;
 
 		if (ret)
@@ -2149,6 +2153,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		ubuf += len;
 	}
 
+	blk_put_request(rq);
 	return ret;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 67339bc5f6bc..fc0dce078616 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -558,10 +558,9 @@ extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(request_queue_t *q);
 extern void blk_run_queue(request_queue_t *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
-extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
-extern int blk_rq_unmap_user(struct request *, struct bio *, unsigned int);
-extern struct request *blk_rq_map_kern(request_queue_t *, int, void *,
-					unsigned int, unsigned int);
+extern int blk_rq_map_user(request_queue_t *, struct request *, void __user *, unsigned int);
+extern int blk_rq_unmap_user(struct bio *, unsigned int);
+extern int blk_rq_map_kern(request_queue_t *, struct request *, void *, unsigned int, unsigned int);
 extern int blk_execute_rq(request_queue_t *, struct gendisk *, struct request *);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
-- 
cgit v1.2.3


From f1970baf6d74e03bd32072ab453f2fc01bc1b8d3 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Mon, 20 Jun 2005 14:06:52 +0200
Subject: [PATCH] Add scatter-gather support for the block layer SG_IO

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/block/ll_rw_blk.c  |  64 +++++++++++++++++--
 drivers/block/scsi_ioctl.c |  34 ++++++----
 fs/bio.c                   | 150 +++++++++++++++++++++++++++++++--------------
 include/linux/bio.h        |   4 ++
 include/linux/blkdev.h     |   1 +
 5 files changed, 191 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 42c4f3651cf8..874e46fc3748 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -2148,6 +2148,50 @@ int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
 
 EXPORT_SYMBOL(blk_rq_map_user);
 
+/**
+ * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
+ * @q:		request queue where request should be inserted
+ * @rq:		request to map data to
+ * @iov:	pointer to the iovec
+ * @iov_count:	number of elements in the iovec
+ *
+ * Description:
+ *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    a kernel bounce buffer is used.
+ *
+ *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    still in process context.
+ *
+ *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
+ *    before being submitted to the device, as pages mapped may be out of
+ *    reach. It's the callers responsibility to make sure this happens. The
+ *    original bio must be passed back in to blk_rq_unmap_user() for proper
+ *    unmapping.
+ */
+int blk_rq_map_user_iov(request_queue_t *q, struct request *rq,
+			struct sg_iovec *iov, int iov_count)
+{
+	struct bio *bio;
+
+	if (!iov || iov_count <= 0)
+		return -EINVAL;
+
+	/* we don't allow misaligned data like bio_map_user() does.  If the
+	 * user is using sg, they're expected to know the alignment constraints
+	 * and respect them accordingly */
+	bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	rq->bio = rq->biotail = bio;
+	blk_rq_bio_prep(q, rq, bio);
+	rq->buffer = rq->data = NULL;
+	rq->data_len = bio->bi_size;
+	return 0;
+}
+
+EXPORT_SYMBOL(blk_rq_map_user_iov);
+
 /**
  * blk_rq_unmap_user - unmap a request with user data
  * @rq:		request to be unmapped
@@ -2207,6 +2251,19 @@ int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
 
 EXPORT_SYMBOL(blk_rq_map_kern);
 
+void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
+			   struct request *rq, int at_head,
+			   void (*done)(struct request *))
+{
+	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+
+	rq->rq_disk = bd_disk;
+	rq->flags |= REQ_NOMERGE;
+	rq->end_io = done;
+	elv_add_request(q, rq, where, 1);
+	generic_unplug_device(q);
+}
+
 /**
  * blk_execute_rq - insert a request into queue for execution
  * @q:		queue to insert the request in
@@ -2224,8 +2281,6 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	int err = 0;
 
-	rq->rq_disk = bd_disk;
-
 	/*
 	 * we need an extra reference to the request, so we can look at
 	 * it after io completion
@@ -2238,11 +2293,8 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
 		rq->sense_len = 0;
 	}
 
-	rq->flags |= REQ_NOMERGE;
 	rq->waiting = &wait;
-	rq->end_io = blk_end_sync_rq;
-	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
-	generic_unplug_device(q);
+	blk_execute_rq_nowait(q, bd_disk, rq, 0, blk_end_sync_rq);
 	wait_for_completion(&wait);
 	rq->waiting = NULL;
 
diff --git a/drivers/block/scsi_ioctl.c b/drivers/block/scsi_ioctl.c
index 93c4ca874be3..09a7e73a0812 100644
--- a/drivers/block/scsi_ioctl.c
+++ b/drivers/block/scsi_ioctl.c
@@ -231,17 +231,11 @@ static int sg_io(struct file *file, request_queue_t *q,
 	if (verify_command(file, cmd))
 		return -EPERM;
 
-	/*
-	 * we'll do that later
-	 */
-	if (hdr->iovec_count)
-		return -EOPNOTSUPP;
-
 	if (hdr->dxfer_len > (q->max_sectors << 9))
 		return -EIO;
 
 	reading = writing = 0;
-	if (hdr->dxfer_len) {
+	if (hdr->dxfer_len)
 		switch (hdr->dxfer_direction) {
 		default:
 			return -EINVAL;
@@ -261,11 +255,29 @@ static int sg_io(struct file *file, request_queue_t *q,
 	if (!rq)
 		return -ENOMEM;
 
-	if (reading || writing) {
-		ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
-		if (ret)
+	if (hdr->iovec_count) {
+		const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
+		struct sg_iovec *iov;
+
+		iov = kmalloc(size, GFP_KERNEL);
+		if (!iov) {
+			ret = -ENOMEM;
 			goto out;
-	}
+		}
+
+		if (copy_from_user(iov, hdr->dxferp, size)) {
+			kfree(iov);
+			ret = -EFAULT;
+			goto out;
+		}
+
+		ret = blk_rq_map_user_iov(q, rq, iov, hdr->iovec_count);
+		kfree(iov);
+	} else if (hdr->dxfer_len)
+		ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
+
+	if (ret)
+		goto out;
 
 	/*
 	 * fill in request structure
diff --git a/fs/bio.c b/fs/bio.c
index c0d9140e470c..24e4045788e2 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 256
 
@@ -549,22 +550,34 @@ out_bmd:
 	return ERR_PTR(ret);
 }
 
-static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
-				  unsigned long uaddr, unsigned int len,
-				  int write_to_vm)
+static struct bio *__bio_map_user_iov(request_queue_t *q,
+				      struct block_device *bdev,
+				      struct sg_iovec *iov, int iov_count,
+				      int write_to_vm)
 {
-	unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = uaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
-	int ret, offset, i;
+	int i, j;
+	int nr_pages = 0;
 	struct page **pages;
 	struct bio *bio;
+	int cur_page = 0;
+	int ret, offset;
 
-	/*
-	 * transfer and buffer must be aligned to at least hardsector
-	 * size for now, in the future we can relax this restriction
-	 */
-	if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q)))
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+		unsigned long len = iov[i].iov_len;
+		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long start = uaddr >> PAGE_SHIFT;
+
+		nr_pages += end - start;
+		/*
+		 * transfer and buffer must be aligned to at least hardsector
+		 * size for now, in the future we can relax this restriction
+		 */
+		if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q)))
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (!nr_pages)
 		return ERR_PTR(-EINVAL);
 
 	bio = bio_alloc(GFP_KERNEL, nr_pages);
@@ -576,42 +589,54 @@ static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
 	if (!pages)
 		goto out;
 
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(current, current->mm, uaddr, nr_pages,
-						write_to_vm, 0, pages, NULL);
-	up_read(&current->mm->mmap_sem);
-
-	if (ret < nr_pages)
-		goto out;
-
-	bio->bi_bdev = bdev;
-
-	offset = uaddr & ~PAGE_MASK;
-	for (i = 0; i < nr_pages; i++) {
-		unsigned int bytes = PAGE_SIZE - offset;
-
-		if (len <= 0)
-			break;
-
-		if (bytes > len)
-			bytes = len;
+	memset(pages, 0, nr_pages * sizeof(struct page *));
+
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+		unsigned long len = iov[i].iov_len;
+		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		unsigned long start = uaddr >> PAGE_SHIFT;
+		const int local_nr_pages = end - start;
+		const int page_limit = cur_page + local_nr_pages;
+		
+		down_read(&current->mm->mmap_sem);
+		ret = get_user_pages(current, current->mm, uaddr,
+				     local_nr_pages,
+				     write_to_vm, 0, &pages[cur_page], NULL);
+		up_read(&current->mm->mmap_sem);
+
+		if (ret < local_nr_pages)
+			goto out_unmap;
+
+
+		offset = uaddr & ~PAGE_MASK;
+		for (j = cur_page; j < page_limit; j++) {
+			unsigned int bytes = PAGE_SIZE - offset;
+
+			if (len <= 0)
+				break;
+			
+			if (bytes > len)
+				bytes = len;
+
+			/*
+			 * sorry...
+			 */
+			if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes)
+				break;
+
+			len -= bytes;
+			offset = 0;
+		}
 
+		cur_page = j;
 		/*
-		 * sorry...
+		 * release the pages we didn't map into the bio, if any
 		 */
-		if (__bio_add_page(q, bio, pages[i], bytes, offset) < bytes)
-			break;
-
-		len -= bytes;
-		offset = 0;
+		while (j < page_limit)
+			page_cache_release(pages[j++]);
 	}
 
-	/*
-	 * release the pages we didn't map into the bio, if any
-	 */
-	while (i < nr_pages)
-		page_cache_release(pages[i++]);
-
 	kfree(pages);
 
 	/*
@@ -620,9 +645,17 @@ static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
 	if (!write_to_vm)
 		bio->bi_rw |= (1 << BIO_RW);
 
+	bio->bi_bdev = bdev;
 	bio->bi_flags |= (1 << BIO_USER_MAPPED);
 	return bio;
-out:
+
+ out_unmap:
+	for (i = 0; i < nr_pages; i++) {
+		if(!pages[i])
+			break;
+		page_cache_release(pages[i]);
+	}
+ out:
 	kfree(pages);
 	bio_put(bio);
 	return ERR_PTR(ret);
@@ -641,10 +674,34 @@ out:
  */
 struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
 			 unsigned long uaddr, unsigned int len, int write_to_vm)
+{
+	struct sg_iovec iov;
+
+	iov.iov_base = (__user void *)uaddr;
+	iov.iov_len = len;
+
+	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+}
+
+/**
+ *	bio_map_user_iov - map user sg_iovec table into bio
+ *	@q: the request_queue_t for the bio
+ *	@bdev: destination block device
+ *	@iov:	the iovec.
+ *	@iov_count: number of elements in the iovec
+ *	@write_to_vm: bool indicating writing to pages or not
+ *
+ *	Map the user space address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_user_iov(request_queue_t *q, struct block_device *bdev,
+			     struct sg_iovec *iov, int iov_count,
+			     int write_to_vm)
 {
 	struct bio *bio;
+	int len = 0, i;
 
-	bio = __bio_map_user(q, bdev, uaddr, len, write_to_vm);
+	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
 
 	if (IS_ERR(bio))
 		return bio;
@@ -657,6 +714,9 @@ struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
 	 */
 	bio_get(bio);
 
+	for (i = 0; i < iov_count; i++)
+		len += iov[i].iov_len;
+
 	if (bio->bi_size == len)
 		return bio;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1dd2bc2e84ae..ebcd03ba2e20 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -281,6 +281,10 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int);
+struct sg_iovec;
+extern struct bio *bio_map_user_iov(struct request_queue *,
+				    struct block_device *,
+				    struct sg_iovec *, int, int);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				unsigned int);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fc0dce078616..0430ea3e5f2e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -561,6 +561,7 @@ extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern int blk_rq_map_user(request_queue_t *, struct request *, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct bio *, unsigned int);
 extern int blk_rq_map_kern(request_queue_t *, struct request *, void *, unsigned int, unsigned int);
+extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_iovec *, int);
 extern int blk_execute_rq(request_queue_t *, struct gendisk *, struct request *);
 
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
-- 
cgit v1.2.3


From 994ca9a19616f0d4161a9e825f0835925d522426 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Mon, 20 Jun 2005 14:11:09 +0200
Subject: [PATCH] update blk_execute_rq to take an at_head parameter

Original From: Mike Christie <michaelc@cs.wisc.edu>

Modified to split out block changes (this patch) and SCSI pieces.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/block/ll_rw_blk.c  | 7 ++++---
 drivers/block/scsi_ioctl.c | 6 +++---
 drivers/cdrom/cdrom.c      | 2 +-
 drivers/ide/ide-disk.c     | 2 +-
 include/linux/blkdev.h     | 4 ++--
 5 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 874e46fc3748..d260a2ce9a70 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -2269,13 +2269,14 @@ void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
  * @q:		queue to insert the request in
  * @bd_disk:	matching gendisk
  * @rq:		request to insert
+ * @at_head:    insert request at head or tail of queue
  *
  * Description:
  *    Insert a fully prepared request at the back of the io scheduler queue
  *    for execution.
  */
 int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
-		   struct request *rq)
+		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION(wait);
 	char sense[SCSI_SENSE_BUFFERSIZE];
@@ -2294,7 +2295,7 @@ int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
 	}
 
 	rq->waiting = &wait;
-	blk_execute_rq_nowait(q, bd_disk, rq, 0, blk_end_sync_rq);
+	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
 	wait_for_completion(&wait);
 	rq->waiting = NULL;
 
@@ -2361,7 +2362,7 @@ int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
 	rq->data_len = 0;
 	rq->timeout = 60 * HZ;
 
-	ret = blk_execute_rq(q, disk, rq);
+	ret = blk_execute_rq(q, disk, rq, 0);
 
 	if (ret && error_sector)
 		*error_sector = rq->sector;
diff --git a/drivers/block/scsi_ioctl.c b/drivers/block/scsi_ioctl.c
index 7717b76f7f20..abb2df249fd3 100644
--- a/drivers/block/scsi_ioctl.c
+++ b/drivers/block/scsi_ioctl.c
@@ -308,7 +308,7 @@ static int sg_io(struct file *file, request_queue_t *q,
 	 * (if he doesn't check that is his problem).
 	 * N.B. a non-zero SCSI status is _not_ necessarily an error.
 	 */
-	blk_execute_rq(q, bd_disk, rq);
+	blk_execute_rq(q, bd_disk, rq, 0);
 
 	/* write to all output members */
 	hdr->status = 0xff & rq->errors;
@@ -420,7 +420,7 @@ static int sg_scsi_ioctl(struct file *file, request_queue_t *q,
 	rq->data_len = bytes;
 	rq->flags |= REQ_BLOCK_PC;
 
-	blk_execute_rq(q, bd_disk, rq);
+	blk_execute_rq(q, bd_disk, rq, 0);
 	err = rq->errors & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
 		if (rq->sense_len && rq->sense) {
@@ -573,7 +573,7 @@ int scsi_cmd_ioctl(struct file *file, struct gendisk *bd_disk, unsigned int cmd,
 			rq->cmd[0] = GPCMD_START_STOP_UNIT;
 			rq->cmd[4] = 0x02 + (close != 0);
 			rq->cmd_len = 6;
-			err = blk_execute_rq(q, bd_disk, rq);
+			err = blk_execute_rq(q, bd_disk, rq, 0);
 			blk_put_request(rq);
 			break;
 		default:
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 6a7d926774a1..153960348414 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2136,7 +2136,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		if (rq->bio)
 			blk_queue_bounce(q, &rq->bio);
 
-		if (blk_execute_rq(q, cdi->disk, rq)) {
+		if (blk_execute_rq(q, cdi->disk, rq, 0)) {
 			struct request_sense *s = rq->sense;
 			ret = -EIO;
 			cdi->last_sense = s->sense_key;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 3302cd8eab4c..9176da7a9858 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -750,7 +750,7 @@ static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
 
 	idedisk_prepare_flush(q, rq);
 
-	ret = blk_execute_rq(q, disk, rq);
+	ret = blk_execute_rq(q, disk, rq, 0);
 
 	/*
 	 * if we failed and caller wants error offset, get it
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0430ea3e5f2e..a48dc12c6699 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -562,8 +562,8 @@ extern int blk_rq_map_user(request_queue_t *, struct request *, void __user *, u
 extern int blk_rq_unmap_user(struct bio *, unsigned int);
 extern int blk_rq_map_kern(request_queue_t *, struct request *, void *, unsigned int, unsigned int);
 extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_iovec *, int);
-extern int blk_execute_rq(request_queue_t *, struct gendisk *, struct request *);
-
+extern int blk_execute_rq(request_queue_t *, struct gendisk *,
+			  struct request *, int);
 static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue;
-- 
cgit v1.2.3


From d0a7e574007fd547d72ec693bfa35778623d0738 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Sun, 14 Aug 2005 17:09:01 -0500
Subject: [SCSI] correct transport class abstraction to work outside SCSI

I recently tried to construct a totally generic transport class and
found there were certain features missing from the current abstract
transport class.  Most notable is that you have to hang the data on the
class_device but most of the API is framed in terms of the generic
device, not the class_device.

These changes are two fold

- Provide the class_device to all of the setup and configure APIs
- Provide and extra API to take the device and the attribute class and
  return the corresponding class_device

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/base/attribute_container.c  | 38 +++++++++++++++++++++++++++++++++++++
 drivers/base/transport_class.c      | 17 +++++++++++------
 drivers/scsi/scsi_transport_fc.c    |  6 ++++--
 drivers/scsi/scsi_transport_spi.c   | 11 ++++++++---
 include/linux/attribute_container.h |  9 +++------
 include/linux/transport_class.h     | 11 ++++++++---
 6 files changed, 72 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/attribute_container.c b/drivers/base/attribute_container.c
index ec615d854be9..62c093db11e6 100644
--- a/drivers/base/attribute_container.c
+++ b/drivers/base/attribute_container.c
@@ -58,6 +58,7 @@ attribute_container_register(struct attribute_container *cont)
 {
 	INIT_LIST_HEAD(&cont->node);
 	INIT_LIST_HEAD(&cont->containers);
+	spin_lock_init(&cont->containers_lock);
 		
 	down(&attribute_container_mutex);
 	list_add_tail(&cont->node, &attribute_container_list);
@@ -77,11 +78,13 @@ attribute_container_unregister(struct attribute_container *cont)
 {
 	int retval = -EBUSY;
 	down(&attribute_container_mutex);
+	spin_lock(&cont->containers_lock);
 	if (!list_empty(&cont->containers))
 		goto out;
 	retval = 0;
 	list_del(&cont->node);
  out:
+	spin_unlock(&cont->containers_lock);
 	up(&attribute_container_mutex);
 	return retval;
 		
@@ -151,7 +154,9 @@ attribute_container_add_device(struct device *dev,
 			fn(cont, dev, &ic->classdev);
 		else
 			attribute_container_add_class_device(&ic->classdev);
+		spin_lock(&cont->containers_lock);
 		list_add_tail(&ic->node, &cont->containers);
+		spin_unlock(&cont->containers_lock);
 	}
 	up(&attribute_container_mutex);
 }
@@ -189,6 +194,7 @@ attribute_container_remove_device(struct device *dev,
 
 		if (!cont->match(cont, dev))
 			continue;
+		spin_lock(&cont->containers_lock);
 		list_for_each_entry_safe(ic, tmp, &cont->containers, node) {
 			if (dev != ic->classdev.dev)
 				continue;
@@ -200,6 +206,7 @@ attribute_container_remove_device(struct device *dev,
 				class_device_unregister(&ic->classdev);
 			}
 		}
+		spin_unlock(&cont->containers_lock);
 	}
 	up(&attribute_container_mutex);
 }
@@ -230,10 +237,12 @@ attribute_container_device_trigger(struct device *dev,
 		if (!cont->match(cont, dev))
 			continue;
 
+		spin_lock(&cont->containers_lock);
 		list_for_each_entry_safe(ic, tmp, &cont->containers, node) {
 			if (dev == ic->classdev.dev)
 				fn(cont, dev, &ic->classdev);
 		}
+		spin_unlock(&cont->containers_lock);
 	}
 	up(&attribute_container_mutex);
 }
@@ -368,6 +377,35 @@ attribute_container_class_device_del(struct class_device *classdev)
 }
 EXPORT_SYMBOL_GPL(attribute_container_class_device_del);
 
+/**
+ * attribute_container_find_class_device - find the corresponding class_device
+ *
+ * @cont:	the container
+ * @dev:	the generic device
+ *
+ * Looks up the device in the container's list of class devices and returns
+ * the corresponding class_device.
+ */
+struct class_device *
+attribute_container_find_class_device(struct attribute_container *cont,
+				      struct device *dev)
+{
+	struct class_device *cdev = NULL;
+	struct internal_container *ic;
+
+	spin_lock(&cont->containers_lock);
+	list_for_each_entry(ic, &cont->containers, node) {
+		if (ic->classdev.dev == dev) {
+			cdev = &ic->classdev;
+			break;
+		}
+	}
+	spin_unlock(&cont->containers_lock);
+
+	return cdev;
+}
+EXPORT_SYMBOL_GPL(attribute_container_find_class_device);
+
 int __init
 attribute_container_init(void)
 {
diff --git a/drivers/base/transport_class.c b/drivers/base/transport_class.c
index 6c2b447a3336..4fb4c5de8470 100644
--- a/drivers/base/transport_class.c
+++ b/drivers/base/transport_class.c
@@ -64,7 +64,9 @@ void transport_class_unregister(struct transport_class *tclass)
 }
 EXPORT_SYMBOL_GPL(transport_class_unregister);
 
-static int anon_transport_dummy_function(struct device *dev)
+static int anon_transport_dummy_function(struct transport_container *tc,
+					 struct device *dev,
+					 struct class_device *cdev)
 {
 	/* do nothing */
 	return 0;
@@ -115,9 +117,10 @@ static int transport_setup_classdev(struct attribute_container *cont,
 				    struct class_device *classdev)
 {
 	struct transport_class *tclass = class_to_transport_class(cont->class);
+	struct transport_container *tcont = attribute_container_to_transport_container(cont);
 
 	if (tclass->setup)
-		tclass->setup(dev);
+		tclass->setup(tcont, dev, classdev);
 
 	return 0;
 }
@@ -178,12 +181,14 @@ void transport_add_device(struct device *dev)
 EXPORT_SYMBOL_GPL(transport_add_device);
 
 static int transport_configure(struct attribute_container *cont,
-			       struct device *dev)
+			       struct device *dev,
+			       struct class_device *cdev)
 {
 	struct transport_class *tclass = class_to_transport_class(cont->class);
+	struct transport_container *tcont = attribute_container_to_transport_container(cont);
 
 	if (tclass->configure)
-		tclass->configure(dev);
+		tclass->configure(tcont, dev, cdev);
 
 	return 0;
 }
@@ -202,7 +207,7 @@ static int transport_configure(struct attribute_container *cont,
  */
 void transport_configure_device(struct device *dev)
 {
-	attribute_container_trigger(dev, transport_configure);
+	attribute_container_device_trigger(dev, transport_configure);
 }
 EXPORT_SYMBOL_GPL(transport_configure_device);
 
@@ -215,7 +220,7 @@ static int transport_remove_classdev(struct attribute_container *cont,
 	struct transport_class *tclass = class_to_transport_class(cont->class);
 
 	if (tclass->remove)
-		tclass->remove(dev);
+		tclass->remove(tcont, dev, classdev);
 
 	if (tclass->remove != anon_transport_dummy_function) {
 		if (tcont->statistics)
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 35d1c1e8e345..96243c7fe110 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -252,7 +252,8 @@ struct fc_internal {
 
 #define to_fc_internal(tmpl)	container_of(tmpl, struct fc_internal, t)
 
-static int fc_target_setup(struct device *dev)
+static int fc_target_setup(struct transport_container *tc, struct device *dev,
+			   struct class_device *cdev)
 {
 	struct scsi_target *starget = to_scsi_target(dev);
 	struct fc_rport *rport = starget_to_rport(starget);
@@ -281,7 +282,8 @@ static DECLARE_TRANSPORT_CLASS(fc_transport_class,
 			       NULL,
 			       NULL);
 
-static int fc_host_setup(struct device *dev)
+static int fc_host_setup(struct transport_container *tc, struct device *dev,
+			 struct class_device *cdev)
 {
 	struct Scsi_Host *shost = dev_to_shost(dev);
 
diff --git a/drivers/scsi/scsi_transport_spi.c b/drivers/scsi/scsi_transport_spi.c
index 02134fce2174..89f6b7feb9c2 100644
--- a/drivers/scsi/scsi_transport_spi.c
+++ b/drivers/scsi/scsi_transport_spi.c
@@ -162,7 +162,8 @@ static inline enum spi_signal_type spi_signal_to_value(const char *name)
 	return SPI_SIGNAL_UNKNOWN;
 }
 
-static int spi_host_setup(struct device *dev)
+static int spi_host_setup(struct transport_container *tc, struct device *dev,
+			  struct class_device *cdev)
 {
 	struct Scsi_Host *shost = dev_to_shost(dev);
 
@@ -196,7 +197,9 @@ static int spi_host_match(struct attribute_container *cont,
 	return &i->t.host_attrs.ac == cont;
 }
 
-static int spi_device_configure(struct device *dev)
+static int spi_device_configure(struct transport_container *tc,
+				struct device *dev,
+				struct class_device *cdev)
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
 	struct scsi_target *starget = sdev->sdev_target;
@@ -214,7 +217,9 @@ static int spi_device_configure(struct device *dev)
 	return 0;
 }
 
-static int spi_setup_transport_attrs(struct device *dev)
+static int spi_setup_transport_attrs(struct transport_container *tc,
+				     struct device *dev,
+				     struct class_device *cdev)
 {
 	struct scsi_target *starget = to_scsi_target(dev);
 
diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h
index af1010b6dab7..f54b05b052b3 100644
--- a/include/linux/attribute_container.h
+++ b/include/linux/attribute_container.h
@@ -11,10 +11,12 @@
 
 #include <linux/device.h>
 #include <linux/list.h>
+#include <linux/spinlock.h>
 
 struct attribute_container {
 	struct list_head	node;
 	struct list_head	containers;
+	spinlock_t		containers_lock;
 	struct class		*class;
 	struct class_device_attribute **attrs;
 	int (*match)(struct attribute_container *, struct device *);
@@ -62,12 +64,7 @@ int attribute_container_add_class_device_adapter(struct attribute_container *con
 						 struct class_device *classdev);
 void attribute_container_remove_attrs(struct class_device *classdev);
 void attribute_container_class_device_del(struct class_device *classdev);
-
-
-
-
-
-
+struct class_device *attribute_container_find_class_device(struct attribute_container *, struct device *);
 struct class_device_attribute **attribute_container_classdev_to_attrs(const struct class_device *classdev);
 
 #endif
diff --git a/include/linux/transport_class.h b/include/linux/transport_class.h
index 87d98d1faefb..1d6cc22e5f42 100644
--- a/include/linux/transport_class.h
+++ b/include/linux/transport_class.h
@@ -12,11 +12,16 @@
 #include <linux/device.h>
 #include <linux/attribute_container.h>
 
+struct transport_container;
+
 struct transport_class {
 	struct class class;
-	int (*setup)(struct device *);
-	int (*configure)(struct device *);
-	int (*remove)(struct device *);
+	int (*setup)(struct transport_container *, struct device *,
+		     struct class_device *);
+	int (*configure)(struct transport_container *, struct device *,
+			 struct class_device *);
+	int (*remove)(struct transport_container *, struct device *,
+		      struct class_device *);
 };
 
 #define DECLARE_TRANSPORT_CLASS(cls, nm, su, rm, cfg)			\
-- 
cgit v1.2.3


From caca1779870b1bcc0fb07e48ebd2403901f356b8 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Tue, 16 Aug 2005 17:26:10 -0500
Subject: [SCSI] add missing attribute container function prototype

attribute_container_classdev_to_container is an exported function of the
attribute_container.c file.  However, there's no prototype for it.  Now
I actually want to use it, so add one.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 include/linux/attribute_container.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h
index f54b05b052b3..ee83fe64a102 100644
--- a/include/linux/attribute_container.h
+++ b/include/linux/attribute_container.h
@@ -64,6 +64,7 @@ int attribute_container_add_class_device_adapter(struct attribute_container *con
 						 struct class_device *classdev);
 void attribute_container_remove_attrs(struct class_device *classdev);
 void attribute_container_class_device_del(struct class_device *classdev);
+struct attribute_container *attribute_container_classdev_to_container(struct class_device *);
 struct class_device *attribute_container_find_class_device(struct attribute_container *, struct device *);
 struct class_device_attribute **attribute_container_classdev_to_attrs(const struct class_device *classdev);
 
-- 
cgit v1.2.3


From 53c165e0a6c8a4ff7df316557528fa7a52d20711 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Mon, 22 Aug 2005 10:06:19 -0500
Subject: [SCSI] correct attribute_container list usage

One of the changes in the attribute_container code in the scsi-misc tree
was to add a lock to protect the list of devices per container.  This,
unfortunately, leads to potential scheduling while atomic problems if
there's a sleep in the function called by a trigger.

The correct solution is to use the kernel klist infrastructure instead
which allows lockless traversal of a list.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/base/attribute_container.c  | 51 +++++++++++++++++++++----------------
 include/linux/attribute_container.h |  4 +--
 2 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/attribute_container.c b/drivers/base/attribute_container.c
index ebcae5c34133..6c0f49340eb2 100644
--- a/drivers/base/attribute_container.c
+++ b/drivers/base/attribute_container.c
@@ -22,7 +22,7 @@
 /* This is a private structure used to tie the classdev and the
  * container .. it should never be visible outside this file */
 struct internal_container {
-	struct list_head node;
+	struct klist_node node;
 	struct attribute_container *cont;
 	struct class_device classdev;
 };
@@ -57,8 +57,7 @@ int
 attribute_container_register(struct attribute_container *cont)
 {
 	INIT_LIST_HEAD(&cont->node);
-	INIT_LIST_HEAD(&cont->containers);
-	spin_lock_init(&cont->containers_lock);
+	klist_init(&cont->containers);
 		
 	down(&attribute_container_mutex);
 	list_add_tail(&cont->node, &attribute_container_list);
@@ -78,13 +77,13 @@ attribute_container_unregister(struct attribute_container *cont)
 {
 	int retval = -EBUSY;
 	down(&attribute_container_mutex);
-	spin_lock(&cont->containers_lock);
-	if (!list_empty(&cont->containers))
+	spin_lock(&cont->containers.k_lock);
+	if (!list_empty(&cont->containers.k_list))
 		goto out;
 	retval = 0;
 	list_del(&cont->node);
  out:
-	spin_unlock(&cont->containers_lock);
+	spin_unlock(&cont->containers.k_lock);
 	up(&attribute_container_mutex);
 	return retval;
 		
@@ -143,7 +142,6 @@ attribute_container_add_device(struct device *dev,
 			continue;
 		}
 		memset(ic, 0, sizeof(struct internal_container));
-		INIT_LIST_HEAD(&ic->node);
 		ic->cont = cont;
 		class_device_initialize(&ic->classdev);
 		ic->classdev.dev = get_device(dev);
@@ -154,13 +152,22 @@ attribute_container_add_device(struct device *dev,
 			fn(cont, dev, &ic->classdev);
 		else
 			attribute_container_add_class_device(&ic->classdev);
-		spin_lock(&cont->containers_lock);
-		list_add_tail(&ic->node, &cont->containers);
-		spin_unlock(&cont->containers_lock);
+		klist_add_tail(&ic->node, &cont->containers);
 	}
 	up(&attribute_container_mutex);
 }
 
+/* FIXME: can't break out of this unless klist_iter_exit is also
+ * called before doing the break
+ */
+#define klist_for_each_entry(pos, head, member, iter) \
+	for (klist_iter_init(head, iter); (pos = ({ \
+		struct klist_node *n = klist_next(iter); \
+		n ? ({ klist_iter_exit(iter) ; NULL; }) : \
+			container_of(n, typeof(*pos), member);\
+	}) ) != NULL; )
+			
+
 /**
  * attribute_container_remove_device - make device eligible for removal.
  *
@@ -187,18 +194,19 @@ attribute_container_remove_device(struct device *dev,
 
 	down(&attribute_container_mutex);
 	list_for_each_entry(cont, &attribute_container_list, node) {
-		struct internal_container *ic, *tmp;
+		struct internal_container *ic;
+		struct klist_iter iter;
 
 		if (attribute_container_no_classdevs(cont))
 			continue;
 
 		if (!cont->match(cont, dev))
 			continue;
-		spin_lock(&cont->containers_lock);
-		list_for_each_entry_safe(ic, tmp, &cont->containers, node) {
+
+		klist_for_each_entry(ic, &cont->containers, node, &iter) {
 			if (dev != ic->classdev.dev)
 				continue;
-			list_del(&ic->node);
+			klist_remove(&ic->node);
 			if (fn)
 				fn(cont, dev, &ic->classdev);
 			else {
@@ -206,7 +214,6 @@ attribute_container_remove_device(struct device *dev,
 				class_device_unregister(&ic->classdev);
 			}
 		}
-		spin_unlock(&cont->containers_lock);
 	}
 	up(&attribute_container_mutex);
 }
@@ -232,7 +239,8 @@ attribute_container_device_trigger(struct device *dev,
 
 	down(&attribute_container_mutex);
 	list_for_each_entry(cont, &attribute_container_list, node) {
-		struct internal_container *ic, *tmp;
+		struct internal_container *ic;
+		struct klist_iter iter;
 
 		if (!cont->match(cont, dev))
 			continue;
@@ -242,12 +250,10 @@ attribute_container_device_trigger(struct device *dev,
 			continue;
 		}
 
-		spin_lock(&cont->containers_lock);
-		list_for_each_entry_safe(ic, tmp, &cont->containers, node) {
+		klist_for_each_entry(ic, &cont->containers, node, &iter) {
 			if (dev == ic->classdev.dev)
 				fn(cont, dev, &ic->classdev);
 		}
-		spin_unlock(&cont->containers_lock);
 	}
 	up(&attribute_container_mutex);
 }
@@ -397,15 +403,16 @@ attribute_container_find_class_device(struct attribute_container *cont,
 {
 	struct class_device *cdev = NULL;
 	struct internal_container *ic;
+	struct klist_iter iter;
 
-	spin_lock(&cont->containers_lock);
-	list_for_each_entry(ic, &cont->containers, node) {
+	klist_for_each_entry(ic, &cont->containers, node, &iter) {
 		if (ic->classdev.dev == dev) {
 			cdev = &ic->classdev;
+			/* FIXME: must exit iterator then break */
+			klist_iter_exit(&iter);
 			break;
 		}
 	}
-	spin_unlock(&cont->containers_lock);
 
 	return cdev;
 }
diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h
index ee83fe64a102..93bfb0beb62a 100644
--- a/include/linux/attribute_container.h
+++ b/include/linux/attribute_container.h
@@ -11,12 +11,12 @@
 
 #include <linux/device.h>
 #include <linux/list.h>
+#include <linux/klist.h>
 #include <linux/spinlock.h>
 
 struct attribute_container {
 	struct list_head	node;
-	struct list_head	containers;
-	spinlock_t		containers_lock;
+	struct klist		containers;
 	struct class		*class;
 	struct class_device_attribute **attrs;
 	int (*match)(struct attribute_container *, struct device *);
-- 
cgit v1.2.3


From 61a7afa2c476a3be261cf88a95b0dea0c3bd29d4 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Tue, 16 Aug 2005 18:27:34 -0500
Subject: [SCSI] embryonic RAID class

The idea behind a RAID class is to provide a uniform interface to all
RAID subsystems (both hardware and software) in the kernel.

To do that, I've made this class a transport class that's entirely
subsystem independent (although the matching routines have to match per
subsystem, as you'll see looking at the code).  I put it in the scsi
subdirectory purely because I needed somewhere to play with it, but it's
not a scsi specific module.

I used a fusion raid card as the test bed for this; with that kind of
card, this is the type of class output you get:

jejb@titanic> ls -l /sys/class/raid_devices/20\:0\:0\:0/
total 0
lrwxrwxrwx  1 root root     0 Aug 16 17:21 component-0 -> ../../../devices/pci0000:80/0000:80:04.0/host20/target20:1:0/20:1:0:0/
lrwxrwxrwx  1 root root     0 Aug 16 17:21 component-1 -> ../../../devices/pci0000:80/0000:80:04.0/host20/target20:1:1/20:1:1:0/
lrwxrwxrwx  1 root root     0 Aug 16 17:21 device -> ../../../devices/pci0000:80/0000:80:04.0/host20/target20:0:0/20:0:0:0/
-r--r--r--  1 root root 16384 Aug 16 17:21 level
-r--r--r--  1 root root 16384 Aug 16 17:21 resync
-r--r--r--  1 root root 16384 Aug 16 17:21 state

So it's really simple: for a SCSI device representing a hardware raid,
it shows the raid level, the array state, the resync % complete (if the
state is resyncing) and the underlying components of the RAID (these are
exposed in fusion on the virtual channel 1).

As you can see, this type of information can be exported by almost
anything, including software raid.

The more difficult trick, of course, is going to be getting it to
perform configuration type actions with writable attributes.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/Kconfig       |   6 ++
 drivers/scsi/Makefile      |   2 +
 drivers/scsi/raid_class.c  | 250 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/raid_class.h |  59 +++++++++++
 4 files changed, 317 insertions(+)
 create mode 100644 drivers/scsi/raid_class.c
 create mode 100644 include/linux/raid_class.h

(limited to 'include/linux')

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 96df148ed969..68adc3cc8ad2 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1,5 +1,11 @@
 menu "SCSI device support"
 
+config RAID_ATTRS
+	tristate "RAID Transport Class"
+	default n
+	---help---
+	  Provides RAID
+
 config SCSI
 	tristate "SCSI device support"
 	---help---
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 3746fb9fa2f5..85f9e6bb34b9 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -22,6 +22,8 @@ subdir-$(CONFIG_PCMCIA)		+= pcmcia
 
 obj-$(CONFIG_SCSI)		+= scsi_mod.o
 
+obj-$(CONFIG_RAID_ATTRS)	+= raid_class.o
+
 # --- NOTE ORDERING HERE ---
 # For kernel non-modular link, transport attributes need to
 # be initialised before drivers
diff --git a/drivers/scsi/raid_class.c b/drivers/scsi/raid_class.c
new file mode 100644
index 000000000000..f1ea5027865f
--- /dev/null
+++ b/drivers/scsi/raid_class.c
@@ -0,0 +1,250 @@
+/*
+ * RAID Attributes
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/raid_class.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+
+#define RAID_NUM_ATTRS	3
+
+struct raid_internal {
+	struct raid_template r;
+	struct raid_function_template *f;
+	/* The actual attributes */
+	struct class_device_attribute private_attrs[RAID_NUM_ATTRS];
+	/* The array of null terminated pointers to attributes 
+	 * needed by scsi_sysfs.c */
+	struct class_device_attribute *attrs[RAID_NUM_ATTRS + 1];
+};
+
+struct raid_component {
+	struct list_head node;
+	struct device *dev;
+	int num;
+};
+
+#define to_raid_internal(tmpl)	container_of(tmpl, struct raid_internal, r)
+
+#define tc_to_raid_internal(tcont) ({					\
+	struct raid_template *r =					\
+		container_of(tcont, struct raid_template, raid_attrs);	\
+	to_raid_internal(r);						\
+})
+
+#define ac_to_raid_internal(acont) ({					\
+	struct transport_container *tc =				\
+		container_of(acont, struct transport_container, ac);	\
+	tc_to_raid_internal(tc);					\
+})
+
+#define class_device_to_raid_internal(cdev) ({				\
+	struct attribute_container *ac =				\
+		attribute_container_classdev_to_container(cdev);	\
+	ac_to_raid_internal(ac);					\
+})
+	
+
+static int raid_match(struct attribute_container *cont, struct device *dev)
+{
+	/* We have to look for every subsystem that could house
+	 * emulated RAID devices, so start with SCSI */
+	struct raid_internal *i = ac_to_raid_internal(cont);
+
+	if (scsi_is_sdev_device(dev)) {
+		struct scsi_device *sdev = to_scsi_device(dev);
+
+		if (i->f->cookie != sdev->host->hostt)
+			return 0;
+
+		return i->f->is_raid(dev);
+	}
+	/* FIXME: look at other subsystems too */
+	return 0;
+}
+
+static int raid_setup(struct transport_container *tc, struct device *dev,
+		       struct class_device *cdev)
+{
+	struct raid_data *rd;
+
+	BUG_ON(class_get_devdata(cdev));
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	memset(rd, 0, sizeof(*rd));
+	INIT_LIST_HEAD(&rd->component_list);
+	class_set_devdata(cdev, rd);
+		
+	return 0;
+}
+
+static int raid_remove(struct transport_container *tc, struct device *dev,
+		       struct class_device *cdev)
+{
+	struct raid_data *rd = class_get_devdata(cdev);
+	struct raid_component *rc, *next;
+	class_set_devdata(cdev, NULL);
+	list_for_each_entry_safe(rc, next, &rd->component_list, node) {
+		char buf[40];
+		snprintf(buf, sizeof(buf), "component-%d", rc->num);
+		list_del(&rc->node);
+		sysfs_remove_link(&cdev->kobj, buf);
+		kfree(rc);
+	}
+	kfree(class_get_devdata(cdev));
+	return 0;
+}
+
+static DECLARE_TRANSPORT_CLASS(raid_class,
+			       "raid_devices",
+			       raid_setup,
+			       raid_remove,
+			       NULL);
+
+static struct {
+	enum raid_state	value;
+	char		*name;
+} raid_states[] = {
+	{ RAID_ACTIVE, "active" },
+	{ RAID_DEGRADED, "degraded" },
+	{ RAID_RESYNCING, "resyncing" },
+	{ RAID_OFFLINE, "offline" },
+};
+
+static const char *raid_state_name(enum raid_state state)
+{
+	int i;
+	char *name = NULL;
+
+	for (i = 0; i < sizeof(raid_states)/sizeof(raid_states[0]); i++) {
+		if (raid_states[i].value == state) {
+			name = raid_states[i].name;
+			break;
+		}
+	}
+	return name;
+}
+
+
+#define raid_attr_show_internal(attr, fmt, var, code)			\
+static ssize_t raid_show_##attr(struct class_device *cdev, char *buf)	\
+{									\
+	struct raid_data *rd = class_get_devdata(cdev);			\
+	code								\
+	return snprintf(buf, 20, #fmt "\n", var);			\
+}
+
+#define raid_attr_ro_states(attr, states, code)				\
+raid_attr_show_internal(attr, %s, name,					\
+	const char *name;						\
+	code								\
+	name = raid_##states##_name(rd->attr);				\
+)									\
+static CLASS_DEVICE_ATTR(attr, S_IRUGO, raid_show_##attr, NULL)
+
+
+#define raid_attr_ro_internal(attr, code)				\
+raid_attr_show_internal(attr, %d, rd->attr, code)			\
+static CLASS_DEVICE_ATTR(attr, S_IRUGO, raid_show_##attr, NULL)
+
+#define ATTR_CODE(attr)							\
+	struct raid_internal *i = class_device_to_raid_internal(cdev);	\
+	if (i->f->get_##attr)						\
+		i->f->get_##attr(cdev->dev);
+
+#define raid_attr_ro(attr)	raid_attr_ro_internal(attr, )
+#define raid_attr_ro_fn(attr)	raid_attr_ro_internal(attr, ATTR_CODE(attr))
+#define raid_attr_ro_state(attr)	raid_attr_ro_states(attr, attr, ATTR_CODE(attr))
+
+raid_attr_ro(level);
+raid_attr_ro_fn(resync);
+raid_attr_ro_state(state);
+
+void raid_component_add(struct raid_template *r,struct device *raid_dev,
+			struct device *component_dev)
+{
+	struct class_device *cdev =
+		attribute_container_find_class_device(&r->raid_attrs.ac,
+						      raid_dev);
+	struct raid_component *rc;
+	struct raid_data *rd = class_get_devdata(cdev);
+	char buf[40];
+
+	rc = kmalloc(sizeof(*rc), GFP_KERNEL);
+	if (!rc)
+		return;
+
+	INIT_LIST_HEAD(&rc->node);
+	rc->dev = component_dev;
+	rc->num = rd->component_count++;
+
+	snprintf(buf, sizeof(buf), "component-%d", rc->num);
+	list_add_tail(&rc->node, &rd->component_list);
+	sysfs_create_link(&cdev->kobj, &component_dev->kobj, buf);
+}
+EXPORT_SYMBOL(raid_component_add);
+
+struct raid_template *
+raid_class_attach(struct raid_function_template *ft)
+{
+	struct raid_internal *i = kmalloc(sizeof(struct raid_internal),
+					  GFP_KERNEL);
+	int count = 0;
+
+	if (unlikely(!i))
+		return NULL;
+
+	memset(i, 0, sizeof(*i));
+
+	i->f = ft;
+
+	i->r.raid_attrs.ac.class = &raid_class.class;
+	i->r.raid_attrs.ac.match = raid_match;
+	i->r.raid_attrs.ac.attrs = &i->attrs[0];
+
+	attribute_container_register(&i->r.raid_attrs.ac);
+
+	i->attrs[count++] = &class_device_attr_level;
+	i->attrs[count++] = &class_device_attr_resync;
+	i->attrs[count++] = &class_device_attr_state;
+
+	i->attrs[count] = NULL;
+	BUG_ON(count > RAID_NUM_ATTRS);
+
+	return &i->r;
+}
+EXPORT_SYMBOL(raid_class_attach);
+
+void
+raid_class_release(struct raid_template *r)
+{
+	struct raid_internal *i = to_raid_internal(r);
+
+	attribute_container_unregister(&i->r.raid_attrs.ac);
+
+	kfree(i);
+}
+EXPORT_SYMBOL(raid_class_release);
+
+static __init int raid_init(void)
+{
+	return transport_class_register(&raid_class);
+}
+
+static __exit void raid_exit(void)
+{
+	transport_class_unregister(&raid_class);
+}
+
+MODULE_AUTHOR("James Bottomley");
+MODULE_DESCRIPTION("RAID device class");
+MODULE_LICENSE("GPL");
+
+module_init(raid_init);
+module_exit(raid_exit);
+
diff --git a/include/linux/raid_class.h b/include/linux/raid_class.h
new file mode 100644
index 000000000000..a71123c28272
--- /dev/null
+++ b/include/linux/raid_class.h
@@ -0,0 +1,59 @@
+/*
+ */
+#include <linux/transport_class.h>
+
+struct raid_template {
+	struct transport_container raid_attrs;
+};
+
+struct raid_function_template {
+	void *cookie;
+	int (*is_raid)(struct device *);
+	void (*get_resync)(struct device *);
+	void (*get_state)(struct device *);
+};
+
+enum raid_state {
+	RAID_ACTIVE = 1,
+	RAID_DEGRADED,
+	RAID_RESYNCING,
+	RAID_OFFLINE,
+};
+
+struct raid_data {
+	struct list_head component_list;
+	int component_count;
+	int level;
+	enum raid_state state;
+	int resync;
+};
+
+#define DEFINE_RAID_ATTRIBUTE(type, attr)				      \
+static inline void							      \
+raid_set_##attr(struct raid_template *r, struct device *dev, type value) {    \
+	struct class_device *cdev =					      \
+		attribute_container_find_class_device(&r->raid_attrs.ac, dev);\
+	struct raid_data *rd;						      \
+	BUG_ON(!cdev);							      \
+	rd = class_get_devdata(cdev);					      \
+	rd->attr = value;						      \
+}									      \
+static inline type							      \
+raid_get_##attr(struct raid_template *r, struct device *dev) {		      \
+	struct class_device *cdev =					      \
+		attribute_container_find_class_device(&r->raid_attrs.ac, dev);\
+	struct raid_data *rd;						      \
+	BUG_ON(!cdev);							      \
+	rd = class_get_devdata(cdev);					      \
+	return rd->attr;						      \
+}
+
+DEFINE_RAID_ATTRIBUTE(int, level)
+DEFINE_RAID_ATTRIBUTE(int, resync)
+DEFINE_RAID_ATTRIBUTE(enum raid_state, state)
+	
+struct raid_template *raid_class_attach(struct raid_function_template *);
+void raid_class_release(struct raid_template *);
+
+void raid_component_add(struct raid_template *, struct device *,
+			struct device *);
-- 
cgit v1.2.3


From 93c37f292110a37dd77e4cc0aaf1c341d79bf6aa Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 6 Sep 2005 13:57:08 -0700
Subject: [SERIAL]: Avoid 'statement with no effect' warnings.

When SUPPORT_SYSRQ is false, gcc can emit warnings for
the uart_handle_sysrq_char() that results.  Using an
empty inline returning zero kills the warning.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/serial_core.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index cf0f64ea2bc0..9b12fe731612 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -385,11 +385,11 @@ int uart_resume_port(struct uart_driver *reg, struct uart_port *port);
 /*
  * The following are helper functions for the low level drivers.
  */
-#ifdef SUPPORT_SYSRQ
 static inline int
 uart_handle_sysrq_char(struct uart_port *port, unsigned int ch,
 		       struct pt_regs *regs)
 {
+#ifdef SUPPORT_SYSRQ
 	if (port->sysrq) {
 		if (ch && time_before(jiffies, port->sysrq)) {
 			handle_sysrq(ch, regs, NULL);
@@ -398,11 +398,9 @@ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch,
 		}
 		port->sysrq = 0;
 	}
+#endif
 	return 0;
 }
-#else
-#define uart_handle_sysrq_char(port,ch,regs)	(0)
-#endif
 
 /*
  * We do the SysRQ and SAK checking like this...
-- 
cgit v1.2.3


From 2248bcfcd8fb622ec88b8587d0c1f139635ffd2e Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 6 Sep 2005 15:06:42 -0700
Subject: [NETFILTER]: Add support for permanent expectations

A permanent expectation exists until timeing out and can expect
multiple related connections.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h |  5 +++++
 net/ipv4/netfilter/ip_conntrack_amanda.c    |  1 +
 net/ipv4/netfilter/ip_conntrack_core.c      | 12 ++++++++----
 net/ipv4/netfilter/ip_conntrack_ftp.c       |  1 +
 net/ipv4/netfilter/ip_conntrack_irc.c       |  1 +
 net/ipv4/netfilter/ip_conntrack_netlink.c   |  1 +
 net/ipv4/netfilter/ip_conntrack_tftp.c      |  1 +
 7 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 088742befe49..7e033e9271a8 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -263,6 +263,9 @@ struct ip_conntrack_expect
 	/* Unique ID */
 	unsigned int id;
 
+	/* Flags */
+	unsigned int flags;
+
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 	/* This is the original per-proto part, used to map the
 	 * expected connection the way the recipient expects. */
@@ -272,6 +275,8 @@ struct ip_conntrack_expect
 #endif
 };
 
+#define IP_CT_EXPECT_PERMANENT	0x1
+
 static inline struct ip_conntrack *
 tuplehash_to_ctrack(const struct ip_conntrack_tuple_hash *hash)
 {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index be4c9eb3243f..dc20881004bc 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -108,6 +108,7 @@ static int help(struct sk_buff **pskb,
 		}
 
 		exp->expectfn = NULL;
+		exp->flags = 0;
 
 		exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
 		exp->tuple.src.u.tcp.port = 0;
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a0648600190e..e23e8ca476c0 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -264,10 +264,14 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
 		   master ct never got confirmed, we'd hold a reference to it
 		   and weird things would happen to future packets). */
 		if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
-		    && is_confirmed(i->master)
-		    && del_timer(&i->timeout)) {
-			unlink_expect(i);
-			return i;
+		    && is_confirmed(i->master)) {
+			if (i->flags & IP_CT_EXPECT_PERMANENT) {
+				atomic_inc(&i->use);
+				return i;
+			} else if (del_timer(&i->timeout)) {
+				unlink_expect(i);
+				return i;
+			}
 		}
 	}
 	return NULL;
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 3a2627db1729..1b79ec36085f 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -421,6 +421,7 @@ static int help(struct sk_buff **pskb,
 		  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
 
 	exp->expectfn = NULL;
+	exp->flags = 0;
 
 	/* Now, NAT might want to mangle the packet, and register the
 	 * (possibly changed) expectation itself. */
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 25438eec21a1..d7a8a98c05e1 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -221,6 +221,7 @@ static int help(struct sk_buff **pskb,
 				{ { 0, { 0 } },
 				  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
 			exp->expectfn = NULL;
+			exp->flags = 0;
 			if (ip_nat_irc_hook)
 				ret = ip_nat_irc_hook(pskb, ctinfo, 
 						      addr_beg_p - ib_ptr,
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index a4e9278db4ed..3dc3a7bab3b4 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -1413,6 +1413,7 @@ ctnetlink_create_expect(struct nfattr *cda[])
 	}
 	
 	exp->expectfn = NULL;
+	exp->flags = 0;
 	exp->master = ct;
 	memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
 	memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index f8ff170f390a..d2b590533452 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -75,6 +75,7 @@ static int tftp_help(struct sk_buff **pskb,
 		exp->mask.dst.u.udp.port = 0xffff;
 		exp->mask.dst.protonum = 0xff;
 		exp->expectfn = NULL;
+		exp->flags = 0;
 
 		DEBUGP("expect: ");
 		DUMP_TUPLE(&exp->tuple);
-- 
cgit v1.2.3


From 03486a4f838c55481317fca5ac2e7d12550a4fb7 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 6 Sep 2005 15:09:43 -0700
Subject: [NETFILTER]: Handle NAT module load race

When the NAT module is loaded when connections are already confirmed
it must not change their tuples anymore. This is especially important
with CONFIG_NETFILTER_DEBUG, the netfilter listhelp functions will
refuse to remove an entry from a list when it can not be found on
the list, so when a changed tuple hashes to a new bucket the entry
is kept in the list until and after the conntrack is freed.

Allocate the exact conntrack tuple for NAT for already confirmed
connections or drop them if that fails.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_nat_rule.h |  5 +++++
 net/ipv4/netfilter/ip_nat_rule.c           | 21 +++++++++++++++++++++
 net/ipv4/netfilter/ip_nat_standalone.c     |  8 ++++++--
 3 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_nat_rule.h b/include/linux/netfilter_ipv4/ip_nat_rule.h
index fecd2a06dcd8..73b9552e6a89 100644
--- a/include/linux/netfilter_ipv4/ip_nat_rule.h
+++ b/include/linux/netfilter_ipv4/ip_nat_rule.h
@@ -19,5 +19,10 @@ extern unsigned int
 alloc_null_binding(struct ip_conntrack *conntrack,
 		   struct ip_nat_info *info,
 		   unsigned int hooknum);
+
+extern unsigned int
+alloc_null_binding_confirmed(struct ip_conntrack *conntrack,
+			     struct ip_nat_info *info,
+			     unsigned int hooknum);
 #endif
 #endif /* _IP_NAT_RULE_H */
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 60d70fa41a15..cb66b8bddeb3 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -255,6 +255,27 @@ alloc_null_binding(struct ip_conntrack *conntrack,
 	return ip_nat_setup_info(conntrack, &range, hooknum);
 }
 
+unsigned int
+alloc_null_binding_confirmed(struct ip_conntrack *conntrack,
+                             struct ip_nat_info *info,
+                             unsigned int hooknum)
+{
+	u_int32_t ip
+		= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+		   ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+		   : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+	u_int16_t all
+		= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+		   ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all
+		   : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all);
+	struct ip_nat_range range
+		= { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } };
+
+	DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
+	       conntrack, NIPQUAD(ip));
+	return ip_nat_setup_info(conntrack, &range, hooknum);
+}
+
 int ip_nat_rule_find(struct sk_buff **pskb,
 		     unsigned int hooknum,
 		     const struct net_device *in,
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 89db052add81..0ff368b131f6 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -123,8 +123,12 @@ ip_nat_fn(unsigned int hooknum,
 		if (!ip_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			/* LOCAL_IN hook doesn't have a chain!  */
-			if (hooknum == NF_IP_LOCAL_IN)
+			if (unlikely(is_confirmed(ct)))
+				/* NAT module was loaded late */
+				ret = alloc_null_binding_confirmed(ct, info,
+				                                   hooknum);
+			else if (hooknum == NF_IP_LOCAL_IN)
+				/* LOCAL_IN hook doesn't have a chain!  */
 				ret = alloc_null_binding(ct, info, hooknum);
 			else
 				ret = ip_nat_rule_find(pskb, hooknum,
-- 
cgit v1.2.3


From 49719eb355d32fa07793017b4b46b1c02e88b275 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Sep 2005 15:10:46 -0700
Subject: [NETFILTER]: kill __ip_ct_expect_unlink_destroy

The following patch kills __ip_ct_expect_unlink_destroy and export
unlink_expect as ip_ct_unlink_expect. As it was discussed [1], the function
__ip_ct_expect_unlink_destroy is a bit confusing so better do the following
sequence: ip_ct_destroy_expect and ip_conntrack_expect_put.

[1] https://lists.netfilter.org/pipermail/netfilter-devel/2005-August/020794.html

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_core.h |  2 +-
 net/ipv4/netfilter/ip_conntrack_core.c           | 20 +++++++-------------
 net/ipv4/netfilter/ip_conntrack_netlink.c        | 12 ++++++++----
 net/ipv4/netfilter/ip_conntrack_standalone.c     |  2 +-
 4 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h
index dc4d2a0575de..907d4f5ca5dc 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_core.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h
@@ -52,7 +52,7 @@ static inline int ip_conntrack_confirm(struct sk_buff **pskb)
 	return ret;
 }
 
-extern void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp);
+extern void ip_ct_unlink_expect(struct ip_conntrack_expect *exp);
 
 extern struct list_head *ip_conntrack_hash;
 extern struct list_head ip_conntrack_expect_list;
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index babce304c619..19cba16e6e1e 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -197,7 +197,7 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
 
 
 /* ip_conntrack_expect helper functions */
-static void unlink_expect(struct ip_conntrack_expect *exp)
+void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
 {
 	ASSERT_WRITE_LOCK(&ip_conntrack_lock);
 	IP_NF_ASSERT(!timer_pending(&exp->timeout));
@@ -207,18 +207,12 @@ static void unlink_expect(struct ip_conntrack_expect *exp)
 	ip_conntrack_expect_put(exp);
 }
 
-void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
-{
-	unlink_expect(exp);
-	ip_conntrack_expect_put(exp);
-}
-
 static void expectation_timed_out(unsigned long ul_expect)
 {
 	struct ip_conntrack_expect *exp = (void *)ul_expect;
 
 	write_lock_bh(&ip_conntrack_lock);
-	unlink_expect(exp);
+	ip_ct_unlink_expect(exp);
 	write_unlock_bh(&ip_conntrack_lock);
 	ip_conntrack_expect_put(exp);
 }
@@ -269,7 +263,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
 				atomic_inc(&i->use);
 				return i;
 			} else if (del_timer(&i->timeout)) {
-				unlink_expect(i);
+				ip_ct_unlink_expect(i);
 				return i;
 			}
 		}
@@ -288,7 +282,7 @@ void ip_ct_remove_expectations(struct ip_conntrack *ct)
 
 	list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
 		if (i->master == ct && del_timer(&i->timeout)) {
-			unlink_expect(i);
+			ip_ct_unlink_expect(i);
 			ip_conntrack_expect_put(i);
 		}
 	}
@@ -929,7 +923,7 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
 	/* choose the the oldest expectation to evict */
 	list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
 		if (expect_matches(i, exp) && del_timer(&i->timeout)) {
-			unlink_expect(i);
+			ip_ct_unlink_expect(i);
 			write_unlock_bh(&ip_conntrack_lock);
 			ip_conntrack_expect_put(i);
 			return;
@@ -986,7 +980,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
 	list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
 		if (i->master == master) {
 			if (del_timer(&i->timeout)) {
-				unlink_expect(i);
+				ip_ct_unlink_expect(i);
 				ip_conntrack_expect_put(i);
 			}
 			break;
@@ -1103,7 +1097,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 	/* Get rid of expectations */
 	list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
 		if (exp->master->helper == me && del_timer(&exp->timeout)) {
-			unlink_expect(exp);
+			ip_ct_unlink_expect(exp);
 			ip_conntrack_expect_put(exp);
 		}
 	}
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 3dc3a7bab3b4..15aef3564742 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -1349,8 +1349,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
 					 list) {
 			if (exp->master->helper == h 
-			    && del_timer(&exp->timeout))
-				__ip_ct_expect_unlink_destroy(exp);
+			    && del_timer(&exp->timeout)) {
+				ip_ct_unlink_expect(exp);
+				ip_conntrack_expect_put(exp);
+			}
 		}
 		write_unlock(&ip_conntrack_lock);
 	} else {
@@ -1358,8 +1360,10 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		write_lock_bh(&ip_conntrack_lock);
 		list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
 					 list) {
-			if (del_timer(&exp->timeout))
-				__ip_ct_expect_unlink_destroy(exp);
+			if (del_timer(&exp->timeout)) {
+				ip_ct_unlink_expect(exp);
+				ip_conntrack_expect_put(exp);
+			}
 		}
 		write_unlock_bh(&ip_conntrack_lock);
 	}
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index ee5895afd0c3..ae3e3e655db5 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -998,7 +998,7 @@ EXPORT_SYMBOL(ip_conntrack_expect_related);
 EXPORT_SYMBOL(ip_conntrack_unexpect_related);
 EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
 EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
-EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy);
+EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
 
 EXPORT_SYMBOL(ip_conntrack_tuple_taken);
 EXPORT_SYMBOL(ip_ct_gather_frags);
-- 
cgit v1.2.3


From f2c383988d68c91a7d474b7cf26c0a2df49bbafe Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 6 Sep 2005 15:48:03 -0700
Subject: [NET]: skb_get/set_timestamp use const

The new timestamp get/set routines should have const attribute
on parameters (helps to indicate direction).

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 42edce6abe23..da7da9c0ed1b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1251,7 +1251,7 @@ extern void skb_add_mtu(int mtu);
  *	This function converts the offset back to a struct timeval and stores
  *	it in stamp.
  */
-static inline void skb_get_timestamp(struct sk_buff *skb, struct timeval *stamp)
+static inline void skb_get_timestamp(const struct sk_buff *skb, struct timeval *stamp)
 {
 	stamp->tv_sec  = skb->tstamp.off_sec;
 	stamp->tv_usec = skb->tstamp.off_usec;
@@ -1270,7 +1270,7 @@ static inline void skb_get_timestamp(struct sk_buff *skb, struct timeval *stamp)
  *	This function converts a struct timeval to an offset and stores
  *	it in the skb.
  */
-static inline void skb_set_timestamp(struct sk_buff *skb, struct timeval *stamp)
+static inline void skb_set_timestamp(struct sk_buff *skb, const struct timeval *stamp)
 {
 	skb->tstamp.off_sec  = stamp->tv_sec - skb_tv_base.tv_sec;
 	skb->tstamp.off_usec = stamp->tv_usec - skb_tv_base.tv_usec;
-- 
cgit v1.2.3


From aaec0fab5f8809fe1509fdc204e769bb35ebe41a Mon Sep 17 00:00:00 2001
From: Jens Osterkamp <Jens.Osterkamp@de.ibm.com>
Date: Mon, 5 Sep 2005 15:19:29 -0700
Subject: [PATCH] net: add driver for the NIC on Cell Blades

This patch adds a driver for a new 1000 Mbit ethernet NIC.  It is
integrated on the south bridge that is used for our Cell Blades.

The code gets the MAC address from the Open Firmware device tree, so it
won't compile on platforms other than ppc64.

This is the first public release, so I don't expect the first version to
get merged, but I'd aim for integration within the 2.6.13 time frame.

Cc: Utz Bacher <utz.bacher@de.ibm.com>
Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 drivers/net/Kconfig              |    7 +
 drivers/net/Makefile             |    2 +
 drivers/net/spider_net.c         | 2298 ++++++++++++++++++++++++++++++++++++++
 drivers/net/spider_net.h         |  469 ++++++++
 drivers/net/spider_net_ethtool.c |  107 ++
 include/linux/pci_ids.h          |    1 +
 6 files changed, 2884 insertions(+)
 create mode 100644 drivers/net/spider_net.c
 create mode 100644 drivers/net/spider_net.h
 create mode 100644 drivers/net/spider_net_ethtool.c

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index ae9e7a579b94..6bb9232514b4 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2058,6 +2058,13 @@ config BNX2
 	  To compile this driver as a module, choose M here: the module
 	  will be called bnx2.  This is recommended.
 
+config SPIDER_NET
+	tristate "Spider Gigabit Ethernet driver"
+	depends on PCI && PPC_BPA
+	help
+	  This driver supports the Gigabit Ethernet chips present on the
+	  Cell Processor-Based Blades from IBM.
+
 config GIANFAR
 	tristate "Gianfar Ethernet"
 	depends on 85xx || 83xx
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 5baafcd55610..8645c843cf4d 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -54,6 +54,8 @@ obj-$(CONFIG_STNIC) += stnic.o 8390.o
 obj-$(CONFIG_FEALNX) += fealnx.o
 obj-$(CONFIG_TIGON3) += tg3.o
 obj-$(CONFIG_BNX2) += bnx2.o
+spidernet-y += spider_net.o spider_net_ethtool.o sungem_phy.o
+obj-$(CONFIG_SPIDER_NET) += spidernet.o
 obj-$(CONFIG_TC35815) += tc35815.o
 obj-$(CONFIG_SKGE) += skge.o
 obj-$(CONFIG_SK98LIN) += sk98lin/
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
new file mode 100644
index 000000000000..692a0437fef7
--- /dev/null
+++ b/drivers/net/spider_net.c
@@ -0,0 +1,2298 @@
+/*
+ * Network device driver for Cell Processor-Based Blade
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Authors : Utz Bacher <utz.bacher@de.ibm.com>
+ *           Jens Osterkamp <Jens.Osterkamp@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+
+#include <linux/compiler.h>
+#include <linux/crc32.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/firmware.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <asm/bitops.h>
+#include <asm/pci-bridge.h>
+#include <net/checksum.h>
+
+#include "spider_net.h"
+
+MODULE_AUTHOR("Utz Bacher <utz.bacher@de.ibm.com> and Jens Osterkamp " \
+	      "<Jens.Osterkamp@de.ibm.com>");
+MODULE_DESCRIPTION("Spider Southbridge Gigabit Ethernet driver");
+MODULE_LICENSE("GPL");
+
+static int rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_DEFAULT;
+static int tx_descriptors = SPIDER_NET_TX_DESCRIPTORS_DEFAULT;
+
+module_param(rx_descriptors, int, 0644);
+module_param(tx_descriptors, int, 0644);
+
+MODULE_PARM_DESC(rx_descriptors, "number of descriptors used " \
+		 "in rx chains");
+MODULE_PARM_DESC(tx_descriptors, "number of descriptors used " \
+		 "in tx chain");
+
+char spider_net_driver_name[] = "spidernet";
+
+static struct pci_device_id spider_net_pci_tbl[] = {
+	{ PCI_VENDOR_ID_TOSHIBA_2, PCI_DEVICE_ID_TOSHIBA_SPIDER_NET,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, spider_net_pci_tbl);
+
+/**
+ * spider_net_read_reg - reads an SMMIO register of a card
+ * @card: device structure
+ * @reg: register to read from
+ *
+ * returns the content of the specified SMMIO register.
+ */
+static u32
+spider_net_read_reg(struct spider_net_card *card, u32 reg)
+{
+	u32 value;
+
+	value = readl(card->regs + reg);
+	value = le32_to_cpu(value);
+
+	return value;
+}
+
+/**
+ * spider_net_write_reg - writes to an SMMIO register of a card
+ * @card: device structure
+ * @reg: register to write to
+ * @value: value to write into the specified SMMIO register
+ */
+static void
+spider_net_write_reg(struct spider_net_card *card, u32 reg, u32 value)
+{
+	value = cpu_to_le32(value);
+	writel(value, card->regs + reg);
+}
+
+/**
+ * spider_net_rx_irq_off - switch off rx irq on this spider card
+ * @card: device structure
+ *
+ * switches off rx irq by masking them out in the GHIINTnMSK register
+ */
+static void
+spider_net_rx_irq_off(struct spider_net_card *card)
+{
+	u32 regvalue;
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->intmask_lock, flags);
+	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
+	regvalue &= ~SPIDER_NET_RXINT;
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, regvalue);
+	spin_unlock_irqrestore(&card->intmask_lock, flags);
+}
+
+/** spider_net_write_phy - write to phy register
+ * @netdev: adapter to be written to
+ * @mii_id: id of MII
+ * @reg: PHY register
+ * @val: value to be written to phy register
+ *
+ * spider_net_write_phy_register writes to an arbitrary PHY
+ * register via the spider GPCWOPCMD register. We assume the queue does
+ * not run full (not more than 15 commands outstanding).
+ **/
+static void
+spider_net_write_phy(struct net_device *netdev, int mii_id,
+		     int reg, int val)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	u32 writevalue;
+
+	writevalue = ((u32)mii_id << 21) |
+		((u32)reg << 16) | ((u32)val);
+
+	spider_net_write_reg(card, SPIDER_NET_GPCWOPCMD, writevalue);
+}
+
+/** spider_net_read_phy - read from phy register
+ * @netdev: network device to be read from
+ * @mii_id: id of MII
+ * @reg: PHY register
+ *
+ * Returns value read from PHY register
+ *
+ * spider_net_write_phy reads from an arbitrary PHY
+ * register via the spider GPCROPCMD register
+ **/
+static int
+spider_net_read_phy(struct net_device *netdev, int mii_id, int reg)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	u32 readvalue;
+
+	readvalue = ((u32)mii_id << 21) | ((u32)reg << 16);
+	spider_net_write_reg(card, SPIDER_NET_GPCROPCMD, readvalue);
+
+	/* we don't use semaphores to wait for an SPIDER_NET_GPROPCMPINT
+	 * interrupt, as we poll for the completion of the read operation
+	 * in spider_net_read_phy. Should take about 50 us */
+	do {
+		readvalue = spider_net_read_reg(card, SPIDER_NET_GPCROPCMD);
+	} while (readvalue & SPIDER_NET_GPREXEC);
+
+	readvalue &= SPIDER_NET_GPRDAT_MASK;
+
+	return readvalue;
+}
+
+/**
+ * spider_net_rx_irq_on - switch on rx irq on this spider card
+ * @card: device structure
+ *
+ * switches on rx irq by enabling them in the GHIINTnMSK register
+ */
+static void
+spider_net_rx_irq_on(struct spider_net_card *card)
+{
+	u32 regvalue;
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->intmask_lock, flags);
+	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
+	regvalue |= SPIDER_NET_RXINT;
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, regvalue);
+	spin_unlock_irqrestore(&card->intmask_lock, flags);
+}
+
+/**
+ * spider_net_tx_irq_off - switch off tx irq on this spider card
+ * @card: device structure
+ *
+ * switches off tx irq by masking them out in the GHIINTnMSK register
+ */
+static void
+spider_net_tx_irq_off(struct spider_net_card *card)
+{
+	u32 regvalue;
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->intmask_lock, flags);
+	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
+	regvalue &= ~SPIDER_NET_TXINT;
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, regvalue);
+	spin_unlock_irqrestore(&card->intmask_lock, flags);
+}
+
+/**
+ * spider_net_tx_irq_on - switch on tx irq on this spider card
+ * @card: device structure
+ *
+ * switches on tx irq by enabling them in the GHIINTnMSK register
+ */
+static void
+spider_net_tx_irq_on(struct spider_net_card *card)
+{
+	u32 regvalue;
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->intmask_lock, flags);
+	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
+	regvalue |= SPIDER_NET_TXINT;
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, regvalue);
+	spin_unlock_irqrestore(&card->intmask_lock, flags);
+}
+
+/**
+ * spider_net_set_promisc - sets the unicast address or the promiscuous mode
+ * @card: card structure
+ *
+ * spider_net_set_promisc sets the unicast destination address filter and
+ * thus either allows for non-promisc mode or promisc mode
+ */
+static void
+spider_net_set_promisc(struct spider_net_card *card)
+{
+	u32 macu, macl;
+	struct net_device *netdev = card->netdev;
+
+	if (netdev->flags & IFF_PROMISC) {
+		/* clear destination entry 0 */
+		spider_net_write_reg(card, SPIDER_NET_GMRUAFILnR, 0);
+		spider_net_write_reg(card, SPIDER_NET_GMRUAFILnR + 0x04, 0);
+		spider_net_write_reg(card, SPIDER_NET_GMRUA0FIL15R,
+				     SPIDER_NET_PROMISC_VALUE);
+	} else {
+		macu = netdev->dev_addr[0];
+		macu <<= 8;
+		macu |= netdev->dev_addr[1];
+		memcpy(&macl, &netdev->dev_addr[2], sizeof(macl));
+
+		macu |= SPIDER_NET_UA_DESCR_VALUE;
+		spider_net_write_reg(card, SPIDER_NET_GMRUAFILnR, macu);
+		spider_net_write_reg(card, SPIDER_NET_GMRUAFILnR + 0x04, macl);
+		spider_net_write_reg(card, SPIDER_NET_GMRUA0FIL15R,
+				     SPIDER_NET_NONPROMISC_VALUE);
+	}
+}
+
+/**
+ * spider_net_get_mac_address - read mac address from spider card
+ * @card: device structure
+ *
+ * reads MAC address from GMACUNIMACU and GMACUNIMACL registers
+ */
+static int
+spider_net_get_mac_address(struct net_device *netdev)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	u32 macl, macu;
+
+	macl = spider_net_read_reg(card, SPIDER_NET_GMACUNIMACL);
+	macu = spider_net_read_reg(card, SPIDER_NET_GMACUNIMACU);
+
+	netdev->dev_addr[0] = (macu >> 24) & 0xff;
+	netdev->dev_addr[1] = (macu >> 16) & 0xff;
+	netdev->dev_addr[2] = (macu >> 8) & 0xff;
+	netdev->dev_addr[3] = macu & 0xff;
+	netdev->dev_addr[4] = (macl >> 8) & 0xff;
+	netdev->dev_addr[5] = macl & 0xff;
+
+	if (!is_valid_ether_addr(&netdev->dev_addr[0]))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * spider_net_get_descr_status -- returns the status of a descriptor
+ * @descr: descriptor to look at
+ *
+ * returns the status as in the dmac_cmd_status field of the descriptor
+ */
+static enum spider_net_descr_status
+spider_net_get_descr_status(struct spider_net_descr *descr)
+{
+	u32 cmd_status;
+	rmb();
+	cmd_status = descr->dmac_cmd_status;
+	rmb();
+	cmd_status >>= SPIDER_NET_DESCR_IND_PROC_SHIFT;
+	/* no need to mask out any bits, as cmd_status is 32 bits wide only
+	 * (and unsigned) */
+	return cmd_status;
+}
+
+/**
+ * spider_net_set_descr_status -- sets the status of a descriptor
+ * @descr: descriptor to change
+ * @status: status to set in the descriptor
+ *
+ * changes the status to the specified value. Doesn't change other bits
+ * in the status
+ */
+static void
+spider_net_set_descr_status(struct spider_net_descr *descr,
+			    enum spider_net_descr_status status)
+{
+	u32 cmd_status;
+	/* read the status */
+	mb();
+	cmd_status = descr->dmac_cmd_status;
+	/* clean the upper 4 bits */
+	cmd_status &= SPIDER_NET_DESCR_IND_PROC_MASKO;
+	/* add the status to it */
+	cmd_status |= ((u32)status)<<SPIDER_NET_DESCR_IND_PROC_SHIFT;
+	/* and write it back */
+	descr->dmac_cmd_status = cmd_status;
+	wmb();
+}
+
+/**
+ * spider_net_free_chain - free descriptor chain
+ * @card: card structure
+ * @chain: address of chain
+ *
+ */
+static void
+spider_net_free_chain(struct spider_net_card *card,
+		      struct spider_net_descr_chain *chain)
+{
+	struct spider_net_descr *descr;
+
+	for (descr = chain->tail; !descr->bus_addr; descr = descr->next) {
+		pci_unmap_single(card->pdev, descr->bus_addr,
+				 SPIDER_NET_DESCR_SIZE, PCI_DMA_BIDIRECTIONAL);
+		descr->bus_addr = 0;
+	}
+}
+
+/**
+ * spider_net_init_chain - links descriptor chain
+ * @card: card structure
+ * @chain: address of chain
+ * @start_descr: address of descriptor array
+ * @no: number of descriptors
+ *
+ * we manage a circular list that mirrors the hardware structure,
+ * except that the hardware uses bus addresses.
+ *
+ * returns 0 on success, <0 on failure
+ */
+static int
+spider_net_init_chain(struct spider_net_card *card,
+		       struct spider_net_descr_chain *chain,
+		       struct spider_net_descr *start_descr, int no)
+{
+	int i;
+	struct spider_net_descr *descr;
+
+	spin_lock_init(&card->chain_lock);
+
+	descr = start_descr;
+	memset(descr, 0, sizeof(*descr) * no);
+
+	/* set up the hardware pointers in each descriptor */
+	for (i=0; i<no; i++, descr++) {
+		spider_net_set_descr_status(descr, SPIDER_NET_DESCR_NOT_IN_USE);
+
+		descr->bus_addr =
+			pci_map_single(card->pdev, descr,
+				       SPIDER_NET_DESCR_SIZE,
+				       PCI_DMA_BIDIRECTIONAL);
+
+		if (descr->bus_addr == DMA_ERROR_CODE)
+			goto iommu_error;
+
+		descr->next = descr + 1;
+		descr->prev = descr - 1;
+
+	}
+	/* do actual circular list */
+	(descr-1)->next = start_descr;
+	start_descr->prev = descr-1;
+
+	descr = start_descr;
+	for (i=0; i < no; i++, descr++) {
+		descr->next_descr_addr = descr->next->bus_addr;
+	}
+
+	chain->head = start_descr;
+	chain->tail = start_descr;
+
+	return 0;
+
+iommu_error:
+	descr = start_descr;
+	for (i=0; i < no; i++, descr++)
+		if (descr->bus_addr)
+			pci_unmap_single(card->pdev, descr->bus_addr,
+					 SPIDER_NET_DESCR_SIZE, PCI_DMA_BIDIRECTIONAL);
+	return -ENOMEM;
+}
+
+/**
+ * spider_net_free_rx_chain_contents - frees descr contents in rx chain
+ * @card: card structure
+ *
+ * returns 0 on success, <0 on failure
+ */
+static void
+spider_net_free_rx_chain_contents(struct spider_net_card *card)
+{
+	struct spider_net_descr *descr;
+
+	descr = card->rx_chain.head;
+	while (descr->next != card->rx_chain.head) {
+		if (descr->skb) {
+			dev_kfree_skb(descr->skb);
+			pci_unmap_single(card->pdev, descr->buf_addr,
+					 SPIDER_NET_MAX_MTU,
+					 PCI_DMA_BIDIRECTIONAL);
+		}
+		descr = descr->next;
+	}
+}
+
+/**
+ * spider_net_prepare_rx_descr - reinitializes a rx descriptor
+ * @card: card structure
+ * @descr: descriptor to re-init
+ *
+ * return 0 on succes, <0 on failure
+ *
+ * allocates a new rx skb, iommu-maps it and attaches it to the descriptor.
+ * Activate the descriptor state-wise
+ */
+static int
+spider_net_prepare_rx_descr(struct spider_net_card *card,
+			    struct spider_net_descr *descr)
+{
+	int error = 0;
+	int offset;
+	int bufsize;
+
+	/* we need to round up the buffer size to a multiple of 128 */
+	bufsize = (SPIDER_NET_MAX_MTU + SPIDER_NET_RXBUF_ALIGN - 1) &
+		(~(SPIDER_NET_RXBUF_ALIGN - 1));
+
+	/* and we need to have it 128 byte aligned, therefore we allocate a
+	 * bit more */
+	/* allocate an skb */
+	descr->skb = dev_alloc_skb(bufsize + SPIDER_NET_RXBUF_ALIGN - 1);
+	if (!descr->skb) {
+		if (net_ratelimit())
+			if (netif_msg_rx_err(card))
+				pr_err("Not enough memory to allocate "
+					"rx buffer\n");
+		return -ENOMEM;
+	}
+	descr->buf_size = bufsize;
+	descr->result_size = 0;
+	descr->valid_size = 0;
+	descr->data_status = 0;
+	descr->data_error = 0;
+
+	offset = ((unsigned long)descr->skb->data) &
+		(SPIDER_NET_RXBUF_ALIGN - 1);
+	if (offset)
+		skb_reserve(descr->skb, SPIDER_NET_RXBUF_ALIGN - offset);
+	/* io-mmu-map the skb */
+	descr->buf_addr = pci_map_single(card->pdev, descr->skb->data,
+					 SPIDER_NET_MAX_MTU,
+					 PCI_DMA_BIDIRECTIONAL);
+	if (descr->buf_addr == DMA_ERROR_CODE) {
+		dev_kfree_skb_any(descr->skb);
+		if (netif_msg_rx_err(card))
+			pr_err("Could not iommu-map rx buffer\n");
+		spider_net_set_descr_status(descr, SPIDER_NET_DESCR_NOT_IN_USE);
+	} else {
+		descr->dmac_cmd_status = SPIDER_NET_DMAC_RX_CARDOWNED;
+	}
+
+	return error;
+}
+
+/**
+ * spider_net_enable_rxctails - sets RX dmac chain tail addresses
+ * @card: card structure
+ *
+ * spider_net_enable_rxctails sets the RX DMAC chain tail adresses in the
+ * chip by writing to the appropriate register. DMA is enabled in
+ * spider_net_enable_rxdmac.
+ */
+static void
+spider_net_enable_rxchtails(struct spider_net_card *card)
+{
+	/* assume chain is aligned correctly */
+	spider_net_write_reg(card, SPIDER_NET_GDADCHA ,
+			     card->rx_chain.tail->bus_addr);
+}
+
+/**
+ * spider_net_enable_rxdmac - enables a receive DMA controller
+ * @card: card structure
+ *
+ * spider_net_enable_rxdmac enables the DMA controller by setting RX_DMA_EN
+ * in the GDADMACCNTR register
+ */
+static void
+spider_net_enable_rxdmac(struct spider_net_card *card)
+{
+	spider_net_write_reg(card, SPIDER_NET_GDADMACCNTR,
+			     SPIDER_NET_DMA_RX_VALUE);
+}
+
+/**
+ * spider_net_refill_rx_chain - refills descriptors/skbs in the rx chains
+ * @card: card structure
+ *
+ * refills descriptors in all chains (last used chain first): allocates skbs
+ * and iommu-maps them.
+ */
+static void
+spider_net_refill_rx_chain(struct spider_net_card *card)
+{
+	struct spider_net_descr_chain *chain;
+	int count = 0;
+	unsigned long flags;
+
+	chain = &card->rx_chain;
+
+	spin_lock_irqsave(&card->chain_lock, flags);
+	while (spider_net_get_descr_status(chain->head) ==
+				SPIDER_NET_DESCR_NOT_IN_USE) {
+		if (spider_net_prepare_rx_descr(card, chain->head))
+			break;
+		count++;
+		chain->head = chain->head->next;
+	}
+	spin_unlock_irqrestore(&card->chain_lock, flags);
+
+	/* could be optimized, only do that, if we know the DMA processing
+	 * has terminated */
+	if (count)
+		spider_net_enable_rxdmac(card);
+}
+
+/**
+ * spider_net_alloc_rx_skbs - allocates rx skbs in rx descriptor chains
+ * @card: card structure
+ *
+ * returns 0 on success, <0 on failure
+ */
+static int
+spider_net_alloc_rx_skbs(struct spider_net_card *card)
+{
+	int result;
+	struct spider_net_descr_chain *chain;
+
+	result = -ENOMEM;
+
+	chain = &card->rx_chain;
+	/* put at least one buffer into the chain. if this fails,
+	 * we've got a problem. if not, spider_net_refill_rx_chain
+	 * will do the rest at the end of this function */
+	if (spider_net_prepare_rx_descr(card, chain->head))
+		goto error;
+	else
+		chain->head = chain->head->next;
+
+	/* this will allocate the rest of the rx buffers; if not, it's
+	 * business as usual later on */
+	spider_net_refill_rx_chain(card);
+	return 0;
+
+error:
+	spider_net_free_rx_chain_contents(card);
+	return result;
+}
+
+/**
+ * spider_net_release_tx_descr - processes a used tx descriptor
+ * @card: card structure
+ * @descr: descriptor to release
+ *
+ * releases a used tx descriptor (unmapping, freeing of skb)
+ */
+static void
+spider_net_release_tx_descr(struct spider_net_card *card,
+			    struct spider_net_descr *descr)
+{
+	struct sk_buff *skb;
+
+	/* unmap the skb */
+	skb = descr->skb;
+	pci_unmap_single(card->pdev, descr->buf_addr, skb->len,
+			 PCI_DMA_BIDIRECTIONAL);
+
+	dev_kfree_skb_any(skb);
+
+	/* set status to not used */
+	spider_net_set_descr_status(descr, SPIDER_NET_DESCR_NOT_IN_USE);
+}
+
+/**
+ * spider_net_release_tx_chain - processes sent tx descriptors
+ * @card: adapter structure
+ * @brutal: if set, don't care about whether descriptor seems to be in use
+ *
+ * releases the tx descriptors that spider has finished with (if non-brutal)
+ * or simply release tx descriptors (if brutal)
+ */
+static void
+spider_net_release_tx_chain(struct spider_net_card *card, int brutal)
+{
+	struct spider_net_descr_chain *tx_chain = &card->tx_chain;
+	enum spider_net_descr_status status;
+
+	spider_net_tx_irq_off(card);
+
+	/* no lock for chain needed, if this is only executed once at a time */
+again:
+	for (;;) {
+		status = spider_net_get_descr_status(tx_chain->tail);
+		switch (status) {
+		case SPIDER_NET_DESCR_CARDOWNED:
+			if (!brutal) goto out;
+			/* fallthrough, if we release the descriptors
+			 * brutally (then we don't care about
+			 * SPIDER_NET_DESCR_CARDOWNED) */
+		case SPIDER_NET_DESCR_RESPONSE_ERROR:
+		case SPIDER_NET_DESCR_PROTECTION_ERROR:
+		case SPIDER_NET_DESCR_FORCE_END:
+			if (netif_msg_tx_err(card))
+				pr_err("%s: forcing end of tx descriptor "
+				       "with status x%02x\n",
+				       card->netdev->name, status);
+			card->netdev_stats.tx_dropped++;
+			break;
+
+		case SPIDER_NET_DESCR_COMPLETE:
+			card->netdev_stats.tx_packets++;
+			card->netdev_stats.tx_bytes +=
+				tx_chain->tail->skb->len;
+			break;
+
+		default: /* any other value (== SPIDER_NET_DESCR_NOT_IN_USE) */
+			goto out;
+		}
+		spider_net_release_tx_descr(card, tx_chain->tail);
+		tx_chain->tail = tx_chain->tail->next;
+	}
+out:
+	netif_wake_queue(card->netdev);
+
+	if (!brutal) {
+		/* switch on tx irqs (while we are still in the interrupt
+		 * handler, so we don't get an interrupt), check again
+		 * for done descriptors. This results in fewer interrupts */
+		spider_net_tx_irq_on(card);
+		status = spider_net_get_descr_status(tx_chain->tail);
+		switch (status) {
+			case SPIDER_NET_DESCR_RESPONSE_ERROR:
+			case SPIDER_NET_DESCR_PROTECTION_ERROR:
+			case SPIDER_NET_DESCR_FORCE_END:
+			case SPIDER_NET_DESCR_COMPLETE:
+				goto again;
+			default:
+				break;
+		}
+	}
+
+}
+
+/**
+ * spider_net_get_multicast_hash - generates hash for multicast filter table
+ * @addr: multicast address
+ *
+ * returns the hash value.
+ *
+ * spider_net_get_multicast_hash calculates a hash value for a given multicast
+ * address, that is used to set the multicast filter tables
+ */
+static u8
+spider_net_get_multicast_hash(struct net_device *netdev, __u8 *addr)
+{
+	/* FIXME: an addr of 01:00:5e:00:00:01 must result in 0xa9,
+	 * ff:ff:ff:ff:ff:ff must result in 0xfd */
+	u32 crc;
+	u8 hash;
+
+	crc = crc32_be(~0, addr, netdev->addr_len);
+
+	hash = (crc >> 27);
+	hash <<= 3;
+	hash |= crc & 7;
+
+	return hash;
+}
+
+/**
+ * spider_net_set_multi - sets multicast addresses and promisc flags
+ * @netdev: interface device structure
+ *
+ * spider_net_set_multi configures multicast addresses as needed for the
+ * netdev interface. It also sets up multicast, allmulti and promisc
+ * flags appropriately
+ */
+static void
+spider_net_set_multi(struct net_device *netdev)
+{
+	struct dev_mc_list *mc;
+	u8 hash;
+	int i;
+	u32 reg;
+	struct spider_net_card *card = netdev_priv(netdev);
+	unsigned long bitmask[SPIDER_NET_MULTICAST_HASHES / BITS_PER_LONG] =
+		{0, };
+
+	spider_net_set_promisc(card);
+
+	if (netdev->flags & IFF_ALLMULTI) {
+		for (i = 0; i < SPIDER_NET_MULTICAST_HASHES; i++) {
+			set_bit(i, bitmask);
+		}
+		goto write_hash;
+	}
+
+	/* well, we know, what the broadcast hash value is: it's xfd
+	hash = spider_net_get_multicast_hash(netdev, netdev->broadcast); */
+	set_bit(0xfd, bitmask);
+
+	for (mc = netdev->mc_list; mc; mc = mc->next) {
+		hash = spider_net_get_multicast_hash(netdev, mc->dmi_addr);
+		set_bit(hash, bitmask);
+	}
+
+write_hash:
+	for (i = 0; i < SPIDER_NET_MULTICAST_HASHES / 4; i++) {
+		reg = 0;
+		if (test_bit(i * 4, bitmask))
+			reg += 0x08;
+		reg <<= 8;
+		if (test_bit(i * 4 + 1, bitmask))
+			reg += 0x08;
+		reg <<= 8;
+		if (test_bit(i * 4 + 2, bitmask))
+			reg += 0x08;
+		reg <<= 8;
+		if (test_bit(i * 4 + 3, bitmask))
+			reg += 0x08;
+
+		spider_net_write_reg(card, SPIDER_NET_GMRMHFILnR + i * 4, reg);
+	}
+}
+
+/**
+ * spider_net_disable_rxdmac - disables the receive DMA controller
+ * @card: card structure
+ *
+ * spider_net_disable_rxdmac terminates processing on the DMA controller by
+ * turing off DMA and issueing a force end
+ */
+static void
+spider_net_disable_rxdmac(struct spider_net_card *card)
+{
+	spider_net_write_reg(card, SPIDER_NET_GDADMACCNTR,
+			     SPIDER_NET_DMA_RX_FEND_VALUE);
+}
+
+/**
+ * spider_net_stop - called upon ifconfig down
+ * @netdev: interface device structure
+ *
+ * always returns 0
+ */
+int
+spider_net_stop(struct net_device *netdev)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+
+	netif_poll_disable(netdev);
+	netif_carrier_off(netdev);
+	netif_stop_queue(netdev);
+
+	/* disable/mask all interrupts */
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, 0);
+	spider_net_write_reg(card, SPIDER_NET_GHIINT1MSK, 0);
+	spider_net_write_reg(card, SPIDER_NET_GHIINT2MSK, 0);
+
+	spider_net_write_reg(card, SPIDER_NET_GDTDMACCNTR,
+			     SPIDER_NET_DMA_TX_FEND_VALUE);
+
+	/* turn off DMA, force end */
+	spider_net_disable_rxdmac(card);
+
+	/* release chains */
+	spider_net_release_tx_chain(card, 1);
+
+	/* switch off card */
+	spider_net_write_reg(card, SPIDER_NET_CKRCTRL,
+			     SPIDER_NET_CKRCTRL_STOP_VALUE);
+
+	spider_net_free_chain(card, &card->tx_chain);
+	spider_net_free_chain(card, &card->rx_chain);
+
+	return 0;
+}
+
+/**
+ * spider_net_get_next_tx_descr - returns the next available tx descriptor
+ * @card: device structure to get descriptor from
+ *
+ * returns the address of the next descriptor, or NULL if not available.
+ */
+static struct spider_net_descr *
+spider_net_get_next_tx_descr(struct spider_net_card *card)
+{
+	/* check, if head points to not-in-use descr */
+	if ( spider_net_get_descr_status(card->tx_chain.head) ==
+	     SPIDER_NET_DESCR_NOT_IN_USE ) {
+		return card->tx_chain.head;
+	} else {
+		return NULL;
+	}
+}
+
+/**
+ * spider_net_set_txdescr_cmdstat - sets the tx descriptor command field
+ * @descr: descriptor structure to fill out
+ * @skb: packet to consider
+ *
+ * fills out the command and status field of the descriptor structure,
+ * depending on hardware checksum settings. This function assumes a wmb()
+ * has executed before.
+ */
+static void
+spider_net_set_txdescr_cmdstat(struct spider_net_descr *descr,
+			       struct sk_buff *skb)
+{
+	if (skb->ip_summed != CHECKSUM_HW) {
+		descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_NOCS;
+		return;
+	}
+
+	/* is packet ip?
+	 * if yes: tcp? udp? */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		if (skb->nh.iph->protocol == IPPROTO_TCP) {
+			descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_TCPCS;
+		} else if (skb->nh.iph->protocol == IPPROTO_UDP) {
+			descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_UDPCS;
+		} else { /* the stack should checksum non-tcp and non-udp
+			    packets on his own: NETIF_F_IP_CSUM */
+			descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_NOCS;
+		}
+	}
+}
+
+/**
+ * spider_net_prepare_tx_descr - fill tx descriptor with skb data
+ * @card: card structure
+ * @descr: descriptor structure to fill out
+ * @skb: packet to use
+ *
+ * returns 0 on success, <0 on failure.
+ *
+ * fills out the descriptor structure with skb data and len. Copies data,
+ * if needed (32bit DMA!)
+ */
+static int
+spider_net_prepare_tx_descr(struct spider_net_card *card,
+			    struct spider_net_descr *descr,
+			    struct sk_buff *skb)
+{
+	descr->buf_addr = pci_map_single(card->pdev, skb->data,
+					 skb->len, PCI_DMA_BIDIRECTIONAL);
+	if (descr->buf_addr == DMA_ERROR_CODE) {
+		if (netif_msg_tx_err(card))
+			pr_err("could not iommu-map packet (%p, %i). "
+				  "Dropping packet\n", skb->data, skb->len);
+		return -ENOMEM;
+	}
+
+	descr->buf_size = skb->len;
+	descr->skb = skb;
+	descr->data_status = 0;
+
+	/* make sure the above values are in memory before we change the
+	 * status */
+	wmb();
+
+	spider_net_set_txdescr_cmdstat(descr,skb);
+
+	return 0;
+}
+
+/**
+ * spider_net_kick_tx_dma - enables TX DMA processing
+ * @card: card structure
+ * @descr: descriptor address to enable TX processing at
+ *
+ * spider_net_kick_tx_dma writes the current tx chain head as start address
+ * of the tx descriptor chain and enables the transmission DMA engine
+ */
+static void
+spider_net_kick_tx_dma(struct spider_net_card *card,
+		       struct spider_net_descr *descr)
+{
+	/* this is the only descriptor in the output chain.
+	 * Enable TX DMA */
+
+	spider_net_write_reg(card, SPIDER_NET_GDTDCHA,
+			     descr->bus_addr);
+
+	spider_net_write_reg(card, SPIDER_NET_GDTDMACCNTR,
+			     SPIDER_NET_DMA_TX_VALUE);
+}
+
+/**
+ * spider_net_xmit - transmits a frame over the device
+ * @skb: packet to send out
+ * @netdev: interface device structure
+ *
+ * returns 0 on success, <0 on failure
+ */
+static int
+spider_net_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	struct spider_net_descr *descr;
+	int result;
+
+	descr = spider_net_get_next_tx_descr(card);
+
+	if (!descr) {
+		netif_stop_queue(netdev);
+
+		descr = spider_net_get_next_tx_descr(card);
+		if (!descr)
+			goto error;
+		else
+			netif_start_queue(netdev);
+	}
+
+	result = spider_net_prepare_tx_descr(card, descr, skb);
+	if (result)
+		goto error;
+
+	card->tx_chain.head = card->tx_chain.head->next;
+
+	/* make sure the status from spider_net_prepare_tx_descr is in
+	 * memory before we check out the previous descriptor */
+	wmb();
+
+	if (spider_net_get_descr_status(descr->prev) !=
+	    SPIDER_NET_DESCR_CARDOWNED)
+		spider_net_kick_tx_dma(card, descr);
+
+	return NETDEV_TX_OK;
+
+error:
+	card->netdev_stats.tx_dropped++;
+	return NETDEV_TX_LOCKED;
+}
+
+/**
+ * spider_net_do_ioctl - called for device ioctls
+ * @netdev: interface device structure
+ * @ifr: request parameter structure for ioctl
+ * @cmd: command code for ioctl
+ *
+ * returns 0 on success, <0 on failure. Currently, we have no special ioctls.
+ * -EOPNOTSUPP is returned, if an unknown ioctl was requested
+ */
+static int
+spider_net_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/**
+ * spider_net_pass_skb_up - takes an skb from a descriptor and passes it on
+ * @descr: descriptor to process
+ * @card: card structure
+ *
+ * returns 1 on success, 0 if no packet was passed to the stack
+ *
+ * iommu-unmaps the skb, fills out skb structure and passes the data to the
+ * stack. The descriptor state is not changed.
+ */
+static int
+spider_net_pass_skb_up(struct spider_net_descr *descr,
+		       struct spider_net_card *card)
+{
+	struct sk_buff *skb;
+	struct net_device *netdev;
+	u32 data_status, data_error;
+
+	data_status = descr->data_status;
+	data_error = descr->data_error;
+
+	netdev = card->netdev;
+
+	/* check for errors in the data_error flag */
+	if ((data_error & SPIDER_NET_DATA_ERROR_MASK) &&
+	    netif_msg_rx_err(card))
+		pr_err("error in received descriptor found, "
+		       "data_status=x%08x, data_error=x%08x\n",
+		       data_status, data_error);
+
+	/* prepare skb, unmap descriptor */
+	skb = descr->skb;
+	pci_unmap_single(card->pdev, descr->buf_addr, SPIDER_NET_MAX_MTU,
+			 PCI_DMA_BIDIRECTIONAL);
+
+	/* the cases we'll throw away the packet immediately */
+	if (data_error & SPIDER_NET_DESTROY_RX_FLAGS)
+		return 0;
+
+	skb->dev = netdev;
+	skb_put(skb, descr->valid_size);
+
+	/* the card seems to add 2 bytes of junk in front
+	 * of the ethernet frame */
+#define SPIDER_MISALIGN		2
+	skb_pull(skb, SPIDER_MISALIGN);
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	/* checksum offload */
+	if (card->options.rx_csum) {
+		if ( (data_status & SPIDER_NET_DATA_STATUS_CHK_MASK) &&
+		     (!(data_error & SPIDER_NET_DATA_ERROR_CHK_MASK)) )
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		else
+			skb->ip_summed = CHECKSUM_NONE;
+	} else {
+		skb->ip_summed = CHECKSUM_NONE;
+	}
+
+	if (data_status & SPIDER_NET_VLAN_PACKET) {
+		/* further enhancements: HW-accel VLAN
+		 * vlan_hwaccel_receive_skb
+		 */
+	}
+
+	/* pass skb up to stack */
+	netif_receive_skb(skb);
+
+	/* update netdevice statistics */
+	card->netdev_stats.rx_packets++;
+	card->netdev_stats.rx_bytes += skb->len;
+
+	return 1;
+}
+
+/**
+ * spider_net_decode_descr - processes an rx descriptor
+ * @card: card structure
+ *
+ * returns 1 if a packet has been sent to the stack, otherwise 0
+ *
+ * processes an rx descriptor by iommu-unmapping the data buffer and passing
+ * the packet up to the stack
+ */
+static int
+spider_net_decode_one_descr(struct spider_net_card *card)
+{
+	enum spider_net_descr_status status;
+	struct spider_net_descr *descr;
+	struct spider_net_descr_chain *chain;
+	int result;
+
+	chain = &card->rx_chain;
+	descr = chain->tail;
+
+	status = spider_net_get_descr_status(descr);
+
+	if (status == SPIDER_NET_DESCR_CARDOWNED) {
+		/* nothing in the descriptor yet */
+		return 0;
+	}
+
+	if (status == SPIDER_NET_DESCR_NOT_IN_USE) {
+		/* not initialized yet, I bet chain->tail == chain->head
+		 * and the ring is empty */
+		spider_net_refill_rx_chain(card);
+		return 0;
+	}
+
+	/* descriptor definitively used -- move on head */
+	chain->tail = descr->next;
+
+	result = 0;
+	if ( (status == SPIDER_NET_DESCR_RESPONSE_ERROR) ||
+	     (status == SPIDER_NET_DESCR_PROTECTION_ERROR) ||
+	     (status == SPIDER_NET_DESCR_FORCE_END) ) {
+		if (netif_msg_rx_err(card))
+			pr_err("%s: dropping RX descriptor with state %d\n",
+			       card->netdev->name, status);
+		card->netdev_stats.rx_dropped++;
+		goto refill;
+	}
+
+	if ( (status != SPIDER_NET_DESCR_COMPLETE) &&
+	     (status != SPIDER_NET_DESCR_FRAME_END) ) {
+		if (netif_msg_rx_err(card))
+			pr_err("%s: RX descriptor with state %d\n",
+			       card->netdev->name, status);
+		goto refill;
+	}
+
+	/* ok, we've got a packet in descr */
+	result = spider_net_pass_skb_up(descr, card);
+refill:
+	spider_net_set_descr_status(descr, SPIDER_NET_DESCR_NOT_IN_USE);
+	/* change the descriptor state: */
+	spider_net_refill_rx_chain(card);
+
+	return result;
+}
+
+/**
+ * spider_net_poll - NAPI poll function called by the stack to return packets
+ * @netdev: interface device structure
+ * @budget: number of packets we can pass to the stack at most
+ *
+ * returns 0 if no more packets available to the driver/stack. Returns 1,
+ * if the quota is exceeded, but the driver has still packets.
+ *
+ * spider_net_poll returns all packets from the rx descriptors to the stack
+ * (using netif_receive_skb). If all/enough packets are up, the driver
+ * reenables interrupts and returns 0. If not, 1 is returned.
+ */
+static int
+spider_net_poll(struct net_device *netdev, int *budget)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	int packets_to_do, packets_done = 0;
+	int no_more_packets = 0;
+
+	packets_to_do = min(*budget, netdev->quota);
+
+	while (packets_to_do) {
+		if (spider_net_decode_one_descr(card)) {
+			packets_done++;
+			packets_to_do--;
+		} else {
+			/* no more packets for the stack */
+			no_more_packets = 1;
+			break;
+		}
+	}
+
+	netdev->quota -= packets_done;
+	*budget -= packets_done;
+
+	/* if all packets are in the stack, enable interrupts and return 0 */
+	/* if not, return 1 */
+	if (no_more_packets) {
+		netif_rx_complete(netdev);
+		spider_net_rx_irq_on(card);
+		return 0;
+	}
+
+	return 1;
+}
+
+/**
+ * spider_net_vlan_rx_reg - initializes VLAN structures in the driver and card
+ * @netdev: interface device structure
+ * @grp: vlan_group structure that is registered (NULL on destroying interface)
+ */
+static void
+spider_net_vlan_rx_reg(struct net_device *netdev, struct vlan_group *grp)
+{
+	/* further enhancement... yet to do */
+	return;
+}
+
+/**
+ * spider_net_vlan_rx_add - adds VLAN id to the card filter
+ * @netdev: interface device structure
+ * @vid: VLAN id to add
+ */
+static void
+spider_net_vlan_rx_add(struct net_device *netdev, uint16_t vid)
+{
+	/* further enhancement... yet to do */
+	/* add vid to card's VLAN filter table */
+	return;
+}
+
+/**
+ * spider_net_vlan_rx_kill - removes VLAN id to the card filter
+ * @netdev: interface device structure
+ * @vid: VLAN id to remove
+ */
+static void
+spider_net_vlan_rx_kill(struct net_device *netdev, uint16_t vid)
+{
+	/* further enhancement... yet to do */
+	/* remove vid from card's VLAN filter table */
+}
+
+/**
+ * spider_net_get_stats - get interface statistics
+ * @netdev: interface device structure
+ *
+ * returns the interface statistics residing in the spider_net_card struct
+ */
+static struct net_device_stats *
+spider_net_get_stats(struct net_device *netdev)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	struct net_device_stats *stats = &card->netdev_stats;
+	return stats;
+}
+
+/**
+ * spider_net_change_mtu - changes the MTU of an interface
+ * @netdev: interface device structure
+ * @new_mtu: new MTU value
+ *
+ * returns 0 on success, <0 on failure
+ */
+static int
+spider_net_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	/* no need to re-alloc skbs or so -- the max mtu is about 2.3k
+	 * and mtu is outbound only anyway */
+	if ( (new_mtu < SPIDER_NET_MIN_MTU ) ||
+		(new_mtu > SPIDER_NET_MAX_MTU) )
+		return -EINVAL;
+	netdev->mtu = new_mtu;
+	return 0;
+}
+
+/**
+ * spider_net_set_mac - sets the MAC of an interface
+ * @netdev: interface device structure
+ * @ptr: pointer to new MAC address
+ *
+ * Returns 0 on success, <0 on failure. Currently, we don't support this
+ * and will always return EOPNOTSUPP.
+ */
+static int
+spider_net_set_mac(struct net_device *netdev, void *p)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	u32 macl, macu;
+	struct sockaddr *addr = p;
+
+	/* GMACTPE and GMACRPE must be off, so we only allow this, if
+	 * the device is down */
+	if (netdev->flags & IFF_UP)
+		return -EBUSY;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	macu = (addr->sa_data[0]<<24) + (addr->sa_data[1]<<16) +
+		(addr->sa_data[2]<<8) + (addr->sa_data[3]);
+	macl = (addr->sa_data[4]<<8) + (addr->sa_data[5]);
+	spider_net_write_reg(card, SPIDER_NET_GMACUNIMACU, macu);
+	spider_net_write_reg(card, SPIDER_NET_GMACUNIMACL, macl);
+
+	spider_net_set_promisc(card);
+
+	/* look up, whether we have been successful */
+	if (spider_net_get_mac_address(netdev))
+		return -EADDRNOTAVAIL;
+	if (memcmp(netdev->dev_addr,addr->sa_data,netdev->addr_len))
+		return -EADDRNOTAVAIL;
+
+	return 0;
+}
+
+/**
+ * spider_net_enable_txdmac - enables a TX DMA controller
+ * @card: card structure
+ *
+ * spider_net_enable_txdmac enables the TX DMA controller by setting the
+ * descriptor chain tail address
+ */
+static void
+spider_net_enable_txdmac(struct spider_net_card *card)
+{
+	/* assume chain is aligned correctly */
+	spider_net_write_reg(card, SPIDER_NET_GDTDCHA,
+			     card->tx_chain.tail->bus_addr);
+}
+
+/**
+ * spider_net_handle_error_irq - handles errors raised by an interrupt
+ * @card: card structure
+ * @status_reg: interrupt status register 0 (GHIINT0STS)
+ *
+ * spider_net_handle_error_irq treats or ignores all error conditions
+ * found when an interrupt is presented
+ */
+static void
+spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg)
+{
+	u32 error_reg1, error_reg2;
+	u32 i;
+	int show_error = 1;
+
+	error_reg1 = spider_net_read_reg(card, SPIDER_NET_GHIINT1STS);
+	error_reg2 = spider_net_read_reg(card, SPIDER_NET_GHIINT2STS);
+
+	/* check GHIINT0STS ************************************/
+	if (status_reg)
+		for (i = 0; i < 32; i++)
+			if (status_reg & (1<<i))
+				switch (i)
+	{
+	/* let error_reg1 and error_reg2 evaluation decide, what to do
+	case SPIDER_NET_PHYINT:
+	case SPIDER_NET_GMAC2INT:
+	case SPIDER_NET_GMAC1INT:
+	case SPIDER_NET_GIPSINT:
+	case SPIDER_NET_GFIFOINT:
+	case SPIDER_NET_DMACINT:
+	case SPIDER_NET_GSYSINT:
+		break; */
+
+	case SPIDER_NET_GPWOPCMPINT:
+		/* PHY write operation completed */
+		show_error = 0;
+		break;
+	case SPIDER_NET_GPROPCMPINT:
+		/* PHY read operation completed */
+		/* we don't use semaphores, as we poll for the completion
+		 * of the read operation in spider_net_read_phy. Should take
+		 * about 50 us */
+		show_error = 0;
+		break;
+	case SPIDER_NET_GPWFFINT:
+		/* PHY command queue full */
+		if (netif_msg_intr(card))
+			pr_err("PHY write queue full\n");
+		show_error = 0;
+		break;
+
+	/* case SPIDER_NET_GRMDADRINT: not used. print a message */
+	/* case SPIDER_NET_GRMARPINT: not used. print a message */
+	/* case SPIDER_NET_GRMMPINT: not used. print a message */
+
+	case SPIDER_NET_GDTDEN0INT:
+		/* someone has set TX_DMA_EN to 0 */
+		show_error = 0;
+		break;
+
+	case SPIDER_NET_GDDDEN0INT: /* fallthrough */
+	case SPIDER_NET_GDCDEN0INT: /* fallthrough */
+	case SPIDER_NET_GDBDEN0INT: /* fallthrough */
+	case SPIDER_NET_GDADEN0INT:
+		/* someone has set RX_DMA_EN to 0 */
+		show_error = 0;
+		break;
+
+	/* RX interrupts */
+	case SPIDER_NET_GDDFDCINT:
+	case SPIDER_NET_GDCFDCINT:
+	case SPIDER_NET_GDBFDCINT:
+	case SPIDER_NET_GDAFDCINT:
+	/* case SPIDER_NET_GDNMINT: not used. print a message */
+	/* case SPIDER_NET_GCNMINT: not used. print a message */
+	/* case SPIDER_NET_GBNMINT: not used. print a message */
+	/* case SPIDER_NET_GANMINT: not used. print a message */
+	/* case SPIDER_NET_GRFNMINT: not used. print a message */
+		show_error = 0;
+		break;
+
+	/* TX interrupts */
+	case SPIDER_NET_GDTFDCINT:
+		show_error = 0;
+		break;
+	case SPIDER_NET_GTTEDINT:
+		show_error = 0;
+		break;
+	case SPIDER_NET_GDTDCEINT:
+		/* chain end. If a descriptor should be sent, kick off
+		 * tx dma
+		if (card->tx_chain.tail == card->tx_chain.head)
+			spider_net_kick_tx_dma(card);
+		show_error = 0; */
+		break;
+
+	/* case SPIDER_NET_G1TMCNTINT: not used. print a message */
+	/* case SPIDER_NET_GFREECNTINT: not used. print a message */
+	}
+
+	/* check GHIINT1STS ************************************/
+	if (error_reg1)
+		for (i = 0; i < 32; i++)
+			if (error_reg1 & (1<<i))
+				switch (i)
+	{
+	case SPIDER_NET_GTMFLLINT:
+		if (netif_msg_intr(card))
+			pr_err("Spider TX RAM full\n");
+		show_error = 0;
+		break;
+	case SPIDER_NET_GRMFLLINT:
+		if (netif_msg_intr(card))
+			pr_err("Spider RX RAM full, incoming packets "
+			       "might be discarded !\n");
+		netif_rx_schedule(card->netdev);
+		spider_net_enable_rxchtails(card);
+		spider_net_enable_rxdmac(card);
+		break;
+
+	/* case SPIDER_NET_GTMSHTINT: problem, print a message */
+	case SPIDER_NET_GDTINVDINT:
+		/* allrighty. tx from previous descr ok */
+		show_error = 0;
+		break;
+	/* case SPIDER_NET_GRFDFLLINT: print a message down there */
+	/* case SPIDER_NET_GRFCFLLINT: print a message down there */
+	/* case SPIDER_NET_GRFBFLLINT: print a message down there */
+	/* case SPIDER_NET_GRFAFLLINT: print a message down there */
+
+	/* chain end */
+	case SPIDER_NET_GDDDCEINT: /* fallthrough */
+	case SPIDER_NET_GDCDCEINT: /* fallthrough */
+	case SPIDER_NET_GDBDCEINT: /* fallthrough */
+	case SPIDER_NET_GDADCEINT:
+		if (netif_msg_intr(card))
+			pr_err("got descriptor chain end interrupt, "
+			       "restarting DMAC %c.\n",
+			       'D'+i-SPIDER_NET_GDDDCEINT);
+		spider_net_refill_rx_chain(card);
+		show_error = 0;
+		break;
+
+	/* invalid descriptor */
+	case SPIDER_NET_GDDINVDINT: /* fallthrough */
+	case SPIDER_NET_GDCINVDINT: /* fallthrough */
+	case SPIDER_NET_GDBINVDINT: /* fallthrough */
+	case SPIDER_NET_GDAINVDINT:
+		/* could happen when rx chain is full */
+		spider_net_refill_rx_chain(card);
+		show_error = 0;
+		break;
+
+	/* case SPIDER_NET_GDTRSERINT: problem, print a message */
+	/* case SPIDER_NET_GDDRSERINT: problem, print a message */
+	/* case SPIDER_NET_GDCRSERINT: problem, print a message */
+	/* case SPIDER_NET_GDBRSERINT: problem, print a message */
+	/* case SPIDER_NET_GDARSERINT: problem, print a message */
+	/* case SPIDER_NET_GDSERINT: problem, print a message */
+	/* case SPIDER_NET_GDTPTERINT: problem, print a message */
+	/* case SPIDER_NET_GDDPTERINT: problem, print a message */
+	/* case SPIDER_NET_GDCPTERINT: problem, print a message */
+	/* case SPIDER_NET_GDBPTERINT: problem, print a message */
+	/* case SPIDER_NET_GDAPTERINT: problem, print a message */
+	default:
+		show_error = 1;
+		break;
+	}
+
+	/* check GHIINT2STS ************************************/
+	if (error_reg2)
+		for (i = 0; i < 32; i++)
+			if (error_reg2 & (1<<i))
+				switch (i)
+	{
+	/* there is nothing we can (want  to) do at this time. Log a
+	 * message, we can switch on and off the specific values later on
+	case SPIDER_NET_GPROPERINT:
+	case SPIDER_NET_GMCTCRSNGINT:
+	case SPIDER_NET_GMCTLCOLINT:
+	case SPIDER_NET_GMCTTMOTINT:
+	case SPIDER_NET_GMCRCAERINT:
+	case SPIDER_NET_GMCRCALERINT:
+	case SPIDER_NET_GMCRALNERINT:
+	case SPIDER_NET_GMCROVRINT:
+	case SPIDER_NET_GMCRRNTINT:
+	case SPIDER_NET_GMCRRXERINT:
+	case SPIDER_NET_GTITCSERINT:
+	case SPIDER_NET_GTIFMTERINT:
+	case SPIDER_NET_GTIPKTRVKINT:
+	case SPIDER_NET_GTISPINGINT:
+	case SPIDER_NET_GTISADNGINT:
+	case SPIDER_NET_GTISPDNGINT:
+	case SPIDER_NET_GRIFMTERINT:
+	case SPIDER_NET_GRIPKTRVKINT:
+	case SPIDER_NET_GRISPINGINT:
+	case SPIDER_NET_GRISADNGINT:
+	case SPIDER_NET_GRISPDNGINT:
+		break;
+	*/
+		default:
+			break;
+	}
+
+	if ((show_error) && (netif_msg_intr(card)))
+		pr_err("Got error interrupt, GHIINT0STS = 0x%08x, "
+		       "GHIINT1STS = 0x%08x, GHIINT2STS = 0x%08x\n",
+		       status_reg, error_reg1, error_reg2);
+
+	/* clear interrupt sources */
+	spider_net_write_reg(card, SPIDER_NET_GHIINT1STS, error_reg1);
+	spider_net_write_reg(card, SPIDER_NET_GHIINT2STS, error_reg2);
+}
+
+/**
+ * spider_net_interrupt - interrupt handler for spider_net
+ * @irq: interupt number
+ * @ptr: pointer to net_device
+ * @regs: PU registers
+ *
+ * returns IRQ_HANDLED, if interrupt was for driver, or IRQ_NONE, if no
+ * interrupt found raised by card.
+ *
+ * This is the interrupt handler, that turns off
+ * interrupts for this device and makes the stack poll the driver
+ */
+static irqreturn_t
+spider_net_interrupt(int irq, void *ptr, struct pt_regs *regs)
+{
+	struct net_device *netdev = ptr;
+	struct spider_net_card *card = netdev_priv(netdev);
+	u32 status_reg;
+
+	status_reg = spider_net_read_reg(card, SPIDER_NET_GHIINT0STS);
+
+	if (!status_reg)
+		return IRQ_NONE;
+
+	if (status_reg & SPIDER_NET_TXINT)
+		spider_net_release_tx_chain(card, 0);
+
+	if (status_reg & SPIDER_NET_RXINT ) {
+		spider_net_rx_irq_off(card);
+		netif_rx_schedule(netdev);
+	}
+
+	/* we do this after rx and tx processing, as we want the tx chain
+	 * processed to see, whether we should restart tx dma processing */
+	spider_net_handle_error_irq(card, status_reg);
+
+	/* clear interrupt sources */
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0STS, status_reg);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/**
+ * spider_net_poll_controller - artificial interrupt for netconsole etc.
+ * @netdev: interface device structure
+ *
+ * see Documentation/networking/netconsole.txt
+ */
+static void
+spider_net_poll_controller(struct net_device *netdev)
+{
+	disable_irq(netdev->irq);
+	spider_net_interrupt(netdev->irq, netdev, NULL);
+	enable_irq(netdev->irq);
+}
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+
+/**
+ * spider_net_init_card - initializes the card
+ * @card: card structure
+ *
+ * spider_net_init_card initializes the card so that other registers can
+ * be used
+ */
+static void
+spider_net_init_card(struct spider_net_card *card)
+{
+	spider_net_write_reg(card, SPIDER_NET_CKRCTRL,
+			     SPIDER_NET_CKRCTRL_STOP_VALUE);
+
+	spider_net_write_reg(card, SPIDER_NET_CKRCTRL,
+			     SPIDER_NET_CKRCTRL_RUN_VALUE);
+}
+
+/**
+ * spider_net_enable_card - enables the card by setting all kinds of regs
+ * @card: card structure
+ *
+ * spider_net_enable_card sets a lot of SMMIO registers to enable the device
+ */
+static void
+spider_net_enable_card(struct spider_net_card *card)
+{
+	int i;
+	/* the following array consists of (register),(value) pairs
+	 * that are set in this function. A register of 0 ends the list */
+	u32 regs[][2] = {
+		{ SPIDER_NET_GRESUMINTNUM, 0 },
+		{ SPIDER_NET_GREINTNUM, 0 },
+
+		/* set interrupt frame number registers */
+		/* clear the single DMA engine registers first */
+		{ SPIDER_NET_GFAFRMNUM, SPIDER_NET_GFXFRAMES_VALUE },
+		{ SPIDER_NET_GFBFRMNUM, SPIDER_NET_GFXFRAMES_VALUE },
+		{ SPIDER_NET_GFCFRMNUM, SPIDER_NET_GFXFRAMES_VALUE },
+		{ SPIDER_NET_GFDFRMNUM, SPIDER_NET_GFXFRAMES_VALUE },
+		/* then set, what we really need */
+		{ SPIDER_NET_GFFRMNUM, SPIDER_NET_FRAMENUM_VALUE },
+
+		/* timer counter registers and stuff */
+		{ SPIDER_NET_GFREECNNUM, 0 },
+		{ SPIDER_NET_GONETIMENUM, 0 },
+		{ SPIDER_NET_GTOUTFRMNUM, 0 },
+
+		/* RX mode setting */
+		{ SPIDER_NET_GRXMDSET, SPIDER_NET_RXMODE_VALUE },
+		/* TX mode setting */
+		{ SPIDER_NET_GTXMDSET, SPIDER_NET_TXMODE_VALUE },
+		/* IPSEC mode setting */
+		{ SPIDER_NET_GIPSECINIT, SPIDER_NET_IPSECINIT_VALUE },
+
+		{ SPIDER_NET_GFTRESTRT, SPIDER_NET_RESTART_VALUE },
+
+		{ SPIDER_NET_GMRWOLCTRL, 0 },
+		{ SPIDER_NET_GTESTMD, 0 },
+
+		{ SPIDER_NET_GMACINTEN, 0 },
+
+		/* flow control stuff */
+		{ SPIDER_NET_GMACAPAUSE, SPIDER_NET_MACAPAUSE_VALUE },
+		{ SPIDER_NET_GMACTXPAUSE, SPIDER_NET_TXPAUSE_VALUE },
+
+		{ SPIDER_NET_GMACBSTLMT, SPIDER_NET_BURSTLMT_VALUE },
+		{ 0, 0}
+	};
+
+	i = 0;
+	while (regs[i][0]) {
+		spider_net_write_reg(card, regs[i][0], regs[i][1]);
+		i++;
+	}
+
+	/* clear unicast filter table entries 1 to 14 */
+	for (i = 1; i <= 14; i++) {
+		spider_net_write_reg(card,
+				     SPIDER_NET_GMRUAFILnR + i * 8,
+				     0x00080000);
+		spider_net_write_reg(card,
+				     SPIDER_NET_GMRUAFILnR + i * 8 + 4,
+				     0x00000000);
+	}
+
+	spider_net_write_reg(card, SPIDER_NET_GMRUA0FIL15R, 0x08080000);
+
+	spider_net_write_reg(card, SPIDER_NET_ECMODE, SPIDER_NET_ECMODE_VALUE);
+
+	/* set chain tail adress for RX chains and
+	 * enable DMA */
+	spider_net_enable_rxchtails(card);
+	spider_net_enable_rxdmac(card);
+
+	spider_net_write_reg(card, SPIDER_NET_GRXDMAEN, SPIDER_NET_WOL_VALUE);
+
+	/* set chain tail adress for TX chain */
+	spider_net_enable_txdmac(card);
+
+	spider_net_write_reg(card, SPIDER_NET_GMACLENLMT,
+			     SPIDER_NET_LENLMT_VALUE);
+	spider_net_write_reg(card, SPIDER_NET_GMACMODE,
+			     SPIDER_NET_MACMODE_VALUE);
+	spider_net_write_reg(card, SPIDER_NET_GMACOPEMD,
+			     SPIDER_NET_OPMODE_VALUE);
+
+	/* set interrupt mask registers */
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK,
+			     SPIDER_NET_INT0_MASK_VALUE);
+	spider_net_write_reg(card, SPIDER_NET_GHIINT1MSK,
+			     SPIDER_NET_INT1_MASK_VALUE);
+	spider_net_write_reg(card, SPIDER_NET_GHIINT2MSK,
+			     SPIDER_NET_INT2_MASK_VALUE);
+}
+
+/**
+ * spider_net_open - called upon ifonfig up
+ * @netdev: interface device structure
+ *
+ * returns 0 on success, <0 on failure
+ *
+ * spider_net_open allocates all the descriptors and memory needed for
+ * operation, sets up multicast list and enables interrupts
+ */
+int
+spider_net_open(struct net_device *netdev)
+{
+	struct spider_net_card *card = netdev_priv(netdev);
+	int result;
+
+	result = -ENOMEM;
+	if (spider_net_init_chain(card, &card->tx_chain,
+			  card->descr, tx_descriptors))
+		goto alloc_tx_failed;
+	if (spider_net_init_chain(card, &card->rx_chain,
+			  card->descr + tx_descriptors, rx_descriptors))
+		goto alloc_rx_failed;
+
+	/* allocate rx skbs */
+	if (spider_net_alloc_rx_skbs(card))
+		goto alloc_skbs_failed;
+
+	spider_net_set_multi(netdev);
+
+	/* further enhancement: setup hw vlan, if needed */
+
+	result = -EBUSY;
+	if (request_irq(netdev->irq, spider_net_interrupt,
+			     SA_SHIRQ, netdev->name, netdev))
+		goto register_int_failed;
+
+	spider_net_enable_card(card);
+
+	return 0;
+
+register_int_failed:
+	spider_net_free_rx_chain_contents(card);
+alloc_skbs_failed:
+	spider_net_free_chain(card, &card->rx_chain);
+alloc_rx_failed:
+	spider_net_free_chain(card, &card->tx_chain);
+alloc_tx_failed:
+	return result;
+}
+
+/**
+ * spider_net_setup_phy - setup PHY
+ * @card: card structure
+ *
+ * returns 0 on success, <0 on failure
+ *
+ * spider_net_setup_phy is used as part of spider_net_probe. Sets
+ * the PHY to 1000 Mbps
+ **/
+static int
+spider_net_setup_phy(struct spider_net_card *card)
+{
+	struct mii_phy *phy = &card->phy;
+
+	spider_net_write_reg(card, SPIDER_NET_GDTDMASEL,
+			     SPIDER_NET_DMASEL_VALUE);
+	spider_net_write_reg(card, SPIDER_NET_GPCCTRL,
+			     SPIDER_NET_PHY_CTRL_VALUE);
+	phy->mii_id = 1;
+	phy->dev = card->netdev;
+	phy->mdio_read = spider_net_read_phy;
+	phy->mdio_write = spider_net_write_phy;
+
+	mii_phy_probe(phy, phy->mii_id);
+
+	if (phy->def->ops->setup_forced)
+		phy->def->ops->setup_forced(phy, SPEED_1000, DUPLEX_FULL);
+
+	/* the following two writes could be moved to sungem_phy.c */
+	/* enable fiber mode */
+	spider_net_write_phy(card->netdev, 1, MII_NCONFIG, 0x9020);
+	/* LEDs active in both modes, autosense prio = fiber */
+	spider_net_write_phy(card->netdev, 1, MII_NCONFIG, 0x945f);
+
+	phy->def->ops->read_link(phy);
+	pr_info("Found %s with %i Mbps, %s-duplex.\n", phy->def->name,
+		phy->speed, phy->duplex==1 ? "Full" : "Half");
+
+	return 0;
+}
+
+/**
+ * spider_net_download_firmware - loads firmware into the adapter
+ * @card: card structure
+ * @firmware: firmware pointer
+ *
+ * spider_net_download_firmware loads the firmware opened by
+ * spider_net_init_firmware into the adapter.
+ */
+static void
+spider_net_download_firmware(struct spider_net_card *card,
+			     const struct firmware *firmware)
+{
+	int sequencer, i;
+	u32 *fw_ptr = (u32 *)firmware->data;
+
+	/* stop sequencers */
+	spider_net_write_reg(card, SPIDER_NET_GSINIT,
+			     SPIDER_NET_STOP_SEQ_VALUE);
+
+	for (sequencer = 0; sequencer < 6; sequencer++) {
+		spider_net_write_reg(card,
+				     SPIDER_NET_GSnPRGADR + sequencer * 8, 0);
+		for (i = 0; i < SPIDER_NET_FIRMWARE_LEN; i++) {
+			spider_net_write_reg(card, SPIDER_NET_GSnPRGDAT +
+					     sequencer * 8, *fw_ptr);
+			fw_ptr++;
+		}
+	}
+
+	spider_net_write_reg(card, SPIDER_NET_GSINIT,
+			     SPIDER_NET_RUN_SEQ_VALUE);
+}
+
+/**
+ * spider_net_init_firmware - reads in firmware parts
+ * @card: card structure
+ *
+ * Returns 0 on success, <0 on failure
+ *
+ * spider_net_init_firmware opens the sequencer firmware and does some basic
+ * checks. This function opens and releases the firmware structure. A call
+ * to download the firmware is performed before the release.
+ *
+ * Firmware format
+ * ===============
+ * spider_fw.bin is expected to be a file containing 6*1024*4 bytes, 4k being
+ * the program for each sequencer. Use the command
+ *    tail -q -n +2 Seq_code1_0x088.txt Seq_code2_0x090.txt              \
+ *         Seq_code3_0x098.txt Seq_code4_0x0A0.txt Seq_code5_0x0A8.txt   \
+ *         Seq_code6_0x0B0.txt | xxd -r -p -c4 > spider_fw.bin
+ *
+ * to generate spider_fw.bin, if you have sequencer programs with something
+ * like the following contents for each sequencer:
+ *    <ONE LINE COMMENT>
+ *    <FIRST 4-BYTES-WORD FOR SEQUENCER>
+ *    <SECOND 4-BYTES-WORD FOR SEQUENCER>
+ *     ...
+ *    <1024th 4-BYTES-WORD FOR SEQUENCER>
+ */
+static int
+spider_net_init_firmware(struct spider_net_card *card)
+{
+	const struct firmware *firmware;
+	int err = -EIO;
+
+	if (request_firmware(&firmware,
+			     SPIDER_NET_FIRMWARE_NAME, &card->pdev->dev) < 0) {
+		if (netif_msg_probe(card))
+			pr_err("Couldn't read in sequencer data file %s.\n",
+			       SPIDER_NET_FIRMWARE_NAME);
+		firmware = NULL;
+		goto out;
+	}
+
+	if (firmware->size != 6 * SPIDER_NET_FIRMWARE_LEN * sizeof(u32)) {
+		if (netif_msg_probe(card))
+			pr_err("Invalid size of sequencer data file %s.\n",
+			       SPIDER_NET_FIRMWARE_NAME);
+		goto out;
+	}
+
+	spider_net_download_firmware(card, firmware);
+
+	err = 0;
+out:
+	release_firmware(firmware);
+
+	return err;
+}
+
+/**
+ * spider_net_workaround_rxramfull - work around firmware bug
+ * @card: card structure
+ *
+ * no return value
+ **/
+static void
+spider_net_workaround_rxramfull(struct spider_net_card *card)
+{
+	int i, sequencer = 0;
+
+	/* cancel reset */
+	spider_net_write_reg(card, SPIDER_NET_CKRCTRL,
+			     SPIDER_NET_CKRCTRL_RUN_VALUE);
+
+	/* empty sequencer data */
+	for (sequencer = 0; sequencer < 6; sequencer++) {
+		spider_net_write_reg(card, SPIDER_NET_GSnPRGDAT +
+				     sequencer * 8, 0x0);
+		for (i = 0; i < SPIDER_NET_FIRMWARE_LEN; i++) {
+			spider_net_write_reg(card, SPIDER_NET_GSnPRGDAT +
+					     sequencer * 8, 0x0);
+		}
+	}
+
+	/* set sequencer operation */
+	spider_net_write_reg(card, SPIDER_NET_GSINIT, 0x000000fe);
+
+	/* reset */
+	spider_net_write_reg(card, SPIDER_NET_CKRCTRL,
+			     SPIDER_NET_CKRCTRL_STOP_VALUE);
+}
+
+/**
+ * spider_net_tx_timeout_task - task scheduled by the watchdog timeout
+ * function (to be called not under interrupt status)
+ * @data: data, is interface device structure
+ *
+ * called as task when tx hangs, resets interface (if interface is up)
+ */
+static void
+spider_net_tx_timeout_task(void *data)
+{
+	struct net_device *netdev = data;
+	struct spider_net_card *card = netdev_priv(netdev);
+
+	if (!(netdev->flags & IFF_UP))
+		goto out;
+
+	netif_device_detach(netdev);
+	spider_net_stop(netdev);
+
+	spider_net_workaround_rxramfull(card);
+	spider_net_init_card(card);
+
+	if (spider_net_setup_phy(card))
+		goto out;
+	if (spider_net_init_firmware(card))
+		goto out;
+
+	spider_net_open(netdev);
+	spider_net_kick_tx_dma(card, card->tx_chain.head);
+	netif_device_attach(netdev);
+
+out:
+	atomic_dec(&card->tx_timeout_task_counter);
+}
+
+/**
+ * spider_net_tx_timeout - called when the tx timeout watchdog kicks in.
+ * @netdev: interface device structure
+ *
+ * called, if tx hangs. Schedules a task that resets the interface
+ */
+static void
+spider_net_tx_timeout(struct net_device *netdev)
+{
+	struct spider_net_card *card;
+
+	card = netdev_priv(netdev);
+	atomic_inc(&card->tx_timeout_task_counter);
+	if (netdev->flags & IFF_UP)
+		schedule_work(&card->tx_timeout_task);
+	else
+		atomic_dec(&card->tx_timeout_task_counter);
+}
+
+/**
+ * spider_net_setup_netdev_ops - initialization of net_device operations
+ * @netdev: net_device structure
+ *
+ * fills out function pointers in the net_device structure
+ */
+static void
+spider_net_setup_netdev_ops(struct net_device *netdev)
+{
+	netdev->open = &spider_net_open;
+	netdev->stop = &spider_net_stop;
+	netdev->hard_start_xmit = &spider_net_xmit;
+	netdev->get_stats = &spider_net_get_stats;
+	netdev->set_multicast_list = &spider_net_set_multi;
+	netdev->set_mac_address = &spider_net_set_mac;
+	netdev->change_mtu = &spider_net_change_mtu;
+	netdev->do_ioctl = &spider_net_do_ioctl;
+	/* tx watchdog */
+	netdev->tx_timeout = &spider_net_tx_timeout;
+	netdev->watchdog_timeo = SPIDER_NET_WATCHDOG_TIMEOUT;
+	/* NAPI */
+	netdev->poll = &spider_net_poll;
+	netdev->weight = SPIDER_NET_NAPI_WEIGHT;
+	/* HW VLAN */
+	netdev->vlan_rx_register = &spider_net_vlan_rx_reg;
+	netdev->vlan_rx_add_vid = &spider_net_vlan_rx_add;
+	netdev->vlan_rx_kill_vid = &spider_net_vlan_rx_kill;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	/* poll controller */
+	netdev->poll_controller = &spider_net_poll_controller;
+#endif /* CONFIG_NET_POLL_CONTROLLER */
+	/* ethtool ops */
+	netdev->ethtool_ops = &spider_net_ethtool_ops;
+}
+
+/**
+ * spider_net_setup_netdev - initialization of net_device
+ * @card: card structure
+ *
+ * Returns 0 on success or <0 on failure
+ *
+ * spider_net_setup_netdev initializes the net_device structure
+ **/
+static int
+spider_net_setup_netdev(struct spider_net_card *card)
+{
+	int result;
+	struct net_device *netdev = card->netdev;
+	struct device_node *dn;
+	struct sockaddr addr;
+	u8 *mac;
+
+	SET_MODULE_OWNER(netdev);
+	SET_NETDEV_DEV(netdev, &card->pdev->dev);
+
+	pci_set_drvdata(card->pdev, netdev);
+	spin_lock_init(&card->intmask_lock);
+	netdev->irq = card->pdev->irq;
+
+	card->options.rx_csum = SPIDER_NET_RX_CSUM_DEFAULT;
+
+	spider_net_setup_netdev_ops(netdev);
+
+	netdev->features = 0;
+	/* some time: NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
+	 *		NETIF_F_HW_VLAN_FILTER */
+
+	netdev->irq = card->pdev->irq;
+
+	dn = pci_device_to_OF_node(card->pdev);
+	mac = (u8 *)get_property(dn, "local-mac-address", NULL);
+	memcpy(addr.sa_data, mac, ETH_ALEN);
+
+	result = spider_net_set_mac(netdev, &addr);
+	if ((result) && (netif_msg_probe(card)))
+		pr_err("Failed to set MAC address: %i\n", result);
+
+	result = register_netdev(netdev);
+	if (result) {
+		if (netif_msg_probe(card))
+			pr_err("Couldn't register net_device: %i\n",
+				  result);
+		return result;
+	}
+
+	if (netif_msg_probe(card))
+		pr_info("Initialized device %s.\n", netdev->name);
+
+	return 0;
+}
+
+/**
+ * spider_net_alloc_card - allocates net_device and card structure
+ *
+ * returns the card structure or NULL in case of errors
+ *
+ * the card and net_device structures are linked to each other
+ */
+static struct spider_net_card *
+spider_net_alloc_card(void)
+{
+	struct net_device *netdev;
+	struct spider_net_card *card;
+	size_t alloc_size;
+
+	alloc_size = sizeof (*card) +
+		sizeof (struct spider_net_descr) * rx_descriptors +
+		sizeof (struct spider_net_descr) * tx_descriptors;
+	netdev = alloc_etherdev(alloc_size);
+	if (!netdev)
+		return NULL;
+
+	card = netdev_priv(netdev);
+	card->netdev = netdev;
+	card->msg_enable = SPIDER_NET_DEFAULT_MSG;
+	INIT_WORK(&card->tx_timeout_task, spider_net_tx_timeout_task, netdev);
+	init_waitqueue_head(&card->waitq);
+	atomic_set(&card->tx_timeout_task_counter, 0);
+
+	return card;
+}
+
+/**
+ * spider_net_undo_pci_setup - releases PCI ressources
+ * @card: card structure
+ *
+ * spider_net_undo_pci_setup releases the mapped regions
+ */
+static void
+spider_net_undo_pci_setup(struct spider_net_card *card)
+{
+	iounmap(card->regs);
+	pci_release_regions(card->pdev);
+}
+
+/**
+ * spider_net_setup_pci_dev - sets up the device in terms of PCI operations
+ * @card: card structure
+ * @pdev: PCI device
+ *
+ * Returns the card structure or NULL if any errors occur
+ *
+ * spider_net_setup_pci_dev initializes pdev and together with the
+ * functions called in spider_net_open configures the device so that
+ * data can be transferred over it
+ * The net_device structure is attached to the card structure, if the
+ * function returns without error.
+ **/
+static struct spider_net_card *
+spider_net_setup_pci_dev(struct pci_dev *pdev)
+{
+	struct spider_net_card *card;
+	unsigned long mmio_start, mmio_len;
+
+	if (pci_enable_device(pdev)) {
+		pr_err("Couldn't enable PCI device\n");
+		return NULL;
+	}
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		pr_err("Couldn't find proper PCI device base address.\n");
+		goto out_disable_dev;
+	}
+
+	if (pci_request_regions(pdev, spider_net_driver_name)) {
+		pr_err("Couldn't obtain PCI resources, aborting.\n");
+		goto out_disable_dev;
+	}
+
+	pci_set_master(pdev);
+
+	card = spider_net_alloc_card();
+	if (!card) {
+		pr_err("Couldn't allocate net_device structure, "
+			  "aborting.\n");
+		goto out_release_regions;
+	}
+	card->pdev = pdev;
+
+	/* fetch base address and length of first resource */
+	mmio_start = pci_resource_start(pdev, 0);
+	mmio_len = pci_resource_len(pdev, 0);
+
+	card->netdev->mem_start = mmio_start;
+	card->netdev->mem_end = mmio_start + mmio_len;
+	card->regs = ioremap(mmio_start, mmio_len);
+
+	if (!card->regs) {
+		pr_err("Couldn't obtain PCI resources, aborting.\n");
+		goto out_release_regions;
+	}
+
+	return card;
+
+out_release_regions:
+	pci_release_regions(pdev);
+out_disable_dev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return NULL;
+}
+
+/**
+ * spider_net_probe - initialization of a device
+ * @pdev: PCI device
+ * @ent: entry in the device id list
+ *
+ * Returns 0 on success, <0 on failure
+ *
+ * spider_net_probe initializes pdev and registers a net_device
+ * structure for it. After that, the device can be ifconfig'ed up
+ **/
+static int __devinit
+spider_net_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int err = -EIO;
+	struct spider_net_card *card;
+
+	card = spider_net_setup_pci_dev(pdev);
+	if (!card)
+		goto out;
+
+	spider_net_workaround_rxramfull(card);
+	spider_net_init_card(card);
+
+	err = spider_net_setup_phy(card);
+	if (err)
+		goto out_undo_pci;
+
+	err = spider_net_init_firmware(card);
+	if (err)
+		goto out_undo_pci;
+
+	err = spider_net_setup_netdev(card);
+	if (err)
+		goto out_undo_pci;
+
+	return 0;
+
+out_undo_pci:
+	spider_net_undo_pci_setup(card);
+	free_netdev(card->netdev);
+out:
+	return err;
+}
+
+/**
+ * spider_net_remove - removal of a device
+ * @pdev: PCI device
+ *
+ * Returns 0 on success, <0 on failure
+ *
+ * spider_net_remove is called to remove the device and unregisters the
+ * net_device
+ **/
+static void __devexit
+spider_net_remove(struct pci_dev *pdev)
+{
+	struct net_device *netdev;
+	struct spider_net_card *card;
+
+	netdev = pci_get_drvdata(pdev);
+	card = netdev_priv(netdev);
+
+	wait_event(card->waitq,
+		   atomic_read(&card->tx_timeout_task_counter) == 0);
+
+	unregister_netdev(netdev);
+	spider_net_undo_pci_setup(card);
+	free_netdev(netdev);
+
+	free_irq(to_pci_dev(netdev->class_dev.dev)->irq, netdev);
+}
+
+static struct pci_driver spider_net_driver = {
+	.owner		= THIS_MODULE,
+	.name		= spider_net_driver_name,
+	.id_table	= spider_net_pci_tbl,
+	.probe		= spider_net_probe,
+	.remove		= __devexit_p(spider_net_remove)
+};
+
+/**
+ * spider_net_init - init function when the driver is loaded
+ *
+ * spider_net_init registers the device driver
+ */
+static int __init spider_net_init(void)
+{
+	if (rx_descriptors < SPIDER_NET_RX_DESCRIPTORS_MIN) {
+		rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_MIN;
+		pr_info("adjusting rx descriptors to %i.\n", rx_descriptors);
+	}
+	if (rx_descriptors > SPIDER_NET_RX_DESCRIPTORS_MAX) {
+		rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_MAX;
+		pr_info("adjusting rx descriptors to %i.\n", rx_descriptors);
+	}
+	if (tx_descriptors < SPIDER_NET_TX_DESCRIPTORS_MIN) {
+		tx_descriptors = SPIDER_NET_TX_DESCRIPTORS_MIN;
+		pr_info("adjusting tx descriptors to %i.\n", tx_descriptors);
+	}
+	if (tx_descriptors > SPIDER_NET_TX_DESCRIPTORS_MAX) {
+		tx_descriptors = SPIDER_NET_TX_DESCRIPTORS_MAX;
+		pr_info("adjusting tx descriptors to %i.\n", tx_descriptors);
+	}
+
+	return pci_register_driver(&spider_net_driver);
+}
+
+/**
+ * spider_net_cleanup - exit function when driver is unloaded
+ *
+ * spider_net_cleanup unregisters the device driver
+ */
+static void __exit spider_net_cleanup(void)
+{
+	pci_unregister_driver(&spider_net_driver);
+}
+
+module_init(spider_net_init);
+module_exit(spider_net_cleanup);
diff --git a/drivers/net/spider_net.h b/drivers/net/spider_net.h
new file mode 100644
index 000000000000..22b2f2347351
--- /dev/null
+++ b/drivers/net/spider_net.h
@@ -0,0 +1,469 @@
+/*
+ * Network device driver for Cell Processor-Based Blade
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Authors : Utz Bacher <utz.bacher@de.ibm.com>
+ *           Jens Osterkamp <Jens.Osterkamp@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _SPIDER_NET_H
+#define _SPIDER_NET_H
+
+#include "sungem_phy.h"
+
+extern int spider_net_stop(struct net_device *netdev);
+extern int spider_net_open(struct net_device *netdev);
+
+extern struct ethtool_ops spider_net_ethtool_ops;
+
+extern char spider_net_driver_name[];
+
+#define SPIDER_NET_MAX_MTU			2308
+#define SPIDER_NET_MIN_MTU			64
+
+#define SPIDER_NET_RXBUF_ALIGN			128
+
+#define SPIDER_NET_RX_DESCRIPTORS_DEFAULT	64
+#define SPIDER_NET_RX_DESCRIPTORS_MIN		16
+#define SPIDER_NET_RX_DESCRIPTORS_MAX		256
+
+#define SPIDER_NET_TX_DESCRIPTORS_DEFAULT	64
+#define SPIDER_NET_TX_DESCRIPTORS_MIN		16
+#define SPIDER_NET_TX_DESCRIPTORS_MAX		256
+
+#define SPIDER_NET_RX_CSUM_DEFAULT		1
+
+#define SPIDER_NET_WATCHDOG_TIMEOUT 5*HZ
+#define SPIDER_NET_NAPI_WEIGHT 64
+
+#define SPIDER_NET_FIRMWARE_LEN		1024
+#define SPIDER_NET_FIRMWARE_NAME	"spider_fw.bin"
+
+/** spider_net SMMIO registers */
+#define SPIDER_NET_GHIINT0STS		0x00000000
+#define SPIDER_NET_GHIINT1STS		0x00000004
+#define SPIDER_NET_GHIINT2STS		0x00000008
+#define SPIDER_NET_GHIINT0MSK		0x00000010
+#define SPIDER_NET_GHIINT1MSK		0x00000014
+#define SPIDER_NET_GHIINT2MSK		0x00000018
+
+#define SPIDER_NET_GRESUMINTNUM		0x00000020
+#define SPIDER_NET_GREINTNUM		0x00000024
+
+#define SPIDER_NET_GFFRMNUM		0x00000028
+#define SPIDER_NET_GFAFRMNUM		0x0000002c
+#define SPIDER_NET_GFBFRMNUM		0x00000030
+#define SPIDER_NET_GFCFRMNUM		0x00000034
+#define SPIDER_NET_GFDFRMNUM		0x00000038
+
+/* clear them (don't use it) */
+#define SPIDER_NET_GFREECNNUM		0x0000003c
+#define SPIDER_NET_GONETIMENUM		0x00000040
+
+#define SPIDER_NET_GTOUTFRMNUM		0x00000044
+
+#define SPIDER_NET_GTXMDSET		0x00000050
+#define SPIDER_NET_GPCCTRL		0x00000054
+#define SPIDER_NET_GRXMDSET		0x00000058
+#define SPIDER_NET_GIPSECINIT		0x0000005c
+#define SPIDER_NET_GFTRESTRT		0x00000060
+#define SPIDER_NET_GRXDMAEN		0x00000064
+#define SPIDER_NET_GMRWOLCTRL		0x00000068
+#define SPIDER_NET_GPCWOPCMD		0x0000006c
+#define SPIDER_NET_GPCROPCMD		0x00000070
+#define SPIDER_NET_GTTFRMCNT		0x00000078
+#define SPIDER_NET_GTESTMD		0x0000007c
+
+#define SPIDER_NET_GSINIT		0x00000080
+#define SPIDER_NET_GSnPRGADR		0x00000084
+#define SPIDER_NET_GSnPRGDAT		0x00000088
+
+#define SPIDER_NET_GMACOPEMD		0x00000100
+#define SPIDER_NET_GMACLENLMT		0x00000108
+#define SPIDER_NET_GMACINTEN		0x00000118
+#define SPIDER_NET_GMACPHYCTRL		0x00000120
+
+#define SPIDER_NET_GMACAPAUSE		0x00000154
+#define SPIDER_NET_GMACTXPAUSE		0x00000164
+
+#define SPIDER_NET_GMACMODE		0x000001b0
+#define SPIDER_NET_GMACBSTLMT		0x000001b4
+
+#define SPIDER_NET_GMACUNIMACU		0x000001c0
+#define SPIDER_NET_GMACUNIMACL		0x000001c8
+
+#define SPIDER_NET_GMRMHFILnR		0x00000400
+#define SPIDER_NET_MULTICAST_HASHES	256
+
+#define SPIDER_NET_GMRUAFILnR		0x00000500
+#define SPIDER_NET_GMRUA0FIL15R		0x00000578
+
+/* RX DMA controller registers, all 0x00000a.. are for DMA controller A,
+ * 0x00000b.. for DMA controller B, etc. */
+#define SPIDER_NET_GDADCHA		0x00000a00
+#define SPIDER_NET_GDADMACCNTR		0x00000a04
+#define SPIDER_NET_GDACTDPA		0x00000a08
+#define SPIDER_NET_GDACTDCNT		0x00000a0c
+#define SPIDER_NET_GDACDBADDR		0x00000a20
+#define SPIDER_NET_GDACDBSIZE		0x00000a24
+#define SPIDER_NET_GDACNEXTDA		0x00000a28
+#define SPIDER_NET_GDACCOMST		0x00000a2c
+#define SPIDER_NET_GDAWBCOMST		0x00000a30
+#define SPIDER_NET_GDAWBRSIZE		0x00000a34
+#define SPIDER_NET_GDAWBVSIZE		0x00000a38
+#define SPIDER_NET_GDAWBTRST		0x00000a3c
+#define SPIDER_NET_GDAWBTRERR		0x00000a40
+
+/* TX DMA controller registers */
+#define SPIDER_NET_GDTDCHA		0x00000e00
+#define SPIDER_NET_GDTDMACCNTR		0x00000e04
+#define SPIDER_NET_GDTCDPA		0x00000e08
+#define SPIDER_NET_GDTDMASEL		0x00000e14
+
+#define SPIDER_NET_ECMODE		0x00000f00
+/* clock and reset control register */
+#define SPIDER_NET_CKRCTRL		0x00000ff0
+
+/** SCONFIG registers */
+#define SPIDER_NET_SCONFIG_IOACTE	0x00002810
+
+/** hardcoded register values */
+#define SPIDER_NET_INT0_MASK_VALUE	0x3f7fe3ff
+#define SPIDER_NET_INT1_MASK_VALUE	0xffffffff
+/* no MAC aborts -> auto retransmission */
+#define SPIDER_NET_INT2_MASK_VALUE	0xfffffff1
+
+/* clear counter when interrupt sources are cleared
+#define SPIDER_NET_FRAMENUM_VALUE	0x0001f001 */
+/* we rely on flagged descriptor interrupts */
+#define SPIDER_NET_FRAMENUM_VALUE	0x00000000
+/* set this first, then the FRAMENUM_VALUE */
+#define SPIDER_NET_GFXFRAMES_VALUE	0x00000000
+
+#define SPIDER_NET_STOP_SEQ_VALUE	0x00000000
+#define SPIDER_NET_RUN_SEQ_VALUE	0x0000007e
+
+#define SPIDER_NET_PHY_CTRL_VALUE	0x00040040
+/* #define SPIDER_NET_PHY_CTRL_VALUE	0x01070080*/
+#define SPIDER_NET_RXMODE_VALUE		0x00000011
+/* auto retransmission in case of MAC aborts */
+#define SPIDER_NET_TXMODE_VALUE		0x00010000
+#define SPIDER_NET_RESTART_VALUE	0x00000000
+#define SPIDER_NET_WOL_VALUE		0x00001111
+#if 0
+#define SPIDER_NET_WOL_VALUE		0x00000000
+#endif
+#define SPIDER_NET_IPSECINIT_VALUE	0x00f000f8
+
+/* pause frames: automatic, no upper retransmission count */
+/* outside loopback mode: ETOMOD signal dont matter, not connected */
+#define SPIDER_NET_OPMODE_VALUE		0x00000063
+/*#define SPIDER_NET_OPMODE_VALUE		0x001b0062*/
+#define SPIDER_NET_LENLMT_VALUE		0x00000908
+
+#define SPIDER_NET_MACAPAUSE_VALUE	0x00000800 /* about 1 ms */
+#define SPIDER_NET_TXPAUSE_VALUE	0x00000000
+
+#define SPIDER_NET_MACMODE_VALUE	0x00000001
+#define SPIDER_NET_BURSTLMT_VALUE	0x00000200 /* about 16 us */
+
+/* 1(0)					enable r/tx dma
+ *  0000000				fixed to 0
+ *
+ *         000000			fixed to 0
+ *               0(1)			en/disable descr writeback on force end
+ *                0(1)			force end
+ *
+ *                 000000		fixed to 0
+ *                       00		burst alignment: 128 bytes
+ *
+ *                         00000	fixed to 0
+ *                              0	descr writeback size 32 bytes
+ *                               0(1)	descr chain end interrupt enable
+ *                                0(1)	descr status writeback enable */
+
+/* to set RX_DMA_EN */
+#define SPIDER_NET_DMA_RX_VALUE		0x80000000
+#define SPIDER_NET_DMA_RX_FEND_VALUE	0x00030003
+/* to set TX_DMA_EN */
+#define SPIDER_NET_DMA_TX_VALUE		0x80000000
+#define SPIDER_NET_DMA_TX_FEND_VALUE	0x00030003
+
+/* SPIDER_NET_UA_DESCR_VALUE is OR'ed with the unicast address */
+#define SPIDER_NET_UA_DESCR_VALUE	0x00080000
+#define SPIDER_NET_PROMISC_VALUE	0x00080000
+#define SPIDER_NET_NONPROMISC_VALUE	0x00000000
+
+#define SPIDER_NET_DMASEL_VALUE		0x00000001
+
+#define SPIDER_NET_ECMODE_VALUE		0x00000000
+
+#define SPIDER_NET_CKRCTRL_RUN_VALUE	0x1fff010f
+#define SPIDER_NET_CKRCTRL_STOP_VALUE	0x0000010f
+
+#define SPIDER_NET_SBIMSTATE_VALUE	0x00000000
+#define SPIDER_NET_SBTMSTATE_VALUE	0x00000000
+
+/* SPIDER_NET_GHIINT0STS bits, in reverse order so that they can be used
+ * with 1 << SPIDER_NET_... */
+enum spider_net_int0_status {
+	SPIDER_NET_GPHYINT = 0,
+	SPIDER_NET_GMAC2INT,
+	SPIDER_NET_GMAC1INT,
+	SPIDER_NET_GIPSINT,
+	SPIDER_NET_GFIFOINT,
+	SPIDER_NET_GDMACINT,
+	SPIDER_NET_GSYSINT,
+	SPIDER_NET_GPWOPCMPINT,
+	SPIDER_NET_GPROPCMPINT,
+	SPIDER_NET_GPWFFINT,
+	SPIDER_NET_GRMDADRINT,
+	SPIDER_NET_GRMARPINT,
+	SPIDER_NET_GRMMPINT,
+	SPIDER_NET_GDTDEN0INT,
+	SPIDER_NET_GDDDEN0INT,
+	SPIDER_NET_GDCDEN0INT,
+	SPIDER_NET_GDBDEN0INT,
+	SPIDER_NET_GDADEN0INT,
+	SPIDER_NET_GDTFDCINT,
+	SPIDER_NET_GDDFDCINT,
+	SPIDER_NET_GDCFDCINT,
+	SPIDER_NET_GDBFDCINT,
+	SPIDER_NET_GDAFDCINT,
+	SPIDER_NET_GTTEDINT,
+	SPIDER_NET_GDTDCEINT,
+	SPIDER_NET_GRFDNMINT,
+	SPIDER_NET_GRFCNMINT,
+	SPIDER_NET_GRFBNMINT,
+	SPIDER_NET_GRFANMINT,
+	SPIDER_NET_GRFNMINT,
+	SPIDER_NET_G1TMCNTINT,
+	SPIDER_NET_GFREECNTINT
+};
+/* GHIINT1STS bits */
+enum spider_net_int1_status {
+	SPIDER_NET_GTMFLLINT = 0,
+	SPIDER_NET_GRMFLLINT,
+	SPIDER_NET_GTMSHTINT,
+	SPIDER_NET_GDTINVDINT,
+	SPIDER_NET_GRFDFLLINT,
+	SPIDER_NET_GDDDCEINT,
+	SPIDER_NET_GDDINVDINT,
+	SPIDER_NET_GRFCFLLINT,
+	SPIDER_NET_GDCDCEINT,
+	SPIDER_NET_GDCINVDINT,
+	SPIDER_NET_GRFBFLLINT,
+	SPIDER_NET_GDBDCEINT,
+	SPIDER_NET_GDBINVDINT,
+	SPIDER_NET_GRFAFLLINT,
+	SPIDER_NET_GDADCEINT,
+	SPIDER_NET_GDAINVDINT,
+	SPIDER_NET_GDTRSERINT,
+	SPIDER_NET_GDDRSERINT,
+	SPIDER_NET_GDCRSERINT,
+	SPIDER_NET_GDBRSERINT,
+	SPIDER_NET_GDARSERINT,
+	SPIDER_NET_GDSERINT,
+	SPIDER_NET_GDTPTERINT,
+	SPIDER_NET_GDDPTERINT,
+	SPIDER_NET_GDCPTERINT,
+	SPIDER_NET_GDBPTERINT,
+	SPIDER_NET_GDAPTERINT
+};
+/* GHIINT2STS bits */
+enum spider_net_int2_status {
+	SPIDER_NET_GPROPERINT = 0,
+	SPIDER_NET_GMCTCRSNGINT,
+	SPIDER_NET_GMCTLCOLINT,
+	SPIDER_NET_GMCTTMOTINT,
+	SPIDER_NET_GMCRCAERINT,
+	SPIDER_NET_GMCRCALERINT,
+	SPIDER_NET_GMCRALNERINT,
+	SPIDER_NET_GMCROVRINT,
+	SPIDER_NET_GMCRRNTINT,
+	SPIDER_NET_GMCRRXERINT,
+	SPIDER_NET_GTITCSERINT,
+	SPIDER_NET_GTIFMTERINT,
+	SPIDER_NET_GTIPKTRVKINT,
+	SPIDER_NET_GTISPINGINT,
+	SPIDER_NET_GTISADNGINT,
+	SPIDER_NET_GTISPDNGINT,
+	SPIDER_NET_GRIFMTERINT,
+	SPIDER_NET_GRIPKTRVKINT,
+	SPIDER_NET_GRISPINGINT,
+	SPIDER_NET_GRISADNGINT,
+	SPIDER_NET_GRISPDNGINT
+};
+
+#define SPIDER_NET_TXINT	( (1 << SPIDER_NET_GTTEDINT) | \
+				  (1 << SPIDER_NET_GDTDCEINT) | \
+				  (1 << SPIDER_NET_GDTFDCINT) )
+
+/* we rely on flagged descriptor interrupts*/
+#define SPIDER_NET_RXINT	( (1 << SPIDER_NET_GDAFDCINT) | \
+				  (1 << SPIDER_NET_GRMFLLINT) )
+
+#define SPIDER_NET_GPREXEC		0x80000000
+#define SPIDER_NET_GPRDAT_MASK		0x0000ffff
+
+/* descriptor bits
+ *
+ * 1010					descriptor ready
+ *     0				descr in middle of chain
+ *      000				fixed to 0
+ *
+ *         0				no interrupt on completion
+ *          000				fixed to 0
+ *             1			no ipsec processing
+ *              1			last descriptor for this frame
+ *               00			no checksum
+ *               10			tcp checksum
+ *               11			udp checksum
+ *
+ *                 00			fixed to 0
+ *                   0			fixed to 0
+ *                    0			no interrupt on response errors
+ *                     0		no interrupt on invalid descr
+ *                      0		no interrupt on dma process termination
+ *                       0		no interrupt on descr chain end
+ *                        0		no interrupt on descr complete
+ *
+ *                         000		fixed to 0
+ *                            0		response error interrupt status
+ *                             0	invalid descr status
+ *                              0	dma termination status
+ *                               0	descr chain end status
+ *                                0	descr complete status */
+#define SPIDER_NET_DMAC_CMDSTAT_NOCS	0xa00c0000
+#define SPIDER_NET_DMAC_CMDSTAT_TCPCS	0xa00e0000
+#define SPIDER_NET_DMAC_CMDSTAT_UDPCS	0xa00f0000
+#define SPIDER_NET_DESCR_IND_PROC_SHIFT	28
+#define SPIDER_NET_DESCR_IND_PROC_MASKO	0x0fffffff
+
+/* descr ready, descr is in middle of chain, get interrupt on completion */
+#define SPIDER_NET_DMAC_RX_CARDOWNED	0xa0800000
+
+/* multicast is no problem */
+#define SPIDER_NET_DATA_ERROR_MASK	0xffffbfff
+
+enum spider_net_descr_status {
+	SPIDER_NET_DESCR_COMPLETE		= 0x00, /* used in rx and tx */
+	SPIDER_NET_DESCR_RESPONSE_ERROR		= 0x01, /* used in rx and tx */
+	SPIDER_NET_DESCR_PROTECTION_ERROR	= 0x02, /* used in rx and tx */
+	SPIDER_NET_DESCR_FRAME_END		= 0x04, /* used in rx */
+	SPIDER_NET_DESCR_FORCE_END		= 0x05, /* used in rx and tx */
+	SPIDER_NET_DESCR_CARDOWNED		= 0x0a, /* used in rx and tx */
+	SPIDER_NET_DESCR_NOT_IN_USE /* any other value */
+};
+
+struct spider_net_descr {
+	/* as defined by the hardware */
+	dma_addr_t buf_addr;
+	u32 buf_size;
+	dma_addr_t next_descr_addr;
+	u32 dmac_cmd_status;
+	u32 result_size;
+	u32 valid_size;	/* all zeroes for tx */
+	u32 data_status;
+	u32 data_error;	/* all zeroes for tx */
+
+	/* used in the driver */
+	struct sk_buff *skb;
+	dma_addr_t bus_addr;
+	struct spider_net_descr *next;
+	struct spider_net_descr *prev;
+} __attribute__((aligned(32)));
+
+struct spider_net_descr_chain {
+	/* we walk from tail to head */
+	struct spider_net_descr *head;
+	struct spider_net_descr *tail;
+};
+
+/* descriptor data_status bits */
+#define SPIDER_NET_RXIPCHK		29
+#define SPIDER_NET_TCPUDPIPCHK		28
+#define SPIDER_NET_DATA_STATUS_CHK_MASK	(1 << SPIDER_NET_RXIPCHK | \
+					 1 << SPIDER_NET_TCPUDPIPCHK)
+
+#define SPIDER_NET_VLAN_PACKET		21
+
+/* descriptor data_error bits */
+#define SPIDER_NET_RXIPCHKERR		27
+#define SPIDER_NET_RXTCPCHKERR		26
+#define SPIDER_NET_DATA_ERROR_CHK_MASK	(1 << SPIDER_NET_RXIPCHKERR | \
+					 1 << SPIDER_NET_RXTCPCHKERR)
+
+/* the cases we don't pass the packet to the stack */
+#define SPIDER_NET_DESTROY_RX_FLAGS	0x70138000
+
+#define SPIDER_NET_DESCR_SIZE		32
+
+/* this will be bigger some time */
+struct spider_net_options {
+	int rx_csum; /* for rx: if 0 ip_summed=NONE,
+			if 1 and hw has verified, ip_summed=UNNECESSARY */
+};
+
+#define SPIDER_NET_DEFAULT_MSG		( NETIF_MSG_DRV | \
+					  NETIF_MSG_PROBE | \
+					  NETIF_MSG_LINK | \
+					  NETIF_MSG_TIMER | \
+					  NETIF_MSG_IFDOWN | \
+					  NETIF_MSG_IFUP | \
+					  NETIF_MSG_RX_ERR | \
+					  NETIF_MSG_TX_ERR | \
+					  NETIF_MSG_TX_QUEUED | \
+					  NETIF_MSG_INTR | \
+					  NETIF_MSG_TX_DONE | \
+					  NETIF_MSG_RX_STATUS | \
+					  NETIF_MSG_PKTDATA | \
+					  NETIF_MSG_HW | \
+					  NETIF_MSG_WOL )
+
+struct spider_net_card {
+	struct net_device *netdev;
+	struct pci_dev *pdev;
+	struct mii_phy phy;
+
+	void __iomem *regs;
+
+	struct spider_net_descr_chain tx_chain;
+	struct spider_net_descr_chain rx_chain;
+	spinlock_t chain_lock;
+
+	struct net_device_stats netdev_stats;
+
+	struct spider_net_options options;
+
+	spinlock_t intmask_lock;
+
+	struct work_struct tx_timeout_task;
+	atomic_t tx_timeout_task_counter;
+	wait_queue_head_t waitq;
+
+	/* for ethtool */
+	int msg_enable;
+
+	struct spider_net_descr descr[0];
+};
+
+#define pr_err(fmt,arg...) \
+	printk(KERN_ERR fmt ,##arg)
+
+#endif
diff --git a/drivers/net/spider_net_ethtool.c b/drivers/net/spider_net_ethtool.c
new file mode 100644
index 000000000000..9447c2ccd70a
--- /dev/null
+++ b/drivers/net/spider_net_ethtool.c
@@ -0,0 +1,107 @@
+/*
+ * Network device driver for Cell Processor-Based Blade
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Authors : Utz Bacher <utz.bacher@de.ibm.com>
+ *           Jens Osterkamp <Jens.Osterkamp@de.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/pci.h>
+
+#include "spider_net.h"
+
+static void
+spider_net_ethtool_get_drvinfo(struct net_device *netdev,
+			       struct ethtool_drvinfo *drvinfo)
+{
+	struct spider_net_card *card;
+	card = netdev_priv(netdev);
+
+	/* clear and fill out info */
+	memset(drvinfo, 0, sizeof(struct ethtool_drvinfo));
+	strncpy(drvinfo->driver, spider_net_driver_name, 32);
+	strncpy(drvinfo->version, "0.1", 32);
+	strcpy(drvinfo->fw_version, "no information");
+	strncpy(drvinfo->bus_info, pci_name(card->pdev), 32);
+}
+
+static void
+spider_net_ethtool_get_wol(struct net_device *netdev,
+			   struct ethtool_wolinfo *wolinfo)
+{
+	/* no support for wol */
+	wolinfo->supported = 0;
+	wolinfo->wolopts = 0;
+}
+
+static u32
+spider_net_ethtool_get_msglevel(struct net_device *netdev)
+{
+	struct spider_net_card *card;
+	card = netdev_priv(netdev);
+	return card->msg_enable;
+}
+
+static void
+spider_net_ethtool_set_msglevel(struct net_device *netdev,
+				u32 level)
+{
+	struct spider_net_card *card;
+	card = netdev_priv(netdev);
+	card->msg_enable = level;
+}
+
+static int
+spider_net_ethtool_nway_reset(struct net_device *netdev)
+{
+	if (netif_running(netdev)) {
+		spider_net_stop(netdev);
+		spider_net_open(netdev);
+	}
+	return 0;
+}
+
+static u32
+spider_net_ethtool_get_rx_csum(struct net_device *netdev)
+{
+	struct spider_net_card *card = netdev->priv;
+
+	return card->options.rx_csum;
+}
+
+static int
+spider_net_ethtool_set_rx_csum(struct net_device *netdev, u32 n)
+{
+	struct spider_net_card *card = netdev->priv;
+
+	card->options.rx_csum = n;
+	return 0;
+}
+
+struct ethtool_ops spider_net_ethtool_ops = {
+	.get_drvinfo		= spider_net_ethtool_get_drvinfo,
+	.get_wol		= spider_net_ethtool_get_wol,
+	.get_msglevel		= spider_net_ethtool_get_msglevel,
+	.set_msglevel		= spider_net_ethtool_set_msglevel,
+	.nway_reset		= spider_net_ethtool_nway_reset,
+	.get_rx_csum		= spider_net_ethtool_get_rx_csum,
+	.set_rx_csum		= spider_net_ethtool_set_rx_csum,
+};
+
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 95c941f8c747..ee0ab7a5f91b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1612,6 +1612,7 @@
 #define PCI_DEVICE_ID_TOSHIBA_TC35815CF	0x0030
 #define PCI_DEVICE_ID_TOSHIBA_TX4927	0x0180
 #define PCI_DEVICE_ID_TOSHIBA_TC86C001_MISC	0x0108
+#define PCI_DEVICE_ID_TOSHIBA_SPIDER_NET 0x01b3
 
 #define PCI_VENDOR_ID_RICOH		0x1180
 #define PCI_DEVICE_ID_RICOH_RL5C465	0x0465
-- 
cgit v1.2.3


From 6582c164f2b3b6e58d1f13c1c031b19ee691eb14 Mon Sep 17 00:00:00 2001
From: Jean Tourrilhes <jt@hpl.hp.com>
Date: Fri, 2 Sep 2005 11:32:28 -0700
Subject: [PATCH] WE-19 for kernel 2.6.13

	Hi Jeff,

	This is version 19 of the Wireless Extensions. It was supposed
to be the fallback of the WPA API changes, but people seem quite happy
about it (especially Jouni), so the patch is rather small.
	The patch has been fully tested with 2.6.13 and various
wireless drivers, and is in its final version. Would you mind pushing
that into Linus's kernel so that the driver and the apps can take
advantage ot it ?

	It includes :
	o iwstat improvement (explicit dBm). This is the result of
long discussions with Dan Williams, the authors of
NetworkManager. Thanks to him for all the fruitful feedback.
	o remove pointer from event stream. I was not totally sure if
this pointer was 32-64 bits clean, so I'd rather remove it and be at
peace with it.
	o remove linux header from wireless.h. This has long been
requested by people writting user space apps, now it's done, and it
was not even painful.
	o final deprecation of spy_offset. You did not like it, it's
now gone for good.
	o Start deprecating dev->get_wireless_stats -> debloat netdev
	o Add "check" version of event macros for ieee802.11
stack. Jiri Benc doesn't like the current macros, we aim to please ;-)
	All those changes, except the last one, have been bit-roting on
my web pages for a while...

	Patches for most kernel drivers will follow. Patches for the
Orinoco and the HostAP drivers have been sent to their respective
maintainers.

	Have fun...

	Jean
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 include/linux/wireless.h |  47 ++++++++++++------
 include/net/iw_handler.h | 123 +++++++++++++++++++++++++++++++++++++++++------
 net/core/wireless.c      |  58 +++++++++++++---------
 3 files changed, 175 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index ae485f9c916e..dab5afdaf71c 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -1,7 +1,7 @@
 /*
  * This file define a set of standard wireless extensions
  *
- * Version :	18	12.3.05
+ * Version :	19	18.3.05
  *
  * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
  * Copyright (c) 1997-2005 Jean Tourrilhes, All Rights Reserved.
@@ -69,11 +69,12 @@
 
 /***************************** INCLUDES *****************************/
 
-/* To minimise problems in user space, I might remove those headers
- * at some point. Jean II */
-#include <linux/types.h>		/* for "caddr_t" et al		*/
-#include <linux/socket.h>		/* for "struct sockaddr" et al	*/
-#include <linux/if.h>			/* for IFNAMSIZ and co... */
+/* Do not put any header in this file, this creates a mess when
+ * exported to user space. Most users have included all the
+ * relevant headers anyway... Jean II */
+/*#include <linux/types.h>*/		/* for "caddr_t" et al		*/
+/*#include <linux/socket.h>*/		/* for "struct sockaddr" et al	*/
+/*#include <linux/if.h>*/		/* for IFNAMSIZ and co... */
 
 /***************************** VERSION *****************************/
 /*
@@ -82,7 +83,7 @@
  * (there is some stuff that will be added in the future...)
  * I just plan to increment with each new version.
  */
-#define WIRELESS_EXT	18
+#define WIRELESS_EXT	19
 
 /*
  * Changes :
@@ -197,6 +198,15 @@
  *	  related parameters (extensible up to 4096 parameter values)
  *	- Add wireless events: IWEVGENIE, IWEVMICHAELMICFAILURE,
  *	  IWEVASSOCREQIE, IWEVASSOCRESPIE, IWEVPMKIDCAND
+ *
+ * V18 to V19
+ * ----------
+ *	- Remove (struct iw_point *)->pointer from events and streams
+ *	- Remove header includes to help user space
+ *	- Increase IW_ENCODING_TOKEN_MAX from 32 to 64
+ *	- Add IW_QUAL_ALL_UPDATED and IW_QUAL_ALL_INVALID macros
+ *	- Add explicit flag to tell stats are in dBm : IW_QUAL_DBM
+ *	- Add IW_IOCTL_IDX() and IW_EVENT_IDX() macros
  */
 
 /**************************** CONSTANTS ****************************/
@@ -322,6 +332,7 @@
 /* The first and the last (range) */
 #define SIOCIWFIRST	0x8B00
 #define SIOCIWLAST	SIOCIWLASTPRIV		/* 0x8BFF */
+#define IW_IOCTL_IDX(cmd)	((cmd) - SIOCIWFIRST)
 
 /* Even : get (world access), odd : set (root access) */
 #define IW_IS_SET(cmd)	(!((cmd) & 0x1))
@@ -366,6 +377,7 @@
 					 * (struct iw_pmkid_cand) */
 
 #define IWEVFIRST	0x8C00
+#define IW_EVENT_IDX(cmd)	((cmd) - IWEVFIRST)
 
 /* ------------------------- PRIVATE INFO ------------------------- */
 /*
@@ -427,12 +439,15 @@
 #define IW_MODE_MONITOR	6	/* Passive monitor (listen only) */
 
 /* Statistics flags (bitmask in updated) */
-#define IW_QUAL_QUAL_UPDATED	0x1	/* Value was updated since last read */
-#define IW_QUAL_LEVEL_UPDATED	0x2
-#define IW_QUAL_NOISE_UPDATED	0x4
+#define IW_QUAL_QUAL_UPDATED	0x01	/* Value was updated since last read */
+#define IW_QUAL_LEVEL_UPDATED	0x02
+#define IW_QUAL_NOISE_UPDATED	0x04
+#define IW_QUAL_ALL_UPDATED	0x07
+#define IW_QUAL_DBM		0x08	/* Level + Noise are dBm */
 #define IW_QUAL_QUAL_INVALID	0x10	/* Driver doesn't provide value */
 #define IW_QUAL_LEVEL_INVALID	0x20
 #define IW_QUAL_NOISE_INVALID	0x40
+#define IW_QUAL_ALL_INVALID	0x70
 
 /* Frequency flags */
 #define IW_FREQ_AUTO		0x00	/* Let the driver decides */
@@ -443,7 +458,7 @@
 #define IW_MAX_ENCODING_SIZES	8
 
 /* Maximum size of the encoding token in bytes */
-#define IW_ENCODING_TOKEN_MAX	32	/* 256 bits (for now) */
+#define IW_ENCODING_TOKEN_MAX	64	/* 512 bits (for now) */
 
 /* Flags for encoding (along with the token) */
 #define IW_ENCODE_INDEX		0x00FF	/* Token index (if needed) */
@@ -1039,12 +1054,16 @@ struct iw_event
 #define IW_EV_CHAR_LEN	(IW_EV_LCP_LEN + IFNAMSIZ)
 #define IW_EV_UINT_LEN	(IW_EV_LCP_LEN + sizeof(__u32))
 #define IW_EV_FREQ_LEN	(IW_EV_LCP_LEN + sizeof(struct iw_freq))
-#define IW_EV_POINT_LEN	(IW_EV_LCP_LEN + sizeof(struct iw_point))
 #define IW_EV_PARAM_LEN	(IW_EV_LCP_LEN + sizeof(struct iw_param))
 #define IW_EV_ADDR_LEN	(IW_EV_LCP_LEN + sizeof(struct sockaddr))
 #define IW_EV_QUAL_LEN	(IW_EV_LCP_LEN + sizeof(struct iw_quality))
 
-/* Note : in the case of iw_point, the extra data will come at the
- * end of the event */
+/* iw_point events are special. First, the payload (extra data) come at
+ * the end of the event, so they are bigger than IW_EV_POINT_LEN. Second,
+ * we omit the pointer, so start at an offset. */
+#define IW_EV_POINT_OFF (((char *) &(((struct iw_point *) NULL)->length)) - \
+			  (char *) NULL)
+#define IW_EV_POINT_LEN	(IW_EV_LCP_LEN + sizeof(struct iw_point) - \
+			 IW_EV_POINT_OFF)
 
 #endif	/* _LINUX_WIRELESS_H */
diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h
index 44edd48f1234..d67c8393a343 100644
--- a/include/net/iw_handler.h
+++ b/include/net/iw_handler.h
@@ -1,10 +1,10 @@
 /*
  * This file define the new driver API for Wireless Extensions
  *
- * Version :	6	21.6.04
+ * Version :	7	18.3.05
  *
  * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 2001-2004 Jean Tourrilhes, All Rights Reserved.
+ * Copyright (c) 2001-2005 Jean Tourrilhes, All Rights Reserved.
  */
 
 #ifndef _IW_HANDLER_H
@@ -207,7 +207,7 @@
  * will be needed...
  * I just plan to increment with each new version.
  */
-#define IW_HANDLER_VERSION	6
+#define IW_HANDLER_VERSION	7
 
 /*
  * Changes :
@@ -232,6 +232,13 @@
  *	- Remove spy #ifdef, they are always on -> cleaner code
  *	- Add IW_DESCR_FLAG_NOMAX flag for very large requests
  *	- Start migrating get_wireless_stats to struct iw_handler_def
+ *
+ * V6 to V7
+ * --------
+ *	- Add struct ieee80211_device pointer in struct iw_public_data
+ *	- Remove (struct iw_point *)->pointer from events and streams
+ *	- Remove spy_offset from struct iw_handler_def
+ *	- Add "check" version of event macros for ieee802.11 stack
  */
 
 /**************************** CONSTANTS ****************************/
@@ -334,9 +341,6 @@ struct iw_handler_def
 	 * We will automatically export that to user space... */
 	const struct iw_priv_args *	private_args;
 
-	/* This field will be *removed* in the next version of WE */
-	long			spy_offset;	/* DO NOT USE */
-
 	/* New location of get_wireless_stats, to de-bloat struct net_device.
 	 * The old pointer in struct net_device will be gradually phased
 	 * out, and drivers are encouraged to use this one... */
@@ -400,16 +404,21 @@ struct iw_spy_data
 /* --------------------- DEVICE WIRELESS DATA --------------------- */
 /*
  * This is all the wireless data specific to a device instance that
- * is managed by the core of Wireless Extensions.
+ * is managed by the core of Wireless Extensions or the 802.11 layer.
  * We only keep pointer to those structures, so that a driver is free
  * to share them between instances.
  * This structure should be initialised before registering the device.
  * Access to this data follow the same rules as any other struct net_device
  * data (i.e. valid as long as struct net_device exist, same locking rules).
  */
+/* Forward declaration */
+struct ieee80211_device;
+/* The struct */
 struct iw_public_data {
 	/* Driver enhanced spy support */
-	struct iw_spy_data *	spy_data;
+	struct iw_spy_data *		spy_data;
+	/* Structure managed by the in-kernel IEEE 802.11 layer */
+	struct ieee80211_device *	ieee80211;
 };
 
 /**************************** PROTOTYPES ****************************/
@@ -424,7 +433,7 @@ struct iw_public_data {
 extern int dev_get_wireless_info(char * buffer, char **start, off_t offset,
 				 int length);
 
-/* Handle IOCTLs, called in net/code/dev.c */
+/* Handle IOCTLs, called in net/core/dev.c */
 extern int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd);
 
 /* Second : functions that may be called by driver modules */
@@ -479,7 +488,7 @@ iwe_stream_add_event(char *	stream,		/* Stream of events */
 		     int	event_len)	/* Real size of payload */
 {
 	/* Check if it's possible */
-	if((stream + event_len) < ends) {
+	if(likely((stream + event_len) < ends)) {
 		iwe->len = event_len;
 		memcpy(stream, (char *) iwe, event_len);
 		stream += event_len;
@@ -495,14 +504,17 @@ iwe_stream_add_event(char *	stream,		/* Stream of events */
 static inline char *
 iwe_stream_add_point(char *	stream,		/* Stream of events */
 		     char *	ends,		/* End of stream */
-		     struct iw_event *iwe,	/* Payload */
-		     char *	extra)
+		     struct iw_event *iwe,	/* Payload length + flags */
+		     char *	extra)		/* More payload */
 {
 	int	event_len = IW_EV_POINT_LEN + iwe->u.data.length;
 	/* Check if it's possible */
-	if((stream + event_len) < ends) {
+	if(likely((stream + event_len) < ends)) {
 		iwe->len = event_len;
-		memcpy(stream, (char *) iwe, IW_EV_POINT_LEN);
+		memcpy(stream, (char *) iwe, IW_EV_LCP_LEN);
+		memcpy(stream + IW_EV_LCP_LEN,
+		       ((char *) iwe) + IW_EV_LCP_LEN + IW_EV_POINT_OFF,
+		       IW_EV_POINT_LEN - IW_EV_LCP_LEN);
 		memcpy(stream + IW_EV_POINT_LEN, extra, iwe->u.data.length);
 		stream += event_len;
 	}
@@ -526,7 +538,7 @@ iwe_stream_add_value(char *	event,		/* Event in the stream */
 	event_len -= IW_EV_LCP_LEN;
 
 	/* Check if it's possible */
-	if((value + event_len) < ends) {
+	if(likely((value + event_len) < ends)) {
 		/* Add new value */
 		memcpy(value, (char *) iwe + IW_EV_LCP_LEN, event_len);
 		value += event_len;
@@ -537,4 +549,85 @@ iwe_stream_add_value(char *	event,		/* Event in the stream */
 	return value;
 }
 
+/*------------------------------------------------------------------*/
+/*
+ * Wrapper to add an Wireless Event to a stream of events.
+ * Same as above, with explicit error check...
+ */
+static inline char *
+iwe_stream_check_add_event(char *	stream,		/* Stream of events */
+			   char *	ends,		/* End of stream */
+			   struct iw_event *iwe,	/* Payload */
+			   int		event_len,	/* Size of payload */
+			   int *	perr)		/* Error report */
+{
+	/* Check if it's possible, set error if not */
+	if(likely((stream + event_len) < ends)) {
+		iwe->len = event_len;
+		memcpy(stream, (char *) iwe, event_len);
+		stream += event_len;
+	} else
+		*perr = -E2BIG;
+	return stream;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Wrapper to add an short Wireless Event containing a pointer to a
+ * stream of events.
+ * Same as above, with explicit error check...
+ */
+static inline char *
+iwe_stream_check_add_point(char *	stream,		/* Stream of events */
+			   char *	ends,		/* End of stream */
+			   struct iw_event *iwe,	/* Payload length + flags */
+			   char *	extra,		/* More payload */
+			   int *	perr)		/* Error report */
+{
+	int	event_len = IW_EV_POINT_LEN + iwe->u.data.length;
+	/* Check if it's possible */
+	if(likely((stream + event_len) < ends)) {
+		iwe->len = event_len;
+		memcpy(stream, (char *) iwe, IW_EV_LCP_LEN);
+		memcpy(stream + IW_EV_LCP_LEN,
+		       ((char *) iwe) + IW_EV_LCP_LEN + IW_EV_POINT_OFF,
+		       IW_EV_POINT_LEN - IW_EV_LCP_LEN);
+		memcpy(stream + IW_EV_POINT_LEN, extra, iwe->u.data.length);
+		stream += event_len;
+	} else
+		*perr = -E2BIG;
+	return stream;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Wrapper to add a value to a Wireless Event in a stream of events.
+ * Be careful, this one is tricky to use properly :
+ * At the first run, you need to have (value = event + IW_EV_LCP_LEN).
+ * Same as above, with explicit error check...
+ */
+static inline char *
+iwe_stream_check_add_value(char *	event,		/* Event in the stream */
+			   char *	value,		/* Value in event */
+			   char *	ends,		/* End of stream */
+			   struct iw_event *iwe,	/* Payload */
+			   int		event_len,	/* Size of payload */
+			   int *	perr)		/* Error report */
+{
+	/* Don't duplicate LCP */
+	event_len -= IW_EV_LCP_LEN;
+
+	/* Check if it's possible */
+	if(likely((value + event_len) < ends)) {
+		/* Add new value */
+		memcpy(value, (char *) iwe + IW_EV_LCP_LEN, event_len);
+		value += event_len;
+		/* Patch LCP */
+		iwe->len = value - event;
+		memcpy(event, (char *) iwe, IW_EV_LCP_LEN);
+	} else
+		*perr = -E2BIG;
+	return value;
+}
+
 #endif	/* _IW_HANDLER_H */
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 5caae2399f3a..d17f1583ea3e 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -58,6 +58,13 @@
  *	o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus
  * Based on patch from Pavel Roskin <proski@gnu.org> :
  *	o Fix kernel data leak to user space in private handler handling
+ *
+ * v7 - 18.3.05 - Jean II
+ *	o Remove (struct iw_point *)->pointer from events and streams
+ *	o Remove spy_offset from struct iw_handler_def
+ *	o Start deprecating dev->get_wireless_stats, output a warning
+ *	o If IW_QUAL_DBM is set, show dBm values in /proc/net/wireless
+ *	o Don't loose INVALID/DBM flags when clearing UPDATED flags (iwstats)
  */
 
 /***************************** INCLUDES *****************************/
@@ -446,10 +453,14 @@ static inline struct iw_statistics *get_wireless_stats(struct net_device *dev)
 	   (dev->wireless_handlers->get_wireless_stats != NULL))
 		return dev->wireless_handlers->get_wireless_stats(dev);
 
-	/* Old location, will be phased out in next WE */
-	return (dev->get_wireless_stats ?
-		dev->get_wireless_stats(dev) :
-		(struct iw_statistics *) NULL);
+	/* Old location, field to be removed in next WE */
+	if(dev->get_wireless_stats) {
+		printk(KERN_DEBUG "%s (WE) : Driver using old /proc/net/wireless support, please fix driver !\n",
+		       dev->name);
+		return dev->get_wireless_stats(dev);
+	}
+	/* Not found */
+	return (struct iw_statistics *) NULL;
 }
 
 /* ---------------------------------------------------------------- */
@@ -541,16 +552,18 @@ static __inline__ void wireless_seq_printf_stats(struct seq_file *seq,
 			   dev->name, stats->status, stats->qual.qual,
 			   stats->qual.updated & IW_QUAL_QUAL_UPDATED
 			   ? '.' : ' ',
-			   ((__u8) stats->qual.level),
+			   ((__s32) stats->qual.level) - 
+			   ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
 			   stats->qual.updated & IW_QUAL_LEVEL_UPDATED
 			   ? '.' : ' ',
-			   ((__u8) stats->qual.noise),
+			   ((__s32) stats->qual.noise) - 
+			   ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
 			   stats->qual.updated & IW_QUAL_NOISE_UPDATED
 			   ? '.' : ' ',
 			   stats->discard.nwid, stats->discard.code,
 			   stats->discard.fragment, stats->discard.retries,
 			   stats->discard.misc, stats->miss.beacon);
-		stats->qual.updated = 0;
+		stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
 	}
 }
 
@@ -593,6 +606,7 @@ static struct file_operations wireless_seq_fops = {
 
 int __init wireless_proc_init(void)
 {
+	/* Create /proc/net/wireless entry */
 	if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops))
 		return -ENOMEM;
 
@@ -627,9 +641,9 @@ static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr)
 				sizeof(struct iw_statistics)))
 			return -EFAULT;
 
-		/* Check if we need to clear the update flag */
+		/* Check if we need to clear the updated flag */
 		if(wrq->u.data.flags != 0)
-			stats->qual.updated = 0;
+			stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
 		return 0;
 	} else
 		return -EOPNOTSUPP;
@@ -1161,10 +1175,11 @@ void wireless_send_event(struct net_device *	dev,
 	struct iw_event  *event;		/* Mallocated whole event */
 	int event_len;				/* Its size */
 	int hdr_len;				/* Size of the event header */
+	int wrqu_off = 0;			/* Offset in wrqu */
 	/* Don't "optimise" the following variable, it will crash */
 	unsigned	cmd_index;		/* *MUST* be unsigned */
 
-	/* Get the description of the IOCTL */
+	/* Get the description of the Event */
 	if(cmd <= SIOCIWLAST) {
 		cmd_index = cmd - SIOCIWFIRST;
 		if(cmd_index < standard_ioctl_num)
@@ -1207,6 +1222,8 @@ void wireless_send_event(struct net_device *	dev,
 		/* Calculate extra_len - extra is NULL for restricted events */
 		if(extra != NULL)
 			extra_len = wrqu->data.length * descr->token_size;
+		/* Always at an offset in wrqu */
+		wrqu_off = IW_EV_POINT_OFF;
 #ifdef WE_EVENT_DEBUG
 		printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len);
 #endif	/* WE_EVENT_DEBUG */
@@ -1217,7 +1234,7 @@ void wireless_send_event(struct net_device *	dev,
 	event_len = hdr_len + extra_len;
 
 #ifdef WE_EVENT_DEBUG
-	printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, event_len %d\n", dev->name, cmd, hdr_len, event_len);
+	printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, wrqu_off %d, event_len %d\n", dev->name, cmd, hdr_len, wrqu_off, event_len);
 #endif	/* WE_EVENT_DEBUG */
 
 	/* Create temporary buffer to hold the event */
@@ -1228,7 +1245,7 @@ void wireless_send_event(struct net_device *	dev,
 	/* Fill event */
 	event->len = event_len;
 	event->cmd = cmd;
-	memcpy(&event->u, wrqu, hdr_len - IW_EV_LCP_LEN);
+	memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN);
 	if(extra != NULL)
 		memcpy(((char *) event) + hdr_len, extra, extra_len);
 
@@ -1249,7 +1266,7 @@ void wireless_send_event(struct net_device *	dev,
  * Now, the driver can delegate this task to Wireless Extensions.
  * It needs to use those standard spy iw_handler in struct iw_handler_def,
  * push data to us via wireless_spy_update() and include struct iw_spy_data
- * in its private part (and advertise it in iw_handler_def->spy_offset).
+ * in its private part (and export it in net_device->wireless_data->spy_data).
  * One of the main advantage of centralising spy support here is that
  * it becomes much easier to improve and extend it without having to touch
  * the drivers. One example is the addition of the Spy-Threshold events.
@@ -1266,10 +1283,7 @@ static inline struct iw_spy_data * get_spydata(struct net_device *dev)
 	/* This is the new way */
 	if(dev->wireless_data)
 		return(dev->wireless_data->spy_data);
-
-	/* This is the old way. Doesn't work for multi-headed drivers.
-	 * It will be removed in the next version of WE. */
-	return (dev->priv + dev->wireless_handlers->spy_offset);
+	return NULL;
 }
 
 /*------------------------------------------------------------------*/
@@ -1284,10 +1298,6 @@ int iw_handler_set_spy(struct net_device *	dev,
 	struct iw_spy_data *	spydata = get_spydata(dev);
 	struct sockaddr *	address = (struct sockaddr *) extra;
 
-	if(!dev->wireless_data)
-		/* Help user know that driver needs updating */
-		printk(KERN_DEBUG "%s (WE) : Driver using old/buggy spy support, please fix driver !\n",
-		       dev->name);
 	/* Make sure driver is not buggy or using the old API */
 	if(!spydata)
 		return -EOPNOTSUPP;
@@ -1318,7 +1328,7 @@ int iw_handler_set_spy(struct net_device *	dev,
 		       sizeof(struct iw_quality) * IW_MAX_SPY);
 
 #ifdef WE_SPY_DEBUG
-		printk(KERN_DEBUG "iw_handler_set_spy() :  offset %ld, spydata %p, num %d\n", dev->wireless_handlers->spy_offset, spydata, wrqu->data.length);
+		printk(KERN_DEBUG "iw_handler_set_spy() :  wireless_data %p, spydata %p, num %d\n", dev->wireless_data, spydata, wrqu->data.length);
 		for (i = 0; i < wrqu->data.length; i++)
 			printk(KERN_DEBUG
 			       "%02X:%02X:%02X:%02X:%02X:%02X \n",
@@ -1371,7 +1381,7 @@ int iw_handler_get_spy(struct net_device *	dev,
 		       sizeof(struct iw_quality) * spydata->spy_number);
 	/* Reset updated flags. */
 	for(i = 0; i < spydata->spy_number; i++)
-		spydata->spy_stat[i].updated = 0;
+		spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED;
 	return 0;
 }
 
@@ -1486,7 +1496,7 @@ void wireless_spy_update(struct net_device *	dev,
 		return;
 
 #ifdef WE_SPY_DEBUG
-	printk(KERN_DEBUG "wireless_spy_update() :  offset %ld, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_handlers->spy_offset, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
+	printk(KERN_DEBUG "wireless_spy_update() :  wireless_data %p, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_data, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
 #endif	/* WE_SPY_DEBUG */
 
 	/* Update all records that match */
-- 
cgit v1.2.3


From bbeec90b98a3066f6f2b8d41c80561f5665e4631 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Wed, 7 Sep 2005 00:27:54 -0400
Subject: [wireless] build fixes after merging WE-19

---
 drivers/net/wireless/airo.c   | 2 +-
 drivers/net/wireless/ray_cs.c | 1 +
 include/linux/wireless.h      | 9 +++------
 net/ieee80211/ieee80211_wx.c  | 5 +++--
 4 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 89c2ff9570d5..2be65d308fbe 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -6867,7 +6867,7 @@ static inline char *airo_translate_scan(struct net_device *dev,
 	} else {
 		iwe.u.qual.level = (bss->dBm + 321) / 2;
 		iwe.u.qual.qual = 0;
-		iwe.u.qual.updated = IW_QUAL_QUAL_INVALID;
+		iwe.u.qual.updated = IW_QUAL_QUAL_INVALID
 				| IW_QUAL_LEVEL_UPDATED
 				| IW_QUAL_DBM;
 	}
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 74d66eeddef2..e9c5ea0f5535 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -53,6 +53,7 @@
 #include <pcmcia/ds.h>
 #include <pcmcia/mem_op.h>
 
+#include <net/ieee80211.h>
 #include <linux/wireless.h>
 
 #include <asm/io.h>
diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index dab5afdaf71c..a555a0f7a7b4 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -69,12 +69,9 @@
 
 /***************************** INCLUDES *****************************/
 
-/* Do not put any header in this file, this creates a mess when
- * exported to user space. Most users have included all the
- * relevant headers anyway... Jean II */
-/*#include <linux/types.h>*/		/* for "caddr_t" et al		*/
-/*#include <linux/socket.h>*/		/* for "struct sockaddr" et al	*/
-/*#include <linux/if.h>*/		/* for IFNAMSIZ and co... */
+#include <linux/types.h>		/* for "caddr_t" et al		*/
+#include <linux/socket.h>		/* for "struct sockaddr" et al	*/
+#include <linux/if.h>			/* for IFNAMSIZ and co... */
 
 /***************************** VERSION *****************************/
 /*
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
index 2cd571c525a9..510a1716a4f0 100644
--- a/net/ieee80211/ieee80211_wx.c
+++ b/net/ieee80211/ieee80211_wx.c
@@ -29,12 +29,13 @@
   Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
 
 ******************************************************************************/
-#include <linux/wireless.h>
-#include <linux/version.h>
+
 #include <linux/kmod.h>
 #include <linux/module.h>
 
 #include <net/ieee80211.h>
+#include <linux/wireless.h>
+
 static const char *ieee80211_modes[] = {
 	"?", "a", "b", "ab", "g", "ag", "bg", "abg"
 };
-- 
cgit v1.2.3


From 54d5d42404e7705cf3804593189e963350d470e5 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 6 Sep 2005 15:16:15 -0700
Subject: [PATCH] x86/x86_64: deferred handling of writes to
 /proc/irqxx/smp_affinity

When handling writes to /proc/irq, current code is re-programming rte
entries directly. This is not recommended and could potentially cause
chipset's to lockup, or cause missing interrupts.

CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the
interrupt is pending. The same needs to be done for /proc/irq handling as well.
Otherwise user space irq balancers are really not doing the right thing.

- Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for
  lack of a generic name.
- added move_irq out of IRQ_BALANCE, and added this same to X86_64
- Added new proc handler for write, so we can do deferred write at irq
  handling time.
- Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead
  it now shows only active cpu masks, or exactly what was set.
- Provided a common move_irq implementation, instead of duplicating
  when using generic irq framework.

Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off.
Tested UP builds as well.

MSI testing: tbd: I have cards, need to look for a x-over cable, although I
did test an earlier version of this patch.  Will test in a couple days.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Zwane Mwaikambo <zwane@holomorphy.com>
Grudgingly-acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig            |   5 ++
 arch/i386/kernel/io_apic.c   |  55 ++++++++++---------
 arch/ia64/Kconfig            |   5 ++
 arch/ia64/kernel/irq.c       |  39 +-------------
 arch/x86_64/Kconfig          |   5 ++
 arch/x86_64/kernel/io_apic.c | 102 ++++++++++++++++++++++-------------
 drivers/pci/msi.c            |  17 ++----
 drivers/pci/msi.h            |   5 --
 include/asm-ia64/hw_irq.h    |   7 ---
 include/asm-ia64/irq.h       |   6 ---
 include/linux/irq.h          | 123 +++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c          |   4 ++
 kernel/irq/proc.c            |  14 ++++-
 13 files changed, 253 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3b3b017e1c15..4b7de3e1e57b 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1318,6 +1318,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 config X86_SMP
 	bool
 	depends on SMP && !X86_VOYAGER
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 6578f40bd501..4a5940431579 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -33,6 +33,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
+
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
@@ -222,13 +223,21 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 {
 	unsigned long flags;
 	int pin;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
+	cpumask_t tmp;
 	
+	cpus_and(tmp, cpumask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
 	apicid_value = cpu_mask_to_apicid(cpumask);
 	/* Prepare to do the io_apic_write */
 	apicid_value = apicid_value << 24;
@@ -242,6 +251,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 			break;
 		entry = irq_2_pin + entry->next;
 	}
+	set_irq_info(irq, cpumask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -259,7 +269,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 #  define Dprintk(x...) 
 # endif
 
-cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
 
 #define IRQBALANCE_CHECK_ARCH -999
 static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
@@ -328,12 +337,7 @@ static inline void balance_irq(int cpu, int irq)
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
-		irq_desc_t *desc = irq_desc + irq;
-		unsigned long flags;
-
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
 	}
 }
 
@@ -528,16 +532,12 @@ tryanotherirq:
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
 	if (!cpus_empty(tmp)) {
-		irq_desc_t *desc = irq_desc + selected_irq;
-		unsigned long flags;
 
 		Dprintk("irq = %d moved to cpu = %d\n",
 				selected_irq, min_loaded);
 		/* mark for change destination */
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[selected_irq] =
-					cpumask_of_cpu(min_loaded);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
 		/* Since we made a change, come back sooner to 
 		 * check for more variation.
 		 */
@@ -568,7 +568,8 @@ static int balanced_irq(void *unused)
 	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++) {
-		pending_irq_balance_cpumask[i] = cpumask_of_cpu(0);
+		pending_irq_cpumask[i] = cpumask_of_cpu(0);
+		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
 	for ( ; ; ) {
@@ -647,20 +648,9 @@ int __init irqbalance_disable(char *str)
 
 __setup("noirqbalance", irqbalance_disable);
 
-static inline void move_irq(int irq)
-{
-	/* note - we hold the desc->lock */
-	if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
-		set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
-		cpus_clear(pending_irq_balance_cpumask[irq]);
-	}
-}
-
 late_initcall(balanced_irq_init);
-
-#else /* !CONFIG_IRQBALANCE */
-static inline void move_irq(int irq) { }
 #endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
 
 #ifndef CONFIG_SMP
 void fastcall send_IPI_self(int vector)
@@ -820,6 +810,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -838,6 +829,7 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
 
 /*
  * EISA Edge/Level control register, ELCR
@@ -1249,6 +1241,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1944,6 +1937,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1958,6 +1952,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1975,14 +1970,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
 #endif
+#endif
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -2000,7 +1998,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -2011,7 +2011,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -2569,6 +2571,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3deced637f07..17b5dbf8c311 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -434,6 +434,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 source "arch/ia64/hp/sim/Kconfig"
 
 source "arch/ia64/oprofile/Kconfig"
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 28f2aadc38d0..205d98028261 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -91,23 +91,8 @@ skip:
 }
 
 #ifdef CONFIG_SMP
-/*
- * This is updated when the user sets irq affinity via /proc
- */
-static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
-static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
-
 static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 
-/*
- * Arch specific routine for deferred write to iosapic rte to reprogram
- * intr destination.
- */
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
-	pending_irq_cpumask[irq] = mask_val;
-}
-
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
 	cpumask_t mask = CPU_MASK_NONE;
@@ -116,32 +101,10 @@ void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 
 	if (irq < NR_IRQS) {
 		irq_affinity[irq] = mask;
+		set_irq_info(irq, mask);
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
-
-
-void move_irq(int irq)
-{
-	/* note - we hold desc->lock */
-	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
-	int redir = test_bit(irq, pending_irq_redir);
-
-	if (unlikely(!desc->handler->set_affinity))
-		return;
-
-	if (!cpus_empty(pending_irq_cpumask[irq])) {
-		cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
-		if (unlikely(!cpus_empty(tmp))) {
-			desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0),
-						    pending_irq_cpumask[irq]);
-		}
-		cpus_clear(pending_irq_cpumask[irq]);
-	}
-}
-
-
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 75e52c57f19c..251ce7cf1a38 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -441,6 +441,11 @@ config ISA_DMA_API
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 menu "Power management options"
 
 source kernel/power/Kconfig
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d206d7e49cf5..76bcc4e6979d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -78,6 +78,54 @@ int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
 #define vector_to_irq(vector)	(vector)
 #endif
 
+#define __DO_ACTION(R, ACTION, FINAL)					\
+									\
+{									\
+	int pin;							\
+	struct irq_pin_list *entry = irq_2_pin + irq;			\
+									\
+	for (;;) {							\
+		unsigned int reg;					\
+		pin = entry->pin;					\
+		if (pin == -1)						\
+			break;						\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+		reg ACTION;						\
+		io_apic_modify(entry->apic, reg);			\
+		if (!entry->next)					\
+			break;						\
+		entry = irq_2_pin + entry->next;			\
+	}								\
+	FINAL;								\
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	unsigned long flags;
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	dest = cpu_mask_to_apicid(mask);
+
+	/*
+	 * Only the high 8 bits are valid.
+	 */
+	dest = SET_APIC_LOGICAL_ID(dest);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__DO_ACTION(1, = dest, )
+	set_irq_info(irq, mask);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -101,26 +149,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
-#define __DO_ACTION(R, ACTION, FINAL)					\
-									\
-{									\
-	int pin;							\
-	struct irq_pin_list *entry = irq_2_pin + irq;			\
-									\
-	for (;;) {							\
-		unsigned int reg;					\
-		pin = entry->pin;					\
-		if (pin == -1)						\
-			break;						\
-		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
-		reg ACTION;						\
-		io_apic_modify(entry->apic, reg);			\
-		if (!entry->next)					\
-			break;						\
-		entry = irq_2_pin + entry->next;			\
-	}								\
-	FINAL;								\
-}
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -767,6 +795,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1314,6 +1343,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
  */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
+	move_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1343,26 +1373,10 @@ static unsigned int startup_level_ioapic_irq (unsigned int irq)
 
 static void end_level_ioapic_irq (unsigned int irq)
 {
+	move_irq(irq);
 	ack_APIC_irq();
 }
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-{
-	unsigned long flags;
-	unsigned int dest;
-
-	dest = cpu_mask_to_apicid(mask);
-
-	/*
-	 * Only the high 8 bits are valid.
-	 */
-	dest = SET_APIC_LOGICAL_ID(dest);
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1375,6 +1389,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1389,6 +1404,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1406,14 +1422,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
-#endif
+#endif // CONFIG_SMP
+#endif // CONFIG_PCI_MSI
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1432,7 +1451,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -1443,7 +1464,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -1918,6 +1941,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
@@ -1931,6 +1955,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -1949,3 +1974,4 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2b85aa39f954..532f73bb2224 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -91,6 +91,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 {
 	struct msi_desc *entry;
 	struct msg_address address;
+	unsigned int irq = vector;
 
 	entry = (struct msi_desc *)msi_desc[vector];
 	if (!entry || !entry->dev)
@@ -112,6 +113,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		pci_write_config_dword(entry->dev, msi_lower_address_reg(pos),
 			address.lo_address.value);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	case PCI_CAP_ID_MSIX:
@@ -125,22 +127,13 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		writel(address.lo_address.value, entry->mask_base + offset);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	default:
 		break;
 	}
 }
-
-#ifdef CONFIG_IRQBALANCE
-static inline void move_msi(int vector)
-{
-	if (!cpus_empty(pending_irq_balance_cpumask[vector])) {
-		set_msi_affinity(vector, pending_irq_balance_cpumask[vector]);
-		cpus_clear(pending_irq_balance_cpumask[vector]);
-	}
-}
-#endif /* CONFIG_IRQBALANCE */
 #endif /* CONFIG_SMP */
 
 static void mask_MSI_irq(unsigned int vector)
@@ -191,13 +184,13 @@ static void shutdown_msi_irq(unsigned int vector)
 
 static void end_msi_irq_wo_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	ack_APIC_irq();
 }
 
 static void end_msi_irq_w_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	unmask_MSI_irq(vector);
 	ack_APIC_irq();
 }
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 390f1851c0f1..402136a5c9e4 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -19,7 +19,6 @@
 #define NR_HP_RESERVED_VECTORS 	20
 
 extern int vector_irq[NR_VECTORS];
-extern cpumask_t pending_irq_balance_cpumask[NR_IRQS];
 extern void (*interrupt[NR_IRQS])(void);
 extern int pci_vector_resources(int last, int nr_released);
 
@@ -29,10 +28,6 @@ extern int pci_vector_resources(int last, int nr_released);
 #define set_msi_irq_affinity	NULL
 #endif
 
-#ifndef CONFIG_IRQBALANCE
-static inline void move_msi(int vector) {}
-#endif
-
 /*
  * MSI-X Address Register
  */
diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h
index 041ab8c51a64..0cf119b42f7d 100644
--- a/include/asm-ia64/hw_irq.h
+++ b/include/asm-ia64/hw_irq.h
@@ -116,13 +116,6 @@ __ia64_local_vector_to_irq (ia64_vector vec)
  * and to obtain the irq descriptor for a given irq number.
  */
 
-/* Return a pointer to the irq descriptor for IRQ.  */
-static inline irq_desc_t *
-irq_descp (int irq)
-{
-	return irq_desc + irq;
-}
-
 /* Extract the IA-64 vector that corresponds to IRQ.  */
 static inline ia64_vector
 irq_to_vector (int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index bd07d11d9f37..5d930fdc0bea 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,12 +30,6 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
-#ifdef CONFIG_SMP
-extern void move_irq(int irq);
-#else
-#define move_irq(irq)
-#endif
-
 struct irqaction;
 struct pt_regs;
 int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 069d3b84d311..4a362b9ec966 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,16 +71,139 @@ typedef struct irq_desc {
 	unsigned int irq_count;		/* For detecting broken interrupts */
 	unsigned int irqs_unhandled;
 	spinlock_t lock;
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+#endif
 } ____cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc [NR_IRQS];
 
+/* Return a pointer to the irq descriptor for IRQ.  */
+static inline irq_desc_t *
+irq_descp (int irq)
+{
+	return irq_desc + irq;
+}
+
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
 extern int setup_irq(unsigned int irq, struct irqaction * new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern cpumask_t irq_affinity[NR_IRQS];
+
+#ifdef CONFIG_SMP
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+	irq_affinity[irq] = mask;
+}
+#else
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+extern cpumask_t pending_irq_cpumask[NR_IRQS];
+
+static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->move_irq = 1;
+	pending_irq_cpumask[irq] = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static inline void
+move_native_irq(int irq)
+{
+	cpumask_t tmp;
+	irq_desc_t *desc = irq_descp(irq);
+
+	if (likely (!desc->move_irq))
+		return;
+
+	desc->move_irq = 0;
+
+	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+		return;
+
+	if (!desc->handler->set_affinity)
+		return;
+
+	/* note - we hold the desc->lock */
+	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 */
+	if (unlikely(!cpus_empty(tmp))) {
+		desc->handler->disable(irq);
+		desc->handler->set_affinity(irq,tmp);
+		desc->handler->enable(irq);
+	}
+	cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#ifdef CONFIG_PCI_MSI
+/*
+ * Wonder why these are dummies?
+ * For e.g the set_ioapic_affinity_vector() calls the set_ioapic_affinity_irq()
+ * counter part after translating the vector to irq info. We need to perform
+ * this operation on the real irq, when we dont use vector, i.e when
+ * pci_use_vector() is false.
+ */
+static inline void move_irq(int irq)
+{
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+}
+
+#else // CONFIG_PCI_MSI
+
+static inline void move_irq(int irq)
+{
+	move_native_irq(irq);
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+#endif // CONFIG_PCI_MSI
+
+#else	// CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE
+
+#define move_irq(x)
+#define move_native_irq(x)
+#define set_pending_irq(x,y)
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+
+#endif // CONFIG_GENERIC_PENDING_IRQ
+
+#else // CONFIG_SMP
+
+#define move_irq(x)
+#define move_native_irq(x)
+
+#endif // CONFIG_SMP
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 
-void __attribute__((weak))
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	/*
+	 * Save these away for later use. Re-progam when the
+	 * interrupt is pending
+	 */
+	set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
-- 
cgit v1.2.3


From 4732efbeb997189d9f9b04708dc26bf8613ed721 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 6 Sep 2005 15:16:25 -0700
Subject: [PATCH] FUTEX_WAKE_OP: pthread_cond_signal() speedup

ATM pthread_cond_signal is unnecessarily slow, because it wakes one waiter
(which at least on UP usually means an immediate context switch to one of
the waiter threads).  This waiter wakes up and after a few instructions it
attempts to acquire the cv internal lock, but that lock is still held by
the thread calling pthread_cond_signal.  So it goes to sleep and eventually
the signalling thread is scheduled in, unlocks the internal lock and wakes
the waiter again.

Now, before 2003-09-21 NPTL was using FUTEX_REQUEUE in pthread_cond_signal
to avoid this performance issue, but it was removed when locks were
redesigned to the 3 state scheme (unlocked, locked uncontended, locked
contended).

Following scenario shows why simply using FUTEX_REQUEUE in
pthread_cond_signal together with using lll_mutex_unlock_force in place of
lll_mutex_unlock is not enough and probably why it has been disabled at
that time:

The number is value in cv->__data.__lock.
        thr1            thr2            thr3
0       pthread_cond_wait
1       lll_mutex_lock (cv->__data.__lock)
0       lll_mutex_unlock (cv->__data.__lock)
0       lll_futex_wait (&cv->__data.__futex, futexval)
0                       pthread_cond_signal
1                       lll_mutex_lock (cv->__data.__lock)
1                                       pthread_cond_signal
2                                       lll_mutex_lock (cv->__data.__lock)
2                                         lll_futex_wait (&cv->__data.__lock, 2)
2                       lll_futex_requeue (&cv->__data.__futex, 0, 1, &cv->__data.__lock)
                          # FUTEX_REQUEUE, not FUTEX_CMP_REQUEUE
2                       lll_mutex_unlock_force (cv->__data.__lock)
0                         cv->__data.__lock = 0
0                         lll_futex_wake (&cv->__data.__lock, 1)
1       lll_mutex_lock (cv->__data.__lock)
0       lll_mutex_unlock (cv->__data.__lock)
          # Here, lll_mutex_unlock doesn't know there are threads waiting
          # on the internal cv's lock

Now, I believe it is possible to use FUTEX_REQUEUE in pthread_cond_signal,
but it will cost us not one, but 2 extra syscalls and, what's worse, one of
these extra syscalls will be done for every single waiting loop in
pthread_cond_*wait.

We would need to use lll_mutex_unlock_force in pthread_cond_signal after
requeue and lll_mutex_cond_lock in pthread_cond_*wait after lll_futex_wait.

Another alternative is to do the unlocking pthread_cond_signal needs to do
(the lock can't be unlocked before lll_futex_wake, as that is racy) in the
kernel.

I have implemented both variants, futex-requeue-glibc.patch is the first
one and futex-wake_op{,-glibc}.patch is the unlocking inside of the kernel.
 The kernel interface allows userland to specify how exactly an unlocking
operation should look like (some atomic arithmetic operation with optional
constant argument and comparison of the previous futex value with another
constant).

It has been implemented just for ppc*, x86_64 and i?86, for other
architectures I'm including just a stub header which can be used as a
starting point by maintainers to write support for their arches and ATM
will just return -ENOSYS for FUTEX_WAKE_OP.  The requeue patch has been
(lightly) tested just on x86_64, the wake_op patch on ppc64 kernel running
32-bit and 64-bit NPTL and x86_64 kernel running 32-bit and 64-bit NPTL.

With the following benchmark on UP x86-64 I get:

for i in nptl-orig nptl-requeue nptl-wake_op; do echo time elf/ld.so --library-path .:$i /tmp/bench; \
for j in 1 2; do echo ( time elf/ld.so --library-path .:$i /tmp/bench ) 2>&1; done; done
time elf/ld.so --library-path .:nptl-orig /tmp/bench
real 0m0.655s user 0m0.253s sys 0m0.403s
real 0m0.657s user 0m0.269s sys 0m0.388s
time elf/ld.so --library-path .:nptl-requeue /tmp/bench
real 0m0.496s user 0m0.225s sys 0m0.271s
real 0m0.531s user 0m0.242s sys 0m0.288s
time elf/ld.so --library-path .:nptl-wake_op /tmp/bench
real 0m0.380s user 0m0.176s sys 0m0.204s
real 0m0.382s user 0m0.175s sys 0m0.207s

The benchmark is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00001.txt
Older futex-requeue-glibc.patch version is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00002.txt
Older futex-wake_op-glibc.patch version is at:
http://sourceware.org/ml/libc-alpha/2005-03/txt00003.txt
Will post a new version (just x86-64 fixes so that the patch
applies against pthread_cond_signal.S) to libc-hacker ml soon.

Attached is the kernel FUTEX_WAKE_OP patch as well as a simple-minded
testcase that will not test the atomicity of the operation, but at least
check if the threads that should have been woken up are woken up and
whether the arithmetic operation in the kernel gave the expected results.

Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jamie Lokier <jamie@shareable.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Yoichi Yuasa <yuasa@hh.iij4u.or.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/futex.h     |  53 +++++++++++++++++++
 include/asm-arm/futex.h       |  53 +++++++++++++++++++
 include/asm-arm26/futex.h     |  53 +++++++++++++++++++
 include/asm-cris/futex.h      |  53 +++++++++++++++++++
 include/asm-frv/futex.h       |  53 +++++++++++++++++++
 include/asm-h8300/futex.h     |  53 +++++++++++++++++++
 include/asm-i386/futex.h      | 108 +++++++++++++++++++++++++++++++++++++++
 include/asm-ia64/futex.h      |  53 +++++++++++++++++++
 include/asm-m32r/futex.h      |  53 +++++++++++++++++++
 include/asm-m68k/futex.h      |  53 +++++++++++++++++++
 include/asm-m68knommu/futex.h |  53 +++++++++++++++++++
 include/asm-mips/futex.h      |  53 +++++++++++++++++++
 include/asm-parisc/futex.h    |  53 +++++++++++++++++++
 include/asm-ppc/futex.h       |  53 +++++++++++++++++++
 include/asm-ppc64/futex.h     |  83 ++++++++++++++++++++++++++++++
 include/asm-ppc64/memory.h    |   2 +
 include/asm-s390/futex.h      |  53 +++++++++++++++++++
 include/asm-sh/futex.h        |  53 +++++++++++++++++++
 include/asm-sh64/futex.h      |  53 +++++++++++++++++++
 include/asm-sparc/futex.h     |  53 +++++++++++++++++++
 include/asm-sparc64/futex.h   |  53 +++++++++++++++++++
 include/asm-um/futex.h        |  53 +++++++++++++++++++
 include/asm-v850/futex.h      |  53 +++++++++++++++++++
 include/asm-x86_64/futex.h    |  98 +++++++++++++++++++++++++++++++++++
 include/linux/futex.h         |  36 +++++++++++--
 kernel/futex.c                | 116 ++++++++++++++++++++++++++++++++++++++++++
 26 files changed, 1498 insertions(+), 5 deletions(-)
 create mode 100644 include/asm-alpha/futex.h
 create mode 100644 include/asm-arm/futex.h
 create mode 100644 include/asm-arm26/futex.h
 create mode 100644 include/asm-cris/futex.h
 create mode 100644 include/asm-frv/futex.h
 create mode 100644 include/asm-h8300/futex.h
 create mode 100644 include/asm-i386/futex.h
 create mode 100644 include/asm-ia64/futex.h
 create mode 100644 include/asm-m32r/futex.h
 create mode 100644 include/asm-m68k/futex.h
 create mode 100644 include/asm-m68knommu/futex.h
 create mode 100644 include/asm-mips/futex.h
 create mode 100644 include/asm-parisc/futex.h
 create mode 100644 include/asm-ppc/futex.h
 create mode 100644 include/asm-ppc64/futex.h
 create mode 100644 include/asm-s390/futex.h
 create mode 100644 include/asm-sh/futex.h
 create mode 100644 include/asm-sh64/futex.h
 create mode 100644 include/asm-sparc/futex.h
 create mode 100644 include/asm-sparc64/futex.h
 create mode 100644 include/asm-um/futex.h
 create mode 100644 include/asm-v850/futex.h
 create mode 100644 include/asm-x86_64/futex.h

(limited to 'include/linux')

diff --git a/include/asm-alpha/futex.h b/include/asm-alpha/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-alpha/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-arm/futex.h b/include/asm-arm/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-arm/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-arm26/futex.h b/include/asm-arm26/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-arm26/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-cris/futex.h b/include/asm-cris/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-cris/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-frv/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-h8300/futex.h b/include/asm-h8300/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-h8300/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
new file mode 100644
index 000000000000..44b9db806474
--- /dev/null
+++ b/include/asm-i386/futex.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	" insn "\n"						\
+"2:	.section .fixup,\"ax\"\n\
+3:	mov	%3, %1\n\
+	jmp	2b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.long	1b,3b\n\
+	.previous"						\
+	: "=r" (oldval), "=r" (ret), "=m" (*uaddr)		\
+	: "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	movl	%2, %0\n\
+	movl	%0, %3\n"					\
+	insn "\n"						\
+"2:	" LOCK_PREFIX "cmpxchgl %3, %2\n\
+	jnz	1b\n\
+3:	.section .fixup,\"ax\"\n\
+4:	mov	%5, %1\n\
+	jmp	3b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.long	1b,4b,2b,4b\n\
+	.previous"						\
+	: "=&a" (oldval), "=&r" (ret), "=m" (*uaddr),		\
+	  "=&r" (tem)						\
+	: "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	if (op == FUTEX_OP_SET)
+		__futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+	else {
+#ifndef CONFIG_X86_BSWAP
+		if (boot_cpu_data.x86 == 3)
+			ret = -ENOSYS;
+		else
+#endif
+		switch (op) {
+		case FUTEX_OP_ADD:
+			__futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
+					   oldval, uaddr, oparg);
+			break;
+		case FUTEX_OP_OR:
+			__futex_atomic_op2("orl %4, %3", ret, oldval, uaddr,
+					   oparg);
+			break;
+		case FUTEX_OP_ANDN:
+			__futex_atomic_op2("andl %4, %3", ret, oldval, uaddr,
+					   ~oparg);
+			break;
+		case FUTEX_OP_XOR:
+			__futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr,
+					   oparg);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ia64/futex.h b/include/asm-ia64/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-ia64/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-m32r/futex.h b/include/asm-m32r/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-m32r/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-m68k/futex.h b/include/asm-m68k/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-m68k/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-m68knommu/futex.h b/include/asm-m68knommu/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-m68knommu/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
new file mode 100644
index 000000000000..9feff4ce1424
--- /dev/null
+++ b/include/asm-mips/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-parisc/futex.h b/include/asm-parisc/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-parisc/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ppc/futex.h b/include/asm-ppc/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-ppc/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ppc64/futex.h b/include/asm-ppc64/futex.h
new file mode 100644
index 000000000000..cb2640b3a408
--- /dev/null
+++ b/include/asm-ppc64/futex.h
@@ -0,0 +1,83 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/memory.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (SYNC_ON_SMP				\
+"1:	lwarx	%0,0,%2\n"					\
+	insn							\
+"2:	stwcx.	%1,0,%2\n\
+	bne-	1b\n\
+	li	%1,0\n\
+3:	.section .fixup,\"ax\"\n\
+4:	li	%1,%3\n\
+	b	3b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align 3\n\
+	.llong	1b,4b,2b,4b\n\
+	.previous"						\
+	: "=&r" (oldval), "=&r" (ret)				\
+	: "b" (uaddr), "i" (-EFAULT), "1" (oparg)		\
+	: "cr0", "memory")
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		__futex_atomic_op("", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ADD:
+		__futex_atomic_op("add %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_OR:
+		__futex_atomic_op("or %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ANDN:
+		__futex_atomic_op("andc %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_XOR:
+		__futex_atomic_op("xor %1,%0,%1\n", ret, oldval, uaddr, oparg);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-ppc64/memory.h b/include/asm-ppc64/memory.h
index 56e09face9a8..af53ffb55726 100644
--- a/include/asm-ppc64/memory.h
+++ b/include/asm-ppc64/memory.h
@@ -18,9 +18,11 @@
 #ifdef CONFIG_SMP
 #define EIEIO_ON_SMP	"eieio\n"
 #define ISYNC_ON_SMP	"\n\tisync"
+#define SYNC_ON_SMP	"lwsync\n\t"
 #else
 #define EIEIO_ON_SMP
 #define ISYNC_ON_SMP
+#define SYNC_ON_SMP
 #endif
 
 static inline void eieio(void)
diff --git a/include/asm-s390/futex.h b/include/asm-s390/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-s390/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sh/futex.h b/include/asm-sh/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sh/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sh64/futex.h b/include/asm-sh64/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sh64/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sparc/futex.h b/include/asm-sparc/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sparc/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-sparc64/futex.h b/include/asm-sparc64/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-sparc64/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-um/futex.h b/include/asm-um/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-um/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-v850/futex.h b/include/asm-v850/futex.h
new file mode 100644
index 000000000000..2cac5ecd9d00
--- /dev/null
+++ b/include/asm-v850/futex.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+	case FUTEX_OP_ADD:
+	case FUTEX_OP_OR:
+	case FUTEX_OP_ANDN:
+	case FUTEX_OP_XOR:
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/asm-x86_64/futex.h b/include/asm-x86_64/futex.h
new file mode 100644
index 000000000000..8602c09bf89e
--- /dev/null
+++ b/include/asm-x86_64/futex.h
@@ -0,0 +1,98 @@
+#ifndef _ASM_FUTEX_H
+#define _ASM_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <asm/errno.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	" insn "\n"						\
+"2:	.section .fixup,\"ax\"\n\
+3:	mov	%3, %1\n\
+	jmp	2b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.quad	1b,3b\n\
+	.previous"						\
+	: "=r" (oldval), "=r" (ret), "=m" (*uaddr)		\
+	: "i" (-EFAULT), "m" (*uaddr), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+  __asm__ __volatile (						\
+"1:	movl	%2, %0\n\
+	movl	%0, %3\n"					\
+	insn "\n"						\
+"2:	" LOCK_PREFIX "cmpxchgl %3, %2\n\
+	jnz	1b\n\
+3:	.section .fixup,\"ax\"\n\
+4:	mov	%5, %1\n\
+	jmp	3b\n\
+	.previous\n\
+	.section __ex_table,\"a\"\n\
+	.align	8\n\
+	.quad	1b,4b,2b,4b\n\
+	.previous"						\
+	: "=&a" (oldval), "=&r" (ret), "=m" (*uaddr),		\
+	  "=&r" (tem)						\
+	: "r" (oparg), "i" (-EFAULT), "m" (*uaddr), "1" (0))
+
+static inline int
+futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret, tem;
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	inc_preempt_count();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		__futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ADD:
+		__futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
+				   uaddr, oparg);
+		break;
+	case FUTEX_OP_OR:
+		__futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ANDN:
+		__futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
+		break;
+	case FUTEX_OP_XOR:
+		__futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	dec_preempt_count();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
+		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
+		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
+		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
+		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
+		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
+		default: ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+#endif
+#endif
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 65d6cfdb6d39..10f96c31971e 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -4,14 +4,40 @@
 /* Second argument to futex syscall */
 
 
-#define FUTEX_WAIT (0)
-#define FUTEX_WAKE (1)
-#define FUTEX_FD (2)
-#define FUTEX_REQUEUE (3)
-#define FUTEX_CMP_REQUEUE (4)
+#define FUTEX_WAIT		0
+#define FUTEX_WAKE		1
+#define FUTEX_FD		2
+#define FUTEX_REQUEUE		3
+#define FUTEX_CMP_REQUEUE	4
+#define FUTEX_WAKE_OP		5
 
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
+#define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
+#define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
+#define FUTEX_OP_OR		2	/* *(int *)UADDR2 |= OPARG; */
+#define FUTEX_OP_ANDN		3	/* *(int *)UADDR2 &= ~OPARG; */
+#define FUTEX_OP_XOR		4	/* *(int *)UADDR2 ^= OPARG; */
+
+#define FUTEX_OP_OPARG_SHIFT	8	/* Use (1 << OPARG) instead of OPARG.  */
+
+#define FUTEX_OP_CMP_EQ		0	/* if (oldval == CMPARG) wake */
+#define FUTEX_OP_CMP_NE		1	/* if (oldval != CMPARG) wake */
+#define FUTEX_OP_CMP_LT		2	/* if (oldval < CMPARG) wake */
+#define FUTEX_OP_CMP_LE		3	/* if (oldval <= CMPARG) wake */
+#define FUTEX_OP_CMP_GT		4	/* if (oldval > CMPARG) wake */
+#define FUTEX_OP_CMP_GE		5	/* if (oldval >= CMPARG) wake */
+
+/* FUTEX_WAKE_OP will perform atomically
+   int oldval = *(int *)UADDR2;
+   *(int *)UADDR2 = oldval OP OPARG;
+   if (oldval CMP CMPARG)
+     wake UADDR2;  */
+
+#define FUTEX_OP(op, oparg, cmp, cmparg) \
+  (((op & 0xf) << 28) | ((cmp & 0xf) << 24)		\
+   | ((oparg & 0xfff) << 12) | (cmparg & 0xfff))
+
 #endif
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..07ba87de9658 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <asm/futex.h>
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
@@ -326,6 +327,118 @@ out:
 	return ret;
 }
 
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+	union futex_key key1, key2;
+	struct futex_hash_bucket *bh1, *bh2;
+	struct list_head *head;
+	struct futex_q *this, *next;
+	int ret, op_ret, attempt = 0;
+
+retryfull:
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr1, &key1);
+	if (unlikely(ret != 0))
+		goto out;
+	ret = get_futex_key(uaddr2, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	bh1 = hash_futex(&key1);
+	bh2 = hash_futex(&key2);
+
+retry:
+	if (bh1 < bh2)
+		spin_lock(&bh1->lock);
+	spin_lock(&bh2->lock);
+	if (bh1 > bh2)
+		spin_lock(&bh1->lock);
+
+	op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+	if (unlikely(op_ret < 0)) {
+		int dummy;
+
+		spin_unlock(&bh1->lock);
+		if (bh1 != bh2)
+			spin_unlock(&bh2->lock);
+
+		/* futex_atomic_op_inuser needs to both read and write
+		 * *(int __user *)uaddr2, but we can't modify it
+		 * non-atomically.  Therefore, if get_user below is not
+		 * enough, we need to handle the fault ourselves, while
+		 * still holding the mmap_sem.  */
+		if (attempt++) {
+			struct vm_area_struct * vma;
+			struct mm_struct *mm = current->mm;
+
+			ret = -EFAULT;
+			if (attempt >= 2 ||
+			    !(vma = find_vma(mm, uaddr2)) ||
+			    vma->vm_start > uaddr2 ||
+			    !(vma->vm_flags & VM_WRITE))
+				goto out;
+
+			switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+			case VM_FAULT_MINOR:
+				current->min_flt++;
+				break;
+			case VM_FAULT_MAJOR:
+				current->maj_flt++;
+				break;
+			default:
+				goto out;
+			}
+			goto retry;
+		}
+
+		/* If we would have faulted, release mmap_sem,
+		 * fault it in and start all over again.  */
+		up_read(&current->mm->mmap_sem);
+
+		ret = get_user(dummy, (int __user *)uaddr2);
+		if (ret)
+			return ret;
+
+		goto retryfull;
+	}
+
+	head = &bh1->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &key1)) {
+			wake_futex(this);
+			if (++ret >= nr_wake)
+				break;
+		}
+	}
+
+	if (op_ret > 0) {
+		head = &bh2->chain;
+
+		op_ret = 0;
+		list_for_each_entry_safe(this, next, head, list) {
+			if (match_futex (&this->key, &key2)) {
+				wake_futex(this);
+				if (++op_ret >= nr_wake2)
+					break;
+			}
+		}
+		ret += op_ret;
+	}
+
+	spin_unlock(&bh1->lock);
+	if (bh1 != bh2)
+		spin_unlock(&bh2->lock);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
 /*
  * Requeue all waiters hashed on one physical page to another
  * physical page.
@@ -740,6 +853,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 	case FUTEX_CMP_REQUEUE:
 		ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
 		break;
+	case FUTEX_WAKE_OP:
+		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
-- 
cgit v1.2.3


From 8446f1d391f3d27e6bf9c43d4cbcdac0ca720417 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Sep 2005 15:16:27 -0700
Subject: [PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c       |   5 ++
 arch/i386/kernel/time.c      |   1 +
 arch/x86_64/kernel/nmi.c     |   2 +
 arch/x86_64/kernel/time.c    |   1 +
 drivers/mtd/nand/nand_base.c |   1 +
 include/linux/sched.h        |  17 +++++
 init/main.c                  |   1 +
 kernel/Makefile              |   1 +
 kernel/power/swsusp.c        |   1 +
 kernel/softlockup.c          | 151 +++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c               |   1 +
 lib/Kconfig.debug            |  19 ++++++
 12 files changed, 201 insertions(+)
 create mode 100644 kernel/softlockup.c

(limited to 'include/linux')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 8bbdbda07a2d..0178457db721 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		alert_counter[i] = 0;
+
+	/*
+	 * Tickle the softlockup detector too:
+	 */
+	touch_softlockup_watchdog();
 }
 
 extern void die_nmi(struct pt_regs *, const char *msg);
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 6f794a78ee1e..b0c5ee2b3446 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
 		last_timer->resume();
 	cur_timer = last_timer;
 	last_timer = NULL;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 64a8e05d5811..84cae81fff8b 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		per_cpu(nmi_touch, i) = 1;
+
+ 	touch_softlockup_watchdog();
 }
 
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 66bf6ddeb0c3..2b5d9da912a2 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
 	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index eee5115658c8..04e54318bc6a 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
 	do {
 		if (this->dev_ready(mtd))
 			return;
+		touch_softlockup_watchdog();
 	} while (time_before(jiffies, timeo));	
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dec5827c7742..5fb31bede103 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,23 @@ extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+extern void softlockup_tick(struct pt_regs *regs);
+extern void spawn_softlockup_task(void);
+extern void touch_softlockup_watchdog(void);
+#else
+static inline void softlockup_tick(struct pt_regs *regs)
+{
+}
+static inline void spawn_softlockup_task(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+#endif
+
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Is this address in the __sched functions? */
diff --git a/init/main.c b/init/main.c
index ff410063e4e1..a29fb2ac7240 100644
--- a/init/main.c
+++ b/init/main.c
@@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
 	migration_init();
 #endif
 	spawn_ksoftirqd();
+	spawn_softlockup_task();
 }
 
 static void run_init_process(char *init_filename)
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..8d57a2f1226b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb5889..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1059,6 +1059,7 @@ int swsusp_resume(void)
 	BUG_ON(!error);
 	restore_processor_state();
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
 	return error;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = softlock_panic,
+};
+
+void touch_softlockup_watchdog(void)
+{
+	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+
+	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+		return;
+
+	/* Do not cause a second panic when there already was one */
+	if (did_panic)
+		return;
+
+	if (time_after(jiffies, timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = timestamp;
+
+		spin_lock(&print_lock);
+		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+			this_cpu);
+		show_regs(regs);
+		spin_unlock(&print_lock);
+	}
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+	struct sched_param param = { .sched_priority = 99 };
+	int this_cpu = (long) __bind_cpu;
+
+	printk("softlockup thread %d started up.\n", this_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/*
+	 * Run briefly once per second - if this gets delayed for
+	 * more than 10 seconds then the debug-printout triggers
+	 * in softlockup_tick():
+	 */
+	while (!kthread_should_stop()) {
+		msleep_interruptible(1000);
+		touch_softlockup_watchdog();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(watchdog_task, hotcpu));
+		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("watchdog for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(watchdog_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(watchdog_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+		p = per_cpu(watchdog_task, hotcpu);
+		per_cpu(watchdog_task, hotcpu) = NULL;
+		kthread_stop(p);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init void spawn_softlockup_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	notifier_chain_register(&panic_notifier_list, &panic_block);
+}
+
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..1433d87f46b3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
 	update_times();
+	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 299f7f3b5b08..3754c9a8f5c8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
 		     13 =>  8 KB
 		     12 =>  4 KB
 
+config DETECT_SOFTLOCKUP
+	bool "Detect Soft Lockups"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  Say Y here to enable the kernel to detect "soft lockups",
+	  which are bugs that cause the kernel to loop in kernel
+	  mode for more than 10 seconds, without giving other tasks a
+	  chance to run.
+
+	  When a soft-lockup is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  system will stay locked up. This feature has negligible
+	  overhead.
+
+	  (Note that "hard lockups" are separate type of bugs that
+	   can be detected via the NMI-watchdog, on platforms that
+	   support it.)
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS
-- 
cgit v1.2.3


From e82894f84dbba130ab46c97748c03647f8204f92 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Tue, 6 Sep 2005 15:16:30 -0700
Subject: [PATCH] relayfs

Here's the latest version of relayfs, against linux-2.6.11-mm2.  I'm hoping
you'll consider putting this version back into your tree - the previous
rounds of comment seem to have shaken out all the API issues and the number
of comments on the code itself have also steadily dwindled.

This patch is essentially the same as the relayfs redux part 5 patch, with
some minor changes based on reviewer comments.  Thanks again to Pekka
Enberg for those.  The patch size without documentation is now a little
smaller at just over 40k.  Here's a detailed list of the changes:

- removed the attribute_flags in relay open and changed it to a
  boolean specifying either overwrite or no-overwrite mode, and removed
  everything referencing the attribute flags.
- added a check for NULL names in relayfs_create_entry()
- got rid of the unnecessary multiple labels in relay_create_buf()
- some minor simplification of relay_alloc_buf() which got rid of a
  couple params
- updated the Documentation

In addition, this version (through code contained in the relay-apps tarball
linked to below, not as part of the relayfs patch) tries to make it as easy
as possible to create the cooperating kernel/user pieces of a typical and
common type of logging application, one where kernel logging is kicked off
when a user space data collection app starts and stops when the collection
app exits, with the data being automatically logged to disk in between.  To
create this type of application, you basically just include a header file
(relay-app.h, included in the relay-apps tarball) in your kernel module,
define a couple of callbacks and call an initialization function, and on
the user side call a single function that sets up and continuously monitors
the buffers, and writes data to files as it becomes available.  Channels
are created when the collection app is started and destroyed when it exits,
not when the kernel module is inserted, so different channel buffer sizes
can be specified for each separate run via command-line options.  See the
README in the relay-apps tarball for details.

Also included in the relay-apps tarball are a couple examples
demonstrating how you can use this to create quick and dirty kernel
logging/debugging applications.  They are:

- tprintk, short for 'tee printk', which temporarily puts a kprobe on
  printk() and writes a duplicate stream of printk output to a relayfs
  channel.  This could be used anywhere there's printk() debugging code
  in the kernel which you'd like to exercise, but would rather not have
  your system logs cluttered with debugging junk.  You'd probably want
  to kill klogd while you do this, otherwise there wouldn't be much
  point (since putting a kprobe on printk() doesn't change the output
  of printk()).  I've used this method to temporarily divert the packet
  logging output of the iptables LOG target from the system logs to
  relayfs files instead, for instance.

- klog, which just provides a printk-like formatted logging function
  on top of relayfs.  Again, you can use this to keep stuff out of your
  system logs if used in place of printk.

The example applications can be found here:

http://prdownloads.sourceforge.net/dprobes/relay-apps.tar.gz?download

From: Christoph Hellwig <hch@lst.de>

  avoid lookup_hash usage in relayfs

Signed-off-by: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/relayfs.txt | 362 ++++++++++++++++++++
 fs/Kconfig                            |  12 +
 fs/Makefile                           |   1 +
 fs/relayfs/Makefile                   |   4 +
 fs/relayfs/buffers.c                  | 189 +++++++++++
 fs/relayfs/buffers.h                  |  12 +
 fs/relayfs/inode.c                    | 609 ++++++++++++++++++++++++++++++++++
 fs/relayfs/relay.c                    | 431 ++++++++++++++++++++++++
 fs/relayfs/relay.h                    |  12 +
 include/linux/relayfs_fs.h            | 255 ++++++++++++++
 10 files changed, 1887 insertions(+)
 create mode 100644 Documentation/filesystems/relayfs.txt
 create mode 100644 fs/relayfs/Makefile
 create mode 100644 fs/relayfs/buffers.c
 create mode 100644 fs/relayfs/buffers.h
 create mode 100644 fs/relayfs/inode.c
 create mode 100644 fs/relayfs/relay.c
 create mode 100644 fs/relayfs/relay.h
 create mode 100644 include/linux/relayfs_fs.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt
new file mode 100644
index 000000000000..d24e1b0d4f39
--- /dev/null
+++ b/Documentation/filesystems/relayfs.txt
@@ -0,0 +1,362 @@
+
+relayfs - a high-speed data relay filesystem
+============================================
+
+relayfs is a filesystem designed to provide an efficient mechanism for
+tools and facilities to relay large and potentially sustained streams
+of data from kernel space to user space.
+
+The main abstraction of relayfs is the 'channel'.  A channel consists
+of a set of per-cpu kernel buffers each represented by a file in the
+relayfs filesystem.  Kernel clients write into a channel using
+efficient write functions which automatically log to the current cpu's
+channel buffer.  User space applications mmap() the per-cpu files and
+retrieve the data as it becomes available.
+
+The format of the data logged into the channel buffers is completely
+up to the relayfs client; relayfs does however provide hooks which
+allow clients to impose some stucture on the buffer data.  Nor does
+relayfs implement any form of data filtering - this also is left to
+the client.  The purpose is to keep relayfs as simple as possible.
+
+This document provides an overview of the relayfs API.  The details of
+the function parameters are documented along with the functions in the
+filesystem code - please see that for details.
+
+Semantics
+=========
+
+Each relayfs channel has one buffer per CPU, each buffer has one or
+more sub-buffers. Messages are written to the first sub-buffer until
+it is too full to contain a new message, in which case it it is
+written to the next (if available).  Messages are never split across
+sub-buffers.  At this point, userspace can be notified so it empties
+the first sub-buffer, while the kernel continues writing to the next.
+
+When notified that a sub-buffer is full, the kernel knows how many
+bytes of it are padding i.e. unused.  Userspace can use this knowledge
+to copy only valid data.
+
+After copying it, userspace can notify the kernel that a sub-buffer
+has been consumed.
+
+relayfs can operate in a mode where it will overwrite data not yet
+collected by userspace, and not wait for it to consume it.
+
+relayfs itself does not provide for communication of such data between
+userspace and kernel, allowing the kernel side to remain simple and not
+impose a single interface on userspace. It does provide a separate
+helper though, described below.
+
+klog, relay-app & librelay
+==========================
+
+relayfs itself is ready to use, but to make things easier, two
+additional systems are provided.  klog is a simple wrapper to make
+writing formatted text or raw data to a channel simpler, regardless of
+whether a channel to write into exists or not, or whether relayfs is
+compiled into the kernel or is configured as a module.  relay-app is
+the kernel counterpart of userspace librelay.c, combined these two
+files provide glue to easily stream data to disk, without having to
+bother with housekeeping.  klog and relay-app can be used together,
+with klog providing high-level logging functions to the kernel and
+relay-app taking care of kernel-user control and disk-logging chores.
+
+It is possible to use relayfs without relay-app & librelay, but you'll
+have to implement communication between userspace and kernel, allowing
+both to convey the state of buffers (full, empty, amount of padding).
+
+klog, relay-app and librelay can be found in the relay-apps tarball on
+http://relayfs.sourceforge.net
+
+The relayfs user space API
+==========================
+
+relayfs implements basic file operations for user space access to
+relayfs channel buffer data.  Here are the file operations that are
+available and some comments regarding their behavior:
+
+open()	 enables user to open an _existing_ buffer.
+
+mmap()	 results in channel buffer being mapped into the caller's
+	 memory space. Note that you can't do a partial mmap - you must
+	 map the entire file, which is NRBUF * SUBBUFSIZE.
+
+read()	 read the contents of a channel buffer.  The bytes read are
+	 'consumed' by the reader i.e. they won't be available again
+	 to subsequent reads.  If the channel is being used in
+	 no-overwrite mode (the default), it can be read at any time
+	 even if there's an active kernel writer.  If the channel is
+	 being used in overwrite mode and there are active channel
+	 writers, results may be unpredictable - users should make
+	 sure that all logging to the channel has ended before using
+	 read() with overwrite mode.
+
+poll()	 POLLIN/POLLRDNORM/POLLERR supported.  User applications are
+	 notified when sub-buffer boundaries are crossed.
+
+close() decrements the channel buffer's refcount.  When the refcount
+	reaches 0 i.e. when no process or kernel client has the buffer
+	open, the channel buffer is freed.
+
+
+In order for a user application to make use of relayfs files, the
+relayfs filesystem must be mounted.  For example,
+
+	mount -t relayfs relayfs /mnt/relay
+
+NOTE:	relayfs doesn't need to be mounted for kernel clients to create
+	or use channels - it only needs to be mounted when user space
+	applications need access to the buffer data.
+
+
+The relayfs kernel API
+======================
+
+Here's a summary of the API relayfs provides to in-kernel clients:
+
+
+  channel management functions:
+
+    relay_open(base_filename, parent, subbuf_size, n_subbufs,
+               callbacks)
+    relay_close(chan)
+    relay_flush(chan)
+    relay_reset(chan)
+    relayfs_create_dir(name, parent)
+    relayfs_remove_dir(dentry)
+
+  channel management typically called on instigation of userspace:
+
+    relay_subbufs_consumed(chan, cpu, subbufs_consumed)
+
+  write functions:
+
+    relay_write(chan, data, length)
+    __relay_write(chan, data, length)
+    relay_reserve(chan, length)
+
+  callbacks:
+
+    subbuf_start(buf, subbuf, prev_subbuf, prev_padding)
+    buf_mapped(buf, filp)
+    buf_unmapped(buf, filp)
+
+  helper functions:
+
+    relay_buf_full(buf)
+    subbuf_start_reserve(buf, length)
+
+
+Creating a channel
+------------------
+
+relay_open() is used to create a channel, along with its per-cpu
+channel buffers.  Each channel buffer will have an associated file
+created for it in the relayfs filesystem, which can be opened and
+mmapped from user space if desired.  The files are named
+basename0...basenameN-1 where N is the number of online cpus, and by
+default will be created in the root of the filesystem.  If you want a
+directory structure to contain your relayfs files, you can create it
+with relayfs_create_dir() and pass the parent directory to
+relay_open().  Clients are responsible for cleaning up any directory
+structure they create when the channel is closed - use
+relayfs_remove_dir() for that.
+
+The total size of each per-cpu buffer is calculated by multiplying the
+number of sub-buffers by the sub-buffer size passed into relay_open().
+The idea behind sub-buffers is that they're basically an extension of
+double-buffering to N buffers, and they also allow applications to
+easily implement random-access-on-buffer-boundary schemes, which can
+be important for some high-volume applications.  The number and size
+of sub-buffers is completely dependent on the application and even for
+the same application, different conditions will warrant different
+values for these parameters at different times.  Typically, the right
+values to use are best decided after some experimentation; in general,
+though, it's safe to assume that having only 1 sub-buffer is a bad
+idea - you're guaranteed to either overwrite data or lose events
+depending on the channel mode being used.
+
+Channel 'modes'
+---------------
+
+relayfs channels can be used in either of two modes - 'overwrite' or
+'no-overwrite'.  The mode is entirely determined by the implementation
+of the subbuf_start() callback, as described below.  In 'overwrite'
+mode, also known as 'flight recorder' mode, writes continuously cycle
+around the buffer and will never fail, but will unconditionally
+overwrite old data regardless of whether it's actually been consumed.
+In no-overwrite mode, writes will fail i.e. data will be lost, if the
+number of unconsumed sub-buffers equals the total number of
+sub-buffers in the channel.  It should be clear that if there is no
+consumer or if the consumer can't consume sub-buffers fast enought,
+data will be lost in either case; the only difference is whether data
+is lost from the beginning or the end of a buffer.
+
+As explained above, a relayfs channel is made of up one or more
+per-cpu channel buffers, each implemented as a circular buffer
+subdivided into one or more sub-buffers.  Messages are written into
+the current sub-buffer of the channel's current per-cpu buffer via the
+write functions described below.  Whenever a message can't fit into
+the current sub-buffer, because there's no room left for it, the
+client is notified via the subbuf_start() callback that a switch to a
+new sub-buffer is about to occur.  The client uses this callback to 1)
+initialize the next sub-buffer if appropriate 2) finalize the previous
+sub-buffer if appropriate and 3) return a boolean value indicating
+whether or not to actually go ahead with the sub-buffer switch.
+
+To implement 'no-overwrite' mode, the userspace client would provide
+an implementation of the subbuf_start() callback something like the
+following:
+
+static int subbuf_start(struct rchan_buf *buf,
+                        void *subbuf,
+			void *prev_subbuf,
+			unsigned int prev_padding)
+{
+	if (prev_subbuf)
+		*((unsigned *)prev_subbuf) = prev_padding;
+
+	if (relay_buf_full(buf))
+		return 0;
+
+	subbuf_start_reserve(buf, sizeof(unsigned int));
+
+	return 1;
+}
+
+If the current buffer is full i.e. all sub-buffers remain unconsumed,
+the callback returns 0 to indicate that the buffer switch should not
+occur yet i.e. until the consumer has had a chance to read the current
+set of ready sub-buffers.  For the relay_buf_full() function to make
+sense, the consumer is reponsible for notifying relayfs when
+sub-buffers have been consumed via relay_subbufs_consumed().  Any
+subsequent attempts to write into the buffer will again invoke the
+subbuf_start() callback with the same parameters; only when the
+consumer has consumed one or more of the ready sub-buffers will
+relay_buf_full() return 0, in which case the buffer switch can
+continue.
+
+The implementation of the subbuf_start() callback for 'overwrite' mode
+would be very similar:
+
+static int subbuf_start(struct rchan_buf *buf,
+                        void *subbuf,
+			void *prev_subbuf,
+			unsigned int prev_padding)
+{
+	if (prev_subbuf)
+		*((unsigned *)prev_subbuf) = prev_padding;
+
+	subbuf_start_reserve(buf, sizeof(unsigned int));
+
+	return 1;
+}
+
+In this case, the relay_buf_full() check is meaningless and the
+callback always returns 1, causing the buffer switch to occur
+unconditionally.  It's also meaningless for the client to use the
+relay_subbufs_consumed() function in this mode, as it's never
+consulted.
+
+The default subbuf_start() implementation, used if the client doesn't
+define any callbacks, or doesn't define the subbuf_start() callback,
+implements the simplest possible 'no-overwrite' mode i.e. it does
+nothing but return 0.
+
+Header information can be reserved at the beginning of each sub-buffer
+by calling the subbuf_start_reserve() helper function from within the
+subbuf_start() callback.  This reserved area can be used to store
+whatever information the client wants.  In the example above, room is
+reserved in each sub-buffer to store the padding count for that
+sub-buffer.  This is filled in for the previous sub-buffer in the
+subbuf_start() implementation; the padding value for the previous
+sub-buffer is passed into the subbuf_start() callback along with a
+pointer to the previous sub-buffer, since the padding value isn't
+known until a sub-buffer is filled.  The subbuf_start() callback is
+also called for the first sub-buffer when the channel is opened, to
+give the client a chance to reserve space in it.  In this case the
+previous sub-buffer pointer passed into the callback will be NULL, so
+the client should check the value of the prev_subbuf pointer before
+writing into the previous sub-buffer.
+
+Writing to a channel
+--------------------
+
+kernel clients write data into the current cpu's channel buffer using
+relay_write() or __relay_write().  relay_write() is the main logging
+function - it uses local_irqsave() to protect the buffer and should be
+used if you might be logging from interrupt context.  If you know
+you'll never be logging from interrupt context, you can use
+__relay_write(), which only disables preemption.  These functions
+don't return a value, so you can't determine whether or not they
+failed - the assumption is that you wouldn't want to check a return
+value in the fast logging path anyway, and that they'll always succeed
+unless the buffer is full and no-overwrite mode is being used, in
+which case you can detect a failed write in the subbuf_start()
+callback by calling the relay_buf_full() helper function.
+
+relay_reserve() is used to reserve a slot in a channel buffer which
+can be written to later.  This would typically be used in applications
+that need to write directly into a channel buffer without having to
+stage data in a temporary buffer beforehand.  Because the actual write
+may not happen immediately after the slot is reserved, applications
+using relay_reserve() can keep a count of the number of bytes actually
+written, either in space reserved in the sub-buffers themselves or as
+a separate array.  See the 'reserve' example in the relay-apps tarball
+at http://relayfs.sourceforge.net for an example of how this can be
+done.  Because the write is under control of the client and is
+separated from the reserve, relay_reserve() doesn't protect the buffer
+at all - it's up to the client to provide the appropriate
+synchronization when using relay_reserve().
+
+Closing a channel
+-----------------
+
+The client calls relay_close() when it's finished using the channel.
+The channel and its associated buffers are destroyed when there are no
+longer any references to any of the channel buffers.  relay_flush()
+forces a sub-buffer switch on all the channel buffers, and can be used
+to finalize and process the last sub-buffers before the channel is
+closed.
+
+Misc
+----
+
+Some applications may want to keep a channel around and re-use it
+rather than open and close a new channel for each use.  relay_reset()
+can be used for this purpose - it resets a channel to its initial
+state without reallocating channel buffer memory or destroying
+existing mappings.  It should however only be called when it's safe to
+do so i.e. when the channel isn't currently being written to.
+
+Finally, there are a couple of utility callbacks that can be used for
+different purposes.  buf_mapped() is called whenever a channel buffer
+is mmapped from user space and buf_unmapped() is called when it's
+unmapped.  The client can use this notification to trigger actions
+within the kernel application, such as enabling/disabling logging to
+the channel.
+
+
+Resources
+=========
+
+For news, example code, mailing list, etc. see the relayfs homepage:
+
+    http://relayfs.sourceforge.net
+
+
+Credits
+=======
+
+The ideas and specs for relayfs came about as a result of discussions
+on tracing involving the following:
+
+Michel Dagenais		<michel.dagenais@polymtl.ca>
+Richard Moore		<richardj_moore@uk.ibm.com>
+Bob Wisniewski		<bob@watson.ibm.com>
+Karim Yaghmour		<karim@opersys.com>
+Tom Zanussi		<zanussi@us.ibm.com>
+
+Also thanks to Hubertus Franke for a lot of useful suggestions and bug
+reports.
diff --git a/fs/Kconfig b/fs/Kconfig
index ed78d24ee426..740d6ff0367d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -816,6 +816,18 @@ config RAMFS
 	  To compile this as a module, choose M here: the module will be called
 	  ramfs.
 
+config RELAYFS_FS
+	tristate "Relayfs file system support"
+	---help---
+	  Relayfs is a high-speed data relay filesystem designed to provide
+	  an efficient mechanism for tools and facilities to relay large
+	  amounts of data from kernel space to user space.
+
+	  To compile this code as a module, choose M here: the module will be
+	  called relayfs.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index cf95eb894fd5..15158309dee4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -90,6 +90,7 @@ obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_UDF_FS)		+= udf/
+obj-$(CONFIG_RELAYFS_FS)	+= relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
new file mode 100644
index 000000000000..e76e182cdb38
--- /dev/null
+++ b/fs/relayfs/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_RELAYFS_FS) += relayfs.o
+
+relayfs-y := relay.o inode.o buffers.o
+
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
new file mode 100644
index 000000000000..2aa8e2719999
--- /dev/null
+++ b/fs/relayfs/buffers.c
@@ -0,0 +1,189 @@
+/*
+ * RelayFS buffer management code.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+
+/*
+ * close() vm_op implementation for relayfs file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = vma->vm_private_data;
+	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * nopage() vm_op implementation for relayfs file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+				     unsigned long address,
+				     int *type)
+{
+	struct page *page;
+	struct rchan_buf *buf = vma->vm_private_data;
+	unsigned long offset = address - vma->vm_start;
+
+	if (address > vma->vm_end)
+		return NOPAGE_SIGBUS; /* Disallow mremap */
+	if (!buf)
+		return NOPAGE_OOM;
+
+	page = vmalloc_to_page(buf->start + offset);
+	if (!page)
+		return NOPAGE_OOM;
+	get_page(page);
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	return page;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+	.nopage = relay_buf_nopage,
+	.close = relay_file_mmap_close,
+};
+
+/**
+ *	relay_mmap_buf: - mmap channel buffer to process address space
+ *	@buf: relay channel buffer
+ *	@vma: vm_area_struct describing memory to be mapped
+ *
+ *	Returns 0 if ok, negative on error
+ *
+ *	Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+	unsigned long length = vma->vm_end - vma->vm_start;
+	struct file *filp = vma->vm_file;
+
+	if (!buf)
+		return -EBADF;
+
+	if (length != (unsigned long)buf->chan->alloc_size)
+		return -EINVAL;
+
+	vma->vm_ops = &relay_file_mmap_ops;
+	vma->vm_private_data = buf;
+	buf->chan->cb->buf_mapped(buf, filp);
+
+	return 0;
+}
+
+/**
+ *	relay_alloc_buf - allocate a channel buffer
+ *	@buf: the buffer struct
+ *	@size: total size of the buffer
+ *
+ *	Returns a pointer to the resulting buffer, NULL if unsuccessful
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+{
+	void *mem;
+	unsigned int i, j, n_pages;
+
+	size = PAGE_ALIGN(size);
+	n_pages = size >> PAGE_SHIFT;
+
+	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!buf->page_array)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		buf->page_array[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!buf->page_array[i]))
+			goto depopulate;
+	}
+	mem = vmap(buf->page_array, n_pages, GFP_KERNEL, PAGE_KERNEL);
+	if (!mem)
+		goto depopulate;
+
+	memset(mem, 0, size);
+	buf->page_count = n_pages;
+	return mem;
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(buf->page_array[j]);
+	kfree(buf->page_array);
+	return NULL;
+}
+
+/**
+ *	relay_create_buf - allocate and initialize a channel buffer
+ *	@alloc_size: size of the buffer to allocate
+ *	@n_subbufs: number of sub-buffers in the channel
+ *
+ *	Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+	struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+	if (!buf->padding)
+		goto free_buf;
+
+	buf->start = relay_alloc_buf(buf, chan->alloc_size);
+	if (!buf->start)
+		goto free_buf;
+
+	buf->chan = chan;
+	kref_get(&buf->chan->kref);
+	return buf;
+
+free_buf:
+	kfree(buf->padding);
+	kfree(buf);
+	return NULL;
+}
+
+/**
+ *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ *	@buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+	struct rchan *chan = buf->chan;
+	unsigned int i;
+
+	if (likely(buf->start)) {
+		vunmap(buf->start);
+		for (i = 0; i < buf->page_count; i++)
+			__free_page(buf->page_array[i]);
+		kfree(buf->page_array);
+	}
+	kfree(buf->padding);
+	kfree(buf);
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ *	relay_remove_buf - remove a channel buffer
+ *
+ *	Removes the file from the relayfs fileystem, which also frees the
+ *	rchan_buf_struct and the channel buffer.  Should only be called from
+ *	kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+	relayfs_remove(buf->dentry);
+}
diff --git a/fs/relayfs/buffers.h b/fs/relayfs/buffers.h
new file mode 100644
index 000000000000..37a12493f641
--- /dev/null
+++ b/fs/relayfs/buffers.h
@@ -0,0 +1,12 @@
+#ifndef _BUFFERS_H
+#define _BUFFERS_H
+
+/* This inspired by rtai/shmem */
+#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
+
+extern int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma);
+extern struct rchan_buf *relay_create_buf(struct rchan *chan);
+extern void relay_destroy_buf(struct rchan_buf *buf);
+extern void relay_remove_buf(struct kref *kref);
+
+#endif/* _BUFFERS_H */
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
new file mode 100644
index 000000000000..0f7f88d067ad
--- /dev/null
+++ b/fs/relayfs/inode.c
@@ -0,0 +1,609 @@
+/*
+ * VFS-related code for RelayFS, a high-speed data relay filesystem.
+ *
+ * Copyright (C) 2003-2005 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
+ * Copyright (C) 2003-2005 - Karim Yaghmour <karim@opersys.com>
+ *
+ * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+
+#define RELAYFS_MAGIC			0xF0B4A981
+
+static struct vfsmount *		relayfs_mount;
+static int				relayfs_mount_count;
+static kmem_cache_t *			relayfs_inode_cachep;
+
+static struct backing_dev_info		relayfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
+				       struct rchan *chan)
+{
+	struct rchan_buf *buf = NULL;
+	struct inode *inode;
+
+	if (S_ISREG(mode)) {
+		BUG_ON(!chan);
+		buf = relay_create_buf(chan);
+		if (!buf)
+			return NULL;
+	}
+
+	inode = new_inode(sb);
+	if (!inode) {
+		relay_destroy_buf(buf);
+		return NULL;
+	}
+
+	inode->i_mode = mode;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
+	inode->i_blksize = PAGE_CACHE_SIZE;
+	inode->i_blocks = 0;
+	inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_fop = &relayfs_file_operations;
+		RELAYFS_I(inode)->buf = buf;
+		break;
+	case S_IFDIR:
+		inode->i_op = &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink == 2 (for "." entry) */
+		inode->i_nlink++;
+		break;
+	default:
+		break;
+	}
+
+	return inode;
+}
+
+/**
+ *	relayfs_create_entry - create a relayfs directory or file
+ *	@name: the name of the file to create
+ *	@parent: parent directory
+ *	@mode: mode
+ *	@chan: relay channel associated with the file
+ *
+ *	Returns the new dentry, NULL on failure
+ *
+ *	Creates a file or directory with the specifed permissions.
+ */
+static struct dentry *relayfs_create_entry(const char *name,
+					   struct dentry *parent,
+					   int mode,
+					   struct rchan *chan)
+{
+	struct dentry *d;
+	struct inode *inode;
+	int error = 0;
+
+	BUG_ON(!name || !(S_ISREG(mode) || S_ISDIR(mode)));
+
+	error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
+	if (error) {
+		printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
+		return NULL;
+	}
+
+	if (!parent && relayfs_mount && relayfs_mount->mnt_sb)
+		parent = relayfs_mount->mnt_sb->s_root;
+
+	if (!parent) {
+		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+		return NULL;
+	}
+
+	parent = dget(parent);
+	down(&parent->d_inode->i_sem);
+	d = lookup_one_len(name, parent, strlen(name));
+	if (IS_ERR(d)) {
+		d = NULL;
+		goto release_mount;
+	}
+
+	if (d->d_inode) {
+		d = NULL;
+		goto release_mount;
+	}
+
+	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, chan);
+	if (!inode) {
+		d = NULL;
+		goto release_mount;
+	}
+
+	d_instantiate(d, inode);
+	dget(d);	/* Extra count - pin the dentry in core */
+
+	if (S_ISDIR(mode))
+		parent->d_inode->i_nlink++;
+
+	goto exit;
+
+release_mount:
+	simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+
+exit:
+	up(&parent->d_inode->i_sem);
+	dput(parent);
+	return d;
+}
+
+/**
+ *	relayfs_create_file - create a file in the relay filesystem
+ *	@name: the name of the file to create
+ *	@parent: parent directory
+ *	@mode: mode, if not specied the default perms are used
+ *	@chan: channel associated with the file
+ *
+ *	Returns file dentry if successful, NULL otherwise.
+ *
+ *	The file will be created user r on behalf of current user.
+ */
+struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
+				   int mode, struct rchan *chan)
+{
+	if (!mode)
+		mode = S_IRUSR;
+	mode = (mode & S_IALLUGO) | S_IFREG;
+
+	return relayfs_create_entry(name, parent, mode, chan);
+}
+
+/**
+ *	relayfs_create_dir - create a directory in the relay filesystem
+ *	@name: the name of the directory to create
+ *	@parent: parent directory, NULL if parent should be fs root
+ *
+ *	Returns directory dentry if successful, NULL otherwise.
+ *
+ *	The directory will be created world rwx on behalf of current user.
+ */
+struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
+{
+	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+	return relayfs_create_entry(name, parent, mode, NULL);
+}
+
+/**
+ *	relayfs_remove - remove a file or directory in the relay filesystem
+ *	@dentry: file or directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove(struct dentry *dentry)
+{
+	struct dentry *parent;
+	int error = 0;
+
+	if (!dentry)
+		return -EINVAL;
+	parent = dentry->d_parent;
+	if (!parent)
+		return -EINVAL;
+
+	parent = dget(parent);
+	down(&parent->d_inode->i_sem);
+	if (dentry->d_inode) {
+		if (S_ISDIR(dentry->d_inode->i_mode))
+			error = simple_rmdir(parent->d_inode, dentry);
+		else
+			error = simple_unlink(parent->d_inode, dentry);
+		if (!error)
+			d_delete(dentry);
+	}
+	if (!error)
+		dput(dentry);
+	up(&parent->d_inode->i_sem);
+	dput(parent);
+
+	if (!error)
+		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+
+	return error;
+}
+
+/**
+ *	relayfs_remove_dir - remove a directory in the relay filesystem
+ *	@dentry: directory dentry
+ *
+ *	Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove_dir(struct dentry *dentry)
+{
+	return relayfs_remove(dentry);
+}
+
+/**
+ *	relayfs_open - open file op for relayfs files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Increments the channel buffer refcount.
+ */
+static int relayfs_open(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	kref_get(&buf->kref);
+
+	return 0;
+}
+
+/**
+ *	relayfs_mmap - mmap file op for relayfs files
+ *	@filp: the file
+ *	@vma: the vma describing what to map
+ *
+ *	Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	return relay_mmap_buf(RELAYFS_I(inode)->buf, vma);
+}
+
+/**
+ *	relayfs_poll - poll file op for relayfs files
+ *	@filp: the file
+ *	@wait: poll table
+ *
+ *	Poll implemention.
+ */
+static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
+{
+	unsigned int mask = 0;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+
+	if (buf->finalized)
+		return POLLERR;
+
+	if (filp->f_mode & FMODE_READ) {
+		poll_wait(filp, &buf->read_wait, wait);
+		if (!relay_buf_empty(buf))
+			mask |= POLLIN | POLLRDNORM;
+	}
+
+	return mask;
+}
+
+/**
+ *	relayfs_release - release file op for relayfs files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Decrements the channel refcount, as the filesystem is
+ *	no longer using it.
+ */
+static int relayfs_release(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	kref_put(&buf->kref, relay_remove_buf);
+
+	return 0;
+}
+
+/**
+ *	relayfs_read_consume - update the consumed count for the buffer
+ */
+static void relayfs_read_consume(struct rchan_buf *buf,
+				 size_t read_pos,
+				 size_t bytes_consumed)
+{
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t read_subbuf;
+
+	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+
+	buf->bytes_consumed += bytes_consumed;
+	read_subbuf = read_pos / buf->chan->subbuf_size;
+	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+		if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+		    (buf->offset == subbuf_size))
+			return;
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+}
+
+/**
+ *	relayfs_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+	size_t bytes_produced, bytes_consumed, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t produced = buf->subbufs_produced % n_subbufs;
+	size_t consumed = buf->subbufs_consumed % n_subbufs;
+
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+
+	if (consumed > produced) {
+		if ((produced > n_subbufs) &&
+		    (produced + n_subbufs - consumed <= n_subbufs))
+			produced += n_subbufs;
+	} else if (consumed == produced) {
+		if (buf->offset > subbuf_size) {
+			produced += n_subbufs;
+			if (buf->subbufs_produced == buf->subbufs_consumed)
+				consumed += n_subbufs;
+		}
+	}
+
+	if (buf->offset > subbuf_size)
+		bytes_produced = (produced - 1) * subbuf_size + write_offset;
+	else
+		bytes_produced = produced * subbuf_size + write_offset;
+	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
+
+	if (bytes_produced == bytes_consumed)
+		return 0;
+
+	relayfs_read_consume(buf, read_pos, 0);
+
+	return 1;
+}
+
+/**
+ *	relayfs_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relayfs_read_subbuf_avail(size_t read_pos,
+					struct rchan_buf *buf)
+{
+	size_t padding, avail = 0;
+	size_t read_subbuf, read_offset, write_subbuf, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+
+	write_subbuf = (buf->data - buf->start) / subbuf_size;
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+	read_subbuf = read_pos / subbuf_size;
+	read_offset = read_pos % subbuf_size;
+	padding = buf->padding[read_subbuf];
+
+	if (read_subbuf == write_subbuf) {
+		if (read_offset + padding < write_offset)
+			avail = write_offset - (read_offset + padding);
+	} else
+		avail = (subbuf_size - padding) - read_offset;
+
+	return avail;
+}
+
+/**
+ *	relayfs_read_start_pos - find the first available byte to read
+ *
+ *	If the read_pos is in the middle of padding, return the
+ *	position of the first actually available byte, otherwise
+ *	return the original value.
+ */
+static size_t relayfs_read_start_pos(size_t read_pos,
+				     struct rchan_buf *buf)
+{
+	size_t read_subbuf, padding, padding_start, padding_end;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	padding_start = (read_subbuf + 1) * subbuf_size - padding;
+	padding_end = (read_subbuf + 1) * subbuf_size;
+	if (read_pos >= padding_start && read_pos < padding_end) {
+		read_subbuf = (read_subbuf + 1) % n_subbufs;
+		read_pos = read_subbuf * subbuf_size;
+	}
+
+	return read_pos;
+}
+
+/**
+ *	relayfs_read_end_pos - return the new read position
+ */
+static size_t relayfs_read_end_pos(struct rchan_buf *buf,
+				   size_t read_pos,
+				   size_t count)
+{
+	size_t read_subbuf, padding, end_pos;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	if (read_pos % subbuf_size + count + padding == subbuf_size)
+		end_pos = (read_subbuf + 1) * subbuf_size;
+	else
+		end_pos = read_pos + count;
+	if (end_pos >= subbuf_size * n_subbufs)
+		end_pos = 0;
+
+	return end_pos;
+}
+
+/**
+ *	relayfs_read - read file op for relayfs files
+ *	@filp: the file
+ *	@buffer: the userspace buffer
+ *	@count: number of bytes to read
+ *	@ppos: position to read from
+ *
+ *	Reads count bytes or the number of bytes available in the
+ *	current sub-buffer being read, whichever is smaller.
+ */
+static ssize_t relayfs_read(struct file *filp,
+			    char __user *buffer,
+			    size_t count,
+			    loff_t *ppos)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+	size_t read_start, avail;
+	ssize_t ret = 0;
+	void *from;
+
+	down(&inode->i_sem);
+	if(!relayfs_read_avail(buf, *ppos))
+		goto out;
+
+	read_start = relayfs_read_start_pos(*ppos, buf);
+	avail = relayfs_read_subbuf_avail(read_start, buf);
+	if (!avail)
+		goto out;
+
+	from = buf->start + read_start;
+	ret = count = min(count, avail);
+	if (copy_to_user(buffer, from, count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	relayfs_read_consume(buf, read_start, count);
+	*ppos = relayfs_read_end_pos(buf, read_start, count);
+out:
+	up(&inode->i_sem);
+	return ret;
+}
+
+/**
+ *	relayfs alloc_inode() implementation
+ */
+static struct inode *relayfs_alloc_inode(struct super_block *sb)
+{
+	struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
+	if (!p)
+		return NULL;
+	p->buf = NULL;
+
+	return &p->vfs_inode;
+}
+
+/**
+ *	relayfs destroy_inode() implementation
+ */
+static void relayfs_destroy_inode(struct inode *inode)
+{
+	if (RELAYFS_I(inode)->buf)
+		relay_destroy_buf(RELAYFS_I(inode)->buf);
+
+	kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
+}
+
+static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct relayfs_inode_info *i = p;
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&i->vfs_inode);
+}
+
+struct file_operations relayfs_file_operations = {
+	.open		= relayfs_open,
+	.poll		= relayfs_poll,
+	.mmap		= relayfs_mmap,
+	.read		= relayfs_read,
+	.llseek		= no_llseek,
+	.release	= relayfs_release,
+};
+
+static struct super_operations relayfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.alloc_inode	= relayfs_alloc_inode,
+	.destroy_inode	= relayfs_destroy_inode,
+};
+
+static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = RELAYFS_MAGIC;
+	sb->s_op = &relayfs_ops;
+	inode = relayfs_get_inode(sb, mode, NULL);
+
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+
+	return 0;
+}
+
+static struct super_block * relayfs_get_sb(struct file_system_type *fs_type,
+					   int flags, const char *dev_name,
+					   void *data)
+{
+	return get_sb_single(fs_type, flags, data, relayfs_fill_super);
+}
+
+static struct file_system_type relayfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "relayfs",
+	.get_sb		= relayfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_relayfs_fs(void)
+{
+	int err;
+
+	relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
+				sizeof(struct relayfs_inode_info), 0,
+				0, init_once, NULL);
+	if (!relayfs_inode_cachep)
+		return -ENOMEM;
+
+	err = register_filesystem(&relayfs_fs_type);
+	if (err)
+		kmem_cache_destroy(relayfs_inode_cachep);
+
+	return err;
+}
+
+static void __exit exit_relayfs_fs(void)
+{
+	unregister_filesystem(&relayfs_fs_type);
+	kmem_cache_destroy(relayfs_inode_cachep);
+}
+
+module_init(init_relayfs_fs)
+module_exit(exit_relayfs_fs)
+
+EXPORT_SYMBOL_GPL(relayfs_file_operations);
+EXPORT_SYMBOL_GPL(relayfs_create_dir);
+EXPORT_SYMBOL_GPL(relayfs_remove_dir);
+
+MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
+MODULE_DESCRIPTION("Relay Filesystem");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
new file mode 100644
index 000000000000..16446a15c96d
--- /dev/null
+++ b/fs/relayfs/relay.c
@@ -0,0 +1,431 @@
+/*
+ * Public API and common code for RelayFS.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+
+/**
+ *	relay_buf_empty - boolean, is the channel buffer empty?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+
+/**
+ *	relay_buf_full - boolean, is the channel buffer full?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+
+/*
+ * High-level relayfs kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback.  Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+					  void *subbuf,
+					  void *prev_subbuf,
+					  size_t prev_padding)
+{
+	if (relay_buf_full(buf))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * buf_mapped() default callback.  Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+					struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback.  Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+					  struct file *filp)
+{
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+	.subbuf_start = subbuf_start_default_callback,
+	.buf_mapped = buf_mapped_default_callback,
+	.buf_unmapped = buf_unmapped_default_callback,
+};
+
+/**
+ *	wakeup_readers - wake up readers waiting on a channel
+ *	@private: the channel buffer
+ *
+ *	This is the work function used to defer reader waking.  The
+ *	reason waking is deferred is that calling directly from write
+ *	causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+	struct rchan_buf *buf = private;
+	wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ *	__relay_reset - reset a channel buffer
+ *	@buf: the channel buffer
+ *	@init: 1 if this is a first-time initialization
+ *
+ *	See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+	size_t i;
+
+	if (init) {
+		init_waitqueue_head(&buf->read_wait);
+		kref_init(&buf->kref);
+		INIT_WORK(&buf->wake_readers, NULL, NULL);
+	} else {
+		cancel_delayed_work(&buf->wake_readers);
+		flush_scheduled_work();
+	}
+
+	buf->subbufs_produced = 0;
+	buf->subbufs_consumed = 0;
+	buf->bytes_consumed = 0;
+	buf->finalized = 0;
+	buf->data = buf->start;
+	buf->offset = 0;
+
+	for (i = 0; i < buf->chan->n_subbufs; i++)
+		buf->padding[i] = 0;
+
+	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ *	relay_reset - reset the channel
+ *	@chan: the channel
+ *
+ *	This has the effect of erasing all data from all channel buffers
+ *	and restarting the channel in its initial state.  The buffers
+ *	are not freed, so any mappings are still in effect.
+ *
+ *	NOTE: Care should be taken that the channel isn't actually
+ *	being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+	unsigned int i;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			continue;
+		__relay_reset(chan->buf[i], 0);
+	}
+}
+
+/**
+ *	relay_open_buf - create a new channel buffer in relayfs
+ *
+ *	Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+					const char *filename,
+					struct dentry *parent)
+{
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+
+	/* Create file in fs */
+	dentry = relayfs_create_file(filename, parent, S_IRUSR, chan);
+	if (!dentry)
+		return NULL;
+
+	buf = RELAYFS_I(dentry->d_inode)->buf;
+	buf->dentry = dentry;
+	__relay_reset(buf, 1);
+
+	return buf;
+}
+
+/**
+ *	relay_close_buf - close a channel buffer
+ *	@buf: channel buffer
+ *
+ *	Marks the buffer finalized and restores the default callbacks.
+ *	The channel buffer and channel buffer data structure are then freed
+ *	automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+	buf->finalized = 1;
+	buf->chan->cb = &default_channel_callbacks;
+	cancel_delayed_work(&buf->wake_readers);
+	flush_scheduled_work();
+	kref_put(&buf->kref, relay_remove_buf);
+}
+
+static inline void setup_callbacks(struct rchan *chan,
+				   struct rchan_callbacks *cb)
+{
+	if (!cb) {
+		chan->cb = &default_channel_callbacks;
+		return;
+	}
+
+	if (!cb->subbuf_start)
+		cb->subbuf_start = subbuf_start_default_callback;
+	if (!cb->buf_mapped)
+		cb->buf_mapped = buf_mapped_default_callback;
+	if (!cb->buf_unmapped)
+		cb->buf_unmapped = buf_unmapped_default_callback;
+	chan->cb = cb;
+}
+
+/**
+ *	relay_open - create a new relayfs channel
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, NULL for root directory
+ *	@subbuf_size: size of sub-buffers
+ *	@n_subbufs: number of sub-buffers
+ *	@cb: client callback functions
+ *
+ *	Returns channel pointer if successful, NULL otherwise.
+ *
+ *	Creates a channel buffer for each cpu using the sizes and
+ *	attributes specified.  The created channel buffer files
+ *	will be named base_filename0...base_filenameN-1.  File
+ *	permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb)
+{
+	unsigned int i;
+	struct rchan *chan;
+	char *tmpname;
+
+	if (!base_filename)
+		return NULL;
+
+	if (!(subbuf_size && n_subbufs))
+		return NULL;
+
+	chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+	if (!chan)
+		return NULL;
+
+	chan->version = RELAYFS_CHANNEL_VERSION;
+	chan->n_subbufs = n_subbufs;
+	chan->subbuf_size = subbuf_size;
+	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+	setup_callbacks(chan, cb);
+	kref_init(&chan->kref);
+
+	tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		goto free_chan;
+
+	for_each_online_cpu(i) {
+		sprintf(tmpname, "%s%d", base_filename, i);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent);
+		chan->buf[i]->cpu = i;
+		if (!chan->buf[i])
+			goto free_bufs;
+	}
+
+	kfree(tmpname);
+	return chan;
+
+free_bufs:
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			break;
+		relay_close_buf(chan->buf[i]);
+	}
+	kfree(tmpname);
+
+free_chan:
+	kref_put(&chan->kref, relay_destroy_channel);
+	return NULL;
+}
+
+/**
+ *	relay_switch_subbuf - switch to a new sub-buffer
+ *	@buf: channel buffer
+ *	@length: size of current event
+ *
+ *	Returns either the length passed in or 0 if full.
+
+ *	Performs sub-buffer-switch tasks such as invoking callbacks,
+ *	updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+	void *old, *new;
+	size_t old_subbuf, new_subbuf;
+
+	if (unlikely(length > buf->chan->subbuf_size))
+		goto toobig;
+
+	if (buf->offset != buf->chan->subbuf_size + 1) {
+		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+		buf->padding[old_subbuf] = buf->prev_padding;
+		buf->subbufs_produced++;
+		if (waitqueue_active(&buf->read_wait)) {
+			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+			schedule_delayed_work(&buf->wake_readers, 1);
+		}
+	}
+
+	old = buf->data;
+	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+	new = buf->start + new_subbuf * buf->chan->subbuf_size;
+	buf->offset = 0;
+	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+		buf->offset = buf->chan->subbuf_size + 1;
+		return 0;
+	}
+	buf->data = new;
+	buf->padding[new_subbuf] = 0;
+
+	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+		goto toobig;
+
+	return length;
+
+toobig:
+	printk(KERN_WARNING "relayfs: event too large (%Zd)\n", length);
+	WARN_ON(1);
+	return 0;
+}
+
+/**
+ *	relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ *	@chan: the channel
+ *	@cpu: the cpu associated with the channel buffer to update
+ *	@subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ *	Adds to the channel buffer's consumed sub-buffer count.
+ *	subbufs_consumed should be the number of sub-buffers newly consumed,
+ *	not the total consumed.
+ *
+ *	NOTE: kernel clients don't need to call this function if the channel
+ *	mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+			    unsigned int cpu,
+			    size_t subbufs_consumed)
+{
+	struct rchan_buf *buf;
+
+	if (!chan)
+		return;
+
+	if (cpu >= NR_CPUS || !chan->buf[cpu])
+		return;
+
+	buf = chan->buf[cpu];
+	buf->subbufs_consumed += subbufs_consumed;
+	if (buf->subbufs_consumed > buf->subbufs_produced)
+		buf->subbufs_consumed = buf->subbufs_produced;
+}
+
+/**
+ *	relay_destroy_channel - free the channel struct
+ *
+ *	Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+	struct rchan *chan = container_of(kref, struct rchan, kref);
+	kfree(chan);
+}
+
+/**
+ *	relay_close - close the channel
+ *	@chan: the channel
+ *
+ *	Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+	unsigned int i;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			continue;
+		relay_close_buf(chan->buf[i]);
+	}
+
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ *	relay_flush - close the channel
+ *	@chan: the channel
+ *
+ *	Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+	unsigned int i;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			continue;
+		relay_switch_subbuf(chan->buf[i], 0);
+	}
+}
+
+EXPORT_SYMBOL_GPL(relay_open);
+EXPORT_SYMBOL_GPL(relay_close);
+EXPORT_SYMBOL_GPL(relay_flush);
+EXPORT_SYMBOL_GPL(relay_reset);
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+EXPORT_SYMBOL_GPL(relay_buf_full);
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
new file mode 100644
index 000000000000..703503fa22b6
--- /dev/null
+++ b/fs/relayfs/relay.h
@@ -0,0 +1,12 @@
+#ifndef _RELAY_H
+#define _RELAY_H
+
+struct dentry *relayfs_create_file(const char *name,
+				   struct dentry *parent,
+				   int mode,
+				   struct rchan *chan);
+extern int relayfs_remove(struct dentry *dentry);
+extern int relay_buf_empty(struct rchan_buf *buf);
+extern void relay_destroy_channel(struct kref *kref);
+
+#endif /* _RELAY_H */
diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
new file mode 100644
index 000000000000..cfafc3e76bc2
--- /dev/null
+++ b/include/linux/relayfs_fs.h
@@ -0,0 +1,255 @@
+/*
+ * linux/include/linux/relayfs_fs.h
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * RelayFS definitions and declarations
+ */
+
+#ifndef _LINUX_RELAYFS_FS_H
+#define _LINUX_RELAYFS_FS_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/kref.h>
+
+/*
+ * Tracks changes to rchan_buf struct
+ */
+#define RELAYFS_CHANNEL_VERSION		5
+
+/*
+ * Per-cpu relay channel buffer
+ */
+struct rchan_buf
+{
+	void *start;			/* start of channel buffer */
+	void *data;			/* start of current sub-buffer */
+	size_t offset;			/* current offset into sub-buffer */
+	size_t subbufs_produced;	/* count of sub-buffers produced */
+	size_t subbufs_consumed;	/* count of sub-buffers consumed */
+	struct rchan *chan;		/* associated channel */
+	wait_queue_head_t read_wait;	/* reader wait queue */
+	struct work_struct wake_readers; /* reader wake-up work struct */
+	struct dentry *dentry;		/* channel file dentry */
+	struct kref kref;		/* channel buffer refcount */
+	struct page **page_array;	/* array of current buffer pages */
+	unsigned int page_count;	/* number of current buffer pages */
+	unsigned int finalized;		/* buffer has been finalized */
+	size_t *padding;		/* padding counts per sub-buffer */
+	size_t prev_padding;		/* temporary variable */
+	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
+	unsigned int cpu;		/* this buf's cpu */
+} ____cacheline_aligned;
+
+/*
+ * Relay channel data structure
+ */
+struct rchan
+{
+	u32 version;			/* the version of this struct */
+	size_t subbuf_size;		/* sub-buffer size */
+	size_t n_subbufs;		/* number of sub-buffers per buffer */
+	size_t alloc_size;		/* total buffer size allocated */
+	struct rchan_callbacks *cb;	/* client callbacks */
+	struct kref kref;		/* channel refcount */
+	void *private_data;		/* for user-defined data */
+	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
+};
+
+/*
+ * Relayfs inode
+ */
+struct relayfs_inode_info
+{
+	struct inode vfs_inode;
+	struct rchan_buf *buf;
+};
+
+static inline struct relayfs_inode_info *RELAYFS_I(struct inode *inode)
+{
+	return container_of(inode, struct relayfs_inode_info, vfs_inode);
+}
+
+/*
+ * Relay channel client callbacks
+ */
+struct rchan_callbacks
+{
+	/*
+	 * subbuf_start - called on buffer-switch to a new sub-buffer
+	 * @buf: the channel buffer containing the new sub-buffer
+	 * @subbuf: the start of the new sub-buffer
+	 * @prev_subbuf: the start of the previous sub-buffer
+	 * @prev_padding: unused space at the end of previous sub-buffer
+	 *
+	 * The client should return 1 to continue logging, 0 to stop
+	 * logging.
+	 *
+	 * NOTE: subbuf_start will also be invoked when the buffer is
+	 *       created, so that the first sub-buffer can be initialized
+	 *       if necessary.  In this case, prev_subbuf will be NULL.
+	 *
+	 * NOTE: the client can reserve bytes at the beginning of the new
+	 *       sub-buffer by calling subbuf_start_reserve() in this callback.
+	 */
+	int (*subbuf_start) (struct rchan_buf *buf,
+			     void *subbuf,
+			     void *prev_subbuf,
+			     size_t prev_padding);
+
+	/*
+	 * buf_mapped - relayfs buffer mmap notification
+	 * @buf: the channel buffer
+	 * @filp: relayfs file pointer
+	 *
+	 * Called when a relayfs file is successfully mmapped
+	 */
+        void (*buf_mapped)(struct rchan_buf *buf,
+			   struct file *filp);
+
+	/*
+	 * buf_unmapped - relayfs buffer unmap notification
+	 * @buf: the channel buffer
+	 * @filp: relayfs file pointer
+	 *
+	 * Called when a relayfs file is successfully unmapped
+	 */
+        void (*buf_unmapped)(struct rchan_buf *buf,
+			     struct file *filp);
+};
+
+/*
+ * relayfs kernel API, fs/relayfs/relay.c
+ */
+
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb);
+extern void relay_close(struct rchan *chan);
+extern void relay_flush(struct rchan *chan);
+extern void relay_subbufs_consumed(struct rchan *chan,
+				   unsigned int cpu,
+				   size_t consumed);
+extern void relay_reset(struct rchan *chan);
+extern int relay_buf_full(struct rchan_buf *buf);
+
+extern size_t relay_switch_subbuf(struct rchan_buf *buf,
+				  size_t length);
+extern struct dentry *relayfs_create_dir(const char *name,
+					 struct dentry *parent);
+extern int relayfs_remove_dir(struct dentry *dentry);
+
+/**
+ *	relay_write - write data into the channel
+ *	@chan: relay channel
+ *	@data: data to be written
+ *	@length: number of bytes to write
+ *
+ *	Writes data into the current cpu's channel buffer.
+ *
+ *	Protects the buffer by disabling interrupts.  Use this
+ *	if you might be logging from interrupt context.  Try
+ *	__relay_write() if you know you	won't be logging from
+ *	interrupt context.
+ */
+static inline void relay_write(struct rchan *chan,
+			       const void *data,
+			       size_t length)
+{
+	unsigned long flags;
+	struct rchan_buf *buf;
+
+	local_irq_save(flags);
+	buf = chan->buf[smp_processor_id()];
+	if (unlikely(buf->offset + length > chan->subbuf_size))
+		length = relay_switch_subbuf(buf, length);
+	memcpy(buf->data + buf->offset, data, length);
+	buf->offset += length;
+	local_irq_restore(flags);
+}
+
+/**
+ *	__relay_write - write data into the channel
+ *	@chan: relay channel
+ *	@data: data to be written
+ *	@length: number of bytes to write
+ *
+ *	Writes data into the current cpu's channel buffer.
+ *
+ *	Protects the buffer by disabling preemption.  Use
+ *	relay_write() if you might be logging from interrupt
+ *	context.
+ */
+static inline void __relay_write(struct rchan *chan,
+				 const void *data,
+				 size_t length)
+{
+	struct rchan_buf *buf;
+
+	buf = chan->buf[get_cpu()];
+	if (unlikely(buf->offset + length > buf->chan->subbuf_size))
+		length = relay_switch_subbuf(buf, length);
+	memcpy(buf->data + buf->offset, data, length);
+	buf->offset += length;
+	put_cpu();
+}
+
+/**
+ *	relay_reserve - reserve slot in channel buffer
+ *	@chan: relay channel
+ *	@length: number of bytes to reserve
+ *
+ *	Returns pointer to reserved slot, NULL if full.
+ *
+ *	Reserves a slot in the current cpu's channel buffer.
+ *	Does not protect the buffer at all - caller must provide
+ *	appropriate synchronization.
+ */
+static inline void *relay_reserve(struct rchan *chan, size_t length)
+{
+	void *reserved;
+	struct rchan_buf *buf = chan->buf[smp_processor_id()];
+
+	if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
+		length = relay_switch_subbuf(buf, length);
+		if (!length)
+			return NULL;
+	}
+	reserved = buf->data + buf->offset;
+	buf->offset += length;
+
+	return reserved;
+}
+
+/**
+ *	subbuf_start_reserve - reserve bytes at the start of a sub-buffer
+ *	@buf: relay channel buffer
+ *	@length: number of bytes to reserve
+ *
+ *	Helper function used to reserve bytes at the beginning of
+ *	a sub-buffer in the subbuf_start() callback.
+ */
+static inline void subbuf_start_reserve(struct rchan_buf *buf,
+					size_t length)
+{
+	BUG_ON(length >= buf->chan->subbuf_size - 1);
+	buf->offset = length;
+}
+
+/*
+ * exported relayfs file operations, fs/relayfs/inode.c
+ */
+
+extern struct file_operations relayfs_file_operations;
+
+#endif /* _LINUX_RELAYFS_FS_H */
+
-- 
cgit v1.2.3


From 202e5979af4d91c7ca05892641131dee22653259 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 6 Sep 2005 15:16:40 -0700
Subject: [PATCH] compat: be more consistent about [ug]id_t

When I first wrote the compat layer patches, I was somewhat cavalier about
the definition of compat_uid_t and compat_gid_t (or maybe I just
misunderstood :-)).  This patch makes the compat types much more consistent
with the types we are being compatible with and hopefully will fix a few
bugs along the way.

	compat type		type in compat arch
	__compat_[ug]id_t	__kernel_[ug]id_t
	__compat_[ug]id32_t	__kernel_[ug]id32_t
	compat_[ug]id_t		[ug]id_t

The difference is that compat_uid_t is always 32 bits (for the archs we
care about) but __compat_uid_t may be 16 bits on some.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/linux32.c   | 16 ++++++++--------
 fs/compat.c                  | 16 ++++++++--------
 include/asm-ia64/compat.h    | 20 ++++++++++----------
 include/asm-mips/compat.h    | 10 ++++++----
 include/asm-parisc/compat.h  | 10 ++++++----
 include/asm-ppc64/compat.h   | 18 ++++++++++--------
 include/asm-s390/compat.h    | 20 ++++++++++----------
 include/asm-sparc64/compat.h | 18 ++++++++++--------
 include/asm-x86_64/compat.h  | 20 ++++++++++----------
 include/linux/compat.h       |  3 +++
 ipc/compat.c                 | 12 ++++++------
 11 files changed, 87 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 4613219dd73e..ece4564919d8 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -546,20 +546,20 @@ struct msgbuf32 { s32 mtype; char mtext[1]; };
 struct ipc_perm32
 {
 	key_t    	  key;
-        compat_uid_t  uid;
-        compat_gid_t  gid;
-        compat_uid_t  cuid;
-        compat_gid_t  cgid;
+        __compat_uid_t  uid;
+        __compat_gid_t  gid;
+        __compat_uid_t  cuid;
+        __compat_gid_t  cgid;
         compat_mode_t	mode;
         unsigned short  seq;
 };
 
 struct ipc64_perm32 {
 	key_t key;
-	compat_uid_t uid;
-	compat_gid_t gid;
-	compat_uid_t cuid;
-	compat_gid_t cgid;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
+	__compat_uid_t cuid;
+	__compat_gid_t cgid;
 	compat_mode_t	mode;
 	unsigned short	seq;
 	unsigned short __pad1;
diff --git a/fs/compat.c b/fs/compat.c
index 6b06b6bae35e..8e03d31eec3b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -720,14 +720,14 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
 struct compat_ncp_mount_data {
 	compat_int_t version;
 	compat_uint_t ncp_fd;
-	compat_uid_t mounted_uid;
+	__compat_uid_t mounted_uid;
 	compat_pid_t wdog_pid;
 	unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
 	compat_uint_t time_out;
 	compat_uint_t retry_count;
 	compat_uint_t flags;
-	compat_uid_t uid;
-	compat_gid_t gid;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
 	compat_mode_t file_mode;
 	compat_mode_t dir_mode;
 };
@@ -784,9 +784,9 @@ static void *do_ncp_super_data_conv(void *raw_data)
 
 struct compat_smb_mount_data {
 	compat_int_t version;
-	compat_uid_t mounted_uid;
-	compat_uid_t uid;
-	compat_gid_t gid;
+	__compat_uid_t mounted_uid;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
 	compat_mode_t file_mode;
 	compat_mode_t dir_mode;
 };
@@ -1808,8 +1808,8 @@ struct compat_nfsctl_export {
 	compat_dev_t	ex32_dev;
 	compat_ino_t	ex32_ino;
 	compat_int_t	ex32_flags;
-	compat_uid_t	ex32_anon_uid;
-	compat_gid_t	ex32_anon_gid;
+	__compat_uid_t	ex32_anon_uid;
+	__compat_gid_t	ex32_anon_gid;
 };
 
 struct compat_nfsctl_fdparm {
diff --git a/include/asm-ia64/compat.h b/include/asm-ia64/compat.h
index 0c05e5bad8a0..aaf11f4e9169 100644
--- a/include/asm-ia64/compat.h
+++ b/include/asm-ia64/compat.h
@@ -13,10 +13,10 @@ typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_key_t;
 typedef s32		compat_pid_t;
-typedef u16		compat_uid_t;
-typedef u16		compat_gid_t;
-typedef u32		compat_uid32_t;
-typedef u32		compat_gid32_t;
+typedef u16		__compat_uid_t;
+typedef u16		__compat_gid_t;
+typedef u32		__compat_uid32_t;
+typedef u32		__compat_gid32_t;
 typedef u16		compat_mode_t;
 typedef u32		compat_ino_t;
 typedef u16		compat_dev_t;
@@ -50,8 +50,8 @@ struct compat_stat {
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;
-	compat_uid_t	st_uid;
-	compat_gid_t	st_gid;
+	__compat_uid_t	st_uid;
+	__compat_gid_t	st_gid;
 	compat_dev_t	st_rdev;
 	u16		__pad2;
 	u32		st_size;
@@ -120,10 +120,10 @@ typedef u32		compat_sigset_word;
 
 struct compat_ipc64_perm {
 	compat_key_t key;
-	compat_uid32_t uid;
-	compat_gid32_t gid;
-	compat_uid32_t cuid;
-	compat_gid32_t cgid;
+	__compat_uid32_t uid;
+	__compat_gid32_t gid;
+	__compat_uid32_t cuid;
+	__compat_gid32_t cgid;
 	unsigned short mode;
 	unsigned short __pad1;
 	unsigned short seq;
diff --git a/include/asm-mips/compat.h b/include/asm-mips/compat.h
index d78002afb1e1..2c084cd4bc0a 100644
--- a/include/asm-mips/compat.h
+++ b/include/asm-mips/compat.h
@@ -15,8 +15,10 @@ typedef s32		compat_clock_t;
 typedef s32		compat_suseconds_t;
 
 typedef s32		compat_pid_t;
-typedef s32		compat_uid_t;
-typedef s32		compat_gid_t;
+typedef u32		__compat_uid_t;
+typedef u32		__compat_gid_t;
+typedef u32		__compat_uid32_t;
+typedef u32		__compat_gid32_t;
 typedef u32		compat_mode_t;
 typedef u32		compat_ino_t;
 typedef u32		compat_dev_t;
@@ -52,8 +54,8 @@ struct compat_stat {
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;
-	compat_uid_t	st_uid;
-	compat_gid_t	st_gid;
+	__compat_uid32_t	st_uid;
+	__compat_gid32_t	st_gid;
 	compat_dev_t	st_rdev;
 	s32		st_pad2[2];
 	compat_off_t	st_size;
diff --git a/include/asm-parisc/compat.h b/include/asm-parisc/compat.h
index 7630d1ad2391..38b918feead9 100644
--- a/include/asm-parisc/compat.h
+++ b/include/asm-parisc/compat.h
@@ -13,8 +13,10 @@ typedef s32	compat_ssize_t;
 typedef s32	compat_time_t;
 typedef s32	compat_clock_t;
 typedef s32	compat_pid_t;
-typedef u32	compat_uid_t;
-typedef u32	compat_gid_t;
+typedef u32	__compat_uid_t;
+typedef u32	__compat_gid_t;
+typedef u32	__compat_uid32_t;
+typedef u32	__compat_gid32_t;
 typedef u16	compat_mode_t;
 typedef u32	compat_ino_t;
 typedef u32	compat_dev_t;
@@ -67,8 +69,8 @@ struct compat_stat {
 	compat_dev_t		st_realdev;
 	u16			st_basemode;
 	u16			st_spareshort;
-	compat_uid_t		st_uid;
-	compat_gid_t		st_gid;
+	__compat_uid32_t	st_uid;
+	__compat_gid32_t	st_gid;
 	u32			st_spare4[3];
 };
 
diff --git a/include/asm-ppc64/compat.h b/include/asm-ppc64/compat.h
index 12414f5fc666..6ec62cd2d1d1 100644
--- a/include/asm-ppc64/compat.h
+++ b/include/asm-ppc64/compat.h
@@ -13,8 +13,10 @@ typedef s32		compat_ssize_t;
 typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
-typedef u32		compat_uid_t;
-typedef u32		compat_gid_t;
+typedef u32		__compat_uid_t;
+typedef u32		__compat_gid_t;
+typedef u32		__compat_uid32_t;
+typedef u32		__compat_gid32_t;
 typedef u32		compat_mode_t;
 typedef u32		compat_ino_t;
 typedef u32		compat_dev_t;
@@ -48,8 +50,8 @@ struct compat_stat {
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;	
-	compat_uid_t	st_uid;
-	compat_gid_t	st_gid;
+	__compat_uid32_t	st_uid;
+	__compat_gid32_t	st_gid;
 	compat_dev_t	st_rdev;
 	compat_off_t	st_size;
 	compat_off_t	st_blksize;
@@ -144,10 +146,10 @@ static inline void __user *compat_alloc_user_space(long len)
  */
 struct compat_ipc64_perm {
 	compat_key_t key;
-	compat_uid_t uid;
-	compat_gid_t gid;
-	compat_uid_t cuid;
-	compat_gid_t cgid;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
+	__compat_uid_t cuid;
+	__compat_gid_t cgid;
 	compat_mode_t mode;
 	unsigned int seq;
 	unsigned int __pad2;
diff --git a/include/asm-s390/compat.h b/include/asm-s390/compat.h
index 7f8f544eb262..a007715f4aea 100644
--- a/include/asm-s390/compat.h
+++ b/include/asm-s390/compat.h
@@ -13,10 +13,10 @@ typedef s32		compat_ssize_t;
 typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
-typedef u16		compat_uid_t;
-typedef u16		compat_gid_t;
-typedef u32		compat_uid32_t;
-typedef u32		compat_gid32_t;
+typedef u16		__compat_uid_t;
+typedef u16		__compat_gid_t;
+typedef u32		__compat_uid32_t;
+typedef u32		__compat_gid32_t;
 typedef u16		compat_mode_t;
 typedef u32		compat_ino_t;
 typedef u16		compat_dev_t;
@@ -51,8 +51,8 @@ struct compat_stat {
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;
-	compat_uid_t	st_uid;
-	compat_gid_t	st_gid;
+	__compat_uid_t	st_uid;
+	__compat_gid_t	st_gid;
 	compat_dev_t	st_rdev;
 	u16		__pad2;
 	u32		st_size;
@@ -140,10 +140,10 @@ static inline void __user *compat_alloc_user_space(long len)
 
 struct compat_ipc64_perm {
 	compat_key_t key;
-	compat_uid32_t uid;
-	compat_gid32_t gid;
-	compat_uid32_t cuid;
-	compat_gid32_t cgid;
+	__compat_uid32_t uid;
+	__compat_gid32_t gid;
+	__compat_uid32_t cuid;
+	__compat_gid32_t cgid;
 	compat_mode_t mode;
 	unsigned short __pad1;
 	unsigned short seq;
diff --git a/include/asm-sparc64/compat.h b/include/asm-sparc64/compat.h
index b59122dd176d..c73935dc7ba1 100644
--- a/include/asm-sparc64/compat.h
+++ b/include/asm-sparc64/compat.h
@@ -12,8 +12,10 @@ typedef s32		compat_ssize_t;
 typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
-typedef u16		compat_uid_t;
-typedef u16		compat_gid_t;
+typedef u16		__compat_uid_t;
+typedef u16		__compat_gid_t;
+typedef u32		__compat_uid32_t;
+typedef u32		__compat_gid32_t;
 typedef u16		compat_mode_t;
 typedef u32		compat_ino_t;
 typedef u16		compat_dev_t;
@@ -47,8 +49,8 @@ struct compat_stat {
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;
-	compat_uid_t	st_uid;
-	compat_gid_t	st_gid;
+	__compat_uid_t	st_uid;
+	__compat_gid_t	st_gid;
 	compat_dev_t	st_rdev;
 	compat_off_t	st_size;
 	compat_time_t	st_atime;
@@ -177,10 +179,10 @@ static __inline__ void __user *compat_alloc_user_space(long len)
 
 struct compat_ipc64_perm {
 	compat_key_t key;
-	__kernel_uid_t uid;
-	__kernel_gid_t gid;
-	__kernel_uid_t cuid;
-	__kernel_gid_t cgid;
+	__compat_uid32_t uid;
+	__compat_gid32_t gid;
+	__compat_uid32_t cuid;
+	__compat_gid32_t cgid;
 	unsigned short __pad1;
 	compat_mode_t mode;
 	unsigned short __pad2;
diff --git a/include/asm-x86_64/compat.h b/include/asm-x86_64/compat.h
index d0f453c5adfc..f0155c38f639 100644
--- a/include/asm-x86_64/compat.h
+++ b/include/asm-x86_64/compat.h
@@ -14,10 +14,10 @@ typedef s32		compat_ssize_t;
 typedef s32		compat_time_t;
 typedef s32		compat_clock_t;
 typedef s32		compat_pid_t;
-typedef u16		compat_uid_t;
-typedef u16		compat_gid_t;
-typedef u32		compat_uid32_t;
-typedef u32		compat_gid32_t;
+typedef u16		__compat_uid_t;
+typedef u16		__compat_gid_t;
+typedef u32		__compat_uid32_t;
+typedef u32		__compat_gid32_t;
 typedef u16		compat_mode_t;
 typedef u32		compat_ino_t;
 typedef u16		compat_dev_t;
@@ -52,8 +52,8 @@ struct compat_stat {
 	compat_ino_t	st_ino;
 	compat_mode_t	st_mode;
 	compat_nlink_t	st_nlink;
-	compat_uid_t	st_uid;
-	compat_gid_t	st_gid;
+	__compat_uid_t	st_uid;
+	__compat_gid_t	st_gid;
 	compat_dev_t	st_rdev;
 	u16		__pad2;
 	u32		st_size;
@@ -122,10 +122,10 @@ typedef u32               compat_sigset_word;
 
 struct compat_ipc64_perm {
 	compat_key_t key;
-	compat_uid32_t uid;
-	compat_gid32_t gid;
-	compat_uid32_t cuid;
-	compat_gid32_t cgid;
+	__compat_uid32_t uid;
+	__compat_gid32_t gid;
+	__compat_uid32_t cuid;
+	__compat_gid32_t cgid;
 	unsigned short mode;
 	unsigned short __pad1;
 	unsigned short seq;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index b58b7d6f2fdb..f9ca534787e2 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -18,6 +18,9 @@
 #define compat_jiffies_to_clock_t(x)	\
 		(((unsigned long)(x) * COMPAT_USER_HZ) / HZ)
 
+typedef __compat_uid32_t	compat_uid_t;
+typedef __compat_gid32_t	compat_gid_t;
+
 struct rusage;
 
 struct compat_itimerspec { 
diff --git a/ipc/compat.c b/ipc/compat.c
index 3881d564c668..1fe95f6659dd 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -42,10 +42,10 @@ struct compat_msgbuf {
 
 struct compat_ipc_perm {
 	key_t key;
-	compat_uid_t uid;
-	compat_gid_t gid;
-	compat_uid_t cuid;
-	compat_gid_t cgid;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
+	__compat_uid_t cuid;
+	__compat_gid_t cgid;
 	compat_mode_t mode;
 	unsigned short seq;
 };
@@ -174,8 +174,8 @@ static inline int __put_compat_ipc_perm(struct ipc64_perm *p,
 					struct compat_ipc_perm __user *up)
 {
 	int err;
-	compat_uid_t u;
-	compat_gid_t g;
+	__compat_uid_t u;
+	__compat_gid_t g;
 
 	err  = __put_user(p->key, &up->key);
 	SET_UID(u, p->uid);
-- 
cgit v1.2.3


From 022a4a7bbdefdedc2706a13c81c832d8c3173c6d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 6 Sep 2005 15:16:41 -0700
Subject: [PATCH] fs/jbd/: cleanups

This patch contains the following cleanups:
- make needlessly global functions static
- journal.c: remove the unused global function __journal_internal_check
             and move the check to journal_init
- remove the following write-only global variable:
  - journal.c: current_journal
- remove the following unneeded EXPORT_SYMBOL:
  - journal.c: journal_recover

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/journal.c    | 34 ++++++++++++++--------------------
 fs/jbd/revoke.c     |  3 ++-
 include/linux/jbd.h |  1 -
 3 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 5e7b43949517..71cfe25d716e 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -65,7 +65,6 @@ EXPORT_SYMBOL(journal_set_features);
 EXPORT_SYMBOL(journal_create);
 EXPORT_SYMBOL(journal_load);
 EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_recover);
 EXPORT_SYMBOL(journal_update_superblock);
 EXPORT_SYMBOL(journal_abort);
 EXPORT_SYMBOL(journal_errno);
@@ -81,6 +80,7 @@ EXPORT_SYMBOL(journal_try_to_free_buffers);
 EXPORT_SYMBOL(journal_force_commit);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
 
 /*
  * Helper function used to manage commit timeouts
@@ -93,16 +93,6 @@ static void commit_timeout(unsigned long __data)
 	wake_up_process(p);
 }
 
-/* Static check for data structure consistency.  There's no code
- * invoked --- we'll just get a linker failure if things aren't right.
- */
-void __journal_internal_check(void)
-{
-	extern void journal_bad_superblock_size(void);
-	if (sizeof(struct journal_superblock_s) != 1024)
-		journal_bad_superblock_size();
-}
-
 /*
  * kjournald: The main thread function used to manage a logging device
  * journal.
@@ -119,16 +109,12 @@ void __journal_internal_check(void)
  *    known as checkpointing, and this thread is responsible for that job.
  */
 
-journal_t *current_journal;		// AKPM: debug
-
-int kjournald(void *arg)
+static int kjournald(void *arg)
 {
 	journal_t *journal = (journal_t *) arg;
 	transaction_t *transaction;
 	struct timer_list timer;
 
-	current_journal = journal;
-
 	daemonize("kjournald");
 
 	/* Set up an interval timer which can be used to trigger a
@@ -1439,7 +1425,7 @@ int journal_wipe(journal_t *journal, int write)
  * device this journal is present.
  */
 
-const char *journal_dev_name(journal_t *journal, char *buffer)
+static const char *journal_dev_name(journal_t *journal, char *buffer)
 {
 	struct block_device *bdev;
 
@@ -1485,7 +1471,7 @@ void __journal_abort_hard(journal_t *journal)
 
 /* Soft abort: record the abort error status in the journal superblock,
  * but don't do any other IO. */
-void __journal_abort_soft (journal_t *journal, int errno)
+static void __journal_abort_soft (journal_t *journal, int errno)
 {
 	if (journal->j_flags & JFS_ABORT)
 		return;
@@ -1880,7 +1866,7 @@ EXPORT_SYMBOL(journal_enable_debug);
 
 static struct proc_dir_entry *proc_jbd_debug;
 
-int read_jbd_debug(char *page, char **start, off_t off,
+static int read_jbd_debug(char *page, char **start, off_t off,
 			  int count, int *eof, void *data)
 {
 	int ret;
@@ -1890,7 +1876,7 @@ int read_jbd_debug(char *page, char **start, off_t off,
 	return ret;
 }
 
-int write_jbd_debug(struct file *file, const char __user *buffer,
+static int write_jbd_debug(struct file *file, const char __user *buffer,
 			   unsigned long count, void *data)
 {
 	char buf[32];
@@ -1979,6 +1965,14 @@ static int __init journal_init(void)
 {
 	int ret;
 
+/* Static check for data structure consistency.  There's no code
+ * invoked --- we'll just get a linker failure if things aren't right.
+ */
+	extern void journal_bad_superblock_size(void);
+	if (sizeof(struct journal_superblock_s) != 1024)
+		journal_bad_superblock_size();
+
+
 	ret = journal_init_caches();
 	if (ret != 0)
 		journal_destroy_caches();
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index d327a598f861..93b9f45eebda 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -116,7 +116,8 @@ static inline int hash(journal_t *journal, unsigned long block)
 		(block << (hash_shift - 12))) & (table->hash_size - 1);
 }
 
-int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
+static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+			      tid_t seq)
 {
 	struct list_head *hash_list;
 	struct jbd_revoke_record_s *record;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 593407e865b1..84321a4cac93 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -914,7 +914,6 @@ extern int	   journal_wipe       (journal_t *, int);
 extern int	   journal_skip_recovery	(journal_t *);
 extern void	   journal_update_superblock	(journal_t *, int);
 extern void	   __journal_abort_hard	(journal_t *);
-extern void	   __journal_abort_soft	(journal_t *, int);
 extern void	   journal_abort      (journal_t *, int);
 extern int	   journal_errno      (journal_t *);
 extern void	   journal_ack_err    (journal_t *);
-- 
cgit v1.2.3


From 3676347a5e216a7fec7f8eedbbcf8bed6b9c4e40 Mon Sep 17 00:00:00 2001
From: Peter Osterlund <petero2@telia.com>
Date: Tue, 6 Sep 2005 15:16:42 -0700
Subject: [PATCH] kill bio->bi_set

Jens:

->bi_set is totally unnecessary bloat of struct bio.  Just define a proper
destructor for the bio and it already knows what bio_set it belongs too.

Peter:

Fixed the bugs.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-io.c  |  6 ++++++
 drivers/md/dm.c     |  6 ++++++
 fs/bio.c            | 32 +++++++++++++++++++++-----------
 include/linux/bio.h |  2 +-
 4 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 45754bb6a799..9de000131a8a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -239,6 +239,11 @@ static void vm_dp_init(struct dpages *dp, void *data)
 	dp->context_ptr = data;
 }
 
+static void dm_bio_destructor(struct bio *bio)
+{
+	bio_free(bio, _bios);
+}
+
 /*-----------------------------------------------------------------
  * IO routines that accept a list of pages.
  *---------------------------------------------------------------*/
@@ -263,6 +268,7 @@ static void do_region(int rw, unsigned int region, struct io_region *where,
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
 		bio->bi_private = io;
+		bio->bi_destructor = dm_bio_destructor;
 		bio_set_region(bio, region);
 
 		/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d487d9deb98e..930b9fc27953 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -399,6 +399,11 @@ struct clone_info {
 	unsigned short idx;
 };
 
+static void dm_bio_destructor(struct bio *bio)
+{
+	bio_free(bio, dm_set);
+}
+
 /*
  * Creates a little bio that is just does part of a bvec.
  */
@@ -410,6 +415,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 
 	clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
+	clone->bi_destructor = dm_bio_destructor;
 	*clone->bi_io_vec = *bv;
 
 	clone->bi_sector = sector;
diff --git a/fs/bio.c b/fs/bio.c
index 1f2d4649b188..bf3ec9d2b54c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -104,18 +104,22 @@ static inline struct bio_vec *bvec_alloc_bs(unsigned int __nocast gfp_mask, int
 	return bvl;
 }
 
-/*
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_destructor(struct bio *bio)
+void bio_free(struct bio *bio, struct bio_set *bio_set)
 {
 	const int pool_idx = BIO_POOL_IDX(bio);
-	struct bio_set *bs = bio->bi_set;
 
 	BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
 
-	mempool_free(bio->bi_io_vec, bs->bvec_pools[pool_idx]);
-	mempool_free(bio, bs->bio_pool);
+	mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
+	mempool_free(bio, bio_set->bio_pool);
+}
+
+/*
+ * default destructor for a bio allocated with bio_alloc_bioset()
+ */
+static void bio_fs_destructor(struct bio *bio)
+{
+	bio_free(bio, fs_bio_set);
 }
 
 inline void bio_init(struct bio *bio)
@@ -171,8 +175,6 @@ struct bio *bio_alloc_bioset(unsigned int __nocast gfp_mask, int nr_iovecs, stru
 			bio->bi_max_vecs = bvec_slabs[idx].nr_vecs;
 		}
 		bio->bi_io_vec = bvl;
-		bio->bi_destructor = bio_destructor;
-		bio->bi_set = bs;
 	}
 out:
 	return bio;
@@ -180,7 +182,12 @@ out:
 
 struct bio *bio_alloc(unsigned int __nocast gfp_mask, int nr_iovecs)
 {
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+
+	if (bio)
+		bio->bi_destructor = bio_fs_destructor;
+
+	return bio;
 }
 
 void zero_fill_bio(struct bio *bio)
@@ -273,8 +280,10 @@ struct bio *bio_clone(struct bio *bio, unsigned int __nocast gfp_mask)
 {
 	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
-	if (b)
+	if (b) {
+		b->bi_destructor = bio_fs_destructor;
 		__bio_clone(b, bio);
+	}
 
 	return b;
 }
@@ -1075,6 +1084,7 @@ subsys_initcall(init_bio);
 
 EXPORT_SYMBOL(bio_alloc);
 EXPORT_SYMBOL(bio_put);
+EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
 EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 36ef29fa0d8b..69e047989f1c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -111,7 +111,6 @@ struct bio {
 	void			*bi_private;
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
-	struct bio_set		*bi_set;	/* memory pools set */
 };
 
 /*
@@ -280,6 +279,7 @@ extern void bioset_free(struct bio_set *);
 extern struct bio *bio_alloc(unsigned int __nocast, int);
 extern struct bio *bio_alloc_bioset(unsigned int __nocast, int, struct bio_set *);
 extern void bio_put(struct bio *);
+extern void bio_free(struct bio *, struct bio_set *);
 
 extern void bio_endio(struct bio *, unsigned int, int);
 struct request_queue;
-- 
cgit v1.2.3


From 5dd42c262bd742fa3602180bbe5550b4828de8f3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 6 Sep 2005 15:16:43 -0700
Subject: [PATCH] remove register_ioctl32_conversion and
 unregister_ioctl32_conversion

All users have been converted.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/feature-removal-schedule.txt |  8 ---
 fs/compat.c                                | 90 ------------------------------
 include/linux/ioctl32.h                    | 22 --------
 3 files changed, 120 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 363909056e46..95e744353120 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -51,14 +51,6 @@ Who:	Adrian Bunk <bunk@stusta.de>
 
 ---------------------------
 
-What:	register_ioctl32_conversion() / unregister_ioctl32_conversion()
-When:	April 2005
-Why:	Replaced by ->compat_ioctl in file_operations and other method
-	vecors.
-Who:	Andi Kleen <ak@muc.de>, Christoph Hellwig <hch@lst.de>
-
----------------------------
-
 What:	RCU API moves to EXPORT_SYMBOL_GPL
 When:	April 2006
 Files:	include/linux/rcupdate.h, kernel/rcupdate.c
diff --git a/fs/compat.c b/fs/compat.c
index 8e03d31eec3b..2eb03c49b07c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -310,96 +310,6 @@ static int __init init_sys32_ioctl(void)
 
 __initcall(init_sys32_ioctl);
 
-int register_ioctl32_conversion(unsigned int cmd,
-				ioctl_trans_handler_t handler)
-{
-	struct ioctl_trans *t;
-	struct ioctl_trans *new_t;
-	unsigned long hash = ioctl32_hash(cmd);
-
-	new_t = kmalloc(sizeof(*new_t), GFP_KERNEL);
-	if (!new_t)
-		return -ENOMEM;
-
-	down_write(&ioctl32_sem);
-	for (t = ioctl32_hash_table[hash]; t; t = t->next) {
-		if (t->cmd == cmd) {
-			printk(KERN_ERR "Trying to register duplicated ioctl32 "
-					"handler %x\n", cmd);
-			up_write(&ioctl32_sem);
-			kfree(new_t);
-			return -EINVAL; 
-		}
-	}
-	new_t->next = NULL;
-	new_t->cmd = cmd;
-	new_t->handler = handler;
-	ioctl32_insert_translation(new_t);
-
-	up_write(&ioctl32_sem);
-	return 0;
-}
-EXPORT_SYMBOL(register_ioctl32_conversion);
-
-static inline int builtin_ioctl(struct ioctl_trans *t)
-{ 
-	return t >= ioctl_start && t < (ioctl_start + ioctl_table_size);
-} 
-
-/* Problem: 
-   This function cannot unregister duplicate ioctls, because they are not
-   unique.
-   When they happen we need to extend the prototype to pass the handler too. */
-
-int unregister_ioctl32_conversion(unsigned int cmd)
-{
-	unsigned long hash = ioctl32_hash(cmd);
-	struct ioctl_trans *t, *t1;
-
-	down_write(&ioctl32_sem);
-
-	t = ioctl32_hash_table[hash];
-	if (!t) { 
-		up_write(&ioctl32_sem);
-		return -EINVAL;
-	} 
-
-	if (t->cmd == cmd) { 
-		if (builtin_ioctl(t)) {
-			printk("%p tried to unregister builtin ioctl %x\n",
-			       __builtin_return_address(0), cmd);
-		} else { 
-			ioctl32_hash_table[hash] = t->next;
-			up_write(&ioctl32_sem);
-			kfree(t);
-			return 0;
-		}
-	} 
-	while (t->next) {
-		t1 = t->next;
-		if (t1->cmd == cmd) { 
-			if (builtin_ioctl(t1)) {
-				printk("%p tried to unregister builtin "
-					"ioctl %x\n",
-					__builtin_return_address(0), cmd);
-				goto out;
-			} else { 
-				t->next = t1->next;
-				up_write(&ioctl32_sem);
-				kfree(t1);
-				return 0;
-			}
-		}
-		t = t1;
-	}
-	printk(KERN_ERR "Trying to free unknown 32bit ioctl handler %x\n",
-				cmd);
-out:
-	up_write(&ioctl32_sem);
-	return -EINVAL;
-}
-EXPORT_SYMBOL(unregister_ioctl32_conversion); 
-
 static void compat_ioctl_error(struct file *filp, unsigned int fd,
 		unsigned int cmd, unsigned long arg)
 {
diff --git a/include/linux/ioctl32.h b/include/linux/ioctl32.h
index e8c4af32b3bb..948809d99917 100644
--- a/include/linux/ioctl32.h
+++ b/include/linux/ioctl32.h
@@ -14,26 +14,4 @@ struct ioctl_trans {
 	struct ioctl_trans *next;
 };
 
-/* 
- * Register an 32bit ioctl translation handler for ioctl cmd.
- *
- * handler == NULL: use 64bit ioctl handler.
- * arguments to handler:  fd: file descriptor
- *                        cmd: ioctl command.
- *                        arg: ioctl argument
- *                        struct file *file: file descriptor pointer.
- */ 
-
-#ifdef CONFIG_COMPAT
-extern int __deprecated register_ioctl32_conversion(unsigned int cmd,
-				ioctl_trans_handler_t handler);
-extern int __deprecated unregister_ioctl32_conversion(unsigned int cmd);
-
-#else
-
-#define register_ioctl32_conversion(cmd, handler)	({ 0; })
-#define unregister_ioctl32_conversion(cmd)		({ 0; })
-
-#endif
-
 #endif
-- 
cgit v1.2.3


From 36d57ac4a818cb4aa3edbdf63ad2ebc31106f925 Mon Sep 17 00:00:00 2001
From: "H. J. Lu" <hjl@lucon.org>
Date: Tue, 6 Sep 2005 15:16:49 -0700
Subject: [PATCH] auxiliary vector cleanups

The size of auxiliary vector is fixed at 42 in linux/sched.h.  But it isn't
very obvious when looking at linux/elf.h.  This patch adds AT_VECTOR_SIZE
so that we can change it if necessary when a new vector is added.

Because of include file ordering problems, doing this necessitated the
extraction of the AT_* symbols into a standalone header file.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/auxvec.h     | 24 ++++++++++++++++++++++++
 include/asm-alpha/elf.h        | 22 ++--------------------
 include/asm-arm/auxvec.h       |  4 ++++
 include/asm-arm26/auxvec.h     |  4 ++++
 include/asm-cris/auxvec.h      |  4 ++++
 include/asm-frv/auxvec.h       |  4 ++++
 include/asm-h8300/auxvec.h     |  4 ++++
 include/asm-i386/auxvec.h      | 11 +++++++++++
 include/asm-i386/elf.h         |  8 +-------
 include/asm-ia64/auxvec.h      | 11 +++++++++++
 include/asm-ia64/elf.h         |  8 +-------
 include/asm-m32r/auxvec.h      |  4 ++++
 include/asm-m68k/auxvec.h      |  4 ++++
 include/asm-m68knommu/auxvec.h |  4 ++++
 include/asm-mips/auxvec.h      |  4 ++++
 include/asm-parisc/auxvec.h    |  4 ++++
 include/asm-ppc/auxvec.h       | 14 ++++++++++++++
 include/asm-ppc/elf.h          | 11 +----------
 include/asm-ppc64/auxvec.h     | 19 +++++++++++++++++++
 include/asm-ppc64/elf.h        | 16 +---------------
 include/asm-s390/auxvec.h      |  4 ++++
 include/asm-sh/auxvec.h        |  4 ++++
 include/asm-sh64/auxvec.h      |  4 ++++
 include/asm-sparc/auxvec.h     |  4 ++++
 include/asm-sparc64/auxvec.h   |  4 ++++
 include/asm-um/auxvec.h        |  4 ++++
 include/asm-v850/auxvec.h      |  4 ++++
 include/asm-x86_64/auxvec.h    |  4 ++++
 include/asm-xtensa/auxvec.h    |  4 ++++
 include/linux/auxvec.h         | 31 +++++++++++++++++++++++++++++++
 include/linux/elf.h            | 24 +-----------------------
 include/linux/sched.h          |  4 +++-
 32 files changed, 196 insertions(+), 83 deletions(-)
 create mode 100644 include/asm-alpha/auxvec.h
 create mode 100644 include/asm-arm/auxvec.h
 create mode 100644 include/asm-arm26/auxvec.h
 create mode 100644 include/asm-cris/auxvec.h
 create mode 100644 include/asm-frv/auxvec.h
 create mode 100644 include/asm-h8300/auxvec.h
 create mode 100644 include/asm-i386/auxvec.h
 create mode 100644 include/asm-ia64/auxvec.h
 create mode 100644 include/asm-m32r/auxvec.h
 create mode 100644 include/asm-m68k/auxvec.h
 create mode 100644 include/asm-m68knommu/auxvec.h
 create mode 100644 include/asm-mips/auxvec.h
 create mode 100644 include/asm-parisc/auxvec.h
 create mode 100644 include/asm-ppc/auxvec.h
 create mode 100644 include/asm-ppc64/auxvec.h
 create mode 100644 include/asm-s390/auxvec.h
 create mode 100644 include/asm-sh/auxvec.h
 create mode 100644 include/asm-sh64/auxvec.h
 create mode 100644 include/asm-sparc/auxvec.h
 create mode 100644 include/asm-sparc64/auxvec.h
 create mode 100644 include/asm-um/auxvec.h
 create mode 100644 include/asm-v850/auxvec.h
 create mode 100644 include/asm-x86_64/auxvec.h
 create mode 100644 include/asm-xtensa/auxvec.h
 create mode 100644 include/linux/auxvec.h

(limited to 'include/linux')

diff --git a/include/asm-alpha/auxvec.h b/include/asm-alpha/auxvec.h
new file mode 100644
index 000000000000..e96fe880e310
--- /dev/null
+++ b/include/asm-alpha/auxvec.h
@@ -0,0 +1,24 @@
+#ifndef __ASM_ALPHA_AUXVEC_H
+#define __ASM_ALPHA_AUXVEC_H
+
+/* Reserve these numbers for any future use of a VDSO.  */
+#if 0
+#define AT_SYSINFO		32
+#define AT_SYSINFO_EHDR		33
+#endif
+
+/* More complete cache descriptions than AT_[DIU]CACHEBSIZE.  If the
+   value is -1, then the cache doesn't exist.  Otherwise:
+
+      bit 0-3:	  Cache set-associativity; 0 means fully associative.
+      bit 4-7:	  Log2 of cacheline size.
+      bit 8-31:	  Size of the entire cache >> 8.
+      bit 32-63:  Reserved.
+*/
+
+#define AT_L1I_CACHESHAPE	34
+#define AT_L1D_CACHESHAPE	35
+#define AT_L2_CACHESHAPE	36
+#define AT_L3_CACHESHAPE	37
+
+#endif /* __ASM_ALPHA_AUXVEC_H */
diff --git a/include/asm-alpha/elf.h b/include/asm-alpha/elf.h
index e94a945a2314..6c2d78fba264 100644
--- a/include/asm-alpha/elf.h
+++ b/include/asm-alpha/elf.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_ALPHA_ELF_H
 #define __ASM_ALPHA_ELF_H
 
+#include <asm/auxvec.h>
+
 /* Special values for the st_other field in the symbol table.  */
 
 #define STO_ALPHA_NOPV		0x80
@@ -142,26 +144,6 @@ extern int dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task);
 	: amask (AMASK_CIX) ? "ev6" : "ev67");	\
 })
 
-/* Reserve these numbers for any future use of a VDSO.  */
-#if 0
-#define AT_SYSINFO		32
-#define AT_SYSINFO_EHDR		33
-#endif
-
-/* More complete cache descriptions than AT_[DIU]CACHEBSIZE.  If the
-   value is -1, then the cache doesn't exist.  Otherwise:
-
-      bit 0-3:	  Cache set-associativity; 0 means fully associative.
-      bit 4-7:	  Log2 of cacheline size.
-      bit 8-31:	  Size of the entire cache >> 8.
-      bit 32-63:  Reserved.
-*/
-
-#define AT_L1I_CACHESHAPE	34
-#define AT_L1D_CACHESHAPE	35
-#define AT_L2_CACHESHAPE	36
-#define AT_L3_CACHESHAPE	37
-
 #ifdef __KERNEL__
 
 #define SET_PERSONALITY(EX, IBCS2)				\
diff --git a/include/asm-arm/auxvec.h b/include/asm-arm/auxvec.h
new file mode 100644
index 000000000000..c0536f6b29a7
--- /dev/null
+++ b/include/asm-arm/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMARM_AUXVEC_H
+#define __ASMARM_AUXVEC_H
+
+#endif
diff --git a/include/asm-arm26/auxvec.h b/include/asm-arm26/auxvec.h
new file mode 100644
index 000000000000..c0536f6b29a7
--- /dev/null
+++ b/include/asm-arm26/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMARM_AUXVEC_H
+#define __ASMARM_AUXVEC_H
+
+#endif
diff --git a/include/asm-cris/auxvec.h b/include/asm-cris/auxvec.h
new file mode 100644
index 000000000000..cb30b01bf19f
--- /dev/null
+++ b/include/asm-cris/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMCRIS_AUXVEC_H
+#define __ASMCRIS_AUXVEC_H
+
+#endif
diff --git a/include/asm-frv/auxvec.h b/include/asm-frv/auxvec.h
new file mode 100644
index 000000000000..07710778fa10
--- /dev/null
+++ b/include/asm-frv/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __FRV_AUXVEC_H
+#define __FRV_AUXVEC_H
+
+#endif
diff --git a/include/asm-h8300/auxvec.h b/include/asm-h8300/auxvec.h
new file mode 100644
index 000000000000..1d36fe38b088
--- /dev/null
+++ b/include/asm-h8300/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMH8300_AUXVEC_H
+#define __ASMH8300_AUXVEC_H
+
+#endif
diff --git a/include/asm-i386/auxvec.h b/include/asm-i386/auxvec.h
new file mode 100644
index 000000000000..395e13016bfb
--- /dev/null
+++ b/include/asm-i386/auxvec.h
@@ -0,0 +1,11 @@
+#ifndef __ASMi386_AUXVEC_H
+#define __ASMi386_AUXVEC_H
+
+/*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+ * for more of them, start the x86-specific ones at 32.
+ */
+#define AT_SYSINFO		32
+#define AT_SYSINFO_EHDR		33
+
+#endif
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 130bdc8c68cf..fa11117d3cfa 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -9,6 +9,7 @@
 #include <asm/user.h>
 #include <asm/processor.h>
 #include <asm/system.h>		/* for savesegment */
+#include <asm/auxvec.h>
 
 #include <linux/utsname.h>
 
@@ -109,13 +110,6 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 
 #define ELF_PLATFORM  (system_utsname.machine)
 
-/*
- * Architecture-neutral AT_ values in 0-17, leave some room
- * for more of them, start the x86-specific ones at 32.
- */
-#define AT_SYSINFO		32
-#define AT_SYSINFO_EHDR		33
-
 #ifdef __KERNEL__
 #define SET_PERSONALITY(ex, ibcs2) do { } while (0)
 
diff --git a/include/asm-ia64/auxvec.h b/include/asm-ia64/auxvec.h
new file mode 100644
index 000000000000..23cebe5685b9
--- /dev/null
+++ b/include/asm-ia64/auxvec.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_IA64_AUXVEC_H
+#define _ASM_IA64_AUXVEC_H
+
+/*
+ * Architecture-neutral AT_ values are in the range 0-17.  Leave some room for more of
+ * them, start the architecture-specific ones at 32.
+ */
+#define AT_SYSINFO	32
+#define AT_SYSINFO_EHDR	33
+
+#endif /* _ASM_IA64_AUXVEC_H */
diff --git a/include/asm-ia64/elf.h b/include/asm-ia64/elf.h
index 7d4ccc4b976e..446fce036fd9 100644
--- a/include/asm-ia64/elf.h
+++ b/include/asm-ia64/elf.h
@@ -12,6 +12,7 @@
 
 #include <asm/fpu.h>
 #include <asm/page.h>
+#include <asm/auxvec.h>
 
 /*
  * This is used to ensure we don't load something for the wrong architecture.
@@ -177,13 +178,6 @@ extern void ia64_elf_core_copy_regs (struct pt_regs *src, elf_gregset_t dst);
    relevant until we have real hardware to play with... */
 #define ELF_PLATFORM	NULL
 
-/*
- * Architecture-neutral AT_ values are in the range 0-17.  Leave some room for more of
- * them, start the architecture-specific ones at 32.
- */
-#define AT_SYSINFO	32
-#define AT_SYSINFO_EHDR	33
-
 #ifdef __KERNEL__
 #define SET_PERSONALITY(ex, ibcs2)	set_personality(PER_LINUX)
 #define elf_read_implies_exec(ex, executable_stack)					\
diff --git a/include/asm-m32r/auxvec.h b/include/asm-m32r/auxvec.h
new file mode 100644
index 000000000000..f76dcc860fae
--- /dev/null
+++ b/include/asm-m32r/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_M32R__AUXVEC_H
+#define _ASM_M32R__AUXVEC_H
+
+#endif  /* _ASM_M32R__AUXVEC_H */
diff --git a/include/asm-m68k/auxvec.h b/include/asm-m68k/auxvec.h
new file mode 100644
index 000000000000..844d6d52204b
--- /dev/null
+++ b/include/asm-m68k/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMm68k_AUXVEC_H
+#define __ASMm68k_AUXVEC_H
+
+#endif
diff --git a/include/asm-m68knommu/auxvec.h b/include/asm-m68knommu/auxvec.h
new file mode 100644
index 000000000000..844d6d52204b
--- /dev/null
+++ b/include/asm-m68knommu/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMm68k_AUXVEC_H
+#define __ASMm68k_AUXVEC_H
+
+#endif
diff --git a/include/asm-mips/auxvec.h b/include/asm-mips/auxvec.h
new file mode 100644
index 000000000000..7cf7f2d21943
--- /dev/null
+++ b/include/asm-mips/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_AUXVEC_H
+#define _ASM_AUXVEC_H
+
+#endif /* _ASM_AUXVEC_H */
diff --git a/include/asm-parisc/auxvec.h b/include/asm-parisc/auxvec.h
new file mode 100644
index 000000000000..9c3ac4b89dc9
--- /dev/null
+++ b/include/asm-parisc/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMPARISC_AUXVEC_H
+#define __ASMPARISC_AUXVEC_H
+
+#endif
diff --git a/include/asm-ppc/auxvec.h b/include/asm-ppc/auxvec.h
new file mode 100644
index 000000000000..172358df29c8
--- /dev/null
+++ b/include/asm-ppc/auxvec.h
@@ -0,0 +1,14 @@
+#ifndef __PPC_AUXVEC_H
+#define __PPC_AUXVEC_H
+
+/*
+ * We need to put in some extra aux table entries to tell glibc what
+ * the cache block size is, so it can use the dcbz instruction safely.
+ */
+#define AT_DCACHEBSIZE		19
+#define AT_ICACHEBSIZE		20
+#define AT_UCACHEBSIZE		21
+/* A special ignored type value for PPC, for glibc compatibility.  */
+#define AT_IGNOREPPC		22
+
+#endif
diff --git a/include/asm-ppc/elf.h b/include/asm-ppc/elf.h
index 2c056966efd3..c25cc35e6ab5 100644
--- a/include/asm-ppc/elf.h
+++ b/include/asm-ppc/elf.h
@@ -7,6 +7,7 @@
 #include <asm/types.h>
 #include <asm/ptrace.h>
 #include <asm/cputable.h>
+#include <asm/auxvec.h>
 
 /* PowerPC relocations defined by the ABIs */
 #define R_PPC_NONE		0
@@ -122,16 +123,6 @@ extern int dump_task_fpu(struct task_struct *t, elf_fpregset_t *fpu);
 
 #define SET_PERSONALITY(ex, ibcs2) set_personality((ibcs2)?PER_SVR4:PER_LINUX)
 
-/*
- * We need to put in some extra aux table entries to tell glibc what
- * the cache block size is, so it can use the dcbz instruction safely.
- */
-#define AT_DCACHEBSIZE		19
-#define AT_ICACHEBSIZE		20
-#define AT_UCACHEBSIZE		21
-/* A special ignored type value for PPC, for glibc compatibility.  */
-#define AT_IGNOREPPC		22
-
 extern int dcache_bsize;
 extern int icache_bsize;
 extern int ucache_bsize;
diff --git a/include/asm-ppc64/auxvec.h b/include/asm-ppc64/auxvec.h
new file mode 100644
index 000000000000..ac6381a106e1
--- /dev/null
+++ b/include/asm-ppc64/auxvec.h
@@ -0,0 +1,19 @@
+#ifndef __PPC64_AUXVEC_H
+#define __PPC64_AUXVEC_H
+
+/*
+ * We need to put in some extra aux table entries to tell glibc what
+ * the cache block size is, so it can use the dcbz instruction safely.
+ */
+#define AT_DCACHEBSIZE		19
+#define AT_ICACHEBSIZE		20
+#define AT_UCACHEBSIZE		21
+/* A special ignored type value for PPC, for glibc compatibility.  */
+#define AT_IGNOREPPC		22
+
+/* The vDSO location. We have to use the same value as x86 for glibc's
+ * sake :-)
+ */
+#define AT_SYSINFO_EHDR		33
+
+#endif /* __PPC64_AUXVEC_H */
diff --git a/include/asm-ppc64/elf.h b/include/asm-ppc64/elf.h
index 085eedb956fe..c919a89343db 100644
--- a/include/asm-ppc64/elf.h
+++ b/include/asm-ppc64/elf.h
@@ -4,6 +4,7 @@
 #include <asm/types.h>
 #include <asm/ptrace.h>
 #include <asm/cputable.h>
+#include <asm/auxvec.h>
 
 /* PowerPC relocations defined by the ABIs */
 #define R_PPC_NONE		0
@@ -237,21 +238,6 @@ do {								\
 
 #endif
 
-/*
- * We need to put in some extra aux table entries to tell glibc what
- * the cache block size is, so it can use the dcbz instruction safely.
- */
-#define AT_DCACHEBSIZE		19
-#define AT_ICACHEBSIZE		20
-#define AT_UCACHEBSIZE		21
-/* A special ignored type value for PPC, for glibc compatibility.  */
-#define AT_IGNOREPPC		22
-
-/* The vDSO location. We have to use the same value as x86 for glibc's
- * sake :-)
- */
-#define AT_SYSINFO_EHDR		33
-
 extern int dcache_bsize;
 extern int icache_bsize;
 extern int ucache_bsize;
diff --git a/include/asm-s390/auxvec.h b/include/asm-s390/auxvec.h
new file mode 100644
index 000000000000..0d340720fd99
--- /dev/null
+++ b/include/asm-s390/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMS390_AUXVEC_H
+#define __ASMS390_AUXVEC_H
+
+#endif
diff --git a/include/asm-sh/auxvec.h b/include/asm-sh/auxvec.h
new file mode 100644
index 000000000000..fc21e4db5881
--- /dev/null
+++ b/include/asm-sh/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_SH_AUXVEC_H
+#define __ASM_SH_AUXVEC_H
+
+#endif /* __ASM_SH_AUXVEC_H */
diff --git a/include/asm-sh64/auxvec.h b/include/asm-sh64/auxvec.h
new file mode 100644
index 000000000000..1ad5a44bdc76
--- /dev/null
+++ b/include/asm-sh64/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_SH64_AUXVEC_H
+#define __ASM_SH64_AUXVEC_H
+
+#endif /* __ASM_SH64_AUXVEC_H */
diff --git a/include/asm-sparc/auxvec.h b/include/asm-sparc/auxvec.h
new file mode 100644
index 000000000000..ad6f360261f6
--- /dev/null
+++ b/include/asm-sparc/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASMSPARC_AUXVEC_H
+#define __ASMSPARC_AUXVEC_H
+
+#endif /* !(__ASMSPARC_AUXVEC_H) */
diff --git a/include/asm-sparc64/auxvec.h b/include/asm-sparc64/auxvec.h
new file mode 100644
index 000000000000..436a29129828
--- /dev/null
+++ b/include/asm-sparc64/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_SPARC64_AUXVEC_H
+#define __ASM_SPARC64_AUXVEC_H
+
+#endif /* !(__ASM_SPARC64_AUXVEC_H) */
diff --git a/include/asm-um/auxvec.h b/include/asm-um/auxvec.h
new file mode 100644
index 000000000000..1e5e1c2fc9b1
--- /dev/null
+++ b/include/asm-um/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __UM_AUXVEC_H
+#define __UM_AUXVEC_H
+
+#endif
diff --git a/include/asm-v850/auxvec.h b/include/asm-v850/auxvec.h
new file mode 100644
index 000000000000..f493232d0224
--- /dev/null
+++ b/include/asm-v850/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __V850_AUXVEC_H__
+#define __V850_AUXVEC_H__
+
+#endif /* __V850_AUXVEC_H__ */
diff --git a/include/asm-x86_64/auxvec.h b/include/asm-x86_64/auxvec.h
new file mode 100644
index 000000000000..2403c4cfced2
--- /dev/null
+++ b/include/asm-x86_64/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_X86_64_AUXVEC_H
+#define __ASM_X86_64_AUXVEC_H
+
+#endif
diff --git a/include/asm-xtensa/auxvec.h b/include/asm-xtensa/auxvec.h
new file mode 100644
index 000000000000..257dec75c5af
--- /dev/null
+++ b/include/asm-xtensa/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __XTENSA_AUXVEC_H
+#define __XTENSA_AUXVEC_H
+
+#endif
diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h
new file mode 100644
index 000000000000..9a7b374c9fb4
--- /dev/null
+++ b/include/linux/auxvec.h
@@ -0,0 +1,31 @@
+#ifndef _LINUX_AUXVEC_H
+#define _LINUX_AUXVEC_H
+
+#include <asm/auxvec.h>
+
+/* Symbolic values for the entries in the auxiliary table
+   put on the initial stack */
+#define AT_NULL   0	/* end of vector */
+#define AT_IGNORE 1	/* entry should be ignored */
+#define AT_EXECFD 2	/* file descriptor of program */
+#define AT_PHDR   3	/* program headers for program */
+#define AT_PHENT  4	/* size of program header entry */
+#define AT_PHNUM  5	/* number of program headers */
+#define AT_PAGESZ 6	/* system page size */
+#define AT_BASE   7	/* base address of interpreter */
+#define AT_FLAGS  8	/* flags */
+#define AT_ENTRY  9	/* entry point of program */
+#define AT_NOTELF 10	/* program is not ELF */
+#define AT_UID    11	/* real uid */
+#define AT_EUID   12	/* effective uid */
+#define AT_GID    13	/* real gid */
+#define AT_EGID   14	/* effective gid */
+#define AT_PLATFORM 15  /* string identifying CPU for optimizations */
+#define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
+#define AT_CLKTCK 17	/* frequency at which times() increments */
+
+#define AT_SECURE 23   /* secure mode boolean */
+
+#define AT_VECTOR_SIZE  42 /* Size of auxiliary table.  */
+
+#endif /* _LINUX_AUXVEC_H */
diff --git a/include/linux/elf.h b/include/linux/elf.h
index f5b3ba5a317d..ff955dbf510d 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -2,6 +2,7 @@
 #define _LINUX_ELF_H
 
 #include <linux/types.h>
+#include <linux/auxvec.h>
 #include <asm/elf.h>
 
 #ifndef elf_read_implies_exec
@@ -158,29 +159,6 @@ typedef __s64	Elf64_Sxword;
 #define ELF64_ST_BIND(x)	ELF_ST_BIND(x)
 #define ELF64_ST_TYPE(x)	ELF_ST_TYPE(x)
 
-/* Symbolic values for the entries in the auxiliary table
-   put on the initial stack */
-#define AT_NULL   0	/* end of vector */
-#define AT_IGNORE 1	/* entry should be ignored */
-#define AT_EXECFD 2	/* file descriptor of program */
-#define AT_PHDR   3	/* program headers for program */
-#define AT_PHENT  4	/* size of program header entry */
-#define AT_PHNUM  5	/* number of program headers */
-#define AT_PAGESZ 6	/* system page size */
-#define AT_BASE   7	/* base address of interpreter */
-#define AT_FLAGS  8	/* flags */
-#define AT_ENTRY  9	/* entry point of program */
-#define AT_NOTELF 10	/* program is not ELF */
-#define AT_UID    11	/* real uid */
-#define AT_EUID   12	/* effective uid */
-#define AT_GID    13	/* real gid */
-#define AT_EGID   14	/* effective gid */
-#define AT_PLATFORM 15  /* string identifying CPU for optimizations */
-#define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
-#define AT_CLKTCK 17	/* frequency at which times() increments */
-
-#define AT_SECURE 23   /* secure mode boolean */
-
 typedef struct dynamic{
   Elf32_Sword d_tag;
   union{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5fb31bede103..b5a22ea80045 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,8 @@
 #include <linux/topology.h>
 #include <linux/seccomp.h>
 
+#include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+
 struct exec_domain;
 
 /*
@@ -261,7 +263,7 @@ struct mm_struct {
 	mm_counter_t _rss;
 	mm_counter_t _anon_rss;
 
-	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
+	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
 	unsigned dumpable:2;
 	cpumask_t cpu_vm_mask;
-- 
cgit v1.2.3


From 8fc2751beb0941966d3a97b26544e8585e428c08 Mon Sep 17 00:00:00 2001
From: Mark Bellon <mbellon@mvista.com>
Date: Tue, 6 Sep 2005 15:16:54 -0700
Subject: [PATCH] disk quotas fail when /etc/mtab is symlinked to /proc/mounts

If /etc/mtab is a regular file all of the mount options (of a file system)
are written to /etc/mtab by the mount command.  The quota tools look there
for the quota strings for their operation.  If, however, /etc/mtab is a
symlink to /proc/mounts (a "good thing" in some environments) the tools
don't write anything - they assume the kernel will take care of things.

While the quota options are sent down to the kernel via the mount system
call and the file system codes handle them properly unfortunately there is
no code to echo the quota strings into /proc/mounts and the quota tools
fail in the symlink case.

The attached patchs modify the EXT[2|3] and JFS codes to add the necessary
hooks.  The show_options function of each file system in these patches
currently deal with only those things that seemed related to quotas;
especially in the EXT3 case more can be done (later?).

Jan Kara also noted the difficulty in moving these changes above the FS
codes responding similarly to myself to Andrew's comment about possible
VFS migration. Issue summary:

 - FS codes have to process the entire string of options anyway.

 - Only FS codes that use quotas must have a show_options function (for
   quotas to work properly) however quotas are only used in a small number
   of FS.

 - Since most of the quota using FS support other options these FS codes
   should have the a show_options function to show those options - and the
   quota echoing becomes virtually negligible.

Based on feedback I have modified my patches from the original:

   JFS a missing patch has been restored to the posting
   EXT[2|3] and JFS always use the show_options function
       - Each FS has at least one FS specific option displayed
       - QUOTA output is under a CONFIG_QUOTA ifdef
       - a follow-on patch will add a multitude of options for each FS
   EXT[2|3] and JFS "quota" is treated as "usrquota"
   EXT3 journalled data check for journalled quota removed
   EXT[2|3] mount when quota specified but not compiled in

 - no changes from my original patch.  I tested the patch and the codes
   warn but

 - still mount.  With all due respection I believe the comments
   otherwise were a

 - misread of the patch.  Please reread/test and comment.  XFS patch
   removed - the XFS team already made the necessary changes EXT3 mixing
   old and new quotas are handled differently (not purely exclusive)

 - if old and new quotas for the same type are used together the old
   type is silently depricated for compatability (e.g.  usrquota and
   usrjquota)

 - mixing of old and new quotas is an error (e.g.  usrjquota and
   grpquota)

Signed-off-by: Mark Bellon <mbellon@mvista.com>
Acked-by: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/super.c         | 59 +++++++++++++++++++++++++++----
 fs/ext3/super.c         | 92 +++++++++++++++++++++++++++++++++++++++++++------
 fs/jfs/jfs_filsys.h     |  3 ++
 fs/jfs/super.c          | 48 ++++++++++++++++++++++++--
 include/linux/ext2_fs.h |  3 ++
 include/linux/ext3_fs.h |  2 ++
 6 files changed, 186 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index dcfe331dc4c4..3c0c7c6a5b44 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -19,6 +19,7 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
@@ -27,6 +28,8 @@
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
@@ -201,6 +204,26 @@ static void ext2_clear_inode(struct inode *inode)
 #endif
 }
 
+static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(vfs->mnt_sb);
+
+	if (sbi->s_mount_opt & EXT2_MOUNT_GRPID)
+		seq_puts(seq, ",grpid");
+	else
+		seq_puts(seq, ",nogrpid");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
 #ifdef CONFIG_QUOTA
 static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
 static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
@@ -218,6 +241,7 @@ static struct super_operations ext2_sops = {
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
 	.clear_inode	= ext2_clear_inode,
+	.show_options	= ext2_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext2_quota_read,
 	.quota_write	= ext2_quota_write,
@@ -256,10 +280,11 @@ static unsigned long get_sb_block(void **data)
 
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
-	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh,
-	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_xip,
-	Opt_ignore, Opt_err,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
+	Opt_err_ro, Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug,
+	Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
+	Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota
 };
 
 static match_table_t tokens = {
@@ -288,10 +313,10 @@ static match_table_t tokens = {
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_xip, "xip"},
-	{Opt_ignore, "grpquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_ignore, "noquota"},
-	{Opt_ignore, "quota"},
-	{Opt_ignore, "usrquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
 	{Opt_err, NULL}
 };
 
@@ -406,6 +431,26 @@ static int parse_options (char * options,
 			printk("EXT2 xip option not supported\n");
 #endif
 			break;
+
+#if defined(CONFIG_QUOTA)
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+			printk(KERN_ERR
+				"EXT2-fs: quota operations not supported.\n");
+
+			break;
+#endif
+
 		case Opt_ignore:
 			break;
 		default:
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c3c6e399fb3..a93c3609025d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/quotaops.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "xattr.h"
 #include "acl.h"
@@ -509,8 +510,41 @@ static void ext3_clear_inode(struct inode *inode)
 	kfree(rsv);
 }
 
-#ifdef CONFIG_QUOTA
+static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(vfs->mnt_sb);
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_JOURNAL_DATA)
+		seq_puts(seq, ",data=journal");
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_ORDERED_DATA)
+		seq_puts(seq, ",data=ordered");
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_WRITEBACK_DATA)
+		seq_puts(seq, ",data=writeback");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->s_jquota_fmt)
+		seq_printf(seq, ",jqfmt=%s",
+		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
 
+	if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
+#ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
@@ -569,6 +603,7 @@ static struct super_operations ext3_sops = {
 	.statfs		= ext3_statfs,
 	.remount_fs	= ext3_remount,
 	.clear_inode	= ext3_clear_inode,
+	.show_options	= ext3_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext3_quota_read,
 	.quota_write	= ext3_quota_write,
@@ -590,7 +625,8 @@ enum {
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+	Opt_grpquota
 };
 
 static match_table_t tokens = {
@@ -634,10 +670,10 @@ static match_table_t tokens = {
 	{Opt_grpjquota, "grpjquota=%s"},
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
-	{Opt_quota, "grpquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_noquota, "noquota"},
 	{Opt_quota, "quota"},
-	{Opt_quota, "usrquota"},
+	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
@@ -903,7 +939,13 @@ clear_qf_name:
 			sbi->s_jquota_fmt = QFMT_VFS_V0;
 			break;
 		case Opt_quota:
+		case Opt_usrquota:
 			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
 			if (sb_any_quota_enabled(sb)) {
@@ -912,8 +954,13 @@ clear_qf_name:
 				return 0;
 			}
 			clear_opt(sbi->s_mount_opt, QUOTA);
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 #else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
 		case Opt_usrjquota:
 		case Opt_grpjquota:
 		case Opt_offusrjquota:
@@ -924,7 +971,6 @@ clear_qf_name:
 				"EXT3-fs: journalled quota options not "
 				"supported.\n");
 			break;
-		case Opt_quota:
 		case Opt_noquota:
 			break;
 #endif
@@ -962,14 +1008,38 @@ clear_qf_name:
 		}
 	}
 #ifdef CONFIG_QUOTA
-	if (!sbi->s_jquota_fmt && (sbi->s_qf_names[USRQUOTA] ||
-	    sbi->s_qf_names[GRPQUOTA])) {
-		printk(KERN_ERR
-			"EXT3-fs: journalled quota format not specified.\n");
-		return 0;
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
+		     sbi->s_qf_names[USRQUOTA])
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+
+		if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
+		     sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+
+		if ((sbi->s_qf_names[USRQUOTA] &&
+				(sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
+		    (sbi->s_qf_names[GRPQUOTA] &&
+				(sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
+			printk(KERN_ERR "EXT3-fs: old and new quota "
+					"format mixing.\n");
+			return 0;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			printk(KERN_ERR "EXT3-fs: journalled quota format "
+					"not specified.\n");
+			return 0;
+		}
+	} else {
+		if (sbi->s_jquota_fmt) {
+			printk(KERN_ERR "EXT3-fs: journalled quota format "
+					"specified with no journalling "
+					"enabled.\n");
+			return 0;
+		}
 	}
 #endif
-
 	return 1;
 }
 
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 86ccac80f0ab..72a5588faeca 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -37,6 +37,9 @@
 #define JFS_ERR_CONTINUE   0x00000004   /* continue */
 #define JFS_ERR_PANIC      0x00000008   /* panic */
 
+#define	JFS_USRQUOTA	0x00000010
+#define	JFS_GRPQUOTA	0x00000020
+
 /* platform option (conditional compilation) */
 #define JFS_AIX		0x80000000	/* AIX support */
 /*	POSIX name/directory  support */
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9ff89720f93b..71bc34b96b2b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -23,9 +23,11 @@
 #include <linux/parser.h>
 #include <linux/completion.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 #include <linux/moduleparam.h>
 #include <linux/posix_acl.h>
 #include <asm/uaccess.h>
+#include <linux/seq_file.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -192,7 +194,8 @@ static void jfs_put_super(struct super_block *sb)
 
 enum {
 	Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
-	Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err,
+	Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota,
+	Opt_usrquota, Opt_grpquota
 };
 
 static match_table_t tokens = {
@@ -204,8 +207,8 @@ static match_table_t tokens = {
 	{Opt_errors, "errors=%s"},
 	{Opt_ignore, "noquota"},
 	{Opt_ignore, "quota"},
-	{Opt_ignore, "usrquota"},
-	{Opt_ignore, "grpquota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_err, NULL}
 };
 
@@ -293,6 +296,24 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 			}
 			break;
 		}
+
+#if defined(CONFIG_QUOTA)
+		case Opt_quota:
+		case Opt_usrquota:
+			*flag |= JFS_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			*flag |= JFS_GRPQUOTA;
+			break;
+#else
+		case Opt_usrquota:
+		case Opt_grpquota:
+		case Opt_quota:
+			printk(KERN_ERR
+			       "JFS: quota operations not supported\n");
+			break;
+#endif
+
 		default:
 			printk("jfs: Unrecognized mount option \"%s\" "
 					" or missing value\n", p);
@@ -539,6 +560,26 @@ static int jfs_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
+static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb);
+
+	if (sbi->flag & JFS_NOINTEGRITY)
+		seq_puts(seq, ",nointegrity");
+	else
+		seq_puts(seq, ",integrity");
+
+#if defined(CONFIG_QUOTA)
+	if (sbi->flag & JFS_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->flag & JFS_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+
+	return 0;
+}
+
 static struct super_operations jfs_super_operations = {
 	.alloc_inode	= jfs_alloc_inode,
 	.destroy_inode	= jfs_destroy_inode,
@@ -552,6 +593,7 @@ static struct super_operations jfs_super_operations = {
 	.unlockfs       = jfs_unlockfs,
 	.statfs		= jfs_statfs,
 	.remount_fs	= jfs_remount,
+	.show_options	= jfs_show_options
 };
 
 static struct export_operations jfs_export_operations = {
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index a657130ba03a..f7bd1c7ebefb 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -313,6 +313,9 @@ struct ext2_inode {
 #define EXT2_MOUNT_XATTR_USER		0x004000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL		0x008000  /* POSIX Access Control Lists */
 #define EXT2_MOUNT_XIP			0x010000  /* Execute in place */
+#define EXT2_MOUNT_USRQUOTA		0x020000 /* user quota */
+#define EXT2_MOUNT_GRPQUOTA		0x040000 /* group quota */
+
 
 #define clear_opt(o, opt)		o &= ~EXT2_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT2_MOUNT_##opt
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index c16662836c58..c0272d73ab20 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -373,6 +373,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT3_MOUNT_NOBH			0x40000 /* No bufferheads */
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
+#define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
+#define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
-- 
cgit v1.2.3


From 2865cf001878d22d5fd12e5215621dffbcad76dc Mon Sep 17 00:00:00 2001
From: Zhigang Huo <zghuo@ncic.ac.cn>
Date: Tue, 6 Sep 2005 15:17:00 -0700
Subject: [PATCH] remove pipe definitions

These no longer have any users.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pipe_fs_i.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 36725e7c02c6..1767073df26f 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -39,9 +39,6 @@ struct pipe_inode_info {
 
 #define PIPE_SEM(inode)		(&(inode).i_sem)
 #define PIPE_WAIT(inode)	(&(inode).i_pipe->wait)
-#define PIPE_BASE(inode)	((inode).i_pipe->base)
-#define PIPE_START(inode)	((inode).i_pipe->start)
-#define PIPE_LEN(inode)		((inode).i_pipe->len)
 #define PIPE_READERS(inode)	((inode).i_pipe->readers)
 #define PIPE_WRITERS(inode)	((inode).i_pipe->writers)
 #define PIPE_WAITING_WRITERS(inode)	((inode).i_pipe->waiting_writers)
-- 
cgit v1.2.3


From d2052c1676a39cae101a81f3da8a4ade8b668c88 Mon Sep 17 00:00:00 2001
From: Erik Waling <erikw@acc.umu.se>
Date: Tue, 6 Sep 2005 15:17:02 -0700
Subject: [PATCH] sonypi SPIC initialisation fix

Newer Sony VAIO models (VGN-S480, VGN-S460, VGN-S3XP etc) use a new method to
initialize the SPIC device.  The new way to initialize (and disable) the
device comes directly from the AML code in the _CRS, _SRS and _DIS methods
from the DSDT table.  This patch adds support for the new models.

Signed-off-by: Erik Waling <erikw@acc.umu.se>
Signed-off-by: Stelian Pop <stelian@popies.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sonypi.txt |  10 ++++
 drivers/char/sonypi.c    | 117 +++++++++++++++++++++++++++++++++++++++--------
 include/linux/sonypi.h   |   2 +
 3 files changed, 109 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sonypi.txt b/Documentation/sonypi.txt
index 0f3b2405d09e..c1237a925505 100644
--- a/Documentation/sonypi.txt
+++ b/Documentation/sonypi.txt
@@ -99,6 +99,7 @@ statically linked into the kernel). Those options are:
 				SONYPI_MEYE_MASK		0x0400
 				SONYPI_MEMORYSTICK_MASK		0x0800
 				SONYPI_BATTERY_MASK		0x1000
+				SONYPI_WIRELESS_MASK		0x2000
 
 	useinput:	if set (which is the default) two input devices are
 			created, one which interprets the jogdial events as
@@ -137,6 +138,15 @@ Bugs:
 	  speed handling etc). Use ACPI instead of APM if it works on your
 	  laptop.
 
+	- sonypi lacks the ability to distinguish between certain key
+	  events on some models.
+
+	- some models with the nvidia card (geforce go 6200 tc) uses a
+	  different way to adjust the backlighting of the screen. There
+	  is a userspace utility to adjust the brightness on those models,
+	  which can be downloaded from
+	  http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2
+
 	- since all development was done by reverse engineering, there is
 	  _absolutely no guarantee_ that this driver will not crash your
 	  laptop. Permanently.
diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index cefbe985e55c..35cf1edbc179 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -98,12 +98,13 @@ MODULE_PARM_DESC(useinput,
 
 #define SONYPI_DEVICE_MODEL_TYPE1	1
 #define SONYPI_DEVICE_MODEL_TYPE2	2
+#define SONYPI_DEVICE_MODEL_TYPE3	3
 
 /* type1 models use those */
 #define SONYPI_IRQ_PORT			0x8034
 #define SONYPI_IRQ_SHIFT		22
-#define SONYPI_BASE			0x50
-#define SONYPI_G10A			(SONYPI_BASE+0x14)
+#define SONYPI_TYPE1_BASE		0x50
+#define SONYPI_G10A			(SONYPI_TYPE1_BASE+0x14)
 #define SONYPI_TYPE1_REGION_SIZE	0x08
 #define SONYPI_TYPE1_EVTYPE_OFFSET	0x04
 
@@ -114,6 +115,13 @@ MODULE_PARM_DESC(useinput,
 #define SONYPI_TYPE2_REGION_SIZE	0x20
 #define SONYPI_TYPE2_EVTYPE_OFFSET	0x12
 
+/* type3 series specifics */
+#define SONYPI_TYPE3_BASE		0x40
+#define SONYPI_TYPE3_GID2		(SONYPI_TYPE3_BASE+0x48) /* 16 bits */
+#define SONYPI_TYPE3_MISC		(SONYPI_TYPE3_BASE+0x6d) /* 8 bits  */
+#define SONYPI_TYPE3_REGION_SIZE	0x20
+#define SONYPI_TYPE3_EVTYPE_OFFSET	0x12
+
 /* battery / brightness addresses */
 #define SONYPI_BAT_FLAGS	0x81
 #define SONYPI_LCD_LIGHT	0x96
@@ -159,6 +167,10 @@ static struct sonypi_ioport_list sonypi_type2_ioport_list[] = {
 	{ 0x0, 0x0 }
 };
 
+/* same as in type 2 models */
+static struct sonypi_ioport_list *sonypi_type3_ioport_list =
+	sonypi_type2_ioport_list;
+
 /* The set of possible interrupts */
 struct sonypi_irq_list {
 	u16	irq;
@@ -180,6 +192,9 @@ static struct sonypi_irq_list sonypi_type2_irq_list[] = {
 	{  0, 0x00 }	/* no IRQ, 0x00 in SIRQ in AML */
 };
 
+/* same as in type2 models */
+static struct sonypi_irq_list *sonypi_type3_irq_list = sonypi_type2_irq_list;
+
 #define SONYPI_CAMERA_BRIGHTNESS		0
 #define SONYPI_CAMERA_CONTRAST			1
 #define SONYPI_CAMERA_HUE			2
@@ -223,6 +238,7 @@ static struct sonypi_irq_list sonypi_type2_irq_list[] = {
 #define SONYPI_MEYE_MASK			0x00000400
 #define SONYPI_MEMORYSTICK_MASK			0x00000800
 #define SONYPI_BATTERY_MASK			0x00001000
+#define SONYPI_WIRELESS_MASK			0x00002000
 
 struct sonypi_event {
 	u8	data;
@@ -305,6 +321,13 @@ static struct sonypi_event sonypi_blueev[] = {
 	{ 0, 0 }
 };
 
+/* The set of possible wireless events */
+static struct sonypi_event sonypi_wlessev[] = {
+	{ 0x59, SONYPI_EVENT_WIRELESS_ON },
+	{ 0x5a, SONYPI_EVENT_WIRELESS_OFF },
+	{ 0, 0 }
+};
+
 /* The set of possible back button events */
 static struct sonypi_event sonypi_backev[] = {
 	{ 0x20, SONYPI_EVENT_BACK_PRESSED },
@@ -391,6 +414,12 @@ static struct sonypi_eventtypes {
 	{ SONYPI_DEVICE_MODEL_TYPE2, 0x41, SONYPI_BATTERY_MASK, sonypi_batteryev },
 	{ SONYPI_DEVICE_MODEL_TYPE2, 0x31, SONYPI_PKEY_MASK, sonypi_pkeyev },
 
+	{ SONYPI_DEVICE_MODEL_TYPE3, 0, 0xffffffff, sonypi_releaseev },
+	{ SONYPI_DEVICE_MODEL_TYPE3, 0x21, SONYPI_FNKEY_MASK, sonypi_fnkeyev },
+	{ SONYPI_DEVICE_MODEL_TYPE3, 0x31, SONYPI_WIRELESS_MASK, sonypi_wlessev },
+	{ SONYPI_DEVICE_MODEL_TYPE3, 0x31, SONYPI_MEMORYSTICK_MASK, sonypi_memorystickev },
+	{ SONYPI_DEVICE_MODEL_TYPE3, 0x41, SONYPI_BATTERY_MASK, sonypi_batteryev },
+	{ SONYPI_DEVICE_MODEL_TYPE3, 0x31, SONYPI_PKEY_MASK, sonypi_pkeyev },
 	{ 0 }
 };
 
@@ -563,6 +592,23 @@ static void sonypi_type2_srs(void)
 	udelay(10);
 }
 
+static void sonypi_type3_srs(void)
+{
+	u16 v16;
+	u8  v8;
+
+	/* This model type uses the same initialiazation of
+	 * the embedded controller as the type2 models. */
+	sonypi_type2_srs();
+
+	/* Initialization of PCI config space of the LPC interface bridge. */
+	v16 = (sonypi_device.ioport1 & 0xFFF0) | 0x01;
+	pci_write_config_word(sonypi_device.dev, SONYPI_TYPE3_GID2, v16);
+	pci_read_config_byte(sonypi_device.dev, SONYPI_TYPE3_MISC, &v8);
+	v8 = (v8 & 0xCF) | 0x10;
+	pci_write_config_byte(sonypi_device.dev, SONYPI_TYPE3_MISC, v8);
+}
+
 /* Disables the device - this comes from the AML code in the ACPI bios */
 static void sonypi_type1_dis(void)
 {
@@ -587,6 +633,13 @@ static void sonypi_type2_dis(void)
 		printk(KERN_WARNING "ec_write failed\n");
 }
 
+static void sonypi_type3_dis(void)
+{
+	sonypi_type2_dis();
+	udelay(10);
+	pci_write_config_word(sonypi_device.dev, SONYPI_TYPE3_GID2, 0);
+}
+
 static u8 sonypi_call1(u8 dev)
 {
 	u8 v1, v2;
@@ -1067,10 +1120,17 @@ static struct miscdevice sonypi_misc_device = {
 
 static void sonypi_enable(unsigned int camera_on)
 {
-	if (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE2)
-		sonypi_type2_srs();
-	else
+	switch (sonypi_device.model) {
+	case SONYPI_DEVICE_MODEL_TYPE1:
 		sonypi_type1_srs();
+		break;
+	case SONYPI_DEVICE_MODEL_TYPE2:
+		sonypi_type2_srs();
+		break;
+	case SONYPI_DEVICE_MODEL_TYPE3:
+		sonypi_type3_srs();
+		break;
+	}
 
 	sonypi_call1(0x82);
 	sonypi_call2(0x81, 0xff);
@@ -1094,10 +1154,18 @@ static int sonypi_disable(void)
 	if (!SONYPI_ACPI_ACTIVE && fnkeyinit)
 		outb(0xf1, 0xb2);
 
-	if (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE2)
-		sonypi_type2_dis();
-	else
+	switch (sonypi_device.model) {
+	case SONYPI_DEVICE_MODEL_TYPE1:
 		sonypi_type1_dis();
+		break;
+	case SONYPI_DEVICE_MODEL_TYPE2:
+		sonypi_type2_dis();
+		break;
+	case SONYPI_DEVICE_MODEL_TYPE3:
+		sonypi_type3_dis();
+		break;
+	}
+
 	return 0;
 }
 
@@ -1143,12 +1211,16 @@ static int __devinit sonypi_probe(void)
 	struct sonypi_irq_list *irq_list;
 	struct pci_dev *pcidev;
 
-	pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
-				PCI_DEVICE_ID_INTEL_82371AB_3, NULL);
+	if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
+				     PCI_DEVICE_ID_INTEL_82371AB_3, NULL)))
+		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE1;
+	else if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
+					  PCI_DEVICE_ID_INTEL_ICH6_1, NULL)))
+		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE3;
+	else
+		sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE2;
 
 	sonypi_device.dev = pcidev;
-	sonypi_device.model = pcidev ?
-		SONYPI_DEVICE_MODEL_TYPE1 : SONYPI_DEVICE_MODEL_TYPE2;
 
 	spin_lock_init(&sonypi_device.fifo_lock);
 	sonypi_device.fifo = kfifo_alloc(SONYPI_BUF_SIZE, GFP_KERNEL,
@@ -1176,16 +1248,22 @@ static int __devinit sonypi_probe(void)
 		goto out_miscreg;
 	}
 
-	if (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE2) {
+
+	if (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE1) {
+		ioport_list = sonypi_type1_ioport_list;
+		sonypi_device.region_size = SONYPI_TYPE1_REGION_SIZE;
+		sonypi_device.evtype_offset = SONYPI_TYPE1_EVTYPE_OFFSET;
+		irq_list = sonypi_type1_irq_list;
+	} else if (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE2) {
 		ioport_list = sonypi_type2_ioport_list;
 		sonypi_device.region_size = SONYPI_TYPE2_REGION_SIZE;
 		sonypi_device.evtype_offset = SONYPI_TYPE2_EVTYPE_OFFSET;
 		irq_list = sonypi_type2_irq_list;
 	} else {
-		ioport_list = sonypi_type1_ioport_list;
-		sonypi_device.region_size = SONYPI_TYPE1_REGION_SIZE;
-		sonypi_device.evtype_offset = SONYPI_TYPE1_EVTYPE_OFFSET;
-		irq_list = sonypi_type1_irq_list;
+		ioport_list = sonypi_type3_ioport_list;
+		sonypi_device.region_size = SONYPI_TYPE3_REGION_SIZE;
+		sonypi_device.evtype_offset = SONYPI_TYPE3_EVTYPE_OFFSET;
+		irq_list = sonypi_type3_irq_list;
 	}
 
 	for (i = 0; ioport_list[i].port1; i++) {
@@ -1274,11 +1352,10 @@ static int __devinit sonypi_probe(void)
 
 	printk(KERN_INFO "sonypi: Sony Programmable I/O Controller Driver"
 	       "v%s.\n", SONYPI_DRIVER_VERSION);
-	printk(KERN_INFO "sonypi: detected %s model, "
+	printk(KERN_INFO "sonypi: detected type%d model, "
 	       "verbose = %d, fnkeyinit = %s, camera = %s, "
 	       "compat = %s, mask = 0x%08lx, useinput = %s, acpi = %s\n",
-	       (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE1) ?
-			"type1" : "type2",
+	       sonypi_device.model,
 	       verbose,
 	       fnkeyinit ? "on" : "off",
 	       camera ? "on" : "off",
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index 768cbba617d0..f56d24734950 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -99,6 +99,8 @@
 #define SONYPI_EVENT_BATTERY_INSERT		57
 #define SONYPI_EVENT_BATTERY_REMOVE		58
 #define SONYPI_EVENT_FNKEY_RELEASED		59
+#define SONYPI_EVENT_WIRELESS_ON		60
+#define SONYPI_EVENT_WIRELESS_OFF		61
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3


From e139aa595c5d3bd01699530cbe017dec75fdb07f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Tue, 6 Sep 2005 15:17:05 -0700
Subject: [PATCH] PNP: make pnp_dbg conditional directly on CONFIG_PNP_DEBUG

Seems pointless to require .c files to test CONFIG_PNP_DEBUG and
conditionally define DEBUG before including <linux/pnp.h>.  Just test
CONFIG_PNP_DEBUG directly in pnp.h.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Adam Belay <ambx1@neo.rr.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pnp/card.c         | 7 -------
 drivers/pnp/driver.c       | 7 -------
 drivers/pnp/manager.c      | 7 -------
 drivers/pnp/pnpacpi/core.c | 1 +
 drivers/pnp/quirks.c       | 7 -------
 drivers/pnp/support.c      | 7 -------
 include/linux/pnp.h        | 2 +-
 7 files changed, 2 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/card.c b/drivers/pnp/card.c
index 6e5229e92fbc..e95ed67d4f05 100644
--- a/drivers/pnp/card.c
+++ b/drivers/pnp/card.c
@@ -8,13 +8,6 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-
-#ifdef CONFIG_PNP_DEBUG
-	#define DEBUG
-#else
-	#undef DEBUG
-#endif
-
 #include <linux/pnp.h>
 #include "base.h"
 
diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index 1d037c2a82ac..33da25f3213f 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -11,13 +11,6 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
-
-#ifdef CONFIG_PNP_DEBUG
-	#define DEBUG
-#else
-	#undef DEBUG
-#endif
-
 #include <linux/pnp.h>
 #include "base.h"
 
diff --git a/drivers/pnp/manager.c b/drivers/pnp/manager.c
index 6c510c19ad7d..94442ffd4aed 100644
--- a/drivers/pnp/manager.c
+++ b/drivers/pnp/manager.c
@@ -11,13 +11,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-
-#ifdef CONFIG_PNP_DEBUG
-	#define DEBUG
-#else
-	#undef DEBUG
-#endif
-
 #include <linux/pnp.h>
 #include "base.h"
 
diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c
index d3d292ea5876..1a8915e74160 100644
--- a/drivers/pnp/pnpacpi/core.c
+++ b/drivers/pnp/pnpacpi/core.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
  
+#include <linux/config.h>
 #include <linux/acpi.h>
 #include <linux/pnp.h>
 #include <acpi/acpi_bus.h>
diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c
index 596a02d7e03d..8936b0cb2ec3 100644
--- a/drivers/pnp/quirks.c
+++ b/drivers/pnp/quirks.c
@@ -16,13 +16,6 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-
-#ifdef CONFIG_PNP_DEBUG
-	#define DEBUG
-#else
-	#undef DEBUG
-#endif
-
 #include <linux/pnp.h>
 #include "base.h"
 
diff --git a/drivers/pnp/support.c b/drivers/pnp/support.c
index b952aec49189..61fe998944bd 100644
--- a/drivers/pnp/support.c
+++ b/drivers/pnp/support.c
@@ -8,13 +8,6 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
-
-#ifdef CONFIG_PNP_DEBUG
-	#define DEBUG
-#else
-	#undef DEBUG
-#endif
-
 #include <linux/pnp.h>
 #include "base.h"
 
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 5ec2bd0c2848..aadbac29103c 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -443,7 +443,7 @@ static inline void pnp_unregister_driver(struct pnp_driver *drv) { ; }
 #define pnp_info(format, arg...) printk(KERN_INFO "pnp: " format "\n" , ## arg)
 #define pnp_warn(format, arg...) printk(KERN_WARNING "pnp: " format "\n" , ## arg)
 
-#ifdef DEBUG
+#ifdef CONFIG_PNP_DEBUG
 #define pnp_dbg(format, arg...) printk(KERN_DEBUG "pnp: " format "\n" , ## arg)
 #else
 #define pnp_dbg(format, arg...) do {} while (0)
-- 
cgit v1.2.3


From f35279d3f713e5c97b98cbdbf47d98f79942c11f Mon Sep 17 00:00:00 2001
From: Bruce Allan <bwa@us.ibm.com>
Date: Tue, 6 Sep 2005 15:17:08 -0700
Subject: [PATCH] sunrpc: cache_register can use wrong module reference

When registering an RPC cache, cache_register() always sets the owner as the
sunrpc module.  However, there are RPC caches owned by other modules.  With
the incorrect owner setting, the real owning module can be removed potentially
with an open reference to the cache from userspace.

For example, if one were to stop the nfs server and unmount the nfsd
filesystem, the nfsd module could be removed eventhough rpc.idmapd had
references to the idtoname and nametoid caches (i.e.
/proc/net/rpc/nfs4.<cachename>/channel is still open).  This resulted in a
system panic on one of our machines when attempting to restart the nfs
services after reloading the nfsd module.

The following patch adds a 'struct module *owner' field in struct
cache_detail.  The owner is further assigned to the struct proc_dir_entry
in cache_register() so that the module cannot be unloaded while user-space
daemons have an open reference on the associated file under /proc.

Signed-off-by: Bruce Allan <bwa@us.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c                  | 3 +++
 fs/nfsd/nfs4idmap.c               | 8 ++++++--
 include/linux/sunrpc/cache.h      | 1 +
 net/sunrpc/auth_gss/svcauth_gss.c | 8 ++++++--
 net/sunrpc/cache.c                | 8 ++++----
 net/sunrpc/sunrpc_syms.c          | 6 ++++--
 net/sunrpc/svcauth.c              | 1 +
 net/sunrpc/svcauth_unix.c         | 1 +
 8 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 9a11aa39e2e4..057aff745506 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/hash.h>
+#include <linux/module.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -221,6 +222,7 @@ static int expkey_show(struct seq_file *m,
 }
 	
 struct cache_detail svc_expkey_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
 	.hash_table	= expkey_table,
 	.name		= "nfsd.fh",
@@ -456,6 +458,7 @@ static int svc_export_show(struct seq_file *m,
 	return 0;
 }
 struct cache_detail svc_export_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= EXPORT_HASHMAX,
 	.hash_table	= export_table,
 	.name		= "nfsd.export",
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5605a26efc57..13369650cdf9 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -187,6 +187,7 @@ static int         idtoname_parse(struct cache_detail *, char *, int);
 static struct ent *idtoname_lookup(struct ent *, int);
 
 static struct cache_detail idtoname_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= ENT_HASHMAX,
 	.hash_table	= idtoname_table,
 	.name		= "nfs4.idtoname",
@@ -320,6 +321,7 @@ static struct ent *nametoid_lookup(struct ent *, int);
 static int         nametoid_parse(struct cache_detail *, char *, int);
 
 static struct cache_detail nametoid_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= ENT_HASHMAX,
 	.hash_table	= nametoid_table,
 	.name		= "nfs4.nametoid",
@@ -404,8 +406,10 @@ nfsd_idmap_init(void)
 void
 nfsd_idmap_shutdown(void)
 {
-	cache_unregister(&idtoname_cache);
-	cache_unregister(&nametoid_cache);
+	if (cache_unregister(&idtoname_cache))
+		printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n");
+	if (cache_unregister(&nametoid_cache))
+		printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
 }
 
 /*
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 6864063d1b9f..c4e3ea7cf154 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -60,6 +60,7 @@ struct cache_head {
 #define	CACHE_NEW_EXPIRY 120	/* keep new things pending confirmation for 120 seconds */
 
 struct cache_detail {
+	struct module *		owner;
 	int			hash_size;
 	struct cache_head **	hash_table;
 	rwlock_t		hash_lock;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5c8fe3bfc494..e3308195374e 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -250,6 +250,7 @@ out:
 }
 
 static struct cache_detail rsi_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= RSI_HASHMAX,
 	.hash_table     = rsi_table,
 	.name           = "auth.rpcsec.init",
@@ -436,6 +437,7 @@ out:
 }
 
 static struct cache_detail rsc_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= RSC_HASHMAX,
 	.hash_table	= rsc_table,
 	.name		= "auth.rpcsec.context",
@@ -1074,7 +1076,9 @@ gss_svc_init(void)
 void
 gss_svc_shutdown(void)
 {
-	cache_unregister(&rsc_cache);
-	cache_unregister(&rsi_cache);
+	if (cache_unregister(&rsc_cache))
+		printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n");
+	if (cache_unregister(&rsi_cache))
+		printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
 	svc_auth_unregister(RPC_AUTH_GSS);
 }
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 900f5bc7e336..f509e9992767 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -177,7 +177,7 @@ void cache_register(struct cache_detail *cd)
 	cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
 	if (cd->proc_ent) {
 		struct proc_dir_entry *p;
-		cd->proc_ent->owner = THIS_MODULE;
+		cd->proc_ent->owner = cd->owner;
 		cd->channel_ent = cd->content_ent = NULL;
 		
  		p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR,
@@ -185,7 +185,7 @@ void cache_register(struct cache_detail *cd)
 		cd->flush_ent =  p;
  		if (p) {
  			p->proc_fops = &cache_flush_operations;
- 			p->owner = THIS_MODULE;
+ 			p->owner = cd->owner;
  			p->data = cd;
  		}
  
@@ -195,7 +195,7 @@ void cache_register(struct cache_detail *cd)
 			cd->channel_ent = p;
 			if (p) {
 				p->proc_fops = &cache_file_operations;
-				p->owner = THIS_MODULE;
+				p->owner = cd->owner;
 				p->data = cd;
 			}
 		}
@@ -205,7 +205,7 @@ void cache_register(struct cache_detail *cd)
 			cd->content_ent = p;
  			if (p) {
  				p->proc_fops = &content_file_operations;
- 				p->owner = THIS_MODULE;
+ 				p->owner = cd->owner;
  				p->data = cd;
  			}
  		}
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 62a073495276..ed48ff022d35 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -176,8 +176,10 @@ cleanup_sunrpc(void)
 {
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
-	cache_unregister(&auth_domain_cache);
-	cache_unregister(&ip_map_cache);
+	if (cache_unregister(&auth_domain_cache))
+		printk(KERN_ERR "sunrpc: failed to unregister auth_domain cache\n");
+	if (cache_unregister(&ip_map_cache))
+		printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
 #ifdef RPC_DEBUG
 	rpc_unregister_sysctl();
 #endif
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index bde8147ef2db..dda4f0c63511 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -143,6 +143,7 @@ static void auth_domain_drop(struct cache_head *item, struct cache_detail *cd)
 
 
 struct cache_detail auth_domain_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= DN_HASHMAX,
 	.hash_table	= auth_domain_table,
 	.name		= "auth.domain",
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index d6baf6fdf8a9..cac2e774dd81 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -242,6 +242,7 @@ static int ip_map_show(struct seq_file *m,
 	
 
 struct cache_detail ip_map_cache = {
+	.owner		= THIS_MODULE,
 	.hash_size	= IP_HASHMAX,
 	.hash_table	= ip_table,
 	.name		= "auth.unix.ip",
-- 
cgit v1.2.3


From 19b4946ca9d1e35d4c641dcebe27378de34f3ddd Mon Sep 17 00:00:00 2001
From: Mike Waychison <mikew@google.com>
Date: Tue, 6 Sep 2005 15:17:10 -0700
Subject: [PATCH] ipc: convert /proc/sysvipc/* to generic seq_file interface

Change the /proc/sysvipc/shm|sem|msg files to use the generic seq_file
implementation for struct ipc_ids.

Signed-off-by: Mike Waychison <mikew@google.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/msg.h |  1 +
 include/linux/sem.h |  1 +
 ipc/msg.c           | 82 +++++++++++++++++---------------------------------
 ipc/sem.c           | 73 ++++++++++++++-------------------------------
 ipc/shm.c           | 86 +++++++++++++++++------------------------------------
 5 files changed, 80 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msg.h b/include/linux/msg.h
index 2c4c6aa643ff..903e0ab8101f 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -77,6 +77,7 @@ struct msg_msg {
 /* one msq_queue structure for each present queue on the system */
 struct msg_queue {
 	struct kern_ipc_perm q_perm;
+	int q_id;
 	time_t q_stime;			/* last msgsnd time */
 	time_t q_rtime;			/* last msgrcv time */
 	time_t q_ctime;			/* last change time */
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 2d8516be9fd7..106f9757339a 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -88,6 +88,7 @@ struct sem {
 /* One sem_array data structure for each set of semaphores in the system. */
 struct sem_array {
 	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
+	int			sem_id;
 	time_t			sem_otime;	/* last semop time */
 	time_t			sem_ctime;	/* last change time */
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
diff --git a/ipc/msg.c b/ipc/msg.c
index 27e516f96cdc..d035bd2aba96 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/seq_file.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
 #include "util.h"
@@ -74,16 +75,16 @@ static struct ipc_ids msg_ids;
 static void freeque (struct msg_queue *msq, int id);
 static int newque (key_t key, int msgflg);
 #ifdef CONFIG_PROC_FS
-static int sysvipc_msg_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
+static int sysvipc_msg_proc_show(struct seq_file *s, void *it);
 #endif
 
 void __init msg_init (void)
 {
 	ipc_init_ids(&msg_ids,msg_ctlmni);
-
-#ifdef CONFIG_PROC_FS
-	create_proc_read_entry("sysvipc/msg", 0, NULL, sysvipc_msg_read_proc, NULL);
-#endif
+	ipc_init_proc_interface("sysvipc/msg",
+				"       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n",
+				&msg_ids,
+				sysvipc_msg_proc_show);
 }
 
 static int newque (key_t key, int msgflg)
@@ -113,6 +114,7 @@ static int newque (key_t key, int msgflg)
 		return -ENOSPC;
 	}
 
+	msq->q_id = msg_buildid(id,msq->q_perm.seq);
 	msq->q_stime = msq->q_rtime = 0;
 	msq->q_ctime = get_seconds();
 	msq->q_cbytes = msq->q_qnum = 0;
@@ -123,7 +125,7 @@ static int newque (key_t key, int msgflg)
 	INIT_LIST_HEAD(&msq->q_senders);
 	msg_unlock(msq);
 
-	return msg_buildid(id,msq->q_perm.seq);
+	return msq->q_id;
 }
 
 static inline void ss_add(struct msg_queue* msq, struct msg_sender* mss)
@@ -808,55 +810,25 @@ out_unlock:
 }
 
 #ifdef CONFIG_PROC_FS
-static int sysvipc_msg_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
 {
-	off_t pos = 0;
-	off_t begin = 0;
-	int i, len = 0;
-
-	down(&msg_ids.sem);
-	len += sprintf(buffer, "       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n");
-
-	for(i = 0; i <= msg_ids.max_id; i++) {
-		struct msg_queue * msq;
-		msq = msg_lock(i);
-		if(msq != NULL) {
-			len += sprintf(buffer + len, "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
-				msq->q_perm.key,
-				msg_buildid(i,msq->q_perm.seq),
-				msq->q_perm.mode,
-				msq->q_cbytes,
-				msq->q_qnum,
-				msq->q_lspid,
-				msq->q_lrpid,
-				msq->q_perm.uid,
-				msq->q_perm.gid,
-				msq->q_perm.cuid,
-				msq->q_perm.cgid,
-				msq->q_stime,
-				msq->q_rtime,
-				msq->q_ctime);
-			msg_unlock(msq);
-
-			pos += len;
-			if(pos < offset) {
-				len = 0;
-				begin = pos;
-			}
-			if(pos > offset + length)
-				goto done;
-		}
-
-	}
-	*eof = 1;
-done:
-	up(&msg_ids.sem);
-	*start = buffer + (offset - begin);
-	len -= (offset - begin);
-	if(len > length)
-		len = length;
-	if(len < 0)
-		len = 0;
-	return len;
+	struct msg_queue *msq = it;
+
+	return seq_printf(s,
+			  "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
+			  msq->q_perm.key,
+			  msq->q_id,
+			  msq->q_perm.mode,
+			  msq->q_cbytes,
+			  msq->q_qnum,
+			  msq->q_lspid,
+			  msq->q_lrpid,
+			  msq->q_perm.uid,
+			  msq->q_perm.gid,
+			  msq->q_perm.cuid,
+			  msq->q_perm.cgid,
+			  msq->q_stime,
+			  msq->q_rtime,
+			  msq->q_ctime);
 }
 #endif
diff --git a/ipc/sem.c b/ipc/sem.c
index 70975ce0784a..19af028a3e38 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "util.h"
 
@@ -89,7 +90,7 @@ static struct ipc_ids sem_ids;
 static int newary (key_t, int, int);
 static void freeary (struct sem_array *sma, int id);
 #ifdef CONFIG_PROC_FS
-static int sysvipc_sem_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
+static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #endif
 
 #define SEMMSL_FAST	256 /* 512 bytes on stack */
@@ -116,10 +117,10 @@ void __init sem_init (void)
 {
 	used_sems = 0;
 	ipc_init_ids(&sem_ids,sc_semmni);
-
-#ifdef CONFIG_PROC_FS
-	create_proc_read_entry("sysvipc/sem", 0, NULL, sysvipc_sem_read_proc, NULL);
-#endif
+	ipc_init_proc_interface("sysvipc/sem",
+				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
+				&sem_ids,
+				sysvipc_sem_proc_show);
 }
 
 /*
@@ -193,6 +194,7 @@ static int newary (key_t key, int nsems, int semflg)
 	}
 	used_sems += nsems;
 
+	sma->sem_id = sem_buildid(id, sma->sem_perm.seq);
 	sma->sem_base = (struct sem *) &sma[1];
 	/* sma->sem_pending = NULL; */
 	sma->sem_pending_last = &sma->sem_pending;
@@ -201,7 +203,7 @@ static int newary (key_t key, int nsems, int semflg)
 	sma->sem_ctime = get_seconds();
 	sem_unlock(sma);
 
-	return sem_buildid(id, sma->sem_perm.seq);
+	return sma->sem_id;
 }
 
 asmlinkage long sys_semget (key_t key, int nsems, int semflg)
@@ -1328,50 +1330,21 @@ next_entry:
 }
 
 #ifdef CONFIG_PROC_FS
-static int sysvipc_sem_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 {
-	off_t pos = 0;
-	off_t begin = 0;
-	int i, len = 0;
-
-	len += sprintf(buffer, "       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n");
-	down(&sem_ids.sem);
-
-	for(i = 0; i <= sem_ids.max_id; i++) {
-		struct sem_array *sma;
-		sma = sem_lock(i);
-		if(sma) {
-			len += sprintf(buffer + len, "%10d %10d  %4o %10lu %5u %5u %5u %5u %10lu %10lu\n",
-				sma->sem_perm.key,
-				sem_buildid(i,sma->sem_perm.seq),
-				sma->sem_perm.mode,
-				sma->sem_nsems,
-				sma->sem_perm.uid,
-				sma->sem_perm.gid,
-				sma->sem_perm.cuid,
-				sma->sem_perm.cgid,
-				sma->sem_otime,
-				sma->sem_ctime);
-			sem_unlock(sma);
-
-			pos += len;
-			if(pos < offset) {
-				len = 0;
-	    			begin = pos;
-			}
-			if(pos > offset + length)
-				goto done;
-		}
-	}
-	*eof = 1;
-done:
-	up(&sem_ids.sem);
-	*start = buffer + (offset - begin);
-	len -= (offset - begin);
-	if(len > length)
-		len = length;
-	if(len < 0)
-		len = 0;
-	return len;
+	struct sem_array *sma = it;
+
+	return seq_printf(s,
+			  "%10d %10d  %4o %10lu %5u %5u %5u %5u %10lu %10lu\n",
+			  sma->sem_perm.key,
+			  sma->sem_id,
+			  sma->sem_perm.mode,
+			  sma->sem_nsems,
+			  sma->sem_perm.uid,
+			  sma->sem_perm.gid,
+			  sma->sem_perm.cuid,
+			  sma->sem_perm.cgid,
+			  sma->sem_otime,
+			  sma->sem_ctime);
 }
 #endif
diff --git a/ipc/shm.c b/ipc/shm.c
index 1d6cf08d950b..dca90489e3b0 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -23,12 +23,12 @@
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/mman.h>
-#include <linux/proc_fs.h>
 #include <linux/shmem_fs.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
 #include <linux/ptrace.h>
+#include <linux/seq_file.h>
 
 #include <asm/uaccess.h>
 
@@ -51,7 +51,7 @@ static int newseg (key_t key, int shmflg, size_t size);
 static void shm_open (struct vm_area_struct *shmd);
 static void shm_close (struct vm_area_struct *shmd);
 #ifdef CONFIG_PROC_FS
-static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
+static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
 #endif
 
 size_t	shm_ctlmax = SHMMAX;
@@ -63,9 +63,10 @@ static int shm_tot; /* total number of shared memory pages */
 void __init shm_init (void)
 {
 	ipc_init_ids(&shm_ids, 1);
-#ifdef CONFIG_PROC_FS
-	create_proc_read_entry("sysvipc/shm", 0, NULL, sysvipc_shm_read_proc, NULL);
-#endif
+	ipc_init_proc_interface("sysvipc/shm",
+				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
+				&shm_ids,
+				sysvipc_shm_proc_show);
 }
 
 static inline int shm_checkid(struct shmid_kernel *s, int id)
@@ -869,63 +870,32 @@ asmlinkage long sys_shmdt(char __user *shmaddr)
 }
 
 #ifdef CONFIG_PROC_FS
-static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 {
-	off_t pos = 0;
-	off_t begin = 0;
-	int i, len = 0;
-
-	down(&shm_ids.sem);
-	len += sprintf(buffer, "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n");
+	struct shmid_kernel *shp = it;
+	char *format;
 
-	for(i = 0; i <= shm_ids.max_id; i++) {
-		struct shmid_kernel* shp;
-
-		shp = shm_lock(i);
-		if(shp!=NULL) {
 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
-			char *format;
 
-			if (sizeof(size_t) <= sizeof(int))
-				format = SMALL_STRING;
-			else
-				format = BIG_STRING;
-			len += sprintf(buffer + len, format,
-				shp->shm_perm.key,
-				shm_buildid(i, shp->shm_perm.seq),
-				shp->shm_flags,
-				shp->shm_segsz,
-				shp->shm_cprid,
-				shp->shm_lprid,
-				is_file_hugepages(shp->shm_file) ? (file_count(shp->shm_file) - 1) : shp->shm_nattch,
-				shp->shm_perm.uid,
-				shp->shm_perm.gid,
-				shp->shm_perm.cuid,
-				shp->shm_perm.cgid,
-				shp->shm_atim,
-				shp->shm_dtim,
-				shp->shm_ctim);
-			shm_unlock(shp);
-
-			pos += len;
-			if(pos < offset) {
-				len = 0;
-				begin = pos;
-			}
-			if(pos > offset + length)
-				goto done;
-		}
-	}
-	*eof = 1;
-done:
-	up(&shm_ids.sem);
-	*start = buffer + (offset - begin);
-	len -= (offset - begin);
-	if(len > length)
-		len = length;
-	if(len < 0)
-		len = 0;
-	return len;
+	if (sizeof(size_t) <= sizeof(int))
+		format = SMALL_STRING;
+	else
+		format = BIG_STRING;
+	return seq_printf(s, format,
+			  shp->shm_perm.key,
+			  shp->id,
+			  shp->shm_flags,
+			  shp->shm_segsz,
+			  shp->shm_cprid,
+			  shp->shm_lprid,
+			  is_file_hugepages(shp->shm_file) ? (file_count(shp->shm_file) - 1) : shp->shm_nattch,
+			  shp->shm_perm.uid,
+			  shp->shm_perm.gid,
+			  shp->shm_perm.cuid,
+			  shp->shm_perm.cgid,
+			  shp->shm_atim,
+			  shp->shm_dtim,
+			  shp->shm_ctim);
 }
 #endif
-- 
cgit v1.2.3


From 6e3eaab02028c4087a92711b20abb9e72cc803a7 Mon Sep 17 00:00:00 2001
From: Abhay Salunke <Abhay_Salunke@dell.com>
Date: Tue, 6 Sep 2005 15:17:13 -0700
Subject: [PATCH] modified firmware_class.c to support no hotplug

Upgrade the request_firmware_nowait function to not start the hotplug
action on a firmware update.

This patch is tested along with dell_rbu driver on i386 and x86-64 systems.

Signed-off-by: Abhay Salunke <Abhay_Salunke@dell.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/firmware_class.c | 79 +++++++++++++++++++++++++++----------------
 include/linux/firmware.h      |  5 ++-
 2 files changed, 54 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 652281402c92..5bfa2e9a7c26 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -28,6 +28,7 @@ enum {
 	FW_STATUS_DONE,
 	FW_STATUS_ABORT,
 	FW_STATUS_READY,
+	FW_STATUS_READY_NOHOTPLUG,
 };
 
 static int loading_timeout = 10;	/* In seconds */
@@ -344,7 +345,7 @@ error_kfree:
 
 static int
 fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
-		      const char *fw_name, struct device *device)
+		      const char *fw_name, struct device *device, int hotplug)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -376,7 +377,10 @@ fw_setup_class_device(struct firmware *fw, struct class_device **class_dev_p,
 		goto error_unreg;
 	}
 
-	set_bit(FW_STATUS_READY, &fw_priv->status);
+	if (hotplug)
+                set_bit(FW_STATUS_READY, &fw_priv->status);
+        else
+                set_bit(FW_STATUS_READY_NOHOTPLUG, &fw_priv->status);
 	*class_dev_p = class_dev;
 	goto out;
 
@@ -386,21 +390,9 @@ out:
 	return retval;
 }
 
-/**
- * request_firmware: - request firmware to hotplug and wait for it
- * Description:
- *	@firmware will be used to return a firmware image by the name
- *	of @name for device @device.
- *
- *	Should be called from user context where sleeping is allowed.
- *
- *	@name will be use as $FIRMWARE in the hotplug environment and
- *	should be distinctive enough not to be confused with any other
- *	firmware image for this or any other device.
- **/
-int
-request_firmware(const struct firmware **firmware_p, const char *name,
-		 struct device *device)
+static int
+_request_firmware(const struct firmware **firmware_p, const char *name,
+		 struct device *device, int hotplug)
 {
 	struct class_device *class_dev;
 	struct firmware_priv *fw_priv;
@@ -419,22 +411,25 @@ request_firmware(const struct firmware **firmware_p, const char *name,
 	}
 	memset(firmware, 0, sizeof (*firmware));
 
-	retval = fw_setup_class_device(firmware, &class_dev, name, device);
+	retval = fw_setup_class_device(firmware, &class_dev, name, device,
+		hotplug);
 	if (retval)
 		goto error_kfree_fw;
 
 	fw_priv = class_get_devdata(class_dev);
 
-	if (loading_timeout > 0) {
-		fw_priv->timeout.expires = jiffies + loading_timeout * HZ;
-		add_timer(&fw_priv->timeout);
-	}
-
-	kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
-	wait_for_completion(&fw_priv->completion);
-	set_bit(FW_STATUS_DONE, &fw_priv->status);
+	if (hotplug) {
+		if (loading_timeout > 0) {
+			fw_priv->timeout.expires = jiffies + loading_timeout * HZ;
+			add_timer(&fw_priv->timeout);
+		}
 
-	del_timer_sync(&fw_priv->timeout);
+		kobject_hotplug(&class_dev->kobj, KOBJ_ADD);
+		wait_for_completion(&fw_priv->completion);
+		set_bit(FW_STATUS_DONE, &fw_priv->status);
+		del_timer_sync(&fw_priv->timeout);
+	} else
+		wait_for_completion(&fw_priv->completion);
 
 	down(&fw_lock);
 	if (!fw_priv->fw->size || test_bit(FW_STATUS_ABORT, &fw_priv->status)) {
@@ -454,6 +449,26 @@ out:
 	return retval;
 }
 
+/**
+ * request_firmware: - request firmware to hotplug and wait for it
+ * Description:
+ *      @firmware will be used to return a firmware image by the name
+ *      of @name for device @device.
+ *
+ *      Should be called from user context where sleeping is allowed.
+ *
+ *      @name will be use as $FIRMWARE in the hotplug environment and
+ *      should be distinctive enough not to be confused with any other
+ *      firmware image for this or any other device.
+ **/
+int
+request_firmware(const struct firmware **firmware_p, const char *name,
+                 struct device *device)
+{
+        int hotplug = 1;
+        return _request_firmware(firmware_p, name, device, hotplug);
+}
+
 /**
  * release_firmware: - release the resource associated with a firmware image
  **/
@@ -491,6 +506,7 @@ struct firmware_work {
 	struct device *device;
 	void *context;
 	void (*cont)(const struct firmware *fw, void *context);
+	int hotplug;
 };
 
 static int
@@ -503,7 +519,8 @@ request_firmware_work_func(void *arg)
 		return 0;
 	}
 	daemonize("%s/%s", "firmware", fw_work->name);
-	request_firmware(&fw, fw_work->name, fw_work->device);
+	_request_firmware(&fw, fw_work->name, fw_work->device,
+		fw_work->hotplug);
 	fw_work->cont(fw, fw_work->context);
 	release_firmware(fw);
 	module_put(fw_work->module);
@@ -518,6 +535,9 @@ request_firmware_work_func(void *arg)
  *	Asynchronous variant of request_firmware() for contexts where
  *	it is not possible to sleep.
  *
+ *      @hotplug invokes hotplug event to copy the firmware image if this flag
+ *      is non-zero else the firmware copy must be done manually.
+ *
  *	@cont will be called asynchronously when the firmware request is over.
  *
  *	@context will be passed over to @cont.
@@ -527,7 +547,7 @@ request_firmware_work_func(void *arg)
  **/
 int
 request_firmware_nowait(
-	struct module *module,
+	struct module *module, int hotplug,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context))
 {
@@ -548,6 +568,7 @@ request_firmware_nowait(
 		.device = device,
 		.context = context,
 		.cont = cont,
+		.hotplug = hotplug,
 	};
 
 	ret = kernel_thread(request_firmware_work_func, fw_work,
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 886255b69bb9..2063c0839d4f 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -3,6 +3,9 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #define FIRMWARE_NAME_MAX 30 
+#define FW_ACTION_NOHOTPLUG 0
+#define FW_ACTION_HOTPLUG 1
+
 struct firmware {
 	size_t size;
 	u8 *data;
@@ -11,7 +14,7 @@ struct device;
 int request_firmware(const struct firmware **fw, const char *name,
 		     struct device *device);
 int request_firmware_nowait(
-	struct module *module,
+	struct module *module, int hotplug,
 	const char *name, struct device *device, void *context,
 	void (*cont)(const struct firmware *fw, void *context));
 
-- 
cgit v1.2.3


From f26fdd59929e1144c6caf72adcaf4561d6e682a4 Mon Sep 17 00:00:00 2001
From: Karsten Wiese <annabellesgarden@yahoo.de>
Date: Tue, 6 Sep 2005 15:17:25 -0700
Subject: [PATCH] CHECK_IRQ_PER_CPU() to avoid dead code in __do_IRQ()

IRQ_PER_CPU is not used by all architectures.  This patch introduces the
macros ARCH_HAS_IRQ_PER_CPU and CHECK_IRQ_PER_CPU() to avoid the generation
of dead code in __do_IRQ().

ARCH_HAS_IRQ_PER_CPU is defined by architectures using IRQ_PER_CPU in their
include/asm_ARCH/irq.h file.

Through grepping the tree I found the following architectures currently use
IRQ_PER_CPU:

        cris, ia64, ppc, ppc64 and parisc.

Signed-off-by: Karsten Wiese <annabellesgarden@yahoo.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-cris/irq.h   | 5 +++++
 include/asm-ia64/irq.h   | 5 +++++
 include/asm-parisc/irq.h | 5 +++++
 include/asm-ppc/irq.h    | 5 +++++
 include/asm-ppc64/irq.h  | 5 +++++
 include/linux/irq.h      | 7 ++++++-
 kernel/irq/handle.c      | 2 +-
 7 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-cris/irq.h b/include/asm-cris/irq.h
index 8e787fdaedd4..4fab5c3b2e15 100644
--- a/include/asm-cris/irq.h
+++ b/include/asm-cris/irq.h
@@ -1,6 +1,11 @@
 #ifndef _ASM_IRQ_H
 #define _ASM_IRQ_H
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 #include <asm/arch/irq.h>
 
 extern __inline__ int irq_canonicalize(int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 5d930fdc0bea..cd984d08fd15 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -14,6 +14,11 @@
 #define NR_IRQS		256
 #define NR_IRQ_VECTORS	NR_IRQS
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 static __inline__ int
 irq_canonicalize (int irq)
 {
diff --git a/include/asm-parisc/irq.h b/include/asm-parisc/irq.h
index 75654ba93353..f876bdf22056 100644
--- a/include/asm-parisc/irq.h
+++ b/include/asm-parisc/irq.h
@@ -26,6 +26,11 @@
 
 #define NR_IRQS		(CPU_IRQ_MAX + 1)
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 static __inline__ int irq_canonicalize(int irq)
 {
 	return (irq == 2) ? 9 : irq;
diff --git a/include/asm-ppc/irq.h b/include/asm-ppc/irq.h
index a244d93ca953..b4b270457edd 100644
--- a/include/asm-ppc/irq.h
+++ b/include/asm-ppc/irq.h
@@ -19,6 +19,11 @@
 #define IRQ_POLARITY_POSITIVE	0x2	/* high level or low->high edge */
 #define IRQ_POLARITY_NEGATIVE	0x0	/* low level or high->low edge */
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 #if defined(CONFIG_40x)
 #include <asm/ibm4xx.h>
 
diff --git a/include/asm-ppc64/irq.h b/include/asm-ppc64/irq.h
index 570678b1da95..99782afb4cde 100644
--- a/include/asm-ppc64/irq.h
+++ b/include/asm-ppc64/irq.h
@@ -33,6 +33,11 @@
 #define IRQ_POLARITY_POSITIVE	0x2	/* high level or low->high edge */
 #define IRQ_POLARITY_NEGATIVE	0x0	/* low level or high->low edge */
 
+/*
+ * IRQ line status macro IRQ_PER_CPU is used
+ */
+#define ARCH_HAS_IRQ_PER_CPU
+
 #define get_irq_desc(irq) (&irq_desc[(irq)])
 
 /* Define a way to iterate across irqs. */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4a362b9ec966..69681c3b1f05 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -32,7 +32,12 @@
 #define IRQ_WAITING	32	/* IRQ not yet seen - for autodetection */
 #define IRQ_LEVEL	64	/* IRQ level triggered */
 #define IRQ_MASKED	128	/* IRQ masked - shouldn't be seen again */
-#define IRQ_PER_CPU	256	/* IRQ is per CPU */
+#if defined(ARCH_HAS_IRQ_PER_CPU)
+# define IRQ_PER_CPU	256	/* IRQ is per CPU */
+# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
+#else
+# define CHECK_IRQ_PER_CPU(var) 0
+#endif
 
 /*
  * Interrupt controller descriptor. This is all we need
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 	unsigned int status;
 
 	kstat_this_cpu.irqs[irq]++;
-	if (desc->status & IRQ_PER_CPU) {
+	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
 
 		/*
-- 
cgit v1.2.3


From f23ef184b486ac021b6a471b4e94cfa04860d3b0 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Tue, 6 Sep 2005 15:17:29 -0700
Subject: [PATCH] Delete unused do_nanosleep declaration

There is no do_nanosleep function so kill it's declaration in <linux/time.h>.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 5634497ff5df..c10d4c21c183 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -97,7 +97,6 @@ extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
 extern void clock_was_set(void); // call when ever the clock is set
 extern int do_posix_clock_monotonic_gettime(struct timespec *tp);
-extern long do_nanosleep(struct timespec *t);
 extern long do_utimes(char __user * filename, struct timeval * times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue);
-- 
cgit v1.2.3


From 2832e9366a1fcd6f76957a42157be041240f994e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 6 Sep 2005 15:17:38 -0700
Subject: [PATCH] remove file.f_maxcount

struct file cleanup: f_maxcount has an unique value (INT_MAX).  Just use
the hard-wired value.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/file_table.c    | 1 -
 fs/read_write.c    | 2 +-
 include/linux/fs.h | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 1d3de78e6bc9..43e9e1737de2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -89,7 +89,6 @@ struct file *get_empty_filp(void)
 	rwlock_init(&f->f_owner.lock);
 	/* f->f_version: 0 */
 	INIT_LIST_HEAD(&f->f_list);
-	f->f_maxcount = INT_MAX;
 	return f;
 
 over:
diff --git a/fs/read_write.c b/fs/read_write.c
index 563abd09b5c8..b60324aaa2b6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -188,7 +188,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	struct inode *inode;
 	loff_t pos;
 
-	if (unlikely(count > file->f_maxcount))
+	if (unlikely(count > INT_MAX))
 		goto Einval;
 	pos = *ppos;
 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 67e6732d4fdc..2036747c7d1f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -594,7 +594,6 @@ struct file {
 	unsigned int		f_uid, f_gid;
 	struct file_ra_state	f_ra;
 
-	size_t			f_maxcount;
 	unsigned long		f_version;
 	void			*f_security;
 
-- 
cgit v1.2.3


From b149ee2233edf08fb59b11e879a2c5941929bcb8 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 6 Sep 2005 15:17:46 -0700
Subject: [PATCH] NTP: ntp-helper functions

This patch cleans up a commonly repeated set of changes to the NTP state
variables by adding two helper inline functions:

ntp_clear(): Clears the ntp state variables

ntp_synced(): Returns 1 if the system is synced with a time server.

This was compile tested for alpha, arm, i386, x86-64, ppc64, s390, sparc,
sparc64.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/time.c         |  7 ++-----
 arch/arm/kernel/time.c           |  7 ++-----
 arch/arm26/kernel/time.c         |  7 ++-----
 arch/cris/arch-v10/kernel/time.c |  2 +-
 arch/cris/kernel/time.c          |  5 +----
 arch/frv/kernel/time.c           |  7 ++-----
 arch/h8300/kernel/time.c         |  5 +----
 arch/i386/kernel/time.c          |  7 ++-----
 arch/m32r/kernel/time.c          |  7 ++-----
 arch/m68k/kernel/time.c          |  5 +----
 arch/m68knommu/kernel/time.c     |  7 ++-----
 arch/mips/kernel/sysirix.c       |  5 +----
 arch/mips/kernel/time.c          |  7 ++-----
 arch/mips/sgi-ip27/ip27-timer.c  |  2 +-
 arch/parisc/kernel/time.c        |  5 +----
 arch/ppc/kernel/time.c           |  7 ++-----
 arch/ppc64/kernel/time.c         |  7 ++-----
 arch/s390/kernel/time.c          |  5 +----
 arch/sh/kernel/time.c            |  7 ++-----
 arch/sh64/kernel/time.c          |  7 ++-----
 arch/sparc/kernel/pcic.c         |  5 +----
 arch/sparc/kernel/time.c         |  7 ++-----
 arch/sparc64/kernel/time.c       |  2 +-
 arch/v850/kernel/time.c          |  7 ++-----
 arch/x86_64/kernel/time.c        |  7 ++-----
 arch/xtensa/kernel/time.c        |  7 ++-----
 include/linux/timex.h            | 23 +++++++++++++++++++++++
 27 files changed, 65 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 8226c5cd788c..67be50b7d80a 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -149,7 +149,7 @@ irqreturn_t timer_interrupt(int irq, void *dev, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0
+	if (ntp_synced()
 	    && xtime.tv_sec > state.last_rtc_update + 660
 	    && xtime.tv_nsec >= 500000 - ((unsigned) TICK_SIZE) / 2
 	    && xtime.tv_nsec <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -502,10 +502,7 @@ do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 8880482dcbff..69449a818dcc 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -102,7 +102,7 @@ static unsigned long next_rtc_update;
  */
 static inline void do_set_rtc(void)
 {
-	if (time_status & STA_UNSYNC || set_rtc == NULL)
+	if (!ntp_synced() || set_rtc == NULL)
 		return;
 
 	if (next_rtc_update &&
@@ -292,10 +292,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/arm26/kernel/time.c b/arch/arm26/kernel/time.c
index 549a6b2e177e..e66aedd02fad 100644
--- a/arch/arm26/kernel/time.c
+++ b/arch/arm26/kernel/time.c
@@ -114,7 +114,7 @@ static unsigned long next_rtc_update;
  */
 static inline void do_set_rtc(void)
 {
-	if (time_status & STA_UNSYNC || set_rtc == NULL)
+	if (!ntp_synced() || set_rtc == NULL)
 		return;
 
 //FIXME - timespec.tv_sec is a time_t not unsigned long
@@ -189,10 +189,7 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index 6b7b4e0802e3..dc3dfe9b4a1a 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -240,7 +240,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * The division here is not time critical since it will run once in 
 	 * 11 minutes
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - (tick_nsec / 1000) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + (tick_nsec / 1000) / 2) {
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index fa2d4323da25..a2d99b4aedcd 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -114,10 +114,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 075db6644694..8d6558b00e44 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -85,7 +85,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2
@@ -216,10 +216,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index 8a600218334d..af8c5d2057dd 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -116,10 +116,7 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 9b94d84a6c3b..eefea7c55008 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -194,10 +194,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
@@ -347,7 +344,7 @@ static void sync_cmos_clock(unsigned long dummy)
 	 * This code is run on a timer.  If the clock is set, that timer
 	 * may not expire at the correct time.  Thus, we adjust...
 	 */
-	if ((time_status & STA_UNSYNC) != 0)
+	if (!ntp_synced())
 		/*
 		 * Not synced, exit, do not restart a timer (if one is
 		 * running, let it run out).
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 8a2b77bc5749..539c562cd54d 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -171,10 +171,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -221,7 +218,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
 	write_seqlock(&xtime_lock);
-	if ((time_status & STA_UNSYNC) == 0
+	if (ntp_synced()
 		&& xtime.tv_sec > last_rtc_update + 660
 		&& (xtime.tv_nsec / 1000) >= 500000 - ((unsigned)TICK_SIZE) / 2
 		&& (xtime.tv_nsec / 1000) <= 500000 + ((unsigned)TICK_SIZE) / 2)
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index e47e19588525..4ec95e3cb874 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -166,10 +166,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index 5c3ca671627c..b17c1ecba966 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -68,7 +68,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec  / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -178,10 +178,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index f3bf0e43b8bb..b46595462717 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -632,10 +632,7 @@ asmlinkage int irix_stime(int value)
 	write_seqlock_irq(&xtime_lock);
 	xtime.tv_sec = value;
 	xtime.tv_nsec = 0;
-	time_adjust = 0;			/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 
 	return 0;
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 648c82292ed6..0dd0df7a3b04 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -223,10 +223,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;			/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
@@ -442,7 +439,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
 	write_seqlock(&xtime_lock);
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index 8c1b96fffa76..cddf1cedf007 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -118,7 +118,7 @@ again:
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to when a second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 6cf7407344ba..7ff67f8e9f8c 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -188,10 +188,7 @@ do_settimeofday (struct timespec *tv)
 		set_normalized_timespec(&xtime, sec, nsec);
 		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
+		ntp_clear();
 	}
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index bf4ddca5e853..a3c5281a5d2d 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -169,7 +169,7 @@ void timer_interrupt(struct pt_regs * regs)
 		 * We should have an rtc call that only sets the minutes and
 		 * seconds like on Intel to avoid problems with non UTC clocks.
 		 */
-		if ( ppc_md.set_rtc_time && (time_status & STA_UNSYNC) == 0 &&
+		if ( ppc_md.set_rtc_time && ntp_synced() &&
 		     xtime.tv_sec - last_rtc_update >= 659 &&
 		     abs((xtime.tv_nsec / 1000) - (1000000-1000000/HZ)) < 500000/HZ &&
 		     jiffies - wall_jiffies == 1) {
@@ -271,10 +271,7 @@ int do_settimeofday(struct timespec *tv)
 	 */
 	last_rtc_update = new_sec - 658;
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	clock_was_set();
 	return 0;
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
index 91ef95ccda4f..9939c206afa4 100644
--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -128,7 +128,7 @@ static __inline__ void timer_check_rtc(void)
          * We should have an rtc call that only sets the minutes and
          * seconds like on Intel to avoid problems with non UTC clocks.
          */
-        if ( (time_status & STA_UNSYNC) == 0 &&
+        if (ntp_synced() &&
              xtime.tv_sec - last_rtc_update >= 659 &&
              abs((xtime.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ &&
              jiffies - wall_jiffies == 1) {
@@ -435,10 +435,7 @@ int do_settimeofday(struct timespec *tv)
 	 */
 	last_rtc_update = new_sec - 658;
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	delta_xsec = mulhdu( (tb_last_stamp-do_gtod.varp->tb_orig_stamp),
 			     do_gtod.varp->tb_to_xs );
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 8ca485676780..2fd75da15495 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -139,10 +139,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index d5f5aedde0a3..02ca69918d7c 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -215,10 +215,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -252,7 +249,7 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 926c6fc0619c..f4a62a10053c 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -247,10 +247,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -328,7 +325,7 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index 597d3ff6ad68..36a40697b8d6 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -840,10 +840,7 @@ static int pci_do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 3b759aefc170..bc015e980341 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -139,7 +139,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 
 
 	/* Determine when to update the Mostek clock. */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -554,10 +554,7 @@ static int sbus_do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	return 0;
 }
 
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 362b9c26871b..3f08a32f51a1 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -449,7 +449,7 @@ static inline void timer_check_rtc(void)
 	static long last_rtc_update;
 
 	/* Determine when to update the Mostek clock. */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/v850/kernel/time.c b/arch/v850/kernel/time.c
index f722a268238a..ea3fd8844ff0 100644
--- a/arch/v850/kernel/time.c
+++ b/arch/v850/kernel/time.c
@@ -66,7 +66,7 @@ static irqreturn_t timer_interrupt (int irq, void *dummy, struct pt_regs *regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -169,10 +169,7 @@ int do_settimeofday(struct timespec *tv)
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
 
-	time_adjust = 0;		/* stop active adjtime () */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq (&xtime_lock);
 	clock_was_set();
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2b5d9da912a2..7b6abe058253 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -176,10 +176,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
@@ -471,7 +468,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  * off) isn't likely to go away much sooner anyway.
  */
 
-	if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
+	if (ntp_synced() && xtime.tv_sec > rtc_update &&
 		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
 		set_rtc_mmss(xtime.tv_sec);
 		rtc_update = xtime.tv_sec + 660;
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index e07287db5a40..1ac7d5ce7456 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -122,10 +122,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	return 0;
 }
@@ -184,7 +181,7 @@ again:
 		next += CCOUNT_PER_JIFFY;
 		do_timer (regs); /* Linux handler in kernel/timer.c */
 
-		if ((time_status & STA_UNSYNC) == 0 &&
+		if (ntp_synced() &&
 		    xtime.tv_sec - last_rtc_update >= 659 &&
 		    abs((xtime.tv_nsec/1000)-(1000000-1000000/HZ))<5000000/HZ &&
 		    jiffies - wall_jiffies == 1) {
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 74fdd07d3792..7e050a2cc35b 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -260,6 +260,29 @@ extern long pps_calcnt;		/* calibration intervals */
 extern long pps_errcnt;		/* calibration errors */
 extern long pps_stbcnt;		/* stability limit exceeded */
 
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void ntp_clear(void)
+{
+	time_adjust = 0;		/* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+}
+
+/**
+ * ntp_synced - Returns 1 if the NTP status is not UNSYNC
+ *
+ */
+static inline int ntp_synced(void)
+{
+	return !(time_status & STA_UNSYNC);
+}
+
+
 #ifdef CONFIG_TIME_INTERPOLATION
 
 #define TIME_SOURCE_CPU 0
-- 
cgit v1.2.3


From 9c45817f41af987277353e463c78a1c6beb37da2 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@freescale.com>
Date: Tue, 6 Sep 2005 15:17:47 -0700
Subject: [PATCH] Remove non-arch consumers of asm/segment.h

asm/segment.h varies greatly on different architectures but is clearly
deprecated.  Removing all non-architecture consumers will make it easier
for us to get ride of asm/segment.h all together.

Signed-off-by: Kumar Gala <kumar.gala@freescale.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/hisax/hisax.h    | 1 -
 drivers/media/video/adv7170.c | 1 -
 drivers/media/video/adv7175.c | 1 -
 drivers/media/video/bt819.c   | 1 -
 drivers/media/video/bt856.c   | 1 -
 drivers/media/video/saa7111.c | 1 -
 drivers/media/video/saa7114.c | 1 -
 drivers/media/video/saa7185.c | 1 -
 drivers/serial/68328serial.c  | 1 -
 drivers/serial/crisv10.c      | 1 -
 drivers/serial/icom.c         | 1 -
 drivers/serial/mcfserial.c    | 1 -
 drivers/video/q40fb.c         | 1 -
 include/linux/isdn.h          | 1 -
 sound/oss/os.h                | 3 ---
 15 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 17cf7663c582..6eb96cba4d29 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -10,7 +10,6 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/major.h>
-#include <asm/segment.h>
 #include <asm/io.h>
 #include <linux/delay.h>
 #include <linux/kernel.h>
diff --git a/drivers/media/video/adv7170.c b/drivers/media/video/adv7170.c
index 52e32f05d625..1ca2b67aedfb 100644
--- a/drivers/media/video/adv7170.c
+++ b/drivers/media/video/adv7170.c
@@ -43,7 +43,6 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <linux/sched.h>
-#include <asm/segment.h>
 #include <linux/types.h>
 
 #include <linux/videodev.h>
diff --git a/drivers/media/video/adv7175.c b/drivers/media/video/adv7175.c
index b5ed9544bdea..173bca1e0295 100644
--- a/drivers/media/video/adv7175.c
+++ b/drivers/media/video/adv7175.c
@@ -39,7 +39,6 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <linux/sched.h>
-#include <asm/segment.h>
 #include <linux/types.h>
 
 #include <linux/videodev.h>
diff --git a/drivers/media/video/bt819.c b/drivers/media/video/bt819.c
index c6cfa7c48b04..3ee0afca76a7 100644
--- a/drivers/media/video/bt819.c
+++ b/drivers/media/video/bt819.c
@@ -43,7 +43,6 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <linux/sched.h>
-#include <asm/segment.h>
 #include <linux/types.h>
 
 #include <linux/videodev.h>
diff --git a/drivers/media/video/bt856.c b/drivers/media/video/bt856.c
index c13d28658868..8eb871d0e85b 100644
--- a/drivers/media/video/bt856.c
+++ b/drivers/media/video/bt856.c
@@ -43,7 +43,6 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <linux/sched.h>
-#include <asm/segment.h>
 #include <linux/types.h>
 
 #include <linux/videodev.h>
diff --git a/drivers/media/video/saa7111.c b/drivers/media/video/saa7111.c
index f18df53d98ff..fe8a5e453969 100644
--- a/drivers/media/video/saa7111.c
+++ b/drivers/media/video/saa7111.c
@@ -42,7 +42,6 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <linux/sched.h>
-#include <asm/segment.h>
 #include <linux/types.h>
 
 #include <linux/videodev.h>
diff --git a/drivers/media/video/saa7114.c b/drivers/media/video/saa7114.c
index e0c70f54f073..d9f50e2f7b92 100644
--- a/drivers/media/video/saa7114.c
+++ b/drivers/media/video/saa7114.c
@@ -45,7 +45,6 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <linux/sched.h>
-#include <asm/segment.h>
 #include <linux/types.h>
 
 #include <linux/videodev.h>
diff --git a/drivers/media/video/saa7185.c b/drivers/media/video/saa7185.c
index e93412f4407c..132aa7943c16 100644
--- a/drivers/media/video/saa7185.c
+++ b/drivers/media/video/saa7185.c
@@ -39,7 +39,6 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <linux/sched.h>
-#include <asm/segment.h>
 #include <linux/types.h>
 
 #include <linux/videodev.h>
diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c
index 9097f2f7b12a..2efb317153ce 100644
--- a/drivers/serial/68328serial.c
+++ b/drivers/serial/68328serial.c
@@ -40,7 +40,6 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/system.h>
-#include <asm/segment.h>
 #include <asm/delay.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c
index 5690594b257b..40d3e7139cfe 100644
--- a/drivers/serial/crisv10.c
+++ b/drivers/serial/crisv10.c
@@ -446,7 +446,6 @@ static char *serial_version = "$Revision: 1.25 $";
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/system.h>
-#include <asm/segment.h>
 #include <asm/bitops.h>
 #include <linux/delay.h>
 
diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index 79f8df4d66b7..eb31125c6a30 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -56,7 +56,6 @@
 #include <linux/bitops.h>
 
 #include <asm/system.h>
-#include <asm/segment.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/uaccess.h>
diff --git a/drivers/serial/mcfserial.c b/drivers/serial/mcfserial.c
index 8c40167778de..43b03c55f453 100644
--- a/drivers/serial/mcfserial.c
+++ b/drivers/serial/mcfserial.c
@@ -40,7 +40,6 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/system.h>
-#include <asm/segment.h>
 #include <asm/semaphore.h>
 #include <asm/delay.h>
 #include <asm/coldfire.h>
diff --git a/drivers/video/q40fb.c b/drivers/video/q40fb.c
index 71b69da0c40d..162012bb9264 100644
--- a/drivers/video/q40fb.c
+++ b/drivers/video/q40fb.c
@@ -21,7 +21,6 @@
 
 #include <asm/uaccess.h>
 #include <asm/setup.h>
-#include <asm/segment.h>
 #include <asm/system.h>
 #include <asm/q40_master.h>
 #include <linux/fb.h>
diff --git a/include/linux/isdn.h b/include/linux/isdn.h
index 862083eb58ab..53eaee96065b 100644
--- a/include/linux/isdn.h
+++ b/include/linux/isdn.h
@@ -150,7 +150,6 @@ typedef struct {
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/major.h>
-#include <asm/segment.h>
 #include <asm/io.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
diff --git a/sound/oss/os.h b/sound/oss/os.h
index d6b96297835c..80dce329cc3a 100644
--- a/sound/oss/os.h
+++ b/sound/oss/os.h
@@ -19,9 +19,6 @@
 #include <linux/ioport.h>
 #include <asm/page.h>
 #include <asm/system.h>
-#ifdef __alpha__
-#include <asm/segment.h>
-#endif
 #include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 #include <linux/poll.h>
-- 
cgit v1.2.3


From 7ea6040b0eff07d3a9a4e2d248ac137c6ad02d42 Mon Sep 17 00:00:00 2001
From: John McCutchan <ttb@tentacle.dhs.org>
Date: Tue, 6 Sep 2005 15:18:02 -0700
Subject: [PATCH] inotify: fix event loss on hardlinked files

People have run into a problem when they do this:

watch (file1, all_events);
watch (file2, some_events);

if file2 is a hard link to file1, some events will be missed because by
default we replace the mask.  The patch below adds a flag IN_MASK_ADD which
will cause inotify to add to the existing mask if present.

Signed-off-by: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Robert Love <rml@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c            | 9 ++++++++-
 include/linux/inotify.h | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/inotify.c b/fs/inotify.c
index 2fd97ef547ff..a37e9fb1da58 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -931,6 +931,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	struct nameidata nd;
 	struct file *filp;
 	int ret, fput_needed;
+	int mask_add = 0;
 
 	filp = fget_light(fd, &fput_needed);
 	if (unlikely(!filp))
@@ -953,6 +954,9 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	down(&inode->inotify_sem);
 	down(&dev->sem);
 
+	if (mask & IN_MASK_ADD)
+		mask_add = 1;
+
 	/* don't let user-space set invalid bits: we don't want flags set */
 	mask &= IN_ALL_EVENTS;
 	if (unlikely(!mask)) {
@@ -966,7 +970,10 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	 */
 	old = inode_find_dev(inode, dev);
 	if (unlikely(old)) {
-		old->mask = mask;
+		if (mask_add)
+			old->mask |= mask;
+		else
+			old->mask = mask;
 		ret = old->wd;
 		goto out;
 	}
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 93bb3afe646b..ee5b239092ed 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -47,6 +47,7 @@ struct inotify_event {
 #define IN_MOVE			(IN_MOVED_FROM | IN_MOVED_TO) /* moves */
 
 /* special flags */
+#define IN_MASK_ADD		0x20000000	/* add to the mask of an already existing watch */
 #define IN_ISDIR		0x40000000	/* event occurred against dir */
 #define IN_ONESHOT		0x80000000	/* only send event once */
 
-- 
cgit v1.2.3


From f90b1d2f1aaaa40c6519a32e69615edc25bb97d5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:10 -0700
Subject: [PATCH] cpusets: new __GFP_HARDWALL flag

Add another GFP flag: __GFP_HARDWALL.

A subsequent "cpuset_zone_allowed" patch will use this flag to mark GFP_USER
allocations, and distinguish them from GFP_KERNEL allocations.

Allocations (such as GFP_USER) marked GFP_HARDWALL are constrainted to the
current tasks cpuset.  Other allocations (such as GFP_KERNEL) can steal from
the possibly larger nearest mem_exclusive cpuset ancestor, if memory is tight
on every node in the current cpuset.

This patch collides with Mel Gorman's patch to reduce fragmentation in the
standard buddy allocator, which adds two GFP flags.  This was discussed on
linux-mm in July.  Most likely, one of his flags for user reclaimable memory
can be the same as my __GFP_HARDWALL flag, under some generic name meaning its
user address space memory.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7c7400137e97..4dc990f3b5cc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -40,6 +40,7 @@ struct vm_area_struct;
 #define __GFP_ZERO	0x8000u	/* Return zeroed page on success */
 #define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */
 #define __GFP_NORECLAIM  0x20000u /* No realy zone reclaim during allocation */
+#define __GFP_HARDWALL   0x40000u /* Enforce hardwall cpuset memory allocs */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
@@ -48,14 +49,15 @@ struct vm_area_struct;
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_NORECLAIM)
+			__GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL)
 
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
 #define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
+			 __GFP_HIGHMEM)
 
 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
-- 
cgit v1.2.3


From 9bf2229f8817677127a60c177aefce1badd22d7b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:12 -0700
Subject: [PATCH] cpusets: formalize intermediate GFP_KERNEL containment

This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution.  With this patch, there are now the following four layers of
memory placement available:

 1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
 2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
 3) The current tasks cpuset (GFP_USER allocations constrained to here), and
 4) Specific node placement, using mbind and set_mempolicy.

These nest - each layer is a subset (same or within) of the previous.

Layer (2) above is new, with this patch.  The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed.  The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.

GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.

The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such.  Swapper and oom_kill activity is also constrained to Layer (2).  A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset.  Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.

This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.

Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 12 +++++++
 include/linux/cpuset.h    |  5 +--
 kernel/cpuset.c           | 80 ++++++++++++++++++++++++++++++++++++++++++-----
 mm/page_alloc.c           | 16 ++++++----
 mm/vmscan.c               |  8 ++---
 5 files changed, 101 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ad944c060312..47f4114fbf54 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -60,6 +60,18 @@ all of the cpus in the system. This removes any overhead due to
 load balancing code trying to pull tasks outside of the cpu exclusive
 cpuset only to be prevented by the tasks' cpus_allowed mask.
 
+A cpuset that is mem_exclusive restricts kernel allocations for
+page, buffer and other data commonly shared by the kernel across
+multiple users.  All cpusets, whether mem_exclusive or not, restrict
+allocations of memory for user space.  This enables configuring a
+system so that several independent jobs can share common kernel
+data, such as file system pages, while isolating each jobs user
+allocation in its own cpuset.  To do this, construct a large
+mem_exclusive cpuset to hold all the jobs, and construct child,
+non-mem_exclusive cpusets for each individual job.  Only a small
+amount of typical kernel memory, such as requests from interrupt
+handlers, is allowed to be taken outside even a mem_exclusive cpuset.
+
 User level code may create and destroy cpusets by name in the cpuset
 virtual file system, manage the attributes and permissions of these
 cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3438233305a3..1fe1c3ebad30 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,7 +23,7 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-int cpuset_zone_allowed(struct zone *z);
+extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -48,7 +48,8 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 1;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z)
+static inline int cpuset_zone_allowed(struct zone *z,
+					unsigned int __nocast gfp_mask)
 {
 	return 1;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..214806deca99 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1611,17 +1611,81 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 0;
 }
 
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+	while (!is_mem_exclusive(cs) && cs->parent)
+		cs = cs->parent;
+	return cs;
+}
+
 /**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
- * @z: zone in question
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
  *
- * Is zone z allowed in current->mems_allowed, or is
- * the CPU in interrupt context? (zone is always allowed in this case)
- */
-int cpuset_zone_allowed(struct zone *z)
+ * If we're in interrupt, yes, we can always allocate.  If zone
+ * z's node is in our tasks mems_allowed, yes.  If it's not a
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ *	in_interrupt - any node ok (current task context irrelevant)
+ *	GFP_ATOMIC   - any node ok
+ *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *	GFP_USER     - only nodes in current tasks mems allowed ok.
+ **/
+
+int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
 {
-	return in_interrupt() ||
-		node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+	int node;			/* node that zone z is on */
+	const struct cpuset *cs;	/* current cpuset ancestors */
+	int allowed = 1;		/* is allocation in zone z allowed? */
+
+	if (in_interrupt())
+		return 1;
+	node = z->zone_pgdat->node_id;
+	if (node_isset(node, current->mems_allowed))
+		return 1;
+	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
+		return 0;
+
+	/* Not hardwall and node outside mems_allowed: scan up cpusets */
+	down(&cpuset_sem);
+	cs = current->cpuset;
+	if (!cs)
+		goto done;		/* current task exiting */
+	cs = nearest_exclusive_ancestor(cs);
+	allowed = node_isset(node, cs->mems_allowed);
+done:
+	up(&cpuset_sem);
+	return allowed;
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d7032c1d12..3974fd81d27c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
 	classzone_idx = zone_idx(zones[0]);
 
 restart:
-	/* Go through the zonelist once, looking for a zone with enough free */
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
-		if (!cpuset_zone_allowed(z))
+		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 			continue;
 
 		/*
@@ -845,6 +848,7 @@ zone_reclaim_retry:
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
@@ -852,7 +856,7 @@ zone_reclaim_retry:
 				       gfp_mask & __GFP_HIGH))
 			continue;
 
-		if (wait && !cpuset_zone_allowed(z))
+		if (wait && !cpuset_zone_allowed(z, gfp_mask))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -867,7 +871,7 @@ zone_reclaim_retry:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
 			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z))
+				if (!cpuset_zone_allowed(z, gfp_mask))
 					continue;
 				page = buffered_rmqueue(z, order, gfp_mask);
 				if (page)
@@ -903,7 +907,7 @@ rebalance:
 					       gfp_mask & __GFP_HIGH))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, gfp_mask))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -922,7 +926,7 @@ rebalance:
 					       classzone_idx, 0, 0))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0095533cdde9..a740778f688d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->present_pages == 0)
 			continue;
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
-	if (!cpuset_zone_allowed(zone))
+	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 		return;
 	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
 		return;
-- 
cgit v1.2.3


From ef08e3b4981aebf2ba9bd7025ef7210e8eec07ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 6 Sep 2005 15:18:13 -0700
Subject: [PATCH] cpusets: confine oom_killer to mem_exclusive cpuset

Now the real motivation for this cpuset mem_exclusive patch series seems
trivial.

This patch keeps a task in or under one mem_exclusive cpuset from provoking an
oom kill of a task under a non-overlapping mem_exclusive cpuset.  Since only
interrupt and GFP_ATOMIC allocations are allowed to escape mem_exclusive
containment, there is little to gain from oom killing a task under a
non-overlapping mem_exclusive cpuset, as almost all kernel and user memory
allocation must come from disjoint memory nodes.

This patch enables configuring a system so that a runaway job under one
mem_exclusive cpuset cannot cause the killing of a job in another such cpuset
that might be using very high compute and memory resources for a prolonged
time.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpuset.h |  6 ++++++
 kernel/cpuset.c        | 33 +++++++++++++++++++++++++++++++++
 mm/oom_kill.c          |  5 +++++
 3 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1fe1c3ebad30..24062a1dbf61 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -24,6 +24,7 @@ void cpuset_update_current_mems_allowed(void);
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
 extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
+extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 
@@ -54,6 +55,11 @@ static inline int cpuset_zone_allowed(struct zone *z,
 	return 1;
 }
 
+static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+	return 1;
+}
+
 static inline char *cpuset_task_status_allowed(struct task_struct *task,
 							char *buffer)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 214806deca99..40c6d801dd66 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1688,6 +1688,39 @@ done:
 	return allowed;
 }
 
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap.  Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
+	int overlap = 0;		/* do cpusets overlap? */
+
+	down(&cpuset_sem);
+	cs1 = current->cpuset;
+	if (!cs1)
+		goto done;		/* current task exiting */
+	cs2 = p->cpuset;
+	if (!cs2)
+		goto done;		/* task p is exiting */
+	cs1 = nearest_exclusive_ancestor(cs1);
+	cs2 = nearest_exclusive_ancestor(cs2);
+	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+	up(&cpuset_sem);
+
+	return overlap;
+}
+
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a1d46502938..5ec8da12cfd9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -20,6 +20,7 @@
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/cpuset.h>
 
 /* #define DEBUG */
 
@@ -152,6 +153,10 @@ static struct task_struct * select_bad_process(void)
 			continue;
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
+		/* If p's nodes don't overlap ours, it won't help to kill p. */
+		if (!cpuset_excl_nodes_overlap(p))
+			continue;
+
 		/*
 		 * This is in the process of releasing memory so for wait it
 		 * to finish before killing some other task by mistake.
-- 
cgit v1.2.3


From 9c1cfda20a508b181bdda8c0045f7c0c333880a5 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 6 Sep 2005 15:18:14 -0700
Subject: [PATCH] cpusets: Move the ia64 domain setup code to the generic code

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/Makefile    |   2 +-
 arch/ia64/kernel/domain.c    | 444 -------------------------------------------
 include/asm-ia64/processor.h |   3 -
 include/asm-ia64/topology.h  |  23 ---
 include/linux/sched.h        |   7 -
 include/linux/topology.h     |  23 +++
 kernel/sched.c               | 290 ++++++++++++++++++++++------
 7 files changed, 260 insertions(+), 532 deletions(-)
 delete mode 100644 arch/ia64/kernel/domain.c

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index b242594be55b..307514f7a282 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
 obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
deleted file mode 100644
index e907109983f1..000000000000
--- a/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * arch/ia64/kernel/domain.c
- * Architecture specific sched-domains builder.
- *
- * Copyright (C) 2004 Jesse Barnes
- * Copyright (C) 2004 Silicon Graphics, Inc.
- */
-
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
-
-		if (!nr_cpus_node(n))
-			continue;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span, nodemask;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	nodemask = node_to_cpumask(node);
-	cpus_or(span, span, nodemask);
-	set_bit(node, used_nodes);
-
-	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#endif
-
-/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-
-static int cpu_to_allnodes_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-void build_sched_domains(const cpumask_t *cpu_map)
-{
-	int i;
-#ifdef CONFIG_NUMA
-	struct sched_group **sched_group_nodes = NULL;
-	struct sched_group *sched_group_allnodes = NULL;
-
-	/*
-	 * Allocate the per-node list of sched groups
-	 */
-	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-					   GFP_ATOMIC);
-	if (!sched_group_nodes) {
-		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		return;
-	}
-	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
-#endif
-
-	/*
-	 * Set up domains for cpus specified by the cpu_map.
-	 */
-	for_each_cpu_mask(i, *cpu_map) {
-		int group;
-		struct sched_domain *sd = NULL, *p;
-		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-
-#ifdef CONFIG_NUMA
-		if (cpus_weight(*cpu_map)
-				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-			if (!sched_group_allnodes) {
-				sched_group_allnodes
-					= kmalloc(sizeof(struct sched_group)
-							* MAX_NUMNODES,
-						  GFP_KERNEL);
-				if (!sched_group_allnodes) {
-					printk(KERN_WARNING
-					"Can not alloc allnodes sched group\n");
-					break;
-				}
-				sched_group_allnodes_bycpu[i]
-						= sched_group_allnodes;
-			}
-			sd = &per_cpu(allnodes_domains, i);
-			*sd = SD_ALLNODES_INIT;
-			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i);
-			sd->groups = &sched_group_allnodes[group];
-			p = sd;
-		} else
-			p = NULL;
-
-		sd = &per_cpu(node_domains, i);
-		*sd = SD_NODE_INIT;
-		sd->span = sched_domain_node_span(cpu_to_node(i));
-		sd->parent = p;
-		cpus_and(sd->span, sd->span, *cpu_map);
-#endif
-
-		p = sd;
-		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i);
-		*sd = SD_CPU_INIT;
-		sd->span = nodemask;
-		sd->parent = p;
-		sd->groups = &sched_group_phys[group];
-
-#ifdef CONFIG_SCHED_SMT
-		p = sd;
-		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i);
-		*sd = SD_SIBLING_INIT;
-		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, *cpu_map);
-		sd->parent = p;
-		sd->groups = &sched_group_cpus[group];
-#endif
-	}
-
-#ifdef CONFIG_SCHED_SMT
-	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-		if (i != first_cpu(this_sibling_map))
-			continue;
-
-		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-						&cpu_to_cpu_group);
-	}
-#endif
-
-	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		init_sched_build_groups(sched_group_phys, nodemask,
-						&cpu_to_phys_group);
-	}
-
-#ifdef CONFIG_NUMA
-	if (sched_group_allnodes)
-		init_sched_build_groups(sched_group_allnodes, *cpu_map,
-					&cpu_to_allnodes_group);
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Set up node groups */
-		struct sched_group *sg, *prev;
-		cpumask_t nodemask = node_to_cpumask(i);
-		cpumask_t domainspan;
-		cpumask_t covered = CPU_MASK_NONE;
-		int j;
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask)) {
-			sched_group_nodes[i] = NULL;
-			continue;
-		}
-
-		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, *cpu_map);
-
-		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, nodemask) {
-			struct sched_domain *sd;
-			sd = &per_cpu(node_domains, j);
-			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
-		}
-		sg->cpu_power = 0;
-		sg->cpumask = nodemask;
-		cpus_or(covered, covered, nodemask);
-		prev = sg;
-
-		for (j = 0; j < MAX_NUMNODES; j++) {
-			cpumask_t tmp, notcovered;
-			int n = (i + j) % MAX_NUMNODES;
-
-			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, *cpu_map);
-			cpus_and(tmp, tmp, domainspan);
-			if (cpus_empty(tmp))
-				break;
-
-			nodemask = node_to_cpumask(n);
-			cpus_and(tmp, tmp, nodemask);
-			if (cpus_empty(tmp))
-				continue;
-
-			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-			if (!sg) {
-				printk(KERN_WARNING
-				"Can not alloc domain group for node %d\n", j);
-				break;
-			}
-			sg->cpu_power = 0;
-			sg->cpumask = tmp;
-			cpus_or(covered, covered, tmp);
-			prev->next = sg;
-			prev = sg;
-		}
-		prev->next = sched_group_nodes[i];
-	}
-#endif
-
-	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, *cpu_map) {
-		int power;
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
-
-		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-		sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
-		sd = &per_cpu(allnodes_domains, i);
-		if (sd->groups) {
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-			sd->groups->cpu_power = power;
-		}
-#endif
-	}
-
-#ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *sg = sched_group_nodes[i];
-		int j;
-
-		if (sg == NULL)
-			continue;
-next_sg:
-		for_each_cpu_mask(j, sg->cpumask) {
-			struct sched_domain *sd;
-			int power;
-
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
-				/*
-				 * Only add "power" once for each
-				 * physical package.
-				 */
-				continue;
-			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-
-			sg->cpu_power += power;
-		}
-		sg = sg->next;
-		if (sg != sched_group_nodes[i])
-			goto next_sg;
-	}
-#endif
-
-	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-#else
-		sd = &per_cpu(phys_domains, i);
-#endif
-		cpu_attach_domain(sd, i);
-	}
-}
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-void arch_init_sched_domains(const cpumask_t *cpu_map)
-{
-	cpumask_t cpu_default_map;
-
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-
-	build_sched_domains(&cpu_default_map);
-}
-
-void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-{
-#ifdef CONFIG_NUMA
-	int i;
-	int cpu;
-
-	for_each_cpu_mask(cpu, *cpu_map) {
-		struct sched_group *sched_group_allnodes
-			= sched_group_allnodes_bycpu[cpu];
-		struct sched_group **sched_group_nodes
-			= sched_group_nodes_bycpu[cpu];
-
-		if (sched_group_allnodes) {
-			kfree(sched_group_allnodes);
-			sched_group_allnodes_bycpu[cpu] = NULL;
-		}
-
-		if (!sched_group_nodes)
-			continue;
-
-		for (i = 0; i < MAX_NUMNODES; i++) {
-			cpumask_t nodemask = node_to_cpumask(i);
-			struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-			cpus_and(nodemask, nodemask, *cpu_map);
-			if (cpus_empty(nodemask))
-				continue;
-
-			if (sg == NULL)
-				continue;
-			sg = sg->next;
-next_sg:
-			oldsg = sg;
-			sg = sg->next;
-			kfree(oldsg);
-			if (oldsg != sched_group_nodes[i])
-				goto next_sg;
-		}
-		kfree(sched_group_nodes);
-		sched_group_nodes_bycpu[cpu] = NULL;
-	}
-#endif
-}
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 91bbd1f22461..94e07e727395 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
 
-/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
-#define ARCH_HAS_SCHED_DOMAIN
-
 #define IA64_NUM_DBG_REGS	8
 /*
  * Limits for PMC and PMD are set to less than maximum architected values
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 399bc29729fd..a9f738bf18a7 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -98,29 +98,6 @@ void build_cpu_to_node_map(void);
 	.nr_balance_failed	= 0,			\
 }
 
-/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 64,			\
-	.max_interval		= 64*num_online_cpus(),	\
-	.busy_factor		= 128,			\
-	.imbalance_pct		= 133,			\
-	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #endif /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5a22ea80045..ea1b5f32ec5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -564,13 +564,6 @@ struct sched_domain {
 
 extern void partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
-#ifdef ARCH_HAS_SCHED_DOMAIN
-/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
-extern cpumask_t cpu_isolated_map;
-extern void init_sched_build_groups(struct sched_group groups[],
-	                        cpumask_t span, int (*group_fn)(int cpu));
-extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-#endif /* ARCH_HAS_SCHED_DOMAIN */
 #endif /* CONFIG_SMP */
 
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0320225e96da..3df1d474e5c5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -135,6 +135,29 @@
 }
 #endif
 
+/* sched_domains SD_ALLNODES_INIT for NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {	\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_LOAD_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 64,			\
+	.nr_balance_failed	= 0,			\
+}
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..50860ad5b624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-			cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
  */
-static void check_sibling_maps(void)
-{
-	int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 
-	for_each_online_cpu(i) {
-		for_each_cpu_mask(j, cpu_sibling_map[i]) {
-			if (cpu_to_node(i) != cpu_to_node(j)) {
-				printk(KERN_INFO "warning: CPU %d siblings map "
-					"to different node - isolating "
-					"them.\n", i);
-				cpu_sibling_map[i] = cpumask_of_cpu(i);
-				break;
-			}
-		}
-	}
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
 }
 #endif
 
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void)
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+		if (num_online_cpus()
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
-		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = *cpu_map;
-		sd->groups = &sched_group_nodes[group];
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, *cpu_map,
-					&cpu_to_node_group);
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
+				&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-		if (i == first_cpu(sd->groups->cpumask)) {
-			/* Only add "power" once for each physical package. */
-			sd = &per_cpu(node_domains, i);
-			sd->groups->cpu_power += power;
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *sg = sched_group_nodes[i];
+		int j;
+
+		if (sg == NULL)
+			continue;
+next_sg:
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
+			int power;
+
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+
+			sg->cpu_power += power;
+		}
+		sg = sg->next;
+		if (sg != sched_group_nodes[i])
+			goto next_sg;
+	}
+#endif
+
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-	/* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+	int i;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+		struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		if (sg == NULL)
+			continue;
+		sg = sg->next;
+next_sg:
+		oldsg = sg;
+		sg = sg->next;
+		kfree(oldsg);
+		if (oldsg != sched_group_nodes[i])
+			goto next_sg;
+		sched_group_nodes[i] = NULL;
+	}
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
-- 
cgit v1.2.3


From 3f4bb1f4199b7dc0c958447b1e4898980013b884 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 6 Sep 2005 15:18:16 -0700
Subject: [PATCH] struct dentry: place d_hash close to d_parent and d_name to
 speedup lookups

dentry cache uses sophisticated RCU technology (and prefetching if
available) but touches 2 cache lines per dentry during hlist lookup.

This patch moves d_hash in the same cache line than d_parent and d_name
fields so that :

1) One cache line is needed instead of two.

2) the hlist_for_each_rcu() prefetching has a chance to bring all the
   needed data in advance, not only the part that includes d_hash.next.

I also changed one old comment that was wrong for 64bits.

A further optimisation would be to separate dentry in two parts, one that
is mostly read, and one writen (d_count/d_lock) to avoid false sharing on
SMP/NUMA but this would need different field placement depending on 32bits
or 64bits platform.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/dcache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 50be290d24d2..ab04b4f9b0db 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -88,8 +88,9 @@ struct dentry {
 					 * negative */
 	/*
 	 * The next three fields are touched by __d_lookup.  Place them here
-	 * so they all fit in a 16-byte range, with 16-byte alignment.
+	 * so they all fit in a cache line.
 	 */
+	struct hlist_node d_hash;	/* lookup hash list */
 	struct dentry *d_parent;	/* parent directory */
 	struct qstr d_name;
 
@@ -103,7 +104,6 @@ struct dentry {
 	void *d_fsdata;			/* fs-specific data */
  	struct rcu_head d_rcu;
 	struct dcookie_struct *d_cookie; /* cookie, if any */
-	struct hlist_node d_hash;	/* lookup hash list */	
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
 };
-- 
cgit v1.2.3


From e89bbd3a0b3c054d9a94feb0db7bbae1cdb99e54 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 6 Sep 2005 15:18:21 -0700
Subject: [PATCH] remove iattr.ia_attr_flags

Remove unused ia_attr_flags from struct iattr, and related defines.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hostfs/hostfs.h |  1 -
 include/linux/fs.h | 10 ----------
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 67bca0d4a33b..cca3fb693f99 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -49,7 +49,6 @@ struct hostfs_iattr {
 	struct timespec	ia_atime;
 	struct timespec	ia_mtime;
 	struct timespec	ia_ctime;
-	unsigned int	ia_attr_flags;
 };
 
 extern int stat_file(const char *path, unsigned long long *inode_out,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2036747c7d1f..57e294bf83f4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -281,18 +281,8 @@ struct iattr {
 	struct timespec	ia_atime;
 	struct timespec	ia_mtime;
 	struct timespec	ia_ctime;
-	unsigned int	ia_attr_flags;
 };
 
-/*
- * This is the inode attributes flag definitions
- */
-#define ATTR_FLAG_SYNCRONOUS	1 	/* Syncronous write */
-#define ATTR_FLAG_NOATIME	2 	/* Don't update atime */
-#define ATTR_FLAG_APPEND	4 	/* Append-only file */
-#define ATTR_FLAG_IMMUTABLE	8 	/* Immutable file */
-#define ATTR_FLAG_NODIRATIME	16 	/* Don't update atime for directory */
-
 /*
  * Includes for diskquotas.
  */
-- 
cgit v1.2.3


From ab8d11beb46f0bd0617e04205c01f5c1fe845b61 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 6 Sep 2005 15:18:24 -0700
Subject: [PATCH] remove duplicated code from proc and ptrace

Extract common code used by ptrace_attach() and may_ptrace_attach()
into a separate function.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c         | 35 ++++-------------------------------
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 41 ++++++++++++++++++++++++++++-------------
 3 files changed, 33 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 24eed139e54e..84751f3f52d5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -346,33 +346,6 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	 (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
 	 security_ptrace(current,task) == 0))
 
-static int may_ptrace_attach(struct task_struct *task)
-{
-	int retval = 0;
-
-	task_lock(task);
-
-	if (!task->mm)
-		goto out;
-	if (((current->uid != task->euid) ||
-	     (current->uid != task->suid) ||
-	     (current->uid != task->uid) ||
-	     (current->gid != task->egid) ||
-	     (current->gid != task->sgid) ||
-	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-		goto out;
-	rmb();
-	if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE))
-		goto out;
-	if (security_ptrace(current, task))
-		goto out;
-
-	retval = 1;
-out:
-	task_unlock(task);
-	return retval;
-}
-
 static int proc_pid_environ(struct task_struct *task, char * buffer)
 {
 	int res = 0;
@@ -382,7 +355,7 @@ static int proc_pid_environ(struct task_struct *task, char * buffer)
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
 		res = access_process_vm(task, mm->env_start, buffer, len, 0);
-		if (!may_ptrace_attach(task))
+		if (!ptrace_may_attach(task))
 			res = -ESRCH;
 		mmput(mm);
 	}
@@ -685,7 +658,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	int ret = -ESRCH;
 	struct mm_struct *mm;
 
-	if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		goto out;
 
 	ret = -ENOMEM;
@@ -711,7 +684,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 
 		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 		retval = access_process_vm(task, src, page, this_len, 0);
-		if (!retval || !MAY_PTRACE(task) || !may_ptrace_attach(task)) {
+		if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) {
 			if (!ret)
 				ret = -EIO;
 			break;
@@ -749,7 +722,7 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	struct task_struct *task = proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
 
-	if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		return -ESRCH;
 
 	page = (char *)__get_free_page(GFP_USER);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 2afdafb62123..dc6f3647bfbc 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -90,6 +90,7 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void ptrace_untrace(struct task_struct *child);
+extern int ptrace_may_attach(struct task_struct *task);
 
 static inline void ptrace_link(struct task_struct *child,
 			       struct task_struct *new_parent)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
+static int may_attach(struct task_struct *task)
+{
+	if (!task->mm)
+		return -EPERM;
+	if (((current->uid != task->euid) ||
+	     (current->uid != task->suid) ||
+	     (current->uid != task->uid) ||
+	     (current->gid != task->egid) ||
+	     (current->gid != task->sgid) ||
+	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+	smp_rmb();
+	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+		return -EPERM;
+
+	return security_ptrace(current, task);
+}
+
+int ptrace_may_attach(struct task_struct *task)
+{
+	int err;
+	task_lock(task);
+	err = may_attach(task);
+	task_unlock(task);
+	return !err;
+}
+
 int ptrace_attach(struct task_struct *task)
 {
 	int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
 		goto bad;
 	if (task == current)
 		goto bad;
-	if (!task->mm)
-		goto bad;
-	if(((current->uid != task->euid) ||
-	    (current->uid != task->suid) ||
-	    (current->uid != task->uid) ||
- 	    (current->gid != task->egid) ||
- 	    (current->gid != task->sgid) ||
- 	    (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-		goto bad;
-	smp_rmb();
-	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
-		goto bad;
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
-	retval = security_ptrace(current, task);
+	retval = may_attach(task);
 	if (retval)
 		goto bad;
 
-- 
cgit v1.2.3


From e922efc342d565a38eed3af377ff403f52148864 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 6 Sep 2005 15:18:25 -0700
Subject: [PATCH] remove duplicated sys_open32() code from 64bit archs

64 bit architectures all implement their own compatibility sys_open(),
when in fact the difference is simply not forcing the O_LARGEFILE
flag.  So use the a common function instead.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/ia32_entry.S       |  2 +-
 arch/ia64/ia32/sys_ia32.c         | 31 -------------------------------
 arch/ppc64/kernel/misc.S          |  2 +-
 arch/ppc64/kernel/sys_ppc32.c     | 31 -------------------------------
 arch/sparc64/kernel/sys_sparc32.c | 24 +-----------------------
 arch/x86_64/ia32/ia32entry.S      |  2 +-
 arch/x86_64/ia32/sys_ia32.c       | 26 --------------------------
 fs/compat.c                       | 10 ++++++++++
 fs/open.c                         | 19 +++++++++++--------
 include/linux/fs.h                |  1 +
 10 files changed, 26 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/ia32_entry.S b/arch/ia64/ia32/ia32_entry.S
index 829a6d80711c..0708edb06cc4 100644
--- a/arch/ia64/ia32/ia32_entry.S
+++ b/arch/ia64/ia32/ia32_entry.S
@@ -215,7 +215,7 @@ ia32_syscall_table:
 	data8 sys32_fork
 	data8 sys_read
 	data8 sys_write
-	data8 sys32_open	  /* 5 */
+	data8 compat_sys_open	  /* 5 */
 	data8 sys_close
 	data8 sys32_waitpid
 	data8 sys_creat
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index c1e20d65dd6c..e29a8a55486a 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -2359,37 +2359,6 @@ sys32_brk (unsigned int brk)
 	return ret;
 }
 
-/*
- * Exactly like fs/open.c:sys_open(), except that it doesn't set the O_LARGEFILE flag.
- */
-asmlinkage long
-sys32_open (const char __user * filename, int flags, int mode)
-{
-	char * tmp;
-	int fd, error;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file *f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f))
-				goto out_error;
-			fd_install(fd, f);
-		}
-out:
-		putname(tmp);
-	}
-	return fd;
-
-out_error:
-	put_unused_fd(fd);
-	fd = error;
-	goto out;
-}
-
 /* Structure for ia32 emulation on ia64 */
 struct epoll_event32
 {
diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S
index 474df0a862bf..2164bd7b4ef6 100644
--- a/arch/ppc64/kernel/misc.S
+++ b/arch/ppc64/kernel/misc.S
@@ -957,7 +957,7 @@ _GLOBAL(sys_call_table32)
 	.llong .ppc_fork
 	.llong .sys_read
 	.llong .sys_write
-	.llong .sys32_open		/* 5 */
+	.llong .compat_sys_open		/* 5 */
 	.llong .sys_close
 	.llong .sys32_waitpid
 	.llong .sys32_creat
diff --git a/arch/ppc64/kernel/sys_ppc32.c b/arch/ppc64/kernel/sys_ppc32.c
index 206619080e66..214914a95a50 100644
--- a/arch/ppc64/kernel/sys_ppc32.c
+++ b/arch/ppc64/kernel/sys_ppc32.c
@@ -867,37 +867,6 @@ off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin)
 	return sys_lseek(fd, (int)offset, origin);
 }
 
-/*
- * This is just a version for 32-bit applications which does
- * not force O_LARGEFILE on.
- */
-asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
-{
-	char * tmp;
-	int fd, error;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file * f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f))
-				goto out_error;
-			fd_install(fd, f);
-		}
-out:
-		putname(tmp);
-	}
-	return fd;
-
-out_error:
-	put_unused_fd(fd);
-	fd = error;
-	goto out;
-}
-
 /* Note: it is necessary to treat bufsiz as an unsigned int,
  * with the corresponding cast to a signed int to insure that the 
  * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 1d3aa588df8a..7f6239ed2521 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -1002,29 +1002,7 @@ asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
 asmlinkage long sparc32_open(const char __user *filename,
 			     int flags, int mode)
 {
-	char * tmp;
-	int fd, error;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file * f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f))
-				goto out_error;
-			fd_install(fd, f);
-		}
-out:
-		putname(tmp);
-	}
-	return fd;
-
-out_error:
-	put_unused_fd(fd);
-	fd = error;
-	goto out;
+	return do_sys_open(filename, flags, mode);
 }
 
 extern unsigned long do_mremap(unsigned long addr,
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index c45d6a05b984..f174083d5567 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -307,7 +307,7 @@ ia32_sys_call_table:
 	.quad stub32_fork
 	.quad sys_read
 	.quad sys_write
-	.quad sys32_open		/* 5 */
+	.quad compat_sys_open		/* 5 */
 	.quad sys_close
 	.quad sys32_waitpid
 	.quad sys_creat
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index be996d1b691e..04d80406ce4f 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -969,32 +969,6 @@ long sys32_kill(int pid, int sig)
 	return sys_kill(pid, sig);
 }
  
-asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
-{
-	char * tmp;
-	int fd, error;
-
-	/* don't force O_LARGEFILE */
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file *f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f)) {
-				put_unused_fd(fd); 
-				fd = error;
-			} else {
-				fsnotify_open(f->f_dentry);
-				fd_install(fd, f);
-			}
-		}
-		putname(tmp);
-	}
-	return fd;
-}
-
 extern asmlinkage long
 sys_timer_create(clockid_t which_clock,
 		 struct sigevent __user *timer_event_spec,
diff --git a/fs/compat.c b/fs/compat.c
index 2eb03c49b07c..8c665705c6a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1274,6 +1274,16 @@ out:
 	return ret;
 }
 
+/*
+ * Exactly like fs/open.c:sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open(const char __user *filename, int flags, int mode)
+{
+	return do_sys_open(filename, flags, mode);
+}
+
 /*
  * compat_count() counts the number of arguments/envelopes. It is basically
  * a copy of count() from fs/exec.c, except that it works with 32 bit argv
diff --git a/fs/open.c b/fs/open.c
index 32bf05e2996d..4ee2dcc31c28 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -933,16 +933,11 @@ void fastcall fd_install(unsigned int fd, struct file * file)
 
 EXPORT_SYMBOL(fd_install);
 
-asmlinkage long sys_open(const char __user * filename, int flags, int mode)
+long do_sys_open(const char __user *filename, int flags, int mode)
 {
-	char * tmp;
-	int fd;
+	char *tmp = getname(filename);
+	int fd = PTR_ERR(tmp);
 
-	if (force_o_largefile())
-		flags |= O_LARGEFILE;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd();
 		if (fd >= 0) {
@@ -959,6 +954,14 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode)
 	}
 	return fd;
 }
+
+asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+{
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	return do_sys_open(filename, flags, mode);
+}
 EXPORT_SYMBOL_GPL(sys_open);
 
 #ifndef __alpha__
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 57e294bf83f4..7e1b589842af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1280,6 +1280,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 /* fs/open.c */
 
 extern int do_truncate(struct dentry *, loff_t start);
+extern long do_sys_open(const char __user *filename, int flags, int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
 extern int filp_close(struct file *, fl_owner_t id);
-- 
cgit v1.2.3


From ebad6a4230bdb5927495e28bc7837f515bf667a7 Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:29 -0700
Subject: [PATCH] dmi: add onboard devices discovery

This patch adds onboard devices and IPMI BMC discovery into DMI scan code.
Drivers can use dmi_find_device() function to search for devices by type and
name.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 102 ++++++++++++++++++++++++++++++++++++++------
 include/linux/dmi.h         |  36 ++++++++++++++--
 2 files changed, 123 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index ae1a1aed2fc0..c4a73855e38c 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -6,13 +6,6 @@
 #include <linux/bootmem.h>
 
 
-struct dmi_header {
-	u8 type;
-	u8 length;
-	u16 handle;
-};
-
-
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
 	u8 *bp = ((u8 *) dm) + dm->length;
@@ -88,6 +81,7 @@ static int __init dmi_checksum(u8 *buf)
 }
 
 static char *dmi_ident[DMI_STRING_MAX];
+static LIST_HEAD(dmi_devices);
 
 /*
  *	Save a DMI string
@@ -106,6 +100,58 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
 	dmi_ident[slot] = p;
 }
 
+static void __init dmi_save_devices(struct dmi_header *dm)
+{
+	int i, count = (dm->length - sizeof(struct dmi_header)) / 2;
+	struct dmi_device *dev;
+
+	for (i = 0; i < count; i++) {
+		char *d = ((char *) dm) + (i * 2);
+
+		/* Skip disabled device */
+		if ((*d & 0x80) == 0)
+			continue;
+
+		dev = alloc_bootmem(sizeof(*dev));
+		if (!dev) {
+			printk(KERN_ERR "dmi_save_devices: out of memory.\n");
+			break;
+		}
+
+		dev->type = *d++ & 0x7f;
+		dev->name = dmi_string(dm, *d);
+		dev->device_data = NULL;
+
+		list_add(&dev->list, &dmi_devices);
+	}
+}
+
+static void __init dmi_save_ipmi_device(struct dmi_header *dm)
+{
+	struct dmi_device *dev;
+	void * data;
+
+	data = alloc_bootmem(dm->length);
+	if (data == NULL) {
+		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		return;
+	}
+
+	memcpy(data, dm, dm->length);
+
+	dev = alloc_bootmem(sizeof(*dev));
+	if (!dev) {
+		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		return;
+	}
+
+	dev->type = DMI_DEV_TYPE_IPMI;
+	dev->name = "IPMI controller";
+	dev->device_data = data;
+
+	list_add(&dev->list, &dmi_devices);
+}
+
 /*
  *	Process a DMI table entry. Right now all we care about are the BIOS
  *	and machine entries. For 2.5 we should pull the smbus controller info
@@ -113,25 +159,28 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
  */
 static void __init dmi_decode(struct dmi_header *dm)
 {
-	u8 *data __attribute__((__unused__)) = (u8 *)dm;
-	
 	switch(dm->type) {
-	case  0:
+	case 0:		/* BIOS Information */
 		dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
 		dmi_save_ident(dm, DMI_BIOS_DATE, 8);
 		break;
-	case 1:
+	case 1:		/* System Information */
 		dmi_save_ident(dm, DMI_SYS_VENDOR, 4);
 		dmi_save_ident(dm, DMI_PRODUCT_NAME, 5);
 		dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
 		dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
 		break;
-	case 2:
+	case 2:		/* Base Board Information */
 		dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BOARD_NAME, 5);
 		dmi_save_ident(dm, DMI_BOARD_VERSION, 6);
 		break;
+	case 10:	/* Onboard Devices Information */
+		dmi_save_devices(dm);
+		break;
+	case 38:	/* IPMI Device Information */
+		dmi_save_ipmi_device(dm);
 	}
 }
 
@@ -221,3 +270,32 @@ char *dmi_get_system_info(int field)
 	return dmi_ident[field];
 }
 EXPORT_SYMBOL(dmi_get_system_info);
+
+/**
+ *	dmi_find_device - find onboard device by type/name
+ *	@type: device type or %DMI_DEV_TYPE_ANY to match all device types
+ *	@desc: device name string or %NULL to match all
+ *	@from: previous device found in search, or %NULL for new search.
+ *
+ *	Iterates through the list of known onboard devices. If a device is
+ *	found with a matching @vendor and @device, a pointer to its device
+ *	structure is returned.  Otherwise, %NULL is returned.
+ *	A new search is initiated by passing %NULL to the @from argument.
+ *	If @from is not %NULL, searches continue from next device.
+ */
+struct dmi_device * dmi_find_device(int type, const char *name,
+				    struct dmi_device *from)
+{
+	struct list_head *d, *head = from ? &from->list : &dmi_devices;
+
+	for(d = head->next; d != &dmi_devices; d = d->next) {
+		struct dmi_device *dev = list_entry(d, struct dmi_device, list);
+
+		if (((type == DMI_DEV_TYPE_ANY) || (dev->type == type)) &&
+		    ((name == NULL) || (strcmp(dev->name, name) == 0)))
+			return dev;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(dmi_find_device);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 5e93e6dce9a4..c30175e8dec6 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -1,6 +1,8 @@
 #ifndef __DMI_H__
 #define __DMI_H__
 
+#include <linux/list.h>
+
 enum dmi_field {
 	DMI_NONE,
 	DMI_BIOS_VENDOR,
@@ -16,6 +18,24 @@ enum dmi_field {
 	DMI_STRING_MAX,
 };
 
+enum dmi_device_type {
+	DMI_DEV_TYPE_ANY = 0,
+	DMI_DEV_TYPE_OTHER,
+	DMI_DEV_TYPE_UNKNOWN,
+	DMI_DEV_TYPE_VIDEO,
+	DMI_DEV_TYPE_SCSI,
+	DMI_DEV_TYPE_ETHERNET,
+	DMI_DEV_TYPE_TOKENRING,
+	DMI_DEV_TYPE_SOUND,
+	DMI_DEV_TYPE_IPMI = -1
+};
+
+struct dmi_header {
+	u8 type;
+	u8 length;
+	u16 handle;
+};
+
 /*
  *	DMI callbacks for problem boards
  */
@@ -26,22 +46,32 @@ struct dmi_strmatch {
 
 struct dmi_system_id {
 	int (*callback)(struct dmi_system_id *);
-	char *ident;
+	const char *ident;
 	struct dmi_strmatch matches[4];
 	void *driver_data;
 };
 
-#define DMI_MATCH(a,b)	{ a, b }
+#define DMI_MATCH(a, b)	{ a, b }
+
+struct dmi_device {
+	struct list_head list;
+	int type;
+	const char *name;
+	void *device_data;	/* Type specific data */
+};
 
 #if defined(CONFIG_X86) && !defined(CONFIG_X86_64)
 
 extern int dmi_check_system(struct dmi_system_id *list);
 extern char * dmi_get_system_info(int field);
-
+extern struct dmi_device * dmi_find_device(int type, const char *name,
+	struct dmi_device *from);
 #else
 
 static inline int dmi_check_system(struct dmi_system_id *list) { return 0; }
 static inline char * dmi_get_system_info(int field) { return NULL; }
+static struct dmi_device * dmi_find_device(int type, const char *name,
+	struct dmi_device *from) { return NULL; }
 
 #endif
 
-- 
cgit v1.2.3


From dd3927105b6f65afb7dac17682172cdfb86d3f00 Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.Helsinki.FI>
Date: Tue, 6 Sep 2005 15:18:31 -0700
Subject: [PATCH] introduce and use kzalloc

This patch introduces a kzalloc wrapper and converts kernel/ to use it.  It
saves a little program text.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 16 +++++++++++++++-
 kernel/intermodule.c |  3 +--
 kernel/params.c      |  4 ++--
 kernel/power/pm.c    |  3 +--
 kernel/resource.c    |  3 +--
 kernel/workqueue.c   |  3 +--
 mm/slab.c            | 18 ++++++------------
 7 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 80b2dfde2e80..42a6bea58af3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -99,7 +99,21 @@ found:
 	return __kmalloc(size, flags);
 }
 
-extern void *kcalloc(size_t, size_t, unsigned int __nocast);
+extern void *kzalloc(size_t, unsigned int __nocast);
+
+/**
+ * kcalloc - allocate memory for an array. The memory is set to zero.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate.
+ */
+static inline void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+{
+	if (n != 0 && size > INT_MAX / n)
+		return NULL;
+	return kzalloc(n * size, flags);
+}
+
 extern void kfree(const void *);
 extern unsigned int ksize(const void *);
 
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 	struct list_head *tmp;
 	struct inter_module_entry *ime, *ime_new;
 
-	if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+	if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
 		/* Overloaded kernel, not fatal */
 		printk(KERN_ERR
 			"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
 		kmalloc_failed = 1;
 		return;
 	}
-	memset(ime_new, 0, sizeof(*ime_new));
 	ime_new->im_name = im_name;
 	ime_new->owner = owner;
 	ime_new->userdata = userdata;
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..fbf173215fd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 {
 	struct module_kobject *mk;
 
-	mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
-	memset(mk, 0, sizeof(struct module_kobject));
+	mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+	BUG_ON(!mk);
 
 	mk->mod = THIS_MODULE;
 	kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
 			   unsigned long id,
 			   pm_callback callback)
 {
-	struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+	struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
 	if (dev) {
-		memset(dev, 0, sizeof(*dev));
 		dev->type = type;
 		dev->id = id;
 		dev->callback = callback;
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
  */
 struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
 {
-	struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
 
 	if (res) {
-		memset(res, 0, sizeof(*res));
 		res->name = name;
 		res->start = start;
 		res->end = start + n - 1;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3de837a8ddd..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	struct workqueue_struct *wq;
 	struct task_struct *p;
 
-	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return NULL;
-	memset(wq, 0, sizeof(*wq));
 
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
diff --git a/mm/slab.c b/mm/slab.c
index a9ff4f7f9860..d7c4443991fe 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2558,24 +2558,18 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 
 /**
- * kcalloc - allocate memory for an array. The memory is set to zero.
- * @n: number of elements.
- * @size: element size.
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, unsigned int __nocast flags)
 {
-	void *ret = NULL;
-
-	if (n != 0 && size > INT_MAX / n)
-		return ret;
-
-	ret = kmalloc(n * size, flags);
+	void *ret = kmalloc(size, flags);
 	if (ret)
-		memset(ret, 0, n * size);
+		memset(ret, 0, size);
 	return ret;
 }
-EXPORT_SYMBOL(kcalloc);
+EXPORT_SYMBOL(kzalloc);
 
 /**
  * kfree - free previously allocated memory
-- 
cgit v1.2.3


From c14979b993021377228958498937bcdd9539cbce Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Tue, 6 Sep 2005 15:18:38 -0700
Subject: [PATCH] ipmi: add per-channel IPMB addresses

IPMI allows multiple IPMB channels on a single interface, and each channel
might have a different IPMB address.  However, the driver has only one IPMB
address that it uses for everything.  This patch adds new IOCTLS and a new
internal interface for setting per-channel IPMB addresses and LUNs.  New
systems are coming out with support for multiple IPMB channels, and they are
broken without this patch.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/ipmi/ipmi_devintf.c    |  94 ++++++++++++++++++++++---
 drivers/char/ipmi/ipmi_msghandler.c | 137 +++++++++++++++++++++++++-----------
 include/linux/ipmi.h                |  30 ++++++--
 3 files changed, 201 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index e0a53570fea1..5571e92c520f 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -411,6 +411,7 @@ static int ipmi_ioctl(struct inode  *inode,
 		break;
 	}
 
+	/* The next four are legacy, not per-channel. */
 	case IPMICTL_SET_MY_ADDRESS_CMD:
 	{
 		unsigned int val;
@@ -420,22 +421,25 @@ static int ipmi_ioctl(struct inode  *inode,
 			break;
 		}
 
-		ipmi_set_my_address(priv->user, val);
-		rv = 0;
+		rv = ipmi_set_my_address(priv->user, 0, val);
 		break;
 	}
 
 	case IPMICTL_GET_MY_ADDRESS_CMD:
 	{
-		unsigned int val;
+		unsigned int  val;
+		unsigned char rval;
+
+		rv = ipmi_get_my_address(priv->user, 0, &rval);
+		if (rv)
+			break;
 
-		val = ipmi_get_my_address(priv->user);
+		val = rval;
 
 		if (copy_to_user(arg, &val, sizeof(val))) {
 			rv = -EFAULT;
 			break;
 		}
-		rv = 0;
 		break;
 	}
 
@@ -448,24 +452,94 @@ static int ipmi_ioctl(struct inode  *inode,
 			break;
 		}
 
-		ipmi_set_my_LUN(priv->user, val);
-		rv = 0;
+		rv = ipmi_set_my_LUN(priv->user, 0, val);
 		break;
 	}
 
 	case IPMICTL_GET_MY_LUN_CMD:
 	{
-		unsigned int val;
+		unsigned int  val;
+		unsigned char rval;
+
+		rv = ipmi_get_my_LUN(priv->user, 0, &rval);
+		if (rv)
+			break;
 
-		val = ipmi_get_my_LUN(priv->user);
+		val = rval;
 
 		if (copy_to_user(arg, &val, sizeof(val))) {
 			rv = -EFAULT;
 			break;
 		}
-		rv = 0;
 		break;
 	}
+
+	case IPMICTL_SET_MY_CHANNEL_ADDRESS_CMD:
+	{
+		struct ipmi_channel_lun_address_set val;
+
+		if (copy_from_user(&val, arg, sizeof(val))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		return ipmi_set_my_address(priv->user, val.channel, val.value);
+		break;
+	}
+
+	case IPMICTL_GET_MY_CHANNEL_ADDRESS_CMD:
+	{
+		struct ipmi_channel_lun_address_set val;
+
+		if (copy_from_user(&val, arg, sizeof(val))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		rv = ipmi_get_my_address(priv->user, val.channel, &val.value);
+		if (rv)
+			break;
+
+		if (copy_to_user(arg, &val, sizeof(val))) {
+			rv = -EFAULT;
+			break;
+		}
+		break;
+	}
+
+	case IPMICTL_SET_MY_CHANNEL_LUN_CMD:
+	{
+		struct ipmi_channel_lun_address_set val;
+
+		if (copy_from_user(&val, arg, sizeof(val))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		rv = ipmi_set_my_LUN(priv->user, val.channel, val.value);
+		break;
+	}
+
+	case IPMICTL_GET_MY_CHANNEL_LUN_CMD:
+	{
+		struct ipmi_channel_lun_address_set val;
+
+		if (copy_from_user(&val, arg, sizeof(val))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		rv = ipmi_get_my_LUN(priv->user, val.channel, &val.value);
+		if (rv)
+			break;
+
+		if (copy_to_user(arg, &val, sizeof(val))) {
+			rv = -EFAULT;
+			break;
+		}
+		break;
+	}
+
 	case IPMICTL_SET_TIMING_PARMS_CMD:
 	{
 		struct ipmi_timing_parms parms;
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index e16c13fe698d..84d477c6f925 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -124,6 +124,14 @@ struct ipmi_channel
 {
 	unsigned char medium;
 	unsigned char protocol;
+
+	/* My slave address.  This is initialized to IPMI_BMC_SLAVE_ADDR,
+	   but may be changed by the user. */
+	unsigned char address;
+
+	/* My LUN.  This should generally stay the SMS LUN, but just in
+	   case... */
+	unsigned char lun;
 };
 
 #ifdef CONFIG_PROC_FS
@@ -135,7 +143,7 @@ struct ipmi_proc_entry
 #endif
 
 #define IPMI_IPMB_NUM_SEQ	64
-#define IPMI_MAX_CHANNELS       8
+#define IPMI_MAX_CHANNELS       16
 struct ipmi_smi
 {
 	/* What interface number are we? */
@@ -199,14 +207,6 @@ struct ipmi_smi
 	   this is registered. */
 	ipmi_user_t all_cmd_rcvr;
 
-	/* My slave address.  This is initialized to IPMI_BMC_SLAVE_ADDR,
-	   but may be changed by the user. */
-	unsigned char my_address;
-
-	/* My LUN.  This should generally stay the SMS LUN, but just in
-	   case... */
-	unsigned char my_lun;
-
 	/* The event receiver for my BMC, only really used at panic
 	   shutdown as a place to store this. */
 	unsigned char event_receiver;
@@ -766,26 +766,44 @@ void ipmi_get_version(ipmi_user_t   user,
 	*minor = user->intf->version_minor;
 }
 
-void ipmi_set_my_address(ipmi_user_t   user,
-			 unsigned char address)
+int ipmi_set_my_address(ipmi_user_t   user,
+			unsigned int  channel,
+			unsigned char address)
 {
-	user->intf->my_address = address;
+	if (channel >= IPMI_MAX_CHANNELS)
+		return -EINVAL;
+	user->intf->channels[channel].address = address;
+	return 0;
 }
 
-unsigned char ipmi_get_my_address(ipmi_user_t user)
+int ipmi_get_my_address(ipmi_user_t   user,
+			unsigned int  channel,
+			unsigned char *address)
 {
-	return user->intf->my_address;
+	if (channel >= IPMI_MAX_CHANNELS)
+		return -EINVAL;
+	*address = user->intf->channels[channel].address;
+	return 0;
 }
 
-void ipmi_set_my_LUN(ipmi_user_t   user,
-		     unsigned char LUN)
+int ipmi_set_my_LUN(ipmi_user_t   user,
+		    unsigned int  channel,
+		    unsigned char LUN)
 {
-	user->intf->my_lun = LUN & 0x3;
+	if (channel >= IPMI_MAX_CHANNELS)
+		return -EINVAL;
+	user->intf->channels[channel].lun = LUN & 0x3;
+	return 0;
 }
 
-unsigned char ipmi_get_my_LUN(ipmi_user_t user)
+int ipmi_get_my_LUN(ipmi_user_t   user,
+		    unsigned int  channel,
+		    unsigned char *address)
 {
-	return user->intf->my_lun;
+	if (channel >= IPMI_MAX_CHANNELS)
+		return -EINVAL;
+	*address = user->intf->channels[channel].lun;
+	return 0;
 }
 
 int ipmi_set_gets_events(ipmi_user_t user, int val)
@@ -1213,7 +1231,7 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		unsigned char         ipmb_seq;
 		long                  seqid;
 
-		if (addr->channel > IPMI_NUM_CHANNELS) {
+		if (addr->channel >= IPMI_NUM_CHANNELS) {
 			spin_lock_irqsave(&intf->counter_lock, flags);
 			intf->sent_invalid_commands++;
 			spin_unlock_irqrestore(&intf->counter_lock, flags);
@@ -1346,6 +1364,18 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 	return rv;
 }
 
+static int check_addr(ipmi_smi_t       intf,
+		      struct ipmi_addr *addr,
+		      unsigned char    *saddr,
+		      unsigned char    *lun)
+{
+	if (addr->channel >= IPMI_MAX_CHANNELS)
+		return -EINVAL;
+	*lun = intf->channels[addr->channel].lun;
+	*saddr = intf->channels[addr->channel].address;
+	return 0;
+}
+
 int ipmi_request_settime(ipmi_user_t      user,
 			 struct ipmi_addr *addr,
 			 long             msgid,
@@ -1355,6 +1385,12 @@ int ipmi_request_settime(ipmi_user_t      user,
 			 int              retries,
 			 unsigned int     retry_time_ms)
 {
+	unsigned char saddr, lun;
+	int           rv;
+
+	rv = check_addr(user->intf, addr, &saddr, &lun);
+	if (rv)
+		return rv;
 	return i_ipmi_request(user,
 			      user->intf,
 			      addr,
@@ -1363,8 +1399,8 @@ int ipmi_request_settime(ipmi_user_t      user,
 			      user_msg_data,
 			      NULL, NULL,
 			      priority,
-			      user->intf->my_address,
-			      user->intf->my_lun,
+			      saddr,
+			      lun,
 			      retries,
 			      retry_time_ms);
 }
@@ -1378,6 +1414,12 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 			     struct ipmi_recv_msg *supplied_recv,
 			     int                  priority)
 {
+	unsigned char saddr, lun;
+	int           rv;
+
+	rv = check_addr(user->intf, addr, &saddr, &lun);
+	if (rv)
+		return rv;
 	return i_ipmi_request(user,
 			      user->intf,
 			      addr,
@@ -1387,8 +1429,8 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 			      supplied_smi,
 			      supplied_recv,
 			      priority,
-			      user->intf->my_address,
-			      user->intf->my_lun,
+			      saddr,
+			      lun,
 			      -1, 0);
 }
 
@@ -1397,8 +1439,15 @@ static int ipmb_file_read_proc(char *page, char **start, off_t off,
 {
 	char       *out = (char *) page;
 	ipmi_smi_t intf = data;
+	int        i;
+	int        rv= 0;
 
-	return sprintf(out, "%x\n", intf->my_address);
+	for (i=0; i<IPMI_MAX_CHANNELS; i++)
+		rv += sprintf(out+rv, "%x ", intf->channels[i].address);
+	out[rv-1] = '\n'; /* Replace the final space with a newline */
+	out[rv] = '\0';
+	rv++;
+	return rv;
 }
 
 static int version_file_read_proc(char *page, char **start, off_t off,
@@ -1592,8 +1641,8 @@ send_channel_info_cmd(ipmi_smi_t intf, int chan)
 			      NULL,
 			      NULL,
 			      0,
-			      intf->my_address,
-			      intf->my_lun,
+			      intf->channels[0].address,
+			      intf->channels[0].lun,
 			      -1, 0);
 }
 
@@ -1696,11 +1745,13 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 			new_intf->intf_num = i;
 			new_intf->version_major = version_major;
 			new_intf->version_minor = version_minor;
-			if (slave_addr == 0)
-				new_intf->my_address = IPMI_BMC_SLAVE_ADDR;
-			else
-				new_intf->my_address = slave_addr;
-			new_intf->my_lun = 2;  /* the SMS LUN. */
+			for (j=0; j<IPMI_MAX_CHANNELS; j++) {
+				new_intf->channels[j].address
+					= IPMI_BMC_SLAVE_ADDR;
+				new_intf->channels[j].lun = 2;
+			}
+			if (slave_addr != 0)
+				new_intf->channels[0].address = slave_addr;
 			rwlock_init(&(new_intf->users_lock));
 			INIT_LIST_HEAD(&(new_intf->users));
 			new_intf->handlers = handlers;
@@ -1985,7 +2036,7 @@ static int handle_ipmb_get_msg_cmd(ipmi_smi_t          intf,
 		msg->data[3] = msg->rsp[6];
                 msg->data[4] = ((netfn + 1) << 2) | (msg->rsp[7] & 0x3);
 		msg->data[5] = ipmb_checksum(&(msg->data[3]), 2);
-		msg->data[6] = intf->my_address;
+		msg->data[6] = intf->channels[msg->rsp[3] & 0xf].address;
                 /* rqseq/lun */
                 msg->data[7] = (msg->rsp[7] & 0xfc) | (msg->rsp[4] & 0x3);
 		msg->data[8] = msg->rsp[8]; /* cmd */
@@ -2919,8 +2970,8 @@ static void send_panic_events(char *str)
 			       &smi_msg,
 			       &recv_msg,
 			       0,
-			       intf->my_address,
-			       intf->my_lun,
+			       intf->channels[0].address,
+			       intf->channels[0].lun,
 			       0, 1); /* Don't retry, and don't wait. */
 	}
 
@@ -2965,8 +3016,8 @@ static void send_panic_events(char *str)
 			       &smi_msg,
 			       &recv_msg,
 			       0,
-			       intf->my_address,
-			       intf->my_lun,
+			       intf->channels[0].address,
+			       intf->channels[0].lun,
 			       0, 1); /* Don't retry, and don't wait. */
 
 		if (intf->local_event_generator) {
@@ -2985,8 +3036,8 @@ static void send_panic_events(char *str)
 				       &smi_msg,
 				       &recv_msg,
 				       0,
-				       intf->my_address,
-				       intf->my_lun,
+				       intf->channels[0].address,
+				       intf->channels[0].lun,
 				       0, 1); /* no retry, and no wait. */
 		}
 		intf->null_user_handler = NULL;
@@ -2996,7 +3047,7 @@ static void send_panic_events(char *str)
 		   be zero, and it must not be my address. */
                 if (((intf->event_receiver & 1) == 0)
 		    && (intf->event_receiver != 0)
-		    && (intf->event_receiver != intf->my_address))
+		    && (intf->event_receiver != intf->channels[0].address))
 		{
 			/* The event receiver is valid, send an IPMB
 			   message. */
@@ -3031,7 +3082,7 @@ static void send_panic_events(char *str)
 			data[0] = 0;
 			data[1] = 0;
 			data[2] = 0xf0; /* OEM event without timestamp. */
-			data[3] = intf->my_address;
+			data[3] = intf->channels[0].address;
 			data[4] = j++; /* sequence # */
 			/* Always give 11 bytes, so strncpy will fill
 			   it with zeroes for me. */
@@ -3047,8 +3098,8 @@ static void send_panic_events(char *str)
 				       &smi_msg,
 				       &recv_msg,
 				       0,
-				       intf->my_address,
-				       intf->my_lun,
+				       intf->channels[0].address,
+				       intf->channels[0].lun,
 				       0, 1); /* no retry, and no wait. */
 		}
 	}	
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 596ca6130159..846b69899776 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -298,13 +298,19 @@ void ipmi_get_version(ipmi_user_t   user,
    this user, so it will affect all users of this interface.  This is
    so some initialization code can come in and do the OEM-specific
    things it takes to determine your address (if not the BMC) and set
-   it for everyone else. */
-void ipmi_set_my_address(ipmi_user_t   user,
-			 unsigned char address);
-unsigned char ipmi_get_my_address(ipmi_user_t user);
-void ipmi_set_my_LUN(ipmi_user_t   user,
-		     unsigned char LUN);
-unsigned char ipmi_get_my_LUN(ipmi_user_t user);
+   it for everyone else.  Note that each channel can have its own address. */
+int ipmi_set_my_address(ipmi_user_t   user,
+			unsigned int  channel,
+			unsigned char address);
+int ipmi_get_my_address(ipmi_user_t   user,
+			unsigned int  channel,
+			unsigned char *address);
+int ipmi_set_my_LUN(ipmi_user_t   user,
+		    unsigned int  channel,
+		    unsigned char LUN);
+int ipmi_get_my_LUN(ipmi_user_t   user,
+		    unsigned int  channel,
+		    unsigned char *LUN);
 
 /*
  * Like ipmi_request, but lets you specify the number of retries and
@@ -585,6 +591,16 @@ struct ipmi_cmdspec
  * things it takes to determine your address (if not the BMC) and set
  * it for everyone else.  You should probably leave the LUN alone.
  */
+struct ipmi_channel_lun_address_set
+{
+	unsigned short channel;
+	unsigned char  value;
+};
+#define IPMICTL_SET_MY_CHANNEL_ADDRESS_CMD _IOR(IPMI_IOC_MAGIC, 24, struct ipmi_channel_lun_address_set)
+#define IPMICTL_GET_MY_CHANNEL_ADDRESS_CMD _IOR(IPMI_IOC_MAGIC, 25, struct ipmi_channel_lun_address_set)
+#define IPMICTL_SET_MY_CHANNEL_LUN_CMD	   _IOR(IPMI_IOC_MAGIC, 26, struct ipmi_channel_lun_address_set)
+#define IPMICTL_GET_MY_CHANNEL_LUN_CMD	   _IOR(IPMI_IOC_MAGIC, 27, struct ipmi_channel_lun_address_set)
+/* Legacy interfaces, these only set IPMB 0. */
 #define IPMICTL_SET_MY_ADDRESS_CMD	_IOR(IPMI_IOC_MAGIC, 17, unsigned int)
 #define IPMICTL_GET_MY_ADDRESS_CMD	_IOR(IPMI_IOC_MAGIC, 18, unsigned int)
 #define IPMICTL_SET_MY_LUN_CMD		_IOR(IPMI_IOC_MAGIC, 19, unsigned int)
-- 
cgit v1.2.3


From 07766f241b54d67999907d529b99ffaa61d8b7d9 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Tue, 6 Sep 2005 15:18:40 -0700
Subject: [PATCH] ipmi: allow userland to include ipmi.h

The IPMI driver include file needs to include compiler.h so it has definitions
for __user and such.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ipmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 846b69899776..dd30adedd07d 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -35,6 +35,7 @@
 #define __LINUX_IPMI_H
 
 #include <linux/ipmi_msgdefs.h>
+#include <linux/compiler.h>
 
 /*
  * This file describes an interface to an IPMI driver.  You have to
-- 
cgit v1.2.3


From 56a55ec64806fb56e0cd43b0f726020b74c6689b Mon Sep 17 00:00:00 2001
From: Corey Minyard <cminyard@mvista.com>
Date: Tue, 6 Sep 2005 15:18:42 -0700
Subject: [PATCH] ipmi: fix panic ipmb response

The "null message handler" in the IPMI driver is used in startup and panic
situations to handle messages.  It was only designed to work with messages
from the local management controller, but in some cases it was used to get
messages from remote managmenet controllers, and the system would then
panic.  This patch makes the "null message handler" in the IPMI driver more
general so it works with any kind of message.

Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/ipmi/ipmi_msghandler.c | 107 ++++++++++++++++++++++--------------
 include/linux/ipmi.h                |   3 +-
 2 files changed, 69 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 05293d0e6692..d0ed25278cbb 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -219,7 +219,7 @@ struct ipmi_smi
 	   interface comes in with a NULL user, call this routine with
 	   it.  Note that the message will still be freed by the
 	   caller.  This only works on the system interface. */
-	void (*null_user_handler)(ipmi_smi_t intf, struct ipmi_smi_msg *msg);
+	void (*null_user_handler)(ipmi_smi_t intf, struct ipmi_recv_msg *msg);
 
 	/* When we are scanning the channels for an SMI, this will
 	   tell which channel we are scanning. */
@@ -459,7 +459,27 @@ unsigned int ipmi_addr_length(int addr_type)
 
 static void deliver_response(struct ipmi_recv_msg *msg)
 {
-	msg->user->handler->ipmi_recv_hndl(msg, msg->user->handler_data);
+	if (! msg->user) {
+		ipmi_smi_t    intf = msg->user_msg_data;
+		unsigned long flags;
+
+		/* Special handling for NULL users. */
+		if (intf->null_user_handler) {
+			intf->null_user_handler(intf, msg);
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->handled_local_responses++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+		} else {
+			/* No handler, so give up. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->unhandled_local_responses++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+		}
+		ipmi_free_recv_msg(msg);
+	} else {
+		msg->user->handler->ipmi_recv_hndl(msg,
+						   msg->user->handler_data);
+	}
 }
 
 /* Find the next sequence number not being used and add the given
@@ -1389,6 +1409,8 @@ int ipmi_request_settime(ipmi_user_t      user,
 	unsigned char saddr, lun;
 	int           rv;
 
+	if (! user)
+		return -EINVAL;
 	rv = check_addr(user->intf, addr, &saddr, &lun);
 	if (rv)
 		return rv;
@@ -1418,6 +1440,8 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 	unsigned char saddr, lun;
 	int           rv;
 
+	if (! user)
+		return -EINVAL;
 	rv = check_addr(user->intf, addr, &saddr, &lun);
 	if (rv)
 		return rv;
@@ -1638,7 +1662,7 @@ send_channel_info_cmd(ipmi_smi_t intf, int chan)
 			      (struct ipmi_addr *) &si,
 			      0,
 			      &msg,
-			      NULL,
+			      intf,
 			      NULL,
 			      NULL,
 			      0,
@@ -1648,19 +1672,20 @@ send_channel_info_cmd(ipmi_smi_t intf, int chan)
 }
 
 static void
-channel_handler(ipmi_smi_t intf, struct ipmi_smi_msg *msg)
+channel_handler(ipmi_smi_t intf, struct ipmi_recv_msg *msg)
 {
 	int rv = 0;
 	int chan;
 
-	if ((msg->rsp[0] == (IPMI_NETFN_APP_RESPONSE << 2))
-	    && (msg->rsp[1] == IPMI_GET_CHANNEL_INFO_CMD))
+	if ((msg->addr.addr_type == IPMI_SYSTEM_INTERFACE_ADDR_TYPE)
+	    && (msg->msg.netfn == IPMI_NETFN_APP_RESPONSE)
+	    && (msg->msg.cmd == IPMI_GET_CHANNEL_INFO_CMD))
 	{
 		/* It's the one we want */
-		if (msg->rsp[2] != 0) {
+		if (msg->msg.data[0] != 0) {
 			/* Got an error from the channel, just go on. */
 
-			if (msg->rsp[2] == IPMI_INVALID_COMMAND_ERR) {
+			if (msg->msg.data[0] == IPMI_INVALID_COMMAND_ERR) {
 				/* If the MC does not support this
 				   command, that is legal.  We just
 				   assume it has one IPMB at channel
@@ -1677,13 +1702,13 @@ channel_handler(ipmi_smi_t intf, struct ipmi_smi_msg *msg)
 			}
 			goto next_channel;
 		}
-		if (msg->rsp_size < 6) {
+		if (msg->msg.data_len < 4) {
 			/* Message not big enough, just go on. */
 			goto next_channel;
 		}
 		chan = intf->curr_channel;
-		intf->channels[chan].medium = msg->rsp[4] & 0x7f;
-		intf->channels[chan].protocol = msg->rsp[5] & 0x1f;
+		intf->channels[chan].medium = msg->msg.data[2] & 0x7f;
+		intf->channels[chan].protocol = msg->msg.data[3] & 0x1f;
 
 	next_channel:
 		intf->curr_channel++;
@@ -2382,6 +2407,14 @@ static int handle_bmc_rsp(ipmi_smi_t          intf,
 	unsigned long        flags;
 
 	recv_msg = (struct ipmi_recv_msg *) msg->user_data;
+	if (recv_msg == NULL)
+	{
+		printk(KERN_WARNING"IPMI message received with no owner. This\n"
+			"could be because of a malformed message, or\n"
+			"because of a hardware error.  Contact your\n"
+			"hardware vender for assistance\n");
+		return 0;
+	}
 
 	/* Make sure the user still exists. */
 	list_for_each_entry(user, &(intf->users), link) {
@@ -2392,19 +2425,11 @@ static int handle_bmc_rsp(ipmi_smi_t          intf,
 		}
 	}
 
-	if (!found) {
-		/* Special handling for NULL users. */
-		if (!recv_msg->user && intf->null_user_handler){
-			intf->null_user_handler(intf, msg);
-			spin_lock_irqsave(&intf->counter_lock, flags);
-			intf->handled_local_responses++;
-			spin_unlock_irqrestore(&intf->counter_lock, flags);
-		}else{
-			/* The user for the message went away, so give up. */
-			spin_lock_irqsave(&intf->counter_lock, flags);
-			intf->unhandled_local_responses++;
-			spin_unlock_irqrestore(&intf->counter_lock, flags);
-		}
+	if ((! found) && recv_msg->user) {
+		/* The user for the message went away, so give up. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_local_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		ipmi_free_recv_msg(recv_msg);
 	} else {
 		struct ipmi_system_interface_addr *smi_addr;
@@ -2890,28 +2915,30 @@ static void dummy_recv_done_handler(struct ipmi_recv_msg *msg)
 }
 
 #ifdef CONFIG_IPMI_PANIC_STRING
-static void event_receiver_fetcher(ipmi_smi_t intf, struct ipmi_smi_msg *msg)
+static void event_receiver_fetcher(ipmi_smi_t intf, struct ipmi_recv_msg *msg)
 {
-	if ((msg->rsp[0] == (IPMI_NETFN_SENSOR_EVENT_RESPONSE << 2))
-	    && (msg->rsp[1] == IPMI_GET_EVENT_RECEIVER_CMD)
-	    && (msg->rsp[2] == IPMI_CC_NO_ERROR))
+	if ((msg->addr.addr_type == IPMI_SYSTEM_INTERFACE_ADDR_TYPE)
+	    && (msg->msg.netfn == IPMI_NETFN_SENSOR_EVENT_RESPONSE)
+	    && (msg->msg.cmd == IPMI_GET_EVENT_RECEIVER_CMD)
+	    && (msg->msg.data[0] == IPMI_CC_NO_ERROR))
 	{
 		/* A get event receiver command, save it. */
-		intf->event_receiver = msg->rsp[3];
-		intf->event_receiver_lun = msg->rsp[4] & 0x3;
+		intf->event_receiver = msg->msg.data[1];
+		intf->event_receiver_lun = msg->msg.data[2] & 0x3;
 	}
 }
 
-static void device_id_fetcher(ipmi_smi_t intf, struct ipmi_smi_msg *msg)
+static void device_id_fetcher(ipmi_smi_t intf, struct ipmi_recv_msg *msg)
 {
-	if ((msg->rsp[0] == (IPMI_NETFN_APP_RESPONSE << 2))
-	    && (msg->rsp[1] == IPMI_GET_DEVICE_ID_CMD)
-	    && (msg->rsp[2] == IPMI_CC_NO_ERROR))
+	if ((msg->addr.addr_type == IPMI_SYSTEM_INTERFACE_ADDR_TYPE)
+	    && (msg->msg.netfn == IPMI_NETFN_APP_RESPONSE)
+	    && (msg->msg.cmd == IPMI_GET_DEVICE_ID_CMD)
+	    && (msg->msg.data[0] == IPMI_CC_NO_ERROR))
 	{
 		/* A get device id command, save if we are an event
 		   receiver or generator. */
-		intf->local_sel_device = (msg->rsp[8] >> 2) & 1;
-		intf->local_event_generator = (msg->rsp[8] >> 5) & 1;
+		intf->local_sel_device = (msg->msg.data[6] >> 2) & 1;
+		intf->local_event_generator = (msg->msg.data[6] >> 5) & 1;
 	}
 }
 #endif
@@ -2967,7 +2994,7 @@ static void send_panic_events(char *str)
 			       &addr,
 			       0,
 			       &msg,
-			       NULL,
+			       intf,
 			       &smi_msg,
 			       &recv_msg,
 			       0,
@@ -3013,7 +3040,7 @@ static void send_panic_events(char *str)
 			       &addr,
 			       0,
 			       &msg,
-			       NULL,
+			       intf,
 			       &smi_msg,
 			       &recv_msg,
 			       0,
@@ -3033,7 +3060,7 @@ static void send_panic_events(char *str)
 				       &addr,
 				       0,
 				       &msg,
-				       NULL,
+				       intf,
 				       &smi_msg,
 				       &recv_msg,
 				       0,
@@ -3095,7 +3122,7 @@ static void send_panic_events(char *str)
 				       &addr,
 				       0,
 				       &msg,
-				       NULL,
+				       intf,
 				       &smi_msg,
 				       &recv_msg,
 				       0,
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index dd30adedd07d..938d55b813a5 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -242,7 +242,8 @@ struct ipmi_recv_msg
 	/* The user_msg_data is the data supplied when a message was
 	   sent, if this is a response to a sent message.  If this is
 	   not a response to a sent message, then user_msg_data will
-	   be NULL. */
+	   be NULL.  If the user above is NULL, then this will be the
+	   intf. */
 	void             *user_msg_data;
 
 	/* Call this when done with the message.  It will presumably free
-- 
cgit v1.2.3


From 8c702e16207c70119d03df924de35f8c3629a5c4 Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Tue, 6 Sep 2005 15:18:46 -0700
Subject: [PATCH] ipmi poweroff: fix chassis control

The IPMI power control function proc_write_chassctrl was badly written, it
directly used userspace pointers, it assumed that strings were NULL
terminated, and it used the evil sscanf function.  This converts over to
using the sysctl interface for this data and changes the semantics to be a
little more logical.

Signed-off-by: Corey Minyard <minyard@acm.org>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/IPMI.txt            |  13 ++--
 drivers/char/ipmi/ipmi_poweroff.c | 132 +++++++++++++++++---------------------
 include/linux/sysctl.h            |   6 ++
 3 files changed, 71 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 84d3d4d10c17..bf1cf98d2a27 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -605,12 +605,13 @@ is in the ipmi_poweroff module.  When the system requests a powerdown,
 it will send the proper IPMI commands to do this.  This is supported on
 several platforms.
 
-There is a module parameter named "poweroff_control" that may either be zero
-(do a power down) or 2 (do a power cycle, power the system off, then power
-it on in a few seconds).  Setting ipmi_poweroff.poweroff_control=x will do
-the same thing on the kernel command line.  The parameter is also available
-via the proc filesystem in /proc/ipmi/poweroff_control.  Note that if the
-system does not support power cycling, it will always to the power off.
+There is a module parameter named "poweroff_powercycle" that may
+either be zero (do a power down) or non-zero (do a power cycle, power
+the system off, then power it on in a few seconds).  Setting
+ipmi_poweroff.poweroff_control=x will do the same thing on the kernel
+command line.  The parameter is also available via the proc filesystem
+in /proc/sys/dev/ipmi/poweroff_powercycle.  Note that if the system
+does not support power cycling, it will always do the power off.
 
 Note that if you have ACPI enabled, the system will prefer using ACPI to
 power off.
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index 42ea9843b394..e82a96ba396b 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -52,11 +52,11 @@ extern void (*pm_power_off)(void);
 #define IPMI_CHASSIS_POWER_CYCLE	0x02	/* power cycle */
 
 /* the IPMI data command */
-static int poweroff_control = IPMI_CHASSIS_POWER_DOWN;
+static int poweroff_powercycle;
 
 /* parameter definition to allow user to flag power cycle */
-module_param(poweroff_control, int, IPMI_CHASSIS_POWER_DOWN);
-MODULE_PARM_DESC(poweroff_control, " Set to 2 to enable power cycle instead of power down. Power cycle is contingent on hardware support, otherwise it defaults back to power down.");
+module_param(poweroff_powercycle, int, 0);
+MODULE_PARM_DESC(poweroff_powercycles, " Set to non-zero to enable power cycle instead of power down. Power cycle is contingent on hardware support, otherwise it defaults back to power down.");
 
 /* Stuff from the get device id command. */
 static unsigned int mfg_id;
@@ -385,37 +385,34 @@ static void ipmi_poweroff_chassis (ipmi_user_t user)
 
  powercyclefailed:
 	printk(KERN_INFO PFX "Powering %s via IPMI chassis control command\n",
-		((poweroff_control != IPMI_CHASSIS_POWER_CYCLE) ? "down" : "cycle"));
+		(poweroff_powercycle ? "cycle" : "down"));
 
 	/*
 	 * Power down
 	 */
 	send_msg.netfn = IPMI_NETFN_CHASSIS_REQUEST;
 	send_msg.cmd = IPMI_CHASSIS_CONTROL_CMD;
-	data[0] = poweroff_control;
+	if (poweroff_powercycle)
+		data[0] = IPMI_CHASSIS_POWER_CYCLE;
+	else
+		data[0] = IPMI_CHASSIS_POWER_DOWN;
 	send_msg.data = data;
 	send_msg.data_len = sizeof(data);
 	rv = ipmi_request_in_rc_mode(user,
 				     (struct ipmi_addr *) &smi_addr,
 				     &send_msg);
 	if (rv) {
-		switch (poweroff_control) {
-			case IPMI_CHASSIS_POWER_CYCLE:
-				/* power cycle failed, default to power down */
-				printk(KERN_ERR PFX "Unable to send chassis power " \
-					"cycle message, IPMI error 0x%x\n", rv);
-				poweroff_control = IPMI_CHASSIS_POWER_DOWN;
-				goto powercyclefailed;
-
-			case IPMI_CHASSIS_POWER_DOWN:
-			default:
-				printk(KERN_ERR PFX "Unable to send chassis power " \
-					"down message, IPMI error 0x%x\n", rv);
-				break;
+		if (poweroff_powercycle) {
+			/* power cycle failed, default to power down */
+			printk(KERN_ERR PFX "Unable to send chassis power " \
+			       "cycle message, IPMI error 0x%x\n", rv);
+			poweroff_powercycle = 0;
+			goto powercyclefailed;
 		}
-	}
 
-	return;
+		printk(KERN_ERR PFX "Unable to send chassis power " \
+		       "down message, IPMI error 0x%x\n", rv);
+	}
 }
 
 
@@ -561,39 +558,35 @@ static struct ipmi_smi_watcher smi_watcher =
 
 
 #ifdef CONFIG_PROC_FS
-/* displays properties to proc */
-static int proc_read_chassctrl(char *page, char **start, off_t off, int count,
-			       int *eof, void *data)
-{
-	return sprintf(page, "%d\t[ 0=powerdown 2=powercycle ]\n",
-			poweroff_control);
-}
+#include <linux/sysctl.h>
+
+static ctl_table ipmi_table[] = {
+	{ .ctl_name	= DEV_IPMI_POWEROFF_POWERCYCLE,
+	  .procname	= "poweroff_powercycle",
+	  .data		= &poweroff_powercycle,
+	  .maxlen	= sizeof(poweroff_powercycle),
+	  .mode		= 0644,
+	  .proc_handler	= &proc_dointvec },
+	{ }
+};
 
-/* process property writes from proc */
-static int proc_write_chassctrl(struct file *file, const char *buffer,
-			        unsigned long count, void *data)
-{
-	int          rv = count;
-	unsigned int newval = 0;
-
-	sscanf(buffer, "%d", &newval);
-	switch (newval) {
-		case IPMI_CHASSIS_POWER_CYCLE:
-			printk(KERN_INFO PFX "power cycle is now enabled\n");
-			poweroff_control = newval;
-			break;
-
-		case IPMI_CHASSIS_POWER_DOWN:
-			poweroff_control = IPMI_CHASSIS_POWER_DOWN;
-			break;
-
-		default:
-			rv = -EINVAL;
-			break;
-	}
+static ctl_table ipmi_dir_table[] = {
+	{ .ctl_name	= DEV_IPMI,
+	  .procname	= "ipmi",
+	  .mode		= 0555,
+	  .child	= ipmi_table },
+	{ }
+};
 
-	return rv;
-}
+static ctl_table ipmi_root_table[] = {
+	{ .ctl_name	= CTL_DEV,
+	  .procname	= "dev",
+	  .mode		= 0555,
+	  .child	= ipmi_dir_table },
+	{ }
+};
+
+static struct ctl_table_header *ipmi_table_header;
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -601,41 +594,32 @@ static int proc_write_chassctrl(struct file *file, const char *buffer,
  */
 static int ipmi_poweroff_init (void)
 {
-	int                   rv;
-	struct proc_dir_entry *file;
+	int rv;
 
 	printk ("Copyright (C) 2004 MontaVista Software -"
 		" IPMI Powerdown via sys_reboot.\n");
 
-	switch (poweroff_control) {
-		case IPMI_CHASSIS_POWER_CYCLE:
-			printk(KERN_INFO PFX "Power cycle is enabled.\n");
-			break;
+	if (poweroff_powercycle)
+		printk(KERN_INFO PFX "Power cycle is enabled.\n");
 
-		case IPMI_CHASSIS_POWER_DOWN:
-		default:
-			poweroff_control = IPMI_CHASSIS_POWER_DOWN;
-			break;
+#ifdef CONFIG_PROC_FS
+	ipmi_table_header = register_sysctl_table(ipmi_root_table, 1);
+	if (!ipmi_table_header) {
+		printk(KERN_ERR PFX "Unable to register powercycle sysctl\n");
+		rv = -ENOMEM;
+		goto out_err;
 	}
+#endif
 
+#ifdef CONFIG_PROC_FS
 	rv = ipmi_smi_watcher_register(&smi_watcher);
+#endif
 	if (rv) {
+		unregister_sysctl_table(ipmi_table_header);
 		printk(KERN_ERR PFX "Unable to register SMI watcher: %d\n", rv);
 		goto out_err;
 	}
 
-#ifdef CONFIG_PROC_FS
-	file = create_proc_entry("poweroff_control", 0, proc_ipmi_root);
-	if (!file) {
-		printk(KERN_ERR PFX "Unable to create proc power control\n");
-	} else {
-		file->nlink = 1;
-		file->read_proc = proc_read_chassctrl;
-		file->write_proc = proc_write_chassctrl;
-		file->owner = THIS_MODULE;
-	}
-#endif
-
  out_err:
 	return rv;
 }
@@ -646,7 +630,7 @@ static __exit void ipmi_poweroff_cleanup(void)
 	int rv;
 
 #ifdef CONFIG_PROC_FS
-	remove_proc_entry("poweroff_control", proc_ipmi_root);
+	unregister_sysctl_table(ipmi_table_header);
 #endif
 
 	ipmi_smi_watcher_unregister(&smi_watcher);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e82be96d4906..532a6c5c24e9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -711,6 +711,7 @@ enum {
 	DEV_RAID=4,
 	DEV_MAC_HID=5,
 	DEV_SCSI=6,
+	DEV_IPMI=7,
 };
 
 /* /proc/sys/dev/cdrom */
@@ -776,6 +777,11 @@ enum {
 	DEV_SCSI_LOGGING_LEVEL=1,
 };
 
+/* /proc/sys/dev/ipmi */
+enum {
+	DEV_IPMI_POWEROFF_POWERCYCLE=1,
+};
+
 /* /proc/sys/abi */
 enum
 {
-- 
cgit v1.2.3


From 335eadf2ef6a1122a720aea98e758e5d431da87d Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus-list@drzeus.cx>
Date: Tue, 6 Sep 2005 15:18:50 -0700
Subject: [PATCH] sd: initialize SD cards

Support for the Secure Digital protocol in the MMC layer.

A summary of the legal issues surrounding SD cards, as understood by yours
truly:

Members of the Secure Digital Association, hereafter SDA, are required to sign
a NDA[1] before given access to any specifications.  It has been speculated
that including an SD implementation would forbid these members to redistribute
Linux.  This is the basic problem with SD support so it is unclear if it even
is a problem since it has no effect on those of us that aren't members.

The SDA doesn't seem to enforce these rules though since the patches included
here are based on documentation made public by some of the members.  The most
complete specs[2] are actually released by Sandisk, one of the founding
companies of the SDA.

Because of this the NDA is considered a non-issue by most involved in the
discussions concerning these patches.  It might be that the SDA is only
interested in protecting the so called "secure" bits of SD, which so far
hasn't been found in any public spec.  (The card is split into two sections,
one "normal" and one "secure" which has an access scheme similar to TPM:s).

(As a side note, Microsoft is working to make things easier for us since they
want to be able to include the source code for a SD driver in one of their
development kits.  HP is making sure that the new NDA will allow a Linux
implementation.  So far only the SDIO specs have been opened up[3].  More will
hopefully follow.)

 [1] http://www.sdcard.org/membership/images/ippolicy.pdf
 [2] http://www.sandisk.com/pdf/oem/ProdManualSDCardv1.9.pdf
 [3] http://www.sdcard.org/sdio/Simplified%20SDIO%20Card%20Specification.pdf

This patch contains the central parts of the SD support.  If no MMC cards are
found on a bus then the MMC layer proceeds looking for SD cards.  Helper
functions are extended to handle the special needs of SD cards.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/mmc/mmc.c        | 326 ++++++++++++++++++++++++++++++++++++-----------
 include/linux/mmc/card.h |   3 +
 include/linux/mmc/host.h |   4 +
 include/linux/mmc/mmc.h  |   2 +
 4 files changed, 262 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c
index 0a8165974ba7..294961a102ca 100644
--- a/drivers/mmc/mmc.c
+++ b/drivers/mmc/mmc.c
@@ -172,7 +172,79 @@ int mmc_wait_for_cmd(struct mmc_host *host, struct mmc_command *cmd, int retries
 
 EXPORT_SYMBOL(mmc_wait_for_cmd);
 
+/**
+ *	mmc_wait_for_app_cmd - start an application command and wait for
+ 			       completion
+ *	@host: MMC host to start command
+ *	@rca: RCA to send MMC_APP_CMD to
+ *	@cmd: MMC command to start
+ *	@retries: maximum number of retries
+ *
+ *	Sends a MMC_APP_CMD, checks the card response, sends the command
+ *	in the parameter and waits for it to complete. Return any error
+ *	that occurred while the command was executing.  Do not attempt to
+ *	parse the response.
+ */
+int mmc_wait_for_app_cmd(struct mmc_host *host, unsigned int rca,
+	struct mmc_command *cmd, int retries)
+{
+	struct mmc_request mrq;
+	struct mmc_command appcmd;
+
+	int i, err;
+
+	BUG_ON(host->card_busy == NULL);
+	BUG_ON(retries < 0);
+
+	err = MMC_ERR_INVALID;
+
+	/*
+	 * We have to resend MMC_APP_CMD for each attempt so
+	 * we cannot use the retries field in mmc_command.
+	 */
+	for (i = 0;i <= retries;i++) {
+		memset(&mrq, 0, sizeof(struct mmc_request));
+
+		appcmd.opcode = MMC_APP_CMD;
+		appcmd.arg = rca << 16;
+		appcmd.flags = MMC_RSP_R1;
+		appcmd.retries = 0;
+		memset(appcmd.resp, 0, sizeof(appcmd.resp));
+		appcmd.data = NULL;
+
+		mrq.cmd = &appcmd;
+		appcmd.data = NULL;
+
+		mmc_wait_for_req(host, &mrq);
+
+		if (appcmd.error) {
+			err = appcmd.error;
+			continue;
+		}
+
+		/* Check that card supported application commands */
+		if (!(appcmd.resp[0] & R1_APP_CMD))
+			return MMC_ERR_FAILED;
+
+		memset(&mrq, 0, sizeof(struct mmc_request));
+
+		memset(cmd->resp, 0, sizeof(cmd->resp));
+		cmd->retries = 0;
+
+		mrq.cmd = cmd;
+		cmd->data = NULL;
+
+		mmc_wait_for_req(host, &mrq);
 
+		err = cmd->error;
+		if (cmd->error == MMC_ERR_NONE)
+			break;
+	}
+
+	return err;
+}
+
+EXPORT_SYMBOL(mmc_wait_for_app_cmd);
 
 /**
  *	__mmc_claim_host - exclusively claim a host
@@ -322,48 +394,70 @@ static void mmc_decode_cid(struct mmc_card *card)
 
 	memset(&card->cid, 0, sizeof(struct mmc_cid));
 
-	/*
-	 * The selection of the format here is guesswork based upon
-	 * information people have sent to date.
-	 */
-	switch (card->csd.mmca_vsn) {
-	case 0: /* MMC v1.? */
-	case 1: /* MMC v1.4 */
-		card->cid.manfid	= UNSTUFF_BITS(resp, 104, 24);
-		card->cid.prod_name[0]	= UNSTUFF_BITS(resp, 96, 8);
-		card->cid.prod_name[1]	= UNSTUFF_BITS(resp, 88, 8);
-		card->cid.prod_name[2]	= UNSTUFF_BITS(resp, 80, 8);
-		card->cid.prod_name[3]	= UNSTUFF_BITS(resp, 72, 8);
-		card->cid.prod_name[4]	= UNSTUFF_BITS(resp, 64, 8);
-		card->cid.prod_name[5]	= UNSTUFF_BITS(resp, 56, 8);
-		card->cid.prod_name[6]	= UNSTUFF_BITS(resp, 48, 8);
-		card->cid.hwrev		= UNSTUFF_BITS(resp, 44, 4);
-		card->cid.fwrev		= UNSTUFF_BITS(resp, 40, 4);
-		card->cid.serial	= UNSTUFF_BITS(resp, 16, 24);
-		card->cid.month		= UNSTUFF_BITS(resp, 12, 4);
-		card->cid.year		= UNSTUFF_BITS(resp, 8, 4) + 1997;
-		break;
-
-	case 2: /* MMC v2.x ? */
-	case 3: /* MMC v3.x ? */
-		card->cid.manfid	= UNSTUFF_BITS(resp, 120, 8);
-		card->cid.oemid		= UNSTUFF_BITS(resp, 104, 16);
-		card->cid.prod_name[0]	= UNSTUFF_BITS(resp, 96, 8);
-		card->cid.prod_name[1]	= UNSTUFF_BITS(resp, 88, 8);
-		card->cid.prod_name[2]	= UNSTUFF_BITS(resp, 80, 8);
-		card->cid.prod_name[3]	= UNSTUFF_BITS(resp, 72, 8);
-		card->cid.prod_name[4]	= UNSTUFF_BITS(resp, 64, 8);
-		card->cid.prod_name[5]	= UNSTUFF_BITS(resp, 56, 8);
-		card->cid.serial	= UNSTUFF_BITS(resp, 16, 32);
-		card->cid.month		= UNSTUFF_BITS(resp, 12, 4);
-		card->cid.year		= UNSTUFF_BITS(resp, 8, 4) + 1997;
-		break;
-
-	default:
-		printk("%s: card has unknown MMCA version %d\n",
-			mmc_hostname(card->host), card->csd.mmca_vsn);
-		mmc_card_set_bad(card);
-		break;
+	if (mmc_card_sd(card)) {
+		/*
+		 * SD doesn't currently have a version field so we will
+		 * have to assume we can parse this.
+		 */
+		card->cid.manfid		= UNSTUFF_BITS(resp, 120, 8);
+		card->cid.oemid			= UNSTUFF_BITS(resp, 104, 16);
+		card->cid.prod_name[0]		= UNSTUFF_BITS(resp, 96, 8);
+		card->cid.prod_name[1]		= UNSTUFF_BITS(resp, 88, 8);
+		card->cid.prod_name[2]		= UNSTUFF_BITS(resp, 80, 8);
+		card->cid.prod_name[3]		= UNSTUFF_BITS(resp, 72, 8);
+		card->cid.prod_name[4]		= UNSTUFF_BITS(resp, 64, 8);
+		card->cid.hwrev			= UNSTUFF_BITS(resp, 60, 4);
+		card->cid.fwrev			= UNSTUFF_BITS(resp, 56, 4);
+		card->cid.serial		= UNSTUFF_BITS(resp, 24, 32);
+		card->cid.year			= UNSTUFF_BITS(resp, 12, 8);
+		card->cid.month			= UNSTUFF_BITS(resp, 8, 4);
+
+		card->cid.year += 2000; /* SD cards year offset */
+	}
+	else {
+		/*
+		 * The selection of the format here is based upon published
+		 * specs from sandisk and from what people have reported.
+		 */
+		switch (card->csd.mmca_vsn) {
+		case 0: /* MMC v1.0 - v1.2 */
+		case 1: /* MMC v1.4 */
+			card->cid.manfid	= UNSTUFF_BITS(resp, 104, 24);
+			card->cid.prod_name[0]	= UNSTUFF_BITS(resp, 96, 8);
+			card->cid.prod_name[1]	= UNSTUFF_BITS(resp, 88, 8);
+			card->cid.prod_name[2]	= UNSTUFF_BITS(resp, 80, 8);
+			card->cid.prod_name[3]	= UNSTUFF_BITS(resp, 72, 8);
+			card->cid.prod_name[4]	= UNSTUFF_BITS(resp, 64, 8);
+			card->cid.prod_name[5]	= UNSTUFF_BITS(resp, 56, 8);
+			card->cid.prod_name[6]	= UNSTUFF_BITS(resp, 48, 8);
+			card->cid.hwrev		= UNSTUFF_BITS(resp, 44, 4);
+			card->cid.fwrev		= UNSTUFF_BITS(resp, 40, 4);
+			card->cid.serial	= UNSTUFF_BITS(resp, 16, 24);
+			card->cid.month		= UNSTUFF_BITS(resp, 12, 4);
+			card->cid.year		= UNSTUFF_BITS(resp, 8, 4) + 1997;
+			break;
+
+		case 2: /* MMC v2.0 - v2.2 */
+		case 3: /* MMC v3.1 - v3.3 */
+			card->cid.manfid	= UNSTUFF_BITS(resp, 120, 8);
+			card->cid.oemid		= UNSTUFF_BITS(resp, 104, 16);
+			card->cid.prod_name[0]	= UNSTUFF_BITS(resp, 96, 8);
+			card->cid.prod_name[1]	= UNSTUFF_BITS(resp, 88, 8);
+			card->cid.prod_name[2]	= UNSTUFF_BITS(resp, 80, 8);
+			card->cid.prod_name[3]	= UNSTUFF_BITS(resp, 72, 8);
+			card->cid.prod_name[4]	= UNSTUFF_BITS(resp, 64, 8);
+			card->cid.prod_name[5]	= UNSTUFF_BITS(resp, 56, 8);
+			card->cid.serial	= UNSTUFF_BITS(resp, 16, 32);
+			card->cid.month		= UNSTUFF_BITS(resp, 12, 4);
+			card->cid.year		= UNSTUFF_BITS(resp, 8, 4) + 1997;
+			break;
+
+		default:
+			printk("%s: card has unknown MMCA version %d\n",
+				mmc_hostname(card->host), card->csd.mmca_vsn);
+			mmc_card_set_bad(card);
+			break;
+		}
 	}
 }
 
@@ -376,34 +470,61 @@ static void mmc_decode_csd(struct mmc_card *card)
 	unsigned int e, m, csd_struct;
 	u32 *resp = card->raw_csd;
 
-	/*
-	 * We only understand CSD structure v1.1 and v2.
-	 * v2 has extra information in bits 15, 11 and 10.
-	 */
-	csd_struct = UNSTUFF_BITS(resp, 126, 2);
-	if (csd_struct != 1 && csd_struct != 2) {
-		printk("%s: unrecognised CSD structure version %d\n",
-			mmc_hostname(card->host), csd_struct);
-		mmc_card_set_bad(card);
-		return;
+	if (mmc_card_sd(card)) {
+		csd_struct = UNSTUFF_BITS(resp, 126, 2);
+		if (csd_struct != 0) {
+			printk("%s: unrecognised CSD structure version %d\n",
+				mmc_hostname(card->host), csd_struct);
+			mmc_card_set_bad(card);
+			return;
+		}
+
+		m = UNSTUFF_BITS(resp, 115, 4);
+		e = UNSTUFF_BITS(resp, 112, 3);
+		csd->tacc_ns	 = (tacc_exp[e] * tacc_mant[m] + 9) / 10;
+		csd->tacc_clks	 = UNSTUFF_BITS(resp, 104, 8) * 100;
+
+		m = UNSTUFF_BITS(resp, 99, 4);
+		e = UNSTUFF_BITS(resp, 96, 3);
+		csd->max_dtr	  = tran_exp[e] * tran_mant[m];
+		csd->cmdclass	  = UNSTUFF_BITS(resp, 84, 12);
+
+		e = UNSTUFF_BITS(resp, 47, 3);
+		m = UNSTUFF_BITS(resp, 62, 12);
+		csd->capacity	  = (1 + m) << (e + 2);
+
+		csd->read_blkbits = UNSTUFF_BITS(resp, 80, 4);
 	}
+	else {
+		/*
+		 * We only understand CSD structure v1.1 and v1.2.
+		 * v1.2 has extra information in bits 15, 11 and 10.
+		 */
+		csd_struct = UNSTUFF_BITS(resp, 126, 2);
+		if (csd_struct != 1 && csd_struct != 2) {
+			printk("%s: unrecognised CSD structure version %d\n",
+				mmc_hostname(card->host), csd_struct);
+			mmc_card_set_bad(card);
+			return;
+		}
 
-	csd->mmca_vsn	 = UNSTUFF_BITS(resp, 122, 4);
-	m = UNSTUFF_BITS(resp, 115, 4);
-	e = UNSTUFF_BITS(resp, 112, 3);
-	csd->tacc_ns	 = (tacc_exp[e] * tacc_mant[m] + 9) / 10;
-	csd->tacc_clks	 = UNSTUFF_BITS(resp, 104, 8) * 100;
+		csd->mmca_vsn	 = UNSTUFF_BITS(resp, 122, 4);
+		m = UNSTUFF_BITS(resp, 115, 4);
+		e = UNSTUFF_BITS(resp, 112, 3);
+		csd->tacc_ns	 = (tacc_exp[e] * tacc_mant[m] + 9) / 10;
+		csd->tacc_clks	 = UNSTUFF_BITS(resp, 104, 8) * 100;
 
-	m = UNSTUFF_BITS(resp, 99, 4);
-	e = UNSTUFF_BITS(resp, 96, 3);
-	csd->max_dtr	  = tran_exp[e] * tran_mant[m];
-	csd->cmdclass	  = UNSTUFF_BITS(resp, 84, 12);
+		m = UNSTUFF_BITS(resp, 99, 4);
+		e = UNSTUFF_BITS(resp, 96, 3);
+		csd->max_dtr	  = tran_exp[e] * tran_mant[m];
+		csd->cmdclass	  = UNSTUFF_BITS(resp, 84, 12);
 
-	e = UNSTUFF_BITS(resp, 47, 3);
-	m = UNSTUFF_BITS(resp, 62, 12);
-	csd->capacity	  = (1 + m) << (e + 2);
+		e = UNSTUFF_BITS(resp, 47, 3);
+		m = UNSTUFF_BITS(resp, 62, 12);
+		csd->capacity	  = (1 + m) << (e + 2);
 
-	csd->read_blkbits = UNSTUFF_BITS(resp, 80, 4);
+		csd->read_blkbits = UNSTUFF_BITS(resp, 80, 4);
+	}
 }
 
 /*
@@ -536,6 +657,34 @@ static int mmc_send_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr)
 	return err;
 }
 
+static int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr)
+{
+	struct mmc_command cmd;
+	int i, err = 0;
+
+	cmd.opcode = SD_APP_OP_COND;
+	cmd.arg = ocr;
+	cmd.flags = MMC_RSP_R3;
+
+	for (i = 100; i; i--) {
+		err = mmc_wait_for_app_cmd(host, 0, &cmd, CMD_RETRIES);
+		if (err != MMC_ERR_NONE)
+			break;
+
+		if (cmd.resp[0] & MMC_CARD_BUSY || ocr == 0)
+			break;
+
+		err = MMC_ERR_TIMEOUT;
+
+		mmc_delay(10);
+	}
+
+	if (rocr)
+		*rocr = cmd.resp[0];
+
+	return err;
+}
+
 /*
  * Discover cards by requesting their CID.  If this command
  * times out, it is not an error; there are no further cards
@@ -579,13 +728,28 @@ static void mmc_discover_cards(struct mmc_host *host)
 
 		card->state &= ~MMC_STATE_DEAD;
 
-		cmd.opcode = MMC_SET_RELATIVE_ADDR;
-		cmd.arg = card->rca << 16;
-		cmd.flags = MMC_RSP_R1;
+		if (host->mode == MMC_MODE_SD) {
+			mmc_card_set_sd(card);
 
-		err = mmc_wait_for_cmd(host, &cmd, CMD_RETRIES);
-		if (err != MMC_ERR_NONE)
-			mmc_card_set_dead(card);
+			cmd.opcode = SD_SEND_RELATIVE_ADDR;
+			cmd.arg = 0;
+			cmd.flags = MMC_RSP_R1;
+
+			err = mmc_wait_for_cmd(host, &cmd, CMD_RETRIES);
+			if (err != MMC_ERR_NONE)
+				mmc_card_set_dead(card);
+			else
+				card->rca = cmd.resp[0] >> 16;
+		}
+		else {
+			cmd.opcode = MMC_SET_RELATIVE_ADDR;
+			cmd.arg = card->rca << 16;
+			cmd.flags = MMC_RSP_R1;
+
+			err = mmc_wait_for_cmd(host, &cmd, CMD_RETRIES);
+			if (err != MMC_ERR_NONE)
+				mmc_card_set_dead(card);
+		}
 	}
 }
 
@@ -669,12 +833,25 @@ static void mmc_setup(struct mmc_host *host)
 		int err;
 		u32 ocr;
 
+		host->mode = MMC_MODE_MMC;
+
 		mmc_power_up(host);
 		mmc_idle_cards(host);
 
 		err = mmc_send_op_cond(host, 0, &ocr);
+
+		/*
+		 * If we fail to detect any cards then try
+		 * searching for SD cards.
+		 */
 		if (err != MMC_ERR_NONE)
-			return;
+		{
+			err = mmc_send_app_op_cond(host, 0, &ocr);
+			if (err != MMC_ERR_NONE)
+				return;
+
+			host->mode = MMC_MODE_SD;
+		}
 
 		host->ocr = mmc_select_voltage(host, ocr);
 
@@ -714,7 +891,10 @@ static void mmc_setup(struct mmc_host *host)
 	 * all get the idea that they should be ready for CMD2.
 	 * (My SanDisk card seems to need this.)
 	 */
-	mmc_send_op_cond(host, host->ocr, NULL);
+	if (host->mode == MMC_MODE_SD)
+		mmc_send_app_op_cond(host, host->ocr, NULL);
+	else
+		mmc_send_op_cond(host, host->ocr, NULL);
 
 	mmc_discover_cards(host);
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index aefedf04b9bb..538e8c86336c 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -47,6 +47,7 @@ struct mmc_card {
 #define MMC_STATE_PRESENT	(1<<0)		/* present in sysfs */
 #define MMC_STATE_DEAD		(1<<1)		/* device no longer in stack */
 #define MMC_STATE_BAD		(1<<2)		/* unrecognised device */
+#define MMC_STATE_SDCARD	(1<<3)		/* is an SD card */
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
 	struct mmc_cid		cid;		/* card identification */
@@ -56,10 +57,12 @@ struct mmc_card {
 #define mmc_card_present(c)	((c)->state & MMC_STATE_PRESENT)
 #define mmc_card_dead(c)	((c)->state & MMC_STATE_DEAD)
 #define mmc_card_bad(c)		((c)->state & MMC_STATE_BAD)
+#define mmc_card_sd(c)		((c)->state & MMC_STATE_SDCARD)
 
 #define mmc_card_set_present(c)	((c)->state |= MMC_STATE_PRESENT)
 #define mmc_card_set_dead(c)	((c)->state |= MMC_STATE_DEAD)
 #define mmc_card_set_bad(c)	((c)->state |= MMC_STATE_BAD)
+#define mmc_card_set_sd(c)	((c)->state |= MMC_STATE_SDCARD)
 
 #define mmc_card_name(c)	((c)->cid.prod_name)
 #define mmc_card_id(c)		((c)->dev.bus_id)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 30f68c0c8c6e..845020d90c60 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -87,6 +87,10 @@ struct mmc_host {
 	struct mmc_ios		ios;		/* current io bus settings */
 	u32			ocr;		/* the current OCR setting */
 
+	unsigned int		mode;		/* current card mode of host */
+#define MMC_MODE_MMC		0
+#define MMC_MODE_SD		1
+
 	struct list_head	cards;		/* devices attached to this host */
 
 	wait_queue_head_t	wq;
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 0d35d4ffb360..1ab78e8d6c53 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -88,6 +88,8 @@ struct mmc_card;
 
 extern int mmc_wait_for_req(struct mmc_host *, struct mmc_request *);
 extern int mmc_wait_for_cmd(struct mmc_host *, struct mmc_command *, int);
+extern int mmc_wait_for_app_cmd(struct mmc_host *, unsigned int,
+	struct mmc_command *, int);
 
 extern int __mmc_claim_host(struct mmc_host *host, struct mmc_card *card);
 
-- 
cgit v1.2.3


From a00fc09029f02ca833cf90e5d5625f08c4ac4f51 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus-list@drzeus.cx>
Date: Tue, 6 Sep 2005 15:18:52 -0700
Subject: [PATCH] sd: read-only switch

Support for the read-only switch on SD cards which must be enforced by the
host.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/mmc/mmc.c        | 39 +++++++++++++++++++++++----------------
 drivers/mmc/mmc_block.c  |  9 +++++++--
 include/linux/mmc/card.h |  3 +++
 include/linux/mmc/host.h |  1 +
 4 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c
index 294961a102ca..725c6ad3eb64 100644
--- a/drivers/mmc/mmc.c
+++ b/drivers/mmc/mmc.c
@@ -413,8 +413,7 @@ static void mmc_decode_cid(struct mmc_card *card)
 		card->cid.month			= UNSTUFF_BITS(resp, 8, 4);
 
 		card->cid.year += 2000; /* SD cards year offset */
-	}
-	else {
+	} else {
 		/*
 		 * The selection of the format here is based upon published
 		 * specs from sandisk and from what people have reported.
@@ -494,8 +493,7 @@ static void mmc_decode_csd(struct mmc_card *card)
 		csd->capacity	  = (1 + m) << (e + 2);
 
 		csd->read_blkbits = UNSTUFF_BITS(resp, 80, 4);
-	}
-	else {
+	} else {
 		/*
 		 * We only understand CSD structure v1.1 and v1.2.
 		 * v1.2 has extra information in bits 15, 11 and 10.
@@ -738,10 +736,20 @@ static void mmc_discover_cards(struct mmc_host *host)
 			err = mmc_wait_for_cmd(host, &cmd, CMD_RETRIES);
 			if (err != MMC_ERR_NONE)
 				mmc_card_set_dead(card);
-			else
+			else {
 				card->rca = cmd.resp[0] >> 16;
-		}
-		else {
+
+				if (!host->ops->get_ro) {
+					printk(KERN_WARNING "%s: host does not "
+						"support reading read-only "
+						"switch. assuming write-enable.\n",
+						mmc_hostname(host));
+				} else {
+					if (host->ops->get_ro(host))
+						mmc_card_set_readonly(card);
+				}
+			}
+		} else {
 			cmd.opcode = MMC_SET_RELATIVE_ADDR;
 			cmd.arg = card->rca << 16;
 			cmd.flags = MMC_RSP_R1;
@@ -833,24 +841,23 @@ static void mmc_setup(struct mmc_host *host)
 		int err;
 		u32 ocr;
 
-		host->mode = MMC_MODE_MMC;
+		host->mode = MMC_MODE_SD;
 
 		mmc_power_up(host);
 		mmc_idle_cards(host);
 
-		err = mmc_send_op_cond(host, 0, &ocr);
+		err = mmc_send_app_op_cond(host, 0, &ocr);
 
 		/*
-		 * If we fail to detect any cards then try
-		 * searching for SD cards.
+		 * If we fail to detect any SD cards then try
+		 * searching for MMC cards.
 		 */
-		if (err != MMC_ERR_NONE)
-		{
-			err = mmc_send_app_op_cond(host, 0, &ocr);
+		if (err != MMC_ERR_NONE) {
+			host->mode = MMC_MODE_MMC;
+
+			err = mmc_send_op_cond(host, 0, &ocr);
 			if (err != MMC_ERR_NONE)
 				return;
-
-			host->mode = MMC_MODE_SD;
 		}
 
 		host->ocr = mmc_select_voltage(host, ocr);
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index d4eee99c2bf6..fa83f15fdf16 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -95,6 +95,10 @@ static int mmc_blk_open(struct inode *inode, struct file *filp)
 		if (md->usage == 2)
 			check_disk_change(inode->i_bdev);
 		ret = 0;
+
+		if ((filp->f_mode & FMODE_WRITE) &&
+			mmc_card_readonly(md->queue.card))
+			ret = -EROFS;
 	}
 
 	return ret;
@@ -403,9 +407,10 @@ static int mmc_blk_probe(struct mmc_card *card)
 	if (err)
 		goto out;
 
-	printk(KERN_INFO "%s: %s %s %dKiB\n",
+	printk(KERN_INFO "%s: %s %s %dKiB %s\n",
 		md->disk->disk_name, mmc_card_id(card), mmc_card_name(card),
-		(card->csd.capacity << card->csd.read_blkbits) / 1024);
+		(card->csd.capacity << card->csd.read_blkbits) / 1024,
+		mmc_card_readonly(card)?"(ro)":"");
 
 	mmc_set_drvdata(card, md);
 	add_disk(md->disk);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 538e8c86336c..0e9ec01b9c5b 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -48,6 +48,7 @@ struct mmc_card {
 #define MMC_STATE_DEAD		(1<<1)		/* device no longer in stack */
 #define MMC_STATE_BAD		(1<<2)		/* unrecognised device */
 #define MMC_STATE_SDCARD	(1<<3)		/* is an SD card */
+#define MMC_STATE_READONLY	(1<<4)		/* card is read-only */
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
 	struct mmc_cid		cid;		/* card identification */
@@ -58,11 +59,13 @@ struct mmc_card {
 #define mmc_card_dead(c)	((c)->state & MMC_STATE_DEAD)
 #define mmc_card_bad(c)		((c)->state & MMC_STATE_BAD)
 #define mmc_card_sd(c)		((c)->state & MMC_STATE_SDCARD)
+#define mmc_card_readonly(c)	((c)->state & MMC_STATE_READONLY)
 
 #define mmc_card_set_present(c)	((c)->state |= MMC_STATE_PRESENT)
 #define mmc_card_set_dead(c)	((c)->state |= MMC_STATE_DEAD)
 #define mmc_card_set_bad(c)	((c)->state |= MMC_STATE_BAD)
 #define mmc_card_set_sd(c)	((c)->state |= MMC_STATE_SDCARD)
+#define mmc_card_set_readonly(c) ((c)->state |= MMC_STATE_READONLY)
 
 #define mmc_card_name(c)	((c)->cid.prod_name)
 #define mmc_card_id(c)		((c)->dev.bus_id)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 845020d90c60..8c5f71376e41 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -62,6 +62,7 @@ struct mmc_ios {
 struct mmc_host_ops {
 	void	(*request)(struct mmc_host *host, struct mmc_request *req);
 	void	(*set_ios)(struct mmc_host *host, struct mmc_ios *ios);
+	int	(*get_ro)(struct mmc_host *host);
 };
 
 struct mmc_card;
-- 
cgit v1.2.3


From b57c43ad81602589afca3948a5a7121e40026e17 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus-list@drzeus.cx>
Date: Tue, 6 Sep 2005 15:18:53 -0700
Subject: [PATCH] sd: SCR register

Read the SD specific SCR register from the card.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/mmc/mmc.c        | 143 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/mmc/card.h |   9 +++
 2 files changed, 142 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c
index 725c6ad3eb64..21d4fb3314f8 100644
--- a/drivers/mmc/mmc.c
+++ b/drivers/mmc/mmc.c
@@ -16,6 +16,8 @@
 #include <linux/delay.h>
 #include <linux/pagemap.h>
 #include <linux/err.h>
+#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -246,6 +248,8 @@ int mmc_wait_for_app_cmd(struct mmc_host *host, unsigned int rca,
 
 EXPORT_SYMBOL(mmc_wait_for_app_cmd);
 
+static int mmc_select_card(struct mmc_host *host, struct mmc_card *card);
+
 /**
  *	__mmc_claim_host - exclusively claim a host
  *	@host: mmc host to claim
@@ -278,16 +282,10 @@ int __mmc_claim_host(struct mmc_host *host, struct mmc_card *card)
 	spin_unlock_irqrestore(&host->lock, flags);
 	remove_wait_queue(&host->wq, &wait);
 
-	if (card != (void *)-1 && host->card_selected != card) {
-		struct mmc_command cmd;
-
-		host->card_selected = card;
-
-		cmd.opcode = MMC_SELECT_CARD;
-		cmd.arg = card->rca << 16;
-		cmd.flags = MMC_RSP_R1;
-
-		err = mmc_wait_for_cmd(host, &cmd, CMD_RETRIES);
+	if (card != (void *)-1) {
+		err = mmc_select_card(host, card);
+		if (err != MMC_ERR_NONE)
+			return err;
 	}
 
 	return err;
@@ -317,6 +315,29 @@ void mmc_release_host(struct mmc_host *host)
 
 EXPORT_SYMBOL(mmc_release_host);
 
+static int mmc_select_card(struct mmc_host *host, struct mmc_card *card)
+{
+	int err;
+	struct mmc_command cmd;
+
+	BUG_ON(host->card_busy == NULL);
+
+	if (host->card_selected == card)
+		return MMC_ERR_NONE;
+
+	host->card_selected = card;
+
+	cmd.opcode = MMC_SELECT_CARD;
+	cmd.arg = card->rca << 16;
+	cmd.flags = MMC_RSP_R1;
+
+	err = mmc_wait_for_cmd(host, &cmd, CMD_RETRIES);
+	if (err != MMC_ERR_NONE)
+		return err;
+
+	return MMC_ERR_NONE;
+}
+
 /*
  * Ensure that no card is selected.
  */
@@ -525,6 +546,32 @@ static void mmc_decode_csd(struct mmc_card *card)
 	}
 }
 
+/*
+ * Given a 64-bit response, decode to our card SCR structure.
+ */
+static void mmc_decode_scr(struct mmc_card *card)
+{
+	struct sd_scr *scr = &card->scr;
+	unsigned int scr_struct;
+	u32 resp[4];
+
+	BUG_ON(!mmc_card_sd(card));
+
+	resp[3] = card->raw_scr[1];
+	resp[2] = card->raw_scr[0];
+
+	scr_struct = UNSTUFF_BITS(resp, 60, 4);
+	if (scr_struct != 0) {
+		printk("%s: unrecognised SCR structure version %d\n",
+			mmc_hostname(card->host), scr_struct);
+		mmc_card_set_bad(card);
+		return;
+	}
+
+	scr->sda_vsn = UNSTUFF_BITS(resp, 56, 4);
+	scr->bus_widths = UNSTUFF_BITS(resp, 48, 4);
+}
+
 /*
  * Locate a MMC card on this MMC host given a raw CID.
  */
@@ -789,6 +836,79 @@ static void mmc_read_csds(struct mmc_host *host)
 	}
 }
 
+static void mmc_read_scrs(struct mmc_host *host)
+{
+	int err;
+	struct mmc_card *card;
+
+	struct mmc_request mrq;
+	struct mmc_command cmd;
+	struct mmc_data data;
+
+	struct scatterlist sg;
+
+	list_for_each_entry(card, &host->cards, node) {
+		if (card->state & (MMC_STATE_DEAD|MMC_STATE_PRESENT))
+			continue;
+		if (!mmc_card_sd(card))
+			continue;
+
+		err = mmc_select_card(host, card);
+		if (err != MMC_ERR_NONE) {
+			mmc_card_set_dead(card);
+			continue;
+		}
+
+		memset(&cmd, 0, sizeof(struct mmc_command));
+
+		cmd.opcode = MMC_APP_CMD;
+		cmd.arg = card->rca << 16;
+		cmd.flags = MMC_RSP_R1;
+
+		err = mmc_wait_for_cmd(host, &cmd, 0);
+		if ((err != MMC_ERR_NONE) || !(cmd.resp[0] & R1_APP_CMD)) {
+			mmc_card_set_dead(card);
+			continue;
+		}
+
+		memset(&cmd, 0, sizeof(struct mmc_command));
+
+		cmd.opcode = SD_APP_SEND_SCR;
+		cmd.arg = 0;
+		cmd.flags = MMC_RSP_R1;
+
+		memset(&data, 0, sizeof(struct mmc_data));
+
+		data.timeout_ns = card->csd.tacc_ns * 10;
+		data.timeout_clks = card->csd.tacc_clks * 10;
+		data.blksz_bits = 3;
+		data.blocks = 1;
+		data.flags = MMC_DATA_READ;
+		data.sg = &sg;
+		data.sg_len = 1;
+
+		memset(&mrq, 0, sizeof(struct mmc_request));
+
+		mrq.cmd = &cmd;
+		mrq.data = &data;
+
+		sg_init_one(&sg, (u8*)card->raw_scr, 8);
+
+		err = mmc_wait_for_req(host, &mrq);
+		if (err != MMC_ERR_NONE) {
+			mmc_card_set_dead(card);
+			continue;
+		}
+
+		card->raw_scr[0] = ntohl(card->raw_scr[0]);
+		card->raw_scr[1] = ntohl(card->raw_scr[1]);
+
+		mmc_decode_scr(card);
+	}
+
+	mmc_deselect_cards(host);
+}
+
 static unsigned int mmc_calculate_clock(struct mmc_host *host)
 {
 	struct mmc_card *card;
@@ -912,6 +1032,9 @@ static void mmc_setup(struct mmc_host *host)
 	host->ops->set_ios(host, &host->ios);
 
 	mmc_read_csds(host);
+
+	if (host->mode == MMC_MODE_SD)
+		mmc_read_scrs(host);
 }
 
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 0e9ec01b9c5b..18fc77f682de 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -33,6 +33,13 @@ struct mmc_csd {
 	unsigned int		capacity;
 };
 
+struct sd_scr {
+	unsigned char		sda_vsn;
+	unsigned char		bus_widths;
+#define SD_SCR_BUS_WIDTH_1	(1<<0)
+#define SD_SCR_BUS_WIDTH_4	(1<<2)
+};
+
 struct mmc_host;
 
 /*
@@ -51,8 +58,10 @@ struct mmc_card {
 #define MMC_STATE_READONLY	(1<<4)		/* card is read-only */
 	u32			raw_cid[4];	/* raw card CID */
 	u32			raw_csd[4];	/* raw card CSD */
+	u32			raw_scr[2];	/* raw card SCR */
 	struct mmc_cid		cid;		/* card identification */
 	struct mmc_csd		csd;		/* card specific */
+	struct sd_scr		scr;		/* extra SD information */
 };
 
 #define mmc_card_present(c)	((c)->state & MMC_STATE_PRESENT)
-- 
cgit v1.2.3


From f218278a456b3c272b480443c89004c3d2a49f18 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus-list@drzeus.cx>
Date: Tue, 6 Sep 2005 15:18:55 -0700
Subject: [PATCH] sd: SD 4-bit bus

Infrastructure for 4-bit bus transfers with SD cards.

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/mmc/mmc.c            | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h     |  9 +++++++++
 include/linux/mmc/protocol.h |  7 +++++++
 3 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/mmc.c b/drivers/mmc/mmc.c
index 21d4fb3314f8..6414f071a2a4 100644
--- a/drivers/mmc/mmc.c
+++ b/drivers/mmc/mmc.c
@@ -335,6 +335,40 @@ static int mmc_select_card(struct mmc_host *host, struct mmc_card *card)
 	if (err != MMC_ERR_NONE)
 		return err;
 
+	/*
+	 * Default bus width is 1 bit.
+	 */
+	host->ios.bus_width = MMC_BUS_WIDTH_1;
+
+	/*
+	 * We can only change the bus width of the selected
+	 * card so therefore we have to put the handling
+	 * here.
+	 */
+	if (host->caps & MMC_CAP_4_BIT_DATA) {
+		/*
+		 * The card is in 1 bit mode by default so
+		 * we only need to change if it supports the
+		 * wider version.
+		 */
+		if (mmc_card_sd(card) &&
+			(card->scr.bus_widths & SD_SCR_BUS_WIDTH_4)) {
+			struct mmc_command cmd;
+			cmd.opcode = SD_APP_SET_BUS_WIDTH;
+			cmd.arg = SD_BUS_WIDTH_4;
+			cmd.flags = MMC_RSP_R1;
+
+			err = mmc_wait_for_app_cmd(host, card->rca, &cmd,
+				CMD_RETRIES);
+			if (err != MMC_ERR_NONE)
+				return err;
+
+			host->ios.bus_width = MMC_BUS_WIDTH_4;
+		}
+	}
+
+	host->ops->set_ios(host, &host->ios);
+
 	return MMC_ERR_NONE;
 }
 
@@ -653,6 +687,7 @@ static void mmc_power_up(struct mmc_host *host)
 	host->ios.bus_mode = MMC_BUSMODE_OPENDRAIN;
 	host->ios.chip_select = MMC_CS_DONTCARE;
 	host->ios.power_mode = MMC_POWER_UP;
+	host->ios.bus_width = MMC_BUS_WIDTH_1;
 	host->ops->set_ios(host, &host->ios);
 
 	mmc_delay(1);
@@ -671,6 +706,7 @@ static void mmc_power_off(struct mmc_host *host)
 	host->ios.bus_mode = MMC_BUSMODE_OPENDRAIN;
 	host->ios.chip_select = MMC_CS_DONTCARE;
 	host->ios.power_mode = MMC_POWER_OFF;
+	host->ios.bus_width = MMC_BUS_WIDTH_1;
 	host->ops->set_ios(host, &host->ios);
 }
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 8c5f71376e41..6014160d9c06 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -57,6 +57,11 @@ struct mmc_ios {
 #define MMC_POWER_OFF		0
 #define MMC_POWER_UP		1
 #define MMC_POWER_ON		2
+
+	unsigned char	bus_width;		/* data bus width */
+
+#define MMC_BUS_WIDTH_1		0
+#define MMC_BUS_WIDTH_4		2
 };
 
 struct mmc_host_ops {
@@ -77,6 +82,10 @@ struct mmc_host {
 	unsigned int		f_max;
 	u32			ocr_avail;
 
+	unsigned long		caps;		/* Host capabilities */
+
+#define MMC_CAP_4_BIT_DATA	(1 << 0)	/* Can the host do 4 bit transfers */
+
 	/* host specific block data */
 	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
 	unsigned short		max_hw_segs;	/* see blk_queue_max_hw_segments */
diff --git a/include/linux/mmc/protocol.h b/include/linux/mmc/protocol.h
index 896342817b97..f819cae92266 100644
--- a/include/linux/mmc/protocol.h
+++ b/include/linux/mmc/protocol.h
@@ -236,5 +236,12 @@ struct _mmc_csd {
 #define CSD_SPEC_VER_2      2           /* Implements system specification 2.0 - 2.2 */
 #define CSD_SPEC_VER_3      3           /* Implements system specification 3.1 */
 
+
+/*
+ * SD bus widths
+ */
+#define SD_BUS_WIDTH_1      0
+#define SD_BUS_WIDTH_4      2
+
 #endif  /* MMC_MMC_PROTOCOL_H */
 
-- 
cgit v1.2.3


From 3158106685acac8f8d4e74a17b974f160fe77c0b Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@rpsys.net>
Date: Tue, 6 Sep 2005 15:19:06 -0700
Subject: [PATCH] Input: Add a new switch event type

The corgi keyboard has need of a switch event type with slightly type to the
input system as recommended by the input maintainer.

Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Cc: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/input/evdev.c             |  8 ++++++++
 drivers/input/input.c             | 11 +++++++++++
 drivers/input/keyboard/corgikbd.c |  9 +++++----
 include/linux/input.h             | 25 +++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index f8b278d3559b..19c14c4beb44 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -393,6 +393,7 @@ static long evdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 						case EV_LED: bits = dev->ledbit; len = LED_MAX; break;
 						case EV_SND: bits = dev->sndbit; len = SND_MAX; break;
 						case EV_FF:  bits = dev->ffbit;  len = FF_MAX;  break;
+						case EV_SW:  bits = dev->swbit;  len = SW_MAX;  break;
 						default: return -EINVAL;
 					}
 					len = NBITS(len) * sizeof(long);
@@ -421,6 +422,13 @@ static long evdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 					return copy_to_user(p, dev->snd, len) ? -EFAULT : len;
 				}
 
+				if (_IOC_NR(cmd) == _IOC_NR(EVIOCGSW(0))) {
+					int len;
+					len = NBITS(SW_MAX) * sizeof(long);
+					if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd);
+					return copy_to_user(p, dev->sw, len) ? -EFAULT : len;
+				}
+
 				if (_IOC_NR(cmd) == _IOC_NR(EVIOCGNAME(0))) {
 					int len;
 					if (!dev->name) return -ENOENT;
diff --git a/drivers/input/input.c b/drivers/input/input.c
index a275211c8e1e..88636a204525 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -89,6 +89,15 @@ void input_event(struct input_dev *dev, unsigned int type, unsigned int code, in
 
 			break;
 
+		case EV_SW:
+
+			if (code > SW_MAX || !test_bit(code, dev->swbit) || !!test_bit(code, dev->sw) == value)
+				return;
+
+			change_bit(code, dev->sw);
+
+			break;
+
 		case EV_ABS:
 
 			if (code > ABS_MAX || !test_bit(code, dev->absbit))
@@ -402,6 +411,7 @@ static void input_call_hotplug(char *verb, struct input_dev *dev)
 	SPRINTF_BIT_A2(ledbit, "LED=", LED_MAX, EV_LED);
 	SPRINTF_BIT_A2(sndbit, "SND=", SND_MAX, EV_SND);
 	SPRINTF_BIT_A2(ffbit,  "FF=",  FF_MAX, EV_FF);
+	SPRINTF_BIT_A2(swbit,  "SW=",  SW_MAX, EV_SW);
 
 	envp[i++] = NULL;
 
@@ -490,6 +500,7 @@ static int input_devices_read(char *buf, char **start, off_t pos, int count, int
 		SPRINTF_BIT_B2(ledbit, "LED=", LED_MAX, EV_LED);
 		SPRINTF_BIT_B2(sndbit, "SND=", SND_MAX, EV_SND);
 		SPRINTF_BIT_B2(ffbit,  "FF=",  FF_MAX, EV_FF);
+		SPRINTF_BIT_B2(swbit,  "SW=",  SW_MAX, EV_SW);
 
 		len += sprintf(buf + len, "\n");
 
diff --git a/drivers/input/keyboard/corgikbd.c b/drivers/input/keyboard/corgikbd.c
index 767e853dd766..cd4b6e795013 100644
--- a/drivers/input/keyboard/corgikbd.c
+++ b/drivers/input/keyboard/corgikbd.c
@@ -249,9 +249,8 @@ static void corgikbd_hinge_timer(unsigned long data)
 		if (hinge_count >= HINGE_STABLE_COUNT) {
 			spin_lock_irqsave(&corgikbd_data->lock, flags);
 
-			input_report_key(&corgikbd_data->input, corgikbd_data->keycode[125], (sharpsl_hinge_state == 0x00));
-			input_report_key(&corgikbd_data->input, corgikbd_data->keycode[126], (sharpsl_hinge_state == 0x08));
-			input_report_key(&corgikbd_data->input, corgikbd_data->keycode[127], (sharpsl_hinge_state == 0x0c));
+			input_report_switch(&corgikbd_data->input, SW_0, ((sharpsl_hinge_state & CORGI_SCP_SWA) != 0));
+			input_report_switch(&corgikbd_data->input, SW_1, ((sharpsl_hinge_state & CORGI_SCP_SWB) != 0));
 			input_sync(&corgikbd_data->input);
 
 			spin_unlock_irqrestore(&corgikbd_data->lock, flags);
@@ -321,7 +320,7 @@ static int __init corgikbd_probe(struct device *dev)
 	corgikbd->input.id.vendor = 0x0001;
 	corgikbd->input.id.product = 0x0001;
 	corgikbd->input.id.version = 0x0100;
-	corgikbd->input.evbit[0] = BIT(EV_KEY) | BIT(EV_REP) | BIT(EV_PWR);
+	corgikbd->input.evbit[0] = BIT(EV_KEY) | BIT(EV_REP) | BIT(EV_PWR) | BIT(EV_SW);
 	corgikbd->input.keycode = corgikbd->keycode;
 	corgikbd->input.keycodesize = sizeof(unsigned char);
 	corgikbd->input.keycodemax = ARRAY_SIZE(corgikbd_keycode);
@@ -330,6 +329,8 @@ static int __init corgikbd_probe(struct device *dev)
 	for (i = 0; i < ARRAY_SIZE(corgikbd_keycode); i++)
 		set_bit(corgikbd->keycode[i], corgikbd->input.keybit);
 	clear_bit(0, corgikbd->input.keybit);
+	set_bit(SW_0, corgikbd->input.swbit);
+	set_bit(SW_1, corgikbd->input.swbit);
 
 	input_register_device(&corgikbd->input);
 	mod_timer(&corgikbd->htimer, jiffies + HINGE_SCAN_INTERVAL);
diff --git a/include/linux/input.h b/include/linux/input.h
index bdc53c6cc962..4767e5429534 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -66,6 +66,7 @@ struct input_absinfo {
 #define EVIOCGKEY(len)		_IOC(_IOC_READ, 'E', 0x18, len)		/* get global keystate */
 #define EVIOCGLED(len)		_IOC(_IOC_READ, 'E', 0x19, len)		/* get all LEDs */
 #define EVIOCGSND(len)		_IOC(_IOC_READ, 'E', 0x1a, len)		/* get all sounds status */
+#define EVIOCGSW(len)		_IOC(_IOC_READ, 'E', 0x1b, len)		/* get all switch states */
 
 #define EVIOCGBIT(ev,len)	_IOC(_IOC_READ, 'E', 0x20 + ev, len)	/* get event bits */
 #define EVIOCGABS(abs)		_IOR('E', 0x40 + abs, struct input_absinfo)		/* get abs value/limits */
@@ -86,6 +87,7 @@ struct input_absinfo {
 #define EV_REL			0x02
 #define EV_ABS			0x03
 #define EV_MSC			0x04
+#define EV_SW			0x05
 #define EV_LED			0x11
 #define EV_SND			0x12
 #define EV_REP			0x14
@@ -550,6 +552,20 @@ struct input_absinfo {
 #define ABS_MISC		0x28
 #define ABS_MAX			0x3f
 
+/*
+ * Switch events
+ */
+
+#define SW_0		0x00
+#define SW_1		0x01
+#define SW_2		0x02
+#define SW_3		0x03
+#define SW_4		0x04
+#define SW_5		0x05
+#define SW_6		0x06
+#define SW_7		0x07
+#define SW_MAX		0x0f
+
 /*
  * Misc events
  */
@@ -824,6 +840,7 @@ struct input_dev {
 	unsigned long ledbit[NBITS(LED_MAX)];
 	unsigned long sndbit[NBITS(SND_MAX)];
 	unsigned long ffbit[NBITS(FF_MAX)];
+	unsigned long swbit[NBITS(SW_MAX)];
 	int ff_effects_max;
 
 	unsigned int keycodemax;
@@ -844,6 +861,7 @@ struct input_dev {
 	unsigned long key[NBITS(KEY_MAX)];
 	unsigned long led[NBITS(LED_MAX)];
 	unsigned long snd[NBITS(SND_MAX)];
+	unsigned long sw[NBITS(SW_MAX)];
 
 	int absmax[ABS_MAX + 1];
 	int absmin[ABS_MAX + 1];
@@ -886,6 +904,7 @@ struct input_dev {
 #define INPUT_DEVICE_ID_MATCH_LEDBIT	0x200
 #define INPUT_DEVICE_ID_MATCH_SNDBIT	0x400
 #define INPUT_DEVICE_ID_MATCH_FFBIT	0x800
+#define INPUT_DEVICE_ID_MATCH_SWBIT	0x1000
 
 #define INPUT_DEVICE_ID_MATCH_DEVICE\
 	(INPUT_DEVICE_ID_MATCH_BUS | INPUT_DEVICE_ID_MATCH_VENDOR | INPUT_DEVICE_ID_MATCH_PRODUCT)
@@ -906,6 +925,7 @@ struct input_device_id {
 	unsigned long ledbit[NBITS(LED_MAX)];
 	unsigned long sndbit[NBITS(SND_MAX)];
 	unsigned long ffbit[NBITS(FF_MAX)];
+	unsigned long swbit[NBITS(SW_MAX)];
 
 	unsigned long driver_info;
 };
@@ -998,6 +1018,11 @@ static inline void input_report_ff_status(struct input_dev *dev, unsigned int co
 	input_event(dev, EV_FF_STATUS, code, value);
 }
 
+static inline void input_report_switch(struct input_dev *dev, unsigned int code, int value)
+{
+	input_event(dev, EV_SW, code, !!value);
+}
+
 static inline void input_regs(struct input_dev *dev, struct pt_regs *regs)
 {
 	dev->regs = regs;
-- 
cgit v1.2.3


From a7662236253374012d364106b6dc9161bd929e2e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 6 Sep 2005 15:19:10 -0700
Subject: [PATCH] Make ll_rw_block() wait for buffer lock

Introduce new ll_rw_block() operation SWRITE meaning that block layer should
wait for the buffer lock and write-out afterwards.  Hence data in buffers at
the time of call are guaranteed to be submitted to the disk.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c        | 30 ++++++++++++++++--------------
 include/linux/fs.h |  1 +
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index a92b81403552..1c62203a4906 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -917,8 +917,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				wait_on_buffer(bh);
-				ll_rw_block(WRITE, 1, &bh);
+				ll_rw_block(SWRITE, 1, &bh);
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -2793,21 +2792,22 @@ int submit_bh(int rw, struct buffer_head * bh)
 
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
- * ll_rw_block() takes an array of pointers to &struct buffer_heads,
- * and requests an I/O operation on them, either a %READ or a %WRITE.
- * The third %READA option is described in the documentation for
- * generic_make_request() which ll_rw_block() calls.
+ * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
+ * requests an I/O operation on them, either a %READ or a %WRITE.  The third
+ * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
+ * are sent to disk. The fourth %READA option is described in the documentation
+ * for generic_make_request() which ll_rw_block() calls.
  *
  * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit), any buffer that appears to be clean when doing a
- * write request, and any buffer that appears to be up-to-date when doing
- * read request.  Further it marks as clean buffers that are processed for
- * writing (the buffer cache won't assume that they are actually clean until
- * the buffer gets unlocked).
+ * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
+ * clean when doing a write request, and any buffer that appears to be
+ * up-to-date when doing read request.  Further it marks as clean buffers that
+ * are processed for writing (the buffer cache won't assume that they are
+ * actually clean until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2823,11 +2823,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (test_set_buffer_locked(bh))
+		if (rw == SWRITE)
+			lock_buffer(bh);
+		else if (test_set_buffer_locked(bh))
 			continue;
 
 		get_bh(bh);
-		if (rw == WRITE) {
+		if (rw == WRITE || rw == SWRITE) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				submit_bh(WRITE, bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e1b589842af..fd93ab7da905 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -69,6 +69,7 @@ extern int dir_notify_enable;
 #define READ 0
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
+#define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
-- 
cgit v1.2.3


From d0aaff9796c3310326d10da44fc0faed352a1d29 Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Tue, 6 Sep 2005 15:19:26 -0700
Subject: [PATCH] Kprobes: prevent possible race conditions generic

There are possible race conditions if probes are placed on routines within the
kprobes files and routines used by the kprobes.  For example if you put probe
on get_kprobe() routines, the system can hang while inserting probes on any
routine such as do_fork().  Because while inserting probes on do_fork(),
register_kprobes() routine grabs the kprobes spin lock and executes
get_kprobe() routine and to handle probe of get_kprobe(), kprobes_handler()
gets executed and tries to grab kprobes spin lock, and spins forever.  This
patch avoids such possible race conditions by preventing probes on routines
within the kprobes file and routines used by kprobes.

I have modified the patches as per Andi Kleen's suggestion to move kprobes
routines and other routines used by kprobes to a seperate section
.kprobes.text.

Also moved page fault and exception handlers, general protection fault to
.kprobes.text section.

These patches have been tested on i386, x86_64 and ppc64 architectures, also
compiled on ia64 and sparc64 architectures.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/sections.h    |  1 +
 include/asm-generic/vmlinux.lds.h |  6 ++++
 include/linux/kprobes.h           |  3 ++
 include/linux/linkage.h           |  7 ++++
 kernel/kprobes.c                  | 72 +++++++++++++++++++++++----------------
 5 files changed, 60 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 450eae22c39a..886dbd116899 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -12,5 +12,6 @@ extern char _sextratext[] __attribute__((weak));
 extern char _eextratext[] __attribute__((weak));
 extern char _end[];
 extern char __per_cpu_start[], __per_cpu_end[];
+extern char __kprobes_text_start[], __kprobes_text_end[];
 
 #endif /* _ASM_GENERIC_SECTIONS_H_ */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 3fa94288aa93..6f857be2b644 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -97,3 +97,9 @@
 		VMLINUX_SYMBOL(__lock_text_start) = .;			\
 		*(.spinlock.text)					\
 		VMLINUX_SYMBOL(__lock_text_end) = .;
+
+#define KPROBES_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__kprobes_text_start) = .;		\
+		*(.kprobes.text)					\
+		VMLINUX_SYMBOL(__kprobes_text_end) = .;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e050fc2d4c26..e30afdca7917 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -42,6 +42,9 @@
 #define KPROBE_REENTER		0x00000004
 #define KPROBE_HIT_SSDONE	0x00000008
 
+/* Attach to insert probes on any functions which should be ignored*/
+#define __kprobes	__attribute__((__section__(".kprobes.text")))
+
 struct kprobe;
 struct pt_regs;
 struct kretprobe;
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 338f7795d8a0..147eb01e0d4b 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -33,6 +33,13 @@
   ALIGN; \
   name:
 
+#define KPROBE_ENTRY(name) \
+  .section .kprobes.text, "ax"; \
+  .globl name; \
+  ALIGN; \
+  name:
+
+
 #endif
 
 #define NORET_TYPE    /**/
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..3b7653f2e7ae 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
  * get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
  */
-kprobe_opcode_t *get_insn_slot(void)
+kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
 	return kip->insns;
 }
 
-void free_insn_slot(kprobe_opcode_t *slot)
+void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 {
 	struct kprobe_insn_page *kip;
 	struct hlist_node *pos;
@@ -152,20 +153,20 @@ void free_insn_slot(kprobe_opcode_t *slot)
 }
 
 /* Locks kprobe: irqs must be disabled */
-void lock_kprobes(void)
+void __kprobes lock_kprobes(void)
 {
 	spin_lock(&kprobe_lock);
 	kprobe_cpu = smp_processor_id();
 }
 
-void unlock_kprobes(void)
+void __kprobes unlock_kprobes(void)
 {
 	kprobe_cpu = NR_CPUS;
 	spin_unlock(&kprobe_lock);
 }
 
 /* You have to be holding the kprobe_lock */
-struct kprobe *get_kprobe(void *addr)
+struct kprobe __kprobes *get_kprobe(void *addr)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -183,7 +184,7 @@ struct kprobe *get_kprobe(void *addr)
  * Aggregate handlers for multiple kprobes support - these handlers
  * take care of invoking the individual kprobe handlers on p->list
  */
-static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp;
 
@@ -198,8 +199,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-			      unsigned long flags)
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+					unsigned long flags)
 {
 	struct kprobe *kp;
 
@@ -213,8 +214,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	return;
 }
 
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-			      int trapnr)
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+					int trapnr)
 {
 	/*
 	 * if we faulted "during" the execution of a user specified
@@ -227,7 +228,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 	return 0;
 }
 
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp = curr_kprobe;
 	if (curr_kprobe && kp->break_handler) {
@@ -240,7 +241,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
 	struct hlist_node *node;
 	struct kretprobe_instance *ri;
@@ -249,7 +250,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
-static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+							      *rp)
 {
 	struct hlist_node *node;
 	struct kretprobe_instance *ri;
@@ -258,7 +260,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
-void add_rp_inst(struct kretprobe_instance *ri)
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
 	/*
 	 * Remove rp inst off the free list -
@@ -276,7 +278,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
 	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
 
-void recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
 	/* remove rp inst off the rprobe_inst_table */
 	hlist_del(&ri->hlist);
@@ -291,7 +293,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
 		kfree(ri);
 }
 
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 {
 	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
 }
@@ -302,7 +304,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
  * instances associated with this task. These left over instances represent
  * probed functions that have been called but will never return.
  */
-void kprobe_flush_task(struct task_struct *tk)
+void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
         struct kretprobe_instance *ri;
         struct hlist_head *head;
@@ -322,7 +324,8 @@ void kprobe_flush_task(struct task_struct *tk)
  * This kprobe pre_handler is registered with every kretprobe. When probe
  * hits it will set up the return probe.
  */
-static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
 
@@ -353,7 +356,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 * Add the new probe to old_p->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 {
         struct kprobe *kp;
 
@@ -395,7 +398,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
  * the intricacies
  * TODO: Move kcalloc outside the spinlock
  */
-static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+					  struct kprobe *p)
 {
 	int ret = 0;
 	struct kprobe *ap;
@@ -434,15 +438,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
 		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
-int register_kprobe(struct kprobe *p)
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+	if (addr >= (unsigned long)__kprobes_text_start
+		&& addr < (unsigned long)__kprobes_text_end)
+		return -EINVAL;
+	return 0;
+}
+
+int __kprobes register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	unsigned long flags = 0;
 	struct kprobe *old_p;
 
-	if ((ret = arch_prepare_kprobe(p)) != 0) {
+	if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
+		return ret;
+	if ((ret = arch_prepare_kprobe(p)) != 0)
 		goto rm_kprobe;
-	}
+
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
 	p->nmissed = 0;
@@ -466,7 +480,7 @@ rm_kprobe:
 	return ret;
 }
 
-void unregister_kprobe(struct kprobe *p)
+void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unsigned long flags;
 	struct kprobe *old_p;
@@ -487,7 +501,7 @@ static struct notifier_block kprobe_exceptions_nb = {
 	.priority = 0x7fffffff /* we need to notified first */
 };
 
-int register_jprobe(struct jprobe *jp)
+int __kprobes register_jprobe(struct jprobe *jp)
 {
 	/* Todo: Verify probepoint is a function entry point */
 	jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +510,14 @@ int register_jprobe(struct jprobe *jp)
 	return register_kprobe(&jp->kp);
 }
 
-void unregister_jprobe(struct jprobe *jp)
+void __kprobes unregister_jprobe(struct jprobe *jp)
 {
 	unregister_kprobe(&jp->kp);
 }
 
 #ifdef ARCH_SUPPORTS_KRETPROBES
 
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
@@ -540,14 +554,14 @@ int register_kretprobe(struct kretprobe *rp)
 
 #else /* ARCH_SUPPORTS_KRETPROBES */
 
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
 
 #endif /* ARCH_SUPPORTS_KRETPROBES */
 
-void unregister_kretprobe(struct kretprobe *rp)
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unsigned long flags;
 	struct kretprobe_instance *ri;
-- 
cgit v1.2.3


From 34bb61f9ddabd7a7f909cbfb05592eb775f6662a Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Tue, 6 Sep 2005 16:56:51 -0700
Subject: [PATCH] fix klist semantics for lists which have elements removed on
 traversal

The problem is that klists claim to provide semantics for safe traversal of
lists which are being modified.  The failure case is when traversal of a
list causes element removal (a fairly common case).  The issue is that
although the list node is refcounted, if it is embedded in an object (which
is universally the case), then the object will be freed regardless of the
klist refcount leading to slab corruption because the klist iterator refers
to the prior element to get the next.

The solution is to make the klist take and release references to the
embedding object meaning that the embedding object won't be released until
the list relinquishes the reference to it.

(akpm: fast-track this because it's needed for the 2.6.13 scsi merge)

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/bus.c    | 34 ++++++++++++++++++++++++++++++++--
 drivers/base/core.c   | 17 ++++++++++++++++-
 drivers/base/driver.c | 15 ++++++++++++++-
 include/linux/klist.h |  8 +++++---
 lib/klist.c           | 18 +++++++++++++++++-
 5 files changed, 84 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 17e96698410e..03204bfd17af 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -568,6 +568,36 @@ static void bus_remove_attrs(struct bus_type * bus)
 	}
 }
 
+static void klist_devices_get(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_bus);
+
+	get_device(dev);
+}
+
+static void klist_devices_put(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_bus);
+
+	put_device(dev);
+}
+
+static void klist_drivers_get(struct klist_node *n)
+{
+	struct device_driver *drv = container_of(n, struct device_driver,
+						 knode_bus);
+
+	get_driver(drv);
+}
+
+static void klist_drivers_put(struct klist_node *n)
+{
+	struct device_driver *drv = container_of(n, struct device_driver,
+						 knode_bus);
+
+	put_driver(drv);
+}
+
 /**
  *	bus_register - register a bus with the system.
  *	@bus:	bus.
@@ -602,8 +632,8 @@ int bus_register(struct bus_type * bus)
 	if (retval)
 		goto bus_drivers_fail;
 
-	klist_init(&bus->klist_devices);
-	klist_init(&bus->klist_drivers);
+	klist_init(&bus->klist_devices, klist_devices_get, klist_devices_put);
+	klist_init(&bus->klist_drivers, klist_drivers_get, klist_drivers_put);
 	bus_add_attrs(bus);
 
 	pr_debug("bus type '%s' registered\n", bus->name);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index c8a33df00761..6ab73f5c799a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -191,6 +191,20 @@ void device_remove_file(struct device * dev, struct device_attribute * attr)
 	}
 }
 
+static void klist_children_get(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_parent);
+
+	get_device(dev);
+}
+
+static void klist_children_put(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_parent);
+
+	put_device(dev);
+}
+
 
 /**
  *	device_initialize - init device structure.
@@ -207,7 +221,8 @@ void device_initialize(struct device *dev)
 {
 	kobj_set_kset_s(dev, devices_subsys);
 	kobject_init(&dev->kobj);
-	klist_init(&dev->klist_children);
+	klist_init(&dev->klist_children, klist_children_get,
+		   klist_children_put);
 	INIT_LIST_HEAD(&dev->dma_pools);
 	init_MUTEX(&dev->sem);
 }
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index 291c5954a3af..ef3fe513e398 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -142,6 +142,19 @@ void put_driver(struct device_driver * drv)
 	kobject_put(&drv->kobj);
 }
 
+static void klist_devices_get(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_driver);
+
+	get_device(dev);
+}
+
+static void klist_devices_put(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_driver);
+
+	put_device(dev);
+}
 
 /**
  *	driver_register - register driver with bus
@@ -157,7 +170,7 @@ void put_driver(struct device_driver * drv)
  */
 int driver_register(struct device_driver * drv)
 {
-	klist_init(&drv->klist_devices);
+	klist_init(&drv->klist_devices, klist_devices_get, klist_devices_put);
 	init_completion(&drv->unloaded);
 	return bus_add_driver(drv);
 }
diff --git a/include/linux/klist.h b/include/linux/klist.h
index c4d1fae4dd89..74071254c9d3 100644
--- a/include/linux/klist.h
+++ b/include/linux/klist.h
@@ -17,15 +17,17 @@
 #include <linux/kref.h>
 #include <linux/list.h>
 
-
+struct klist_node;
 struct klist {
 	spinlock_t		k_lock;
 	struct list_head	k_list;
+	void			(*get)(struct klist_node *);
+	void			(*put)(struct klist_node *);
 };
 
 
-extern void klist_init(struct klist * k);
-
+extern void klist_init(struct klist * k, void (*get)(struct klist_node *),
+		       void (*put)(struct klist_node *));
 
 struct klist_node {
 	struct klist		* n_klist;
diff --git a/lib/klist.c b/lib/klist.c
index a70c836c5c4c..bb2f3551d50a 100644
--- a/lib/klist.c
+++ b/lib/klist.c
@@ -42,12 +42,23 @@
 /**
  *	klist_init - Initialize a klist structure. 
  *	@k:	The klist we're initializing.
+ *	@get:	The get function for the embedding object (NULL if none)
+ *	@put:	The put function for the embedding object (NULL if none)
+ *
+ * Initialises the klist structure.  If the klist_node structures are
+ * going to be embedded in refcounted objects (necessary for safe
+ * deletion) then the get/put arguments are used to initialise
+ * functions that take and release references on the embedding
+ * objects.
  */
 
-void klist_init(struct klist * k)
+void klist_init(struct klist * k, void (*get)(struct klist_node *),
+		void (*put)(struct klist_node *))
 {
 	INIT_LIST_HEAD(&k->k_list);
 	spin_lock_init(&k->k_lock);
+	k->get = get;
+	k->put = put;
 }
 
 EXPORT_SYMBOL_GPL(klist_init);
@@ -74,6 +85,8 @@ static void klist_node_init(struct klist * k, struct klist_node * n)
 	init_completion(&n->n_removed);
 	kref_init(&n->n_ref);
 	n->n_klist = k;
+	if (k->get)
+		k->get(n);
 }
 
 
@@ -110,9 +123,12 @@ EXPORT_SYMBOL_GPL(klist_add_tail);
 static void klist_release(struct kref * kref)
 {
 	struct klist_node * n = container_of(kref, struct klist_node, n_ref);
+	void (*put)(struct klist_node *) = n->n_klist->put;
 	list_del(&n->n_node);
 	complete(&n->n_removed);
 	n->n_klist = NULL;
+	if (put)
+		put(n);
 }
 
 static int klist_dec_and_del(struct klist_node * n)
-- 
cgit v1.2.3