From e9720acd728a46cb40daa52c99a979f7c4ff195c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 7 Mar 2008 11:08:40 -0800
Subject: [NET]: Make /proc/net a symlink on /proc/self/net (v3)

Current /proc/net is done with so called "shadows", but current
implementation is broken and has little chances to get fixed.

The problem is that dentries subtree of /proc/net directory has
fancy revalidation rules to make processes living in different
net namespaces see different entries in /proc/net subtree, but
currently, tasks see in the /proc/net subdir the contents of any
other namespace, depending on who opened the file first.

The proposed fix is to turn /proc/net into a symlink, which points
to /proc/self/net, which in turn shows what previously was in
/proc/net - the network-related info, from the net namespace the
appropriate task lives in.

# ls -l /proc/net
lrwxrwxrwx  1 root root 8 Mar  5 15:17 /proc/net -> self/net

In other words - this behaves like /proc/mounts, but unlike
"mounts", "net" is not a file, but a directory.

Changes from v2:
* Fixed discrepancy of /proc/net nlink count and selinux labeling
  screwup pointed out by Stephen.

  To get the correct nlink count the ->getattr callback for /proc/net
  is overridden to read one from the net->proc_net entry.

  To make selinux still work the net->proc_net entry is initialized
  properly, i.e. with the "net" name and the proc_net parent.

Selinux fixes are
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>

Changes from v1:
* Fixed a task_struct leak in get_proc_task_net, pointed out by Paul.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/proc_fs.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d9a9e718ad19..9b6c935f69cf 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -50,8 +50,6 @@ typedef	int (read_proc_t)(char *page, char **start, off_t off,
 typedef	int (write_proc_t)(struct file *file, const char __user *buffer,
 			   unsigned long count, void *data);
 typedef int (get_info_t)(char *, char **, off_t, int);
-typedef struct proc_dir_entry *(shadow_proc_t)(struct task_struct *task,
-						struct proc_dir_entry *pde);
 
 struct proc_dir_entry {
 	unsigned int low_ino;
@@ -82,7 +80,6 @@ struct proc_dir_entry {
 	int pde_users;	/* number of callers into module in progress */
 	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
 	struct completion *pde_unload_completion;
-	shadow_proc_t *shadow_proc;
 };
 
 struct kcore_list {
-- 
cgit v1.2.3


From e621e69137b24fdbbe7ad28214e8d81e614c25b7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <k.shutemov@gmail.com>
Date: Fri, 7 Mar 2008 11:11:13 -0800
Subject: [NET]: include <linux/types.h> into linux/ethtool.h for __u* typedef

Signed-off-by: Kirill A. Shutemov <k.shutemov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index fcbe8b640ffb..c8d216357865 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/types.h>
 
 /* This should work for both 32 and 64 bit userland. */
 struct ethtool_cmd {
-- 
cgit v1.2.3


From b5e85dee2a5433246d5b7488918a1a0ad22c046a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 10 Mar 2008 16:41:06 -0700
Subject: [NETFILTER]: nfnetlink: fix ifdef in nfnetlink_compat.h

Use __KERNEL__ instead of __KERNEL to make sure the headers are not
usable by the kernel.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_compat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_compat.h b/include/linux/netfilter/nfnetlink_compat.h
index 02a42d875cf7..e1451760c9cd 100644
--- a/include/linux/netfilter/nfnetlink_compat.h
+++ b/include/linux/netfilter/nfnetlink_compat.h
@@ -1,6 +1,6 @@
 #ifndef _NFNETLINK_COMPAT_H
 #define _NFNETLINK_COMPAT_H
-#ifndef __KERNEL
+#ifndef __KERNEL__
 /* Old nfnetlink macros for userspace */
 
 /* nfnetlink groups: Up to 32 maximum */
-- 
cgit v1.2.3


From 0738c4bb8f2a8bf15178f852494643b0981f578b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 12 Mar 2008 16:51:31 +0900
Subject: nommu: Provide is_vmalloc_addr() stub.

Introduced in commit-id 9e2779fa281cfda13ac060753d674bbcaa23367e and
ifdef'ed out for nommu in 8ca3ed87db062201e1fa15b64a9214e193fc3a8a, both
approaches end up breaking the nommu build in different ways. An
impressive feat for a 2-liner.

Current is_vmalloc_addr() users fall in to two camps:

	- Determining whether to use vfree()/kfree()
	- Whether to do vmlist traversal (only /proc/kcore).

Since we don't support /proc/kcore on nommu, that leaves the
vfree()/kfree() determination use cases. nommu vfree() happens to be a
wrapper to kfree() anyways, so is_vmalloc_addr() can always return 0
and end up with the right behaviour.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f3ccfe42de0..b695875d63e3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -235,15 +235,22 @@ static inline int get_page_unless_zero(struct page *page)
 struct page *vmalloc_to_page(const void *addr);
 unsigned long vmalloc_to_pfn(const void *addr);
 
-#ifdef CONFIG_MMU
-/* Determine if an address is within the vmalloc range */
+/*
+ * Determine if an address is within the vmalloc range
+ *
+ * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
+ * is no special casing required.
+ */
 static inline int is_vmalloc_addr(const void *x)
 {
+#ifdef CONFIG_MMU
 	unsigned long addr = (unsigned long)x;
 
 	return addr >= VMALLOC_START && addr < VMALLOC_END;
-}
+#else
+	return 0;
 #endif
+}
 
 static inline struct page *compound_head(struct page *page)
 {
-- 
cgit v1.2.3


From a8ae50ba9336ff77d0df0943ac27b79ba0a5a521 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 12 Mar 2008 17:52:56 +0100
Subject: Remove <linux/genhd.h> from user-visible headers.

It was all wrapped in '#ifdef CONFIG_BLOCK' anyway, so userspace was
getting nothing useful out of it. And the special #ifndef __KERNEL__
version of 'struct partition' makes me inclined to promote an attitude
of violence...

Stick some comments on some of the #endifs too, while we're at it.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/Kbuild  |  1 -
 include/linux/genhd.h | 30 +++---------------------------
 2 files changed, 3 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 994df3780007..0fac822c1157 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -205,7 +205,6 @@ unifdef-y += futex.h
 unifdef-y += fs.h
 unifdef-y += gameport.h
 unifdef-y += generic_serial.h
-unifdef-y += genhd.h
 unifdef-y += gfs2_ondisk.h
 unifdef-y += hayesesp.h
 unifdef-y += hdlcdrv.h
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 32c2ac49a070..ecd2bf63fc84 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -55,24 +55,6 @@ enum {
 	UNIXWARE_PARTITION = 0x63,	/* Same as GNU_HURD and SCO Unix */
 };
 
-#ifndef __KERNEL__
-
-struct partition {
-	unsigned char boot_ind;		/* 0x80 - active */
-	unsigned char head;		/* starting head */
-	unsigned char sector;		/* starting sector */
-	unsigned char cyl;		/* starting cylinder */
-	unsigned char sys_ind;		/* What partition type */
-	unsigned char end_head;		/* end head */
-	unsigned char end_sector;	/* end sector */
-	unsigned char end_cyl;		/* end cylinder */
-	unsigned int start_sect;	/* starting sector counting from 0 */
-	unsigned int nr_sects;		/* nr of sectors in partition */
-} __attribute__((packed));
-
-#endif
-
-#ifdef __KERNEL__
 #include <linux/major.h>
 #include <linux/device.h>
 #include <linux/smp.h>
@@ -228,7 +210,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)	{
 		       sizeof(struct disk_stats));
 }
 				
-#else
+#else /* !CONFIG_SMP */
 #define __disk_stat_add(gendiskp, field, addnd) \
 				(gendiskp->dkstats.field += addnd)
 #define disk_stat_read(gendiskp, field)	(gendiskp->dkstats.field)
@@ -256,7 +238,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 	memset(&part->dkstats, value, sizeof(struct disk_stats));
 }
 
-#endif
+#endif /* CONFIG_SMP */
 
 #define disk_stat_add(gendiskp, field, addnd)			\
 	do {							\
@@ -395,8 +377,6 @@ static inline void set_capacity(struct gendisk *disk, sector_t size)
 	disk->capacity = size;
 }
 
-#endif  /*  __KERNEL__  */
-
 #ifdef CONFIG_SOLARIS_X86_PARTITION
 
 #define SOLARIS_X86_NUMSLICE	16
@@ -540,8 +520,6 @@ struct unixware_disklabel {
 #   define MINIX_NR_SUBPARTITIONS  4
 #endif /* CONFIG_MINIX_SUBPARTITION */
 
-#ifdef __KERNEL__
-
 #define ADDPART_FLAG_NONE	0
 #define ADDPART_FLAG_RAID	1
 #define ADDPART_FLAG_WHOLEDISK	2
@@ -570,8 +548,6 @@ static inline struct block_device *bdget_disk(struct gendisk *disk, int index)
 	return bdget(MKDEV(disk->major, disk->first_minor) + index);
 }
 
-#endif
-
 #else /* CONFIG_BLOCK */
 
 static inline void printk_all_partitions(void) { }
@@ -584,4 +560,4 @@ static inline dev_t blk_lookup_devt(const char *name)
 
 #endif /* CONFIG_BLOCK */
 
-#endif
+#endif /* _LINUX_GENHD_H */
-- 
cgit v1.2.3


From 07c941d00087581c9553661c2c4fb593da37f525 Mon Sep 17 00:00:00 2001
From: Tony Breeds <tony@bakeyournoodle.com>
Date: Wed, 12 Mar 2008 10:48:48 +1100
Subject: [POWERPC] Fix undefined pmu_sys_suspended compilation error

pmu_sys_suspended is declared extern when:
	defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
but only defined when:
	defined(CONFIG_SUSPEND) && defined(CONFIG_PPC32)
which is wrong.  Let's fix that.

Signed-off-by: Tony Breeds <tony@bakeyournoodle.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/pmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index 4c5f65392d36..b02b57c0fba0 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -192,7 +192,7 @@ extern unsigned int pmu_power_flags;
 extern void pmu_backlight_init(void);
 
 /* some code needs to know if the PMU was suspended for hibernation */
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
+#if defined(CONFIG_SUSPEND) && defined(CONFIG_PPC32)
 extern int pmu_sys_suspended;
 #else
 /* if power management is not configured it can't be suspended */
-- 
cgit v1.2.3


From a99d9a6ebdf8328d5c61ca9f1038f4815e25720e Mon Sep 17 00:00:00 2001
From: Tony Breeds <tony@bakeyournoodle.com>
Date: Wed, 12 Mar 2008 10:48:48 +1100
Subject: [POWERPC] Fix drivers/macintosh/mediabay.c when !CONFIG_ADB_PMU

When building drivers/macintosh/mediabay.c if CONFIG_ADB_PMU isn't
defined we get:

drivers/built-in.o: In function `media_bay_step':
mediabay.c:(.text+0x92b84): undefined reference to `pmu_suspend'
mediabay.c:(.text+0x92c08): undefined reference to `pmu_resume'

Create empty place holders in that scenario.

Signed-off-by: Tony Breeds <tony@bakeyournoodle.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/pmu.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index b02b57c0fba0..cafe98d96948 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -147,8 +147,15 @@ extern void pmu_wait_complete(struct adb_request *req);
 /* For use before switching interrupts off for a long time;
  * warning: not stackable
  */
+#if defined(CONFIG_ADB_PMU)
 extern void pmu_suspend(void);
 extern void pmu_resume(void);
+#else
+static inline void pmu_suspend(void)
+{}
+static inline void pmu_resume(void)
+{}
+#endif
 
 extern void pmu_enable_irled(int on);
 
-- 
cgit v1.2.3


From cc74d96f47b0d916840f92092595e3be9731e047 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 12 Mar 2008 21:48:03 -0700
Subject: PCI: fix issue with busses registering multiple times in sysfs

PCI busses can be registered multiple times, so we need to detect if we
have registered our bus structure in sysfs already.  If so, don't do it
again.

Thanks to Guennadi Liakhovetski <g.liakhovetski@gmx.de> for reporting
the problem, and to Linus for poking me to get me to believe that it was
a real problem.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/bus.c   | 6 +++++-
 include/linux/pci.h | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 6a9403d79e0c..d708358326e5 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -143,14 +143,18 @@ void pci_bus_add_devices(struct pci_bus *bus)
 			/* register the bus with sysfs as the parent is now
 			 * properly registered. */
 			child_bus = dev->subordinate;
+			if (child_bus->is_added)
+				continue;
 			child_bus->dev.parent = child_bus->bridge;
 			retval = device_register(&child_bus->dev);
 			if (retval)
 				dev_err(&dev->dev, "Error registering pci_bus,"
 					" continuing...\n");
-			else
+			else {
+				child_bus->is_added = 1;
 				retval = device_create_file(&child_bus->dev,
 							&dev_attr_cpuaffinity);
+			}
 			if (retval)
 				dev_err(&dev->dev, "Error creating cpuaffinity"
 					" file, continuing...\n");
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 38eff1947750..9010f5458767 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -278,6 +278,7 @@ struct pci_bus {
 	struct device		dev;
 	struct bin_attribute	*legacy_io; /* legacy I/O for this bus */
 	struct bin_attribute	*legacy_mem; /* legacy mem */
+	unsigned int		is_added:1;
 };
 
 #define pci_bus_b(n)	list_entry(n, struct pci_bus, node)
-- 
cgit v1.2.3


From 9b89ca7a3847c0d5b1e86e83f4860a866f28a89b Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.c.dionne@gmail.com>
Date: Fri, 14 Mar 2008 13:11:29 +0000
Subject: struct export_operations: adjust comments to match current members

The comments in the definition of struct export_operations don't match the
current members.

Add a comment for the 2 new functions and remove 2 comments for unused ones.

Signed-off-by: Marc Dionne <marc.c.dionne@gmail.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/exportfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 51d214138814..adcbb05b120b 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -49,11 +49,11 @@ struct fid {
 
 /**
  * struct export_operations - for nfsd to communicate with file systems
- * @decode_fh:      decode a file handle fragment and return a &struct dentry
  * @encode_fh:      encode a file handle fragment from a dentry
+ * @fh_to_dentry:   find the implied object and get a dentry for it
+ * @fh_to_parent:   find the implied object's parent and get a dentry for it
  * @get_name:       find the name for a given inode in a given directory
  * @get_parent:     find the parent of a given directory
- * @get_dentry:     find a dentry for the inode given a file handle sub-fragment
  *
  * See Documentation/filesystems/Exporting for details on how to use
  * this interface correctly.
-- 
cgit v1.2.3


From 2af3e6017e53065ddf40bb19190a29199b7ffee3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Thu, 6 Mar 2008 16:02:42 +0100
Subject: The ps2esdi driver was marked as BROKEN more than two years ago due
 to being no longer working for some time.

A driver that had been marked as BROKEN for such a long time seems to be
unlikely to be revived in the forseeable future.

But if anyone wants to ever revive this driver, the code is still present in
the older kernel releases.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Alan Cox <alan@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/mca.txt   |   17 +-
 drivers/block/Kconfig   |   10 -
 drivers/block/Makefile  |    1 -
 drivers/block/ps2esdi.c | 1079 -----------------------------------------------
 include/linux/Kbuild    |    1 -
 include/linux/ps2esdi.h |   98 -----
 6 files changed, 5 insertions(+), 1201 deletions(-)
 delete mode 100644 drivers/block/ps2esdi.c
 delete mode 100644 include/linux/ps2esdi.h

(limited to 'include/linux')

diff --git a/Documentation/mca.txt b/Documentation/mca.txt
index aabce4ad90f9..510375d4209a 100644
--- a/Documentation/mca.txt
+++ b/Documentation/mca.txt
@@ -143,14 +143,7 @@ MCA Device Drivers
 
 Currently, there are a number of MCA-specific device drivers.
 
-1) PS/2 ESDI
-	drivers/block/ps2esdi.c
-	include/linux/ps2esdi.h
-   Uses major number 36, and should use /dev files /dev/eda, /dev/edb.
-   Supports two drives, but only one controller.  May use the
-   command-line args "ed=cyl,head,sec" and "tp720".
-
-2) PS/2 SCSI
+1) PS/2 SCSI
 	drivers/scsi/ibmmca.c
 	drivers/scsi/ibmmca.h
    The driver for the IBM SCSI subsystem.  Includes both integrated
@@ -159,25 +152,25 @@ Currently, there are a number of MCA-specific device drivers.
    machine with a front-panel display (i.e. model 95), you can use
    "ibmmcascsi=display" to enable a drive activity indicator.
 
-3) 3c523
+2) 3c523
 	drivers/net/3c523.c
 	drivers/net/3c523.h
    3Com 3c523 Etherlink/MC ethernet driver.
 
-4) SMC Ultra/MCA and IBM Adapter/A
+3) SMC Ultra/MCA and IBM Adapter/A
 	drivers/net/smc-mca.c
 	drivers/net/smc-mca.h
 	Driver for the MCA version of the SMC Ultra and various other
 	OEM'ed and work-alike cards (Elite, Adapter/A, etc).
 
-5) NE/2
+4) NE/2
 	driver/net/ne2.c
 	driver/net/ne2.h
 	The NE/2 is the MCA version of the NE2000.  This may not work
 	with clones that have a different adapter id than the original
 	NE/2.
 
-6) Future Domain MCS-600/700, OEM'd IBM Fast SCSI Adapter/A and
+5) Future Domain MCS-600/700, OEM'd IBM Fast SCSI Adapter/A and
    Reply Sound Blaster/SCSI (SCSI part)
 	Better support for these cards than the driver for ISA.
    Supports multiple cards with IRQ sharing.
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index b6d230b3209f..0d1d2133d9bc 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -44,16 +44,6 @@ config MAC_FLOPPY
 	  If you have a SWIM-3 (Super Woz Integrated Machine 3; from Apple)
 	  floppy controller, say Y here. Most commonly found in PowerMacs.
 
-config BLK_DEV_PS2
-	tristate "PS/2 ESDI hard disk support"
-	depends on MCA && MCA_LEGACY && BROKEN
-	help
-	  Say Y here if you have a PS/2 machine with a MCA bus and an ESDI
-	  hard disk.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called ps2esdi.
-
 config AMIGA_Z2RAM
 	tristate "Amiga Zorro II ramdisk support"
 	depends on ZORRO
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 01c972415cb2..5e584306be99 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -13,7 +13,6 @@ obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
-obj-$(CONFIG_BLK_DEV_PS2)	+= ps2esdi.o
 obj-$(CONFIG_BLK_DEV_XD)	+= xd.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
diff --git a/drivers/block/ps2esdi.c b/drivers/block/ps2esdi.c
deleted file mode 100644
index 3c796e236253..000000000000
--- a/drivers/block/ps2esdi.c
+++ /dev/null
@@ -1,1079 +0,0 @@
-/* ps2esdi driver based on assembler code by Arindam Banerji,
-   written by Peter De Schrijver */
-/* Reassuring note to IBM : This driver was NOT developed by vice-versa
-   engineering the PS/2's BIOS */
-/* Dedicated to Wannes, Tofke, Ykke, Godot, Killroy and all those 
-   other lovely fish out there... */
-/* This code was written during the long and boring WINA 
-   elections 1994 */
-/* Thanks to Arindam Banerij for giving me the source of his driver */
-/* This code may be freely distributed and modified in any way, 
-   as long as these notes remain intact */
-
-/*  Revised: 05/07/94 by Arindam Banerji (axb@cse.nd.edu) */
-/*  Revised: 09/08/94 by Peter De Schrijver (stud11@cc4.kuleuven.ac.be)
-   Thanks to Arindam Banerij for sending me the docs of the adapter */
-
-/* BA Modified for ThinkPad 720 by Boris Ashkinazi */
-/*                    (bash@vnet.ibm.com) 08/08/95 */
-
-/* Modified further for ThinkPad-720C by Uri Blumenthal */
-/*                    (uri@watson.ibm.com) Sep 11, 1995 */
-
-/* TODO : 
-   + Timeouts
-   + Get disk parameters
-   + DMA above 16MB
-   + reset after read/write error
- */
-
-#define DEVICE_NAME "PS/2 ESDI"
-
-#include <linux/major.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/interrupt.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/genhd.h>
-#include <linux/ps2esdi.h>
-#include <linux/blkdev.h>
-#include <linux/mca-legacy.h>
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/hdreg.h>
-
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/dma.h>
-#include <asm/mca_dma.h>
-#include <asm/uaccess.h>
-
-#define PS2ESDI_IRQ 14
-#define MAX_HD 2
-#define MAX_RETRIES 5
-#define MAX_16BIT 65536
-#define ESDI_TIMEOUT   0xf000
-#define ESDI_STAT_TIMEOUT 4
-
-#define TYPE_0_CMD_BLK_LENGTH 2
-#define TYPE_1_CMD_BLK_LENGTH 4
-
-static void reset_ctrl(void);
-
-static int ps2esdi_geninit(void);
-
-static void do_ps2esdi_request(struct request_queue * q);
-
-static void ps2esdi_readwrite(int cmd, struct request *req);
-
-static void ps2esdi_fill_cmd_block(u_short * cmd_blk, u_short cmd,
-u_short cyl, u_short head, u_short sector, u_short length, u_char drive);
-
-static int ps2esdi_out_cmd_blk(u_short * cmd_blk);
-
-static void ps2esdi_prep_dma(char *buffer, u_short length, u_char dma_xmode);
-
-static irqreturn_t ps2esdi_interrupt_handler(int irq, void *dev_id);
-static void (*current_int_handler) (u_int) = NULL;
-static void ps2esdi_normal_interrupt_handler(u_int);
-static void ps2esdi_initial_reset_int_handler(u_int);
-static void ps2esdi_geometry_int_handler(u_int);
-static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo);
-
-static int ps2esdi_read_status_words(int num_words, int max_words, u_short * buffer);
-
-static void dump_cmd_complete_status(u_int int_ret_code);
-
-static void ps2esdi_get_device_cfg(void);
-
-static void ps2esdi_reset_timer(unsigned long unused);
-
-static u_int dma_arb_level;		/* DMA arbitration level */
-
-static DECLARE_WAIT_QUEUE_HEAD(ps2esdi_int);
-
-static int no_int_yet;
-static int ps2esdi_drives;
-static u_short io_base;
-static DEFINE_TIMER(esdi_timer, ps2esdi_reset_timer, 0, 0);
-static int reset_status;
-static int ps2esdi_slot = -1;
-static int tp720esdi = 0;	/* Is it Integrated ESDI of ThinkPad-720? */
-static int intg_esdi = 0;       /* If integrated adapter */
-struct ps2esdi_i_struct {
-	unsigned int head, sect, cyl, wpcom, lzone, ctl;
-};
-static DEFINE_SPINLOCK(ps2esdi_lock);
-static struct request_queue *ps2esdi_queue;
-static struct request *current_req;
-
-#if 0
-#if 0				/* try both - I don't know which one is better... UB */
-static struct ps2esdi_i_struct ps2esdi_info[MAX_HD] =
-{
-	{4, 48, 1553, 0, 0, 0},
-	{0, 0, 0, 0, 0, 0}};
-#else
-static struct ps2esdi_i_struct ps2esdi_info[MAX_HD] =
-{
-	{64, 32, 161, 0, 0, 0},
-	{0, 0, 0, 0, 0, 0}};
-#endif
-#endif
-static struct ps2esdi_i_struct ps2esdi_info[MAX_HD] =
-{
-	{0, 0, 0, 0, 0, 0},
-	{0, 0, 0, 0, 0, 0}};
-
-static struct block_device_operations ps2esdi_fops =
-{
-	.owner		= THIS_MODULE,
-	.getgeo		= ps2esdi_getgeo,
-};
-
-static struct gendisk *ps2esdi_gendisk[2];
-
-/* initialization routine called by ll_rw_blk.c   */
-static int __init ps2esdi_init(void)
-{
-
-	int error = 0;
-
-	/* register the device - pass the name and major number */
-	if (register_blkdev(PS2ESDI_MAJOR, "ed"))
-		return -EBUSY;
-
-	/* set up some global information - indicating device specific info */
-	ps2esdi_queue = blk_init_queue(do_ps2esdi_request, &ps2esdi_lock);
-	if (!ps2esdi_queue) {
-		unregister_blkdev(PS2ESDI_MAJOR, "ed");
-		return -ENOMEM;
-	}
-
-	/* some minor housekeeping - setup the global gendisk structure */
-	error = ps2esdi_geninit();
-	if (error) {
-		printk(KERN_WARNING "PS2ESDI: error initialising"
-			" device, releasing resources\n");
-		unregister_blkdev(PS2ESDI_MAJOR, "ed");
-		blk_cleanup_queue(ps2esdi_queue);
-		return error;
-	}
-	return 0;
-}				/* ps2esdi_init */
-
-#ifndef MODULE
-
-module_init(ps2esdi_init);
-
-#else
-
-static int cyl[MAX_HD] = {-1,-1};
-static int head[MAX_HD] = {-1, -1};
-static int sect[MAX_HD] = {-1, -1};
-
-module_param(tp720esdi, bool, 0);
-module_param_array(cyl, int, NULL, 0);
-module_param_array(head, int, NULL, 0);
-module_param_array(sect, int, NULL, 0);
-MODULE_LICENSE("GPL");
-
-int init_module(void) {
-	int drive;
-
-	for(drive = 0; drive < MAX_HD; drive++) {
-	        struct ps2esdi_i_struct *info = &ps2esdi_info[drive];
-
-        	if (cyl[drive] != -1) {
-		  	info->cyl = info->lzone = cyl[drive];
-			info->wpcom = 0;
-		}
-        	if (head[drive] != -1) {
-			info->head = head[drive];
-			info->ctl = (head[drive] > 8 ? 8 : 0);
-		}
-        	if (sect[drive] != -1) info->sect = sect[drive];
-	}
-	return ps2esdi_init();
-}
-
-void
-cleanup_module(void) {
-	int i;
-	if(ps2esdi_slot) {
-		mca_mark_as_unused(ps2esdi_slot);
-		mca_set_adapter_procfn(ps2esdi_slot, NULL, NULL);
-	}
-	release_region(io_base, 4);
-	free_dma(dma_arb_level);
-	free_irq(PS2ESDI_IRQ, &ps2esdi_gendisk);
-	unregister_blkdev(PS2ESDI_MAJOR, "ed");
-	blk_cleanup_queue(ps2esdi_queue);
-	for (i = 0; i < ps2esdi_drives; i++) {
-		del_gendisk(ps2esdi_gendisk[i]);
-		put_disk(ps2esdi_gendisk[i]);
-	}
-}
-#endif /* MODULE */
-
-/* handles boot time command line parameters */
-void __init tp720_setup(char *str, int *ints)
-{
-	/* no params, just sets the tp720esdi flag if it exists */
-
-	printk("%s: TP 720 ESDI flag set\n", DEVICE_NAME);
-	tp720esdi = 1;
-}
-
-void __init ed_setup(char *str, int *ints)
-{
-	int hdind = 0;
-
-	/* handles 3 parameters only - corresponding to
-	   1. Number of cylinders
-	   2. Number of heads
-	   3. Sectors/track
-	 */
-
-	if (ints[0] != 3)
-		return;
-
-	/* print out the information - seen at boot time */
-	printk("%s: ints[0]=%d ints[1]=%d ints[2]=%d ints[3]=%d\n",
-	       DEVICE_NAME, ints[0], ints[1], ints[2], ints[3]);
-
-	/* set the index into device specific information table */
-	if (ps2esdi_info[0].head != 0)
-		hdind = 1;
-
-	/* set up all the device information */
-	ps2esdi_info[hdind].head = ints[2];
-	ps2esdi_info[hdind].sect = ints[3];
-	ps2esdi_info[hdind].cyl = ints[1];
-	ps2esdi_info[hdind].wpcom = 0;
-	ps2esdi_info[hdind].lzone = ints[1];
-	ps2esdi_info[hdind].ctl = (ints[2] > 8 ? 8 : 0);
-#if 0				/* this may be needed for PS2/Mod.80, but it hurts ThinkPad! */
-	ps2esdi_drives = hdind + 1;	/* increment index for the next time */
-#endif
-}				/* ed_setup */
-
-static int ps2esdi_getinfo(char *buf, int slot, void *d)
-{
-	int len = 0;
-
-	len += sprintf(buf + len, "DMA Arbitration Level: %d\n",
-		       dma_arb_level);
-	len += sprintf(buf + len, "IO Port: %x\n", io_base);
-	len += sprintf(buf + len, "IRQ: 14\n");
-	len += sprintf(buf + len, "Drives: %d\n", ps2esdi_drives);
-
-	return len;
-}
-
-/* ps2 esdi specific initialization - called thru the gendisk chain */
-static int __init ps2esdi_geninit(void)
-{
-	/*
-	   The first part contains the initialization code
-	   for the ESDI disk subsystem.  All we really do
-	   is search for the POS registers of the controller
-	   to do some simple setup operations.  First, we
-	   must ensure that the controller is installed,
-	   enabled, and configured as PRIMARY.  Then we must
-	   determine the DMA arbitration level being used by
-	   the controller so we can handle data transfer
-	   operations properly.  If all of this works, then
-	   we will set the INIT_FLAG to a non-zero value.
-	 */
-
-	int slot = 0, i, reset_start, reset_end;
-	u_char status;
-	unsigned short adapterID;
-	int error = 0;
-
-	if ((slot = mca_find_adapter(INTG_ESDI_ID, 0)) != MCA_NOTFOUND) {
-		adapterID = INTG_ESDI_ID;
-		printk("%s: integrated ESDI adapter found in slot %d\n",
-		       DEVICE_NAME, slot+1);
-#ifndef MODULE
-		mca_set_adapter_name(slot, "PS/2 Integrated ESDI");
-#endif
-	} else if ((slot = mca_find_adapter(NRML_ESDI_ID, 0)) != -1) {
-		adapterID = NRML_ESDI_ID;
-		printk("%s: normal ESDI adapter found in slot %d\n",
-		       DEVICE_NAME, slot+1);
-		mca_set_adapter_name(slot, "PS/2 ESDI");
-	} else {
-		return -ENODEV;
-	}
-
-	ps2esdi_slot = slot;
-	mca_mark_as_used(slot);
-	mca_set_adapter_procfn(slot, (MCA_ProcFn) ps2esdi_getinfo, NULL);
-
-	/* Found the slot - read the POS register 2 to get the necessary
-	   configuration and status information.  POS register 2 has the
-	   following information :
-	   Bit           Function
-	   7             reserved = 0
-	   6             arbitration method
-	   0 - fairness enabled
-	   1 - fairness disabled, linear priority assignment
-	   5-2           arbitration level
-	   1             alternate address
-	   1              alternate address
-	   0 - use addresses 0x3510 - 0x3517
-	   0             adapter enable
-	 */
-
-	status = mca_read_stored_pos(slot, 2);
-	/* is it enabled ? */
-	if (!(status & STATUS_ENABLED)) {
-		printk("%s: ESDI adapter disabled\n", DEVICE_NAME);
-		error = -ENODEV;
-		goto err_out1;
-	}
-	/* try to grab IRQ, and try to grab a slow IRQ if it fails, so we can
-	   share with the SCSI driver */
-	if (request_irq(PS2ESDI_IRQ, ps2esdi_interrupt_handler,
-		  IRQF_DISABLED | IRQF_SHARED, "PS/2 ESDI", &ps2esdi_gendisk)
-	    && request_irq(PS2ESDI_IRQ, ps2esdi_interrupt_handler,
-			   IRQF_SHARED, "PS/2 ESDI", &ps2esdi_gendisk)
-	    ) {
-		printk("%s: Unable to get IRQ %d\n", DEVICE_NAME, PS2ESDI_IRQ);
-		error = -EBUSY;
-		goto err_out1;
-	}
-	if (status & STATUS_ALTERNATE)
-		io_base = ALT_IO_BASE;
-	else
-		io_base = PRIMARY_IO_BASE;
-
-	if (!request_region(io_base, 4, "ed")) {
-		printk(KERN_WARNING"Unable to request region 0x%x\n", io_base);
-		error = -EBUSY;
-		goto err_out2;
-	}
-	/* get the dma arbitration level */
-	dma_arb_level = (status >> 2) & 0xf;
-
-	/* BA */
-	printk("%s: DMA arbitration level : %d\n",
-	       DEVICE_NAME, dma_arb_level);
-
-	LITE_ON;
-	current_int_handler = ps2esdi_initial_reset_int_handler;
-	reset_ctrl();
-	reset_status = 0;
-	reset_start = jiffies;
-	while (!reset_status) {
-		init_timer(&esdi_timer);
-		esdi_timer.expires = jiffies + HZ;
-		esdi_timer.data = 0;
-		add_timer(&esdi_timer);
-		sleep_on(&ps2esdi_int);
-	}
-	reset_end = jiffies;
-	LITE_OFF;
-	printk("%s: reset interrupt after %d jiffies,  %u.%02u secs\n",
-	       DEVICE_NAME, reset_end - reset_start, (reset_end - reset_start) / HZ,
-	       (reset_end - reset_start) % HZ);
-
-
-	/* Integrated ESDI Disk and Controller has only one drive! */
-	if (adapterID == INTG_ESDI_ID) {/* if not "normal" PS2 ESDI adapter */
-		ps2esdi_drives = 1;	/* then we have only one physical disk! */		intg_esdi = 1;
-	}
-
-
-
-	/* finally this part sets up some global data structures etc. */
-
-	ps2esdi_get_device_cfg();
-
-	/* some annoyance in the above routine returns TWO drives?
-	 Is something else happining in the background?
-	 Regaurdless we fix the # of drives again. AJK */
-	/* Integrated ESDI Disk and Controller has only one drive! */
-	if (adapterID == INTG_ESDI_ID)	/* if not "normal" PS2 ESDI adapter */
-		ps2esdi_drives = 1;	/* Not three or two, ONE DAMNIT! */
-
-	current_int_handler = ps2esdi_normal_interrupt_handler;
-
-	if (request_dma(dma_arb_level, "ed") !=0) {
-		printk(KERN_WARNING "PS2ESDI: Can't request dma-channel %d\n"
-			,(int) dma_arb_level);
-		error = -EBUSY;
-		goto err_out3;
-	}
-	blk_queue_max_sectors(ps2esdi_queue, 128);
-
-	error = -ENOMEM;
-	for (i = 0; i < ps2esdi_drives; i++) {
-		struct gendisk *disk = alloc_disk(64);
-		if (!disk)
-			goto err_out4;
-		disk->major = PS2ESDI_MAJOR;
-		disk->first_minor = i<<6;
-		sprintf(disk->disk_name, "ed%c", 'a'+i);
-		disk->fops = &ps2esdi_fops;
-		ps2esdi_gendisk[i] = disk;
-	}
-
-	for (i = 0; i < ps2esdi_drives; i++) {
-		struct gendisk *disk = ps2esdi_gendisk[i];
-		set_capacity(disk, ps2esdi_info[i].head * ps2esdi_info[i].sect *
-				ps2esdi_info[i].cyl);
-		disk->queue = ps2esdi_queue;
-		disk->private_data = &ps2esdi_info[i];
-		add_disk(disk);
-	}
-	return 0;
-err_out4:
-	while (i--)
-		put_disk(ps2esdi_gendisk[i]);
-err_out3:
-	release_region(io_base, 4);
-err_out2:
-	free_irq(PS2ESDI_IRQ, &ps2esdi_gendisk);
-err_out1:
-	if(ps2esdi_slot) {
-		mca_mark_as_unused(ps2esdi_slot);
-		mca_set_adapter_procfn(ps2esdi_slot, NULL, NULL);
-	}
-	return error;
-}
-
-static void __init ps2esdi_get_device_cfg(void)
-{
-	u_short cmd_blk[TYPE_0_CMD_BLK_LENGTH];
-
-	/*BA */ printk("%s: Drive 0\n", DEVICE_NAME);
-	current_int_handler = ps2esdi_geometry_int_handler;
-	cmd_blk[0] = CMD_GET_DEV_CONFIG | 0x600;
-	cmd_blk[1] = 0;
-	no_int_yet = TRUE;
-	ps2esdi_out_cmd_blk(cmd_blk);
-	if (no_int_yet)
-		sleep_on(&ps2esdi_int);
-
-	if (ps2esdi_drives > 1) {
-		printk("%s: Drive 1\n", DEVICE_NAME);	/*BA */
-		cmd_blk[0] = CMD_GET_DEV_CONFIG | (1 << 5) | 0x600;
-		cmd_blk[1] = 0;
-		no_int_yet = TRUE;
-		ps2esdi_out_cmd_blk(cmd_blk);
-		if (no_int_yet)
-			sleep_on(&ps2esdi_int);
-	}			/* if second physical drive is present */
-	return;
-}
-
-/* strategy routine that handles most of the IO requests */
-static void do_ps2esdi_request(struct request_queue * q)
-{
-	struct request *req;
-	/* since, this routine is called with interrupts cleared - they 
-	   must be before it finishes  */
-
-	req = elv_next_request(q);
-	if (!req)
-		return;
-
-#if 0
-	printk("%s:got request. device : %s command : %d  sector : %ld count : %ld, buffer: %p\n",
-	       DEVICE_NAME,
-	       req->rq_disk->disk_name,
-	       req->cmd, req->sector,
-	       req->current_nr_sectors, req->buffer);
-#endif
-
-	/* check for above 16Mb dmas */
-	if (isa_virt_to_bus(req->buffer + req->current_nr_sectors * 512) > 16 * MB) {
-		printk("%s: DMA above 16MB not supported\n", DEVICE_NAME);
-		end_request(req, FAIL);
-		return;
-	}
-
-	if (req->sector+req->current_nr_sectors > get_capacity(req->rq_disk)) {
-		printk("Grrr. error. ps2esdi_drives: %d, %llu %llu\n",
-		    ps2esdi_drives, req->sector,
-		    (unsigned long long)get_capacity(req->rq_disk));
-		end_request(req, FAIL);
-		return;
-	}
-
-	switch (rq_data_dir(req)) {
-	case READ:
-		ps2esdi_readwrite(READ, req);
-		break;
-	case WRITE:
-		ps2esdi_readwrite(WRITE, req);
-		break;
-	default:
-		printk("%s: Unknown command\n", req->rq_disk->disk_name);
-		end_request(req, FAIL);
-		break;
-	}		/* handle different commands */
-}				/* main strategy routine */
-
-/* resets the ESDI adapter */
-static void reset_ctrl(void)
-{
-
-	u_long expire;
-	u_short status;
-
-	/* enable interrupts on the controller */
-	status = inb(ESDI_INTRPT);
-	outb((status & 0xe0) | ATT_EOI, ESDI_ATTN);	/* to be sure we don't have
-							   any interrupt pending... */
-	outb_p(CTRL_ENABLE_INTR, ESDI_CONTROL);
-
-	/* read the ESDI status port - if the controller is not busy,
-	   simply do a soft reset (fast) - otherwise we'll have to do a
-	   hard (slow) reset.  */
-	if (!(inb_p(ESDI_STATUS) & STATUS_BUSY)) {
-		/*BA */ printk("%s: soft reset...\n", DEVICE_NAME);
-		outb_p(CTRL_SOFT_RESET, ESDI_ATTN);
-	}
-	/* soft reset */ 
-	else {
-		/*BA */
-		printk("%s: hard reset...\n", DEVICE_NAME);
-		outb_p(CTRL_HARD_RESET, ESDI_CONTROL);
-		expire = jiffies + 2*HZ;
-		while (time_before(jiffies, expire));
-		outb_p(1, ESDI_CONTROL);
-	}			/* hard reset */
-
-
-}				/* reset the controller */
-
-/* called by the strategy routine to handle read and write requests */
-static void ps2esdi_readwrite(int cmd, struct request *req)
-{
-	struct ps2esdi_i_struct *p = req->rq_disk->private_data;
-	unsigned block = req->sector;
-	unsigned count = req->current_nr_sectors;
-	int drive = p - ps2esdi_info;
-	u_short track, head, cylinder, sector;
-	u_short cmd_blk[TYPE_1_CMD_BLK_LENGTH];
-
-	/* do some relevant arithmatic */
-	track = block / p->sect;
-	head = track % p->head;
-	cylinder = track / p->head;
-	sector = block % p->sect;
-
-#if 0
-	printk("%s: cyl=%d head=%d sect=%d\n", DEVICE_NAME, cylinder, head, sector);
-#endif
-	/* call the routine that actually fills out a command block */
-	ps2esdi_fill_cmd_block
-	    (cmd_blk,
-	     (cmd == READ) ? CMD_READ : CMD_WRITE,
-	     cylinder, head, sector, count, drive);
-
-	/* send the command block to the controller */
-	current_req = req;
-	spin_unlock_irq(&ps2esdi_lock);
-	if (ps2esdi_out_cmd_blk(cmd_blk)) {
-		spin_lock_irq(&ps2esdi_lock);
-		printk("%s: Controller failed\n", DEVICE_NAME);
-		if ((++req->errors) >= MAX_RETRIES)
-			end_request(req, FAIL);
-	}
-	/* check for failure to put out the command block */ 
-	else {
-		spin_lock_irq(&ps2esdi_lock);
-#if 0
-		printk("%s: waiting for xfer\n", DEVICE_NAME);
-#endif
-		/* turn disk lights on */
-		LITE_ON;
-	}
-
-}				/* ps2esdi_readwrite */
-
-/* fill out the command block */
-static void ps2esdi_fill_cmd_block(u_short * cmd_blk, u_short cmd,
- u_short cyl, u_short head, u_short sector, u_short length, u_char drive)
-{
-
-	cmd_blk[0] = (drive << 5) | cmd;
-	cmd_blk[1] = length;
-	cmd_blk[2] = ((cyl & 0x1f) << 11) | (head << 5) | sector;
-	cmd_blk[3] = (cyl & 0x3E0) >> 5;
-
-}				/* fill out the command block */
-
-/* write a command block to the controller */
-static int ps2esdi_out_cmd_blk(u_short * cmd_blk)
-{
-
-	int i;
-	unsigned long jif;
-	u_char status;
-
-	/* enable interrupts */
-	outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-
-	/* do not write to the controller, if it is busy */
-	for (jif = jiffies + ESDI_STAT_TIMEOUT;
-		time_after(jif, jiffies) &&
-			(inb(ESDI_STATUS) & STATUS_BUSY); )
-		;
-
-#if 0
-	printk("%s: i(1)=%ld\n", DEVICE_NAME, jif);
-#endif
-
-	/* if device is still busy - then just time out */
-	if (inb(ESDI_STATUS) & STATUS_BUSY) {
-		printk("%s: ps2esdi_out_cmd timed out (1)\n", DEVICE_NAME);
-		return ERROR;
-	}			/* timeout ??? */
-	/* Set up the attention register in the controller */
-	outb(((*cmd_blk) & 0xE0) | 1, ESDI_ATTN);
-
-#if 0
-	printk("%s: sending %d words to controller\n", DEVICE_NAME, (((*cmd_blk) >> 14) + 1) << 1);
-#endif
-
-	/* one by one send each word out */
-	for (i = (((*cmd_blk) >> 14) + 1) << 1; i; i--) {
-		status = inb(ESDI_STATUS);
-		for (jif = jiffies + ESDI_STAT_TIMEOUT;
-		     time_after(jif, jiffies) && (status & STATUS_BUSY) &&
-		   (status & STATUS_CMD_INF); status = inb(ESDI_STATUS));
-		if ((status & (STATUS_BUSY | STATUS_CMD_INF)) == STATUS_BUSY) {
-#if 0
-			printk("%s: sending %04X\n", DEVICE_NAME, *cmd_blk);
-#endif
-			outw(*cmd_blk++, ESDI_CMD_INT);
-		} else {
-			printk("%s: ps2esdi_out_cmd timed out while sending command (status=%02X)\n",
-			       DEVICE_NAME, status);
-			return ERROR;
-		}
-	}			/* send all words out */
-	return OK;
-}				/* send out the commands */
-
-
-/* prepare for dma - do all the necessary setup */
-static void ps2esdi_prep_dma(char *buffer, u_short length, u_char dma_xmode)
-{
-	unsigned long flags = claim_dma_lock();
-
-	mca_disable_dma(dma_arb_level);
-
-	mca_set_dma_addr(dma_arb_level, isa_virt_to_bus(buffer));
-
-	mca_set_dma_count(dma_arb_level, length * 512 / 2);
-
-	mca_set_dma_mode(dma_arb_level, dma_xmode);
-
-	mca_enable_dma(dma_arb_level);
-
-	release_dma_lock(flags);
-
-}				/* prepare for dma */
-
-
-
-static irqreturn_t ps2esdi_interrupt_handler(int irq, void *dev_id)
-{
-	u_int int_ret_code;
-
-	if (inb(ESDI_STATUS) & STATUS_INTR) {
-		int_ret_code = inb(ESDI_INTRPT);
-		if (current_int_handler) {
-			/* Disable adapter interrupts till processing is finished */
-			outb(CTRL_DISABLE_INTR, ESDI_CONTROL);
-			current_int_handler(int_ret_code);
-		} else
-			printk("%s: help ! No interrupt handler.\n", DEVICE_NAME);
-	} else {
-		return IRQ_NONE;
-	}
-	return IRQ_HANDLED;
-}
-
-static void ps2esdi_initial_reset_int_handler(u_int int_ret_code)
-{
-
-	switch (int_ret_code & 0xf) {
-	case INT_RESET:
-		/*BA */
-		printk("%s: initial reset completed.\n", DEVICE_NAME);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		wake_up(&ps2esdi_int);
-		break;
-	case INT_ATTN_ERROR:
-		printk("%s: Attention error. interrupt status : %02X\n", DEVICE_NAME,
-		       int_ret_code);
-		printk("%s: status: %02x\n", DEVICE_NAME, inb(ESDI_STATUS));
-		break;
-	default:
-		printk("%s: initial reset handler received interrupt: %02X\n",
-		       DEVICE_NAME, int_ret_code);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		break;
-	}
-	outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-}
-
-
-static void ps2esdi_geometry_int_handler(u_int int_ret_code)
-{
-	u_int status, drive_num;
-	unsigned long rba;
-	int i;
-
-	drive_num = int_ret_code >> 5;
-	switch (int_ret_code & 0xf) {
-	case INT_CMD_COMPLETE:
-		for (i = ESDI_TIMEOUT; i && !(inb(ESDI_STATUS) & STATUS_STAT_AVAIL); i--);
-		if (!(inb(ESDI_STATUS) & STATUS_STAT_AVAIL)) {
-			printk("%s: timeout reading status word\n", DEVICE_NAME);
-			outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-			break;
-		}
-		status = inw(ESDI_STT_INT);
-		if ((status & 0x1F) == CMD_GET_DEV_CONFIG) {
-#define REPLY_WORDS 5		/* we already read word 0 */
-			u_short reply[REPLY_WORDS];
-
-			if (ps2esdi_read_status_words((status >> 8) - 1, REPLY_WORDS, reply)) {
-				/*BA */
-				printk("%s: Device Configuration Status for drive %u\n",
-				       DEVICE_NAME, drive_num);
-
-				printk("%s: Spares/cyls: %u", DEVICE_NAME, reply[0] >> 8);
-
-				printk
-				    ("Config bits: %s%s%s%s%s\n",
-				     (reply[0] & CONFIG_IS) ? "Invalid Secondary, " : "",
-				     ((reply[0] & CONFIG_ZD) && !(reply[0] & CONFIG_IS))
-				 ? "Zero Defect, " : "Defects Present, ",
-				     (reply[0] & CONFIG_SF) ? "Skewed Format, " : "",
-				     (reply[0] & CONFIG_FR) ? "Removable, " : "Non-Removable, ",
-				     (reply[0] & CONFIG_RT) ? "No Retries" : "Retries");
-
-				rba = reply[1] | ((unsigned long) reply[2] << 16);
-				printk("%s: Number of RBA's: %lu\n", DEVICE_NAME, rba);
-
-				printk("%s: Physical number of cylinders: %u, Sectors/Track: %u, Heads: %u\n",
-				       DEVICE_NAME, reply[3], reply[4] >> 8, reply[4] & 0xff);
-
-				if (!ps2esdi_info[drive_num].head) {
-					ps2esdi_info[drive_num].head = 64;
-					ps2esdi_info[drive_num].sect = 32;
-					ps2esdi_info[drive_num].cyl = rba / (64 * 32);
-					ps2esdi_info[drive_num].wpcom = 0;
-					ps2esdi_info[drive_num].lzone = ps2esdi_info[drive_num].cyl;
-					ps2esdi_info[drive_num].ctl = 8;
-					if (tp720esdi) {	/* store the retrieved parameters */
-						ps2esdi_info[0].head = reply[4] & 0Xff;
-						ps2esdi_info[0].sect = reply[4] >> 8;
-						ps2esdi_info[0].cyl = reply[3];
-						ps2esdi_info[0].wpcom = 0;
-						ps2esdi_info[0].lzone = reply[3];
-					} else {
-						if (!intg_esdi)
-							ps2esdi_drives++;
-					}
-				}
-#ifdef OBSOLETE
-				if (!ps2esdi_info[drive_num].head) {
-					ps2esdi_info[drive_num].head = reply[4] & 0Xff;
-					ps2esdi_info[drive_num].sect = reply[4] >> 8;
-					ps2esdi_info[drive_num].cyl = reply[3];
-					ps2esdi_info[drive_num].wpcom = 0;
-					ps2esdi_info[drive_num].lzone = reply[3];
-					if (tp720esdi) {	/* store the retrieved parameters */
-						ps2esdi_info[0].head = reply[4] & 0Xff;
-						ps2esdi_info[0].sect = reply[4] >> 8;
-						ps2esdi_info[0].cyl = reply[3];
-						ps2esdi_info[0].wpcom = 0;
-						ps2esdi_info[0].lzone = reply[3];
-					} else {
-						ps2esdi_drives++;
-					}
-				}
-#endif
-
-			} else
-				printk("%s: failed while getting device config\n", DEVICE_NAME);
-#undef REPLY_WORDS
-		} else
-			printk("%s: command %02X unknown by geometry handler\n",
-			       DEVICE_NAME, status & 0x1f);
-
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		break;
-
-	case INT_ATTN_ERROR:
-		printk("%s: Attention error. interrupt status : %02X\n", DEVICE_NAME,
-		       int_ret_code);
-		printk("%s: Device not available\n", DEVICE_NAME);
-		break;
-	case INT_CMD_ECC:
-	case INT_CMD_RETRY:
-	case INT_CMD_ECC_RETRY:
-	case INT_CMD_WARNING:
-	case INT_CMD_ABORT:
-	case INT_CMD_FAILED:
-	case INT_DMA_ERR:
-	case INT_CMD_BLK_ERR:
-		/*BA */ printk("%s: Whaa. Error occurred...\n", DEVICE_NAME);
-		dump_cmd_complete_status(int_ret_code);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		break;
-	default:
-		printk("%s: Unknown interrupt reason: %02X\n",
-		       DEVICE_NAME, int_ret_code & 0xf);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		break;
-	}
-
-	wake_up(&ps2esdi_int);
-	no_int_yet = FALSE;
-	outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-
-}
-
-static void ps2esdi_normal_interrupt_handler(u_int int_ret_code)
-{
-	unsigned long flags;
-	u_int status;
-	u_int ending;
-	int i;
-
-	switch (int_ret_code & 0x0f) {
-	case INT_TRANSFER_REQ:
-		ps2esdi_prep_dma(current_req->buffer,
-				 current_req->current_nr_sectors,
-		    (rq_data_dir(current_req) == READ)
-		    ? MCA_DMA_MODE_16 | MCA_DMA_MODE_WRITE | MCA_DMA_MODE_XFER
-		    : MCA_DMA_MODE_16 | MCA_DMA_MODE_READ);
-		outb(CTRL_ENABLE_DMA | CTRL_ENABLE_INTR, ESDI_CONTROL);
-		ending = -1;
-		break;
-
-	case INT_ATTN_ERROR:
-		printk("%s: Attention error. interrupt status : %02X\n", DEVICE_NAME,
-		       int_ret_code);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		ending = FAIL;
-		break;
-
-	case INT_CMD_COMPLETE:
-		for (i = ESDI_TIMEOUT; i && !(inb(ESDI_STATUS) & STATUS_STAT_AVAIL); i--);
-		if (!(inb(ESDI_STATUS) & STATUS_STAT_AVAIL)) {
-			printk("%s: timeout reading status word\n", DEVICE_NAME);
-			outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-			outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-			if ((++current_req->errors) >= MAX_RETRIES)
-				ending = FAIL;
-			else
-				ending = -1;
-			break;
-		}
-		status = inw(ESDI_STT_INT);
-		switch (status & 0x1F) {
-		case (CMD_READ & 0xff):
-		case (CMD_WRITE & 0xff):
-			LITE_OFF;
-			outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-			outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-			ending = SUCCES;
-			break;
-		default:
-			printk("%s: interrupt for unknown command %02X\n",
-			       DEVICE_NAME, status & 0x1f);
-			outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-			outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-			ending = -1;
-			break;
-		}
-		break;
-	case INT_CMD_ECC:
-	case INT_CMD_RETRY:
-	case INT_CMD_ECC_RETRY:
-		LITE_OFF;
-		dump_cmd_complete_status(int_ret_code);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		ending = SUCCES;
-		break;
-	case INT_CMD_WARNING:
-	case INT_CMD_ABORT:
-	case INT_CMD_FAILED:
-	case INT_DMA_ERR:
-		LITE_OFF;
-		dump_cmd_complete_status(int_ret_code);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		if ((++current_req->errors) >= MAX_RETRIES)
-			ending = FAIL;
-		else
-			ending = -1;
-		break;
-
-	case INT_CMD_BLK_ERR:
-		dump_cmd_complete_status(int_ret_code);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		ending = FAIL;
-		break;
-
-	case INT_CMD_FORMAT:
-		printk("%s: huh ? Who issued this format command ?\n"
-		       ,DEVICE_NAME);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		ending = -1;
-		break;
-
-	case INT_RESET:
-		/* BA printk("%s: reset completed.\n", DEVICE_NAME) */ ;
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		ending = -1;
-		break;
-
-	default:
-		printk("%s: Unknown interrupt reason: %02X\n",
-		       DEVICE_NAME, int_ret_code & 0xf);
-		outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		ending = -1;
-		break;
-	}
-	if(ending != -1) {
-		spin_lock_irqsave(&ps2esdi_lock, flags);
-		end_request(current_req, ending);
-		current_req = NULL;
-		do_ps2esdi_request(ps2esdi_queue);
-		spin_unlock_irqrestore(&ps2esdi_lock, flags);
-	}
-}				/* handle interrupts */
-
-
-
-static int ps2esdi_read_status_words(int num_words,
-				     int max_words,
-				     u_short * buffer)
-{
-	int i;
-
-	for (; max_words && num_words; max_words--, num_words--, buffer++) {
-		for (i = ESDI_TIMEOUT; i && !(inb(ESDI_STATUS) & STATUS_STAT_AVAIL); i--);
-		if (!(inb(ESDI_STATUS) & STATUS_STAT_AVAIL)) {
-			printk("%s: timeout reading status word\n", DEVICE_NAME);
-			return FAIL;
-		}
-		*buffer = inw(ESDI_STT_INT);
-	}
-	return SUCCES;
-}
-
-
-
-
-static void dump_cmd_complete_status(u_int int_ret_code)
-{
-#define WAIT_FOR_STATUS \
-  for(i=ESDI_TIMEOUT;i && !(inb(ESDI_STATUS) & STATUS_STAT_AVAIL);i--); \
-    if(!(inb(ESDI_STATUS) & STATUS_STAT_AVAIL)) { \
-    printk("%s: timeout reading status word\n",DEVICE_NAME); \
-    return; \
-    }
-
-	int i, word_count;
-	u_short stat_word;
-	u_long rba;
-
-	printk("%s: Device: %u, interrupt ID: %02X\n",
-	       DEVICE_NAME, int_ret_code >> 5,
-	       int_ret_code & 0xf);
-
-	WAIT_FOR_STATUS;
-	stat_word = inw(ESDI_STT_INT);
-	word_count = (stat_word >> 8) - 1;
-	printk("%s: %u status words, command: %02X\n", DEVICE_NAME, word_count,
-	       stat_word & 0xff);
-
-	if (word_count--) {
-		WAIT_FOR_STATUS;
-		stat_word = inw(ESDI_STT_INT);
-		printk("%s: command status code: %02X, command error code: %02X\n",
-		       DEVICE_NAME, stat_word >> 8, stat_word & 0xff);
-	}
-	if (word_count--) {
-		WAIT_FOR_STATUS;
-		stat_word = inw(ESDI_STT_INT);
-		printk("%s: device error code: %s%s%s%s%s,%02X\n", DEVICE_NAME,
-		       (stat_word & 0x1000) ? "Ready, " : "Not Ready, ",
-		  (stat_word & 0x0800) ? "Selected, " : "Not Selected, ",
-		       (stat_word & 0x0400) ? "Write Fault, " : "",
-		       (stat_word & 0x0200) ? "Track 0, " : "",
-		(stat_word & 0x0100) ? "Seek or command complete, " : "",
-		       stat_word >> 8);
-	}
-	if (word_count--) {
-		WAIT_FOR_STATUS;
-		stat_word = inw(ESDI_STT_INT);
-		printk("%s: Blocks to do: %u", DEVICE_NAME, stat_word);
-	}
-	if (word_count -= 2) {
-		WAIT_FOR_STATUS;
-		rba = inw(ESDI_STT_INT);
-		WAIT_FOR_STATUS;
-		rba |= inw(ESDI_STT_INT) << 16;
-		printk(", Last Cyl: %u Head: %u Sector: %u\n",
-		       (u_short) ((rba & 0x1ff80000) >> 11),
-		 (u_short) ((rba & 0x7E0) >> 5), (u_short) (rba & 0x1f));
-	} else
-		printk("\n");
-
-	if (word_count--) {
-		WAIT_FOR_STATUS;
-		stat_word = inw(ESDI_STT_INT);
-		printk("%s: Blocks required ECC: %u", DEVICE_NAME, stat_word);
-	}
-	printk("\n");
-
-#undef WAIT_FOR_STATUS
-
-}
-
-static int ps2esdi_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct ps2esdi_i_struct *p = bdev->bd_disk->private_data;
-
-	geo->heads = p->head;
-	geo->sectors = p->sect;
-	geo->cylinders = p->cyl;
-	return 0;
-}
-
-static void ps2esdi_reset_timer(unsigned long unused)
-{
-
-	int status;
-
-	status = inb(ESDI_INTRPT);
-	if ((status & 0xf) == INT_RESET) {
-		outb((status & 0xe0) | ATT_EOI, ESDI_ATTN);
-		outb(CTRL_ENABLE_INTR, ESDI_CONTROL);
-		reset_status = 1;
-	}
-	wake_up(&ps2esdi_int);
-}
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 0fac822c1157..4108b38ebb16 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -127,7 +127,6 @@ header-y += pkt_sched.h
 header-y += posix_types.h
 header-y += ppdev.h
 header-y += prctl.h
-header-y += ps2esdi.h
 header-y += qnxtypes.h
 header-y += quotaio_v1.h
 header-y += quotaio_v2.h
diff --git a/include/linux/ps2esdi.h b/include/linux/ps2esdi.h
deleted file mode 100644
index c0e050b1dfe9..000000000000
--- a/include/linux/ps2esdi.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef _PS2ESDI_H_
-#define _PS2ESDI_H_
-
-#define NRML_ESDI_ID 0xddff
-#define INTG_ESDI_ID 0xdf9f
-
-#define PRIMARY_IO_BASE 0x3510
-#define ALT_IO_BASE 0x3518
-
-#define ESDI_CMD_INT (io_base+0)
-#define ESDI_STT_INT (io_base+0)
-#define ESDI_CONTROL (io_base+2)
-#define ESDI_STATUS  (io_base+2)
-#define ESDI_ATTN    (io_base+3)
-#define ESDI_INTRPT  (io_base+3)
-
-#define STATUS_ENABLED    0x01
-#define STATUS_ALTERNATE  0x02
-#define STATUS_BUSY       0x10
-#define STATUS_STAT_AVAIL 0x08
-#define STATUS_INTR       0x01
-#define STATUS_RESET_FAIL 0xea
-#define STATUS_CMD_INF	  0x04
-
-#define CTRL_SOFT_RESET   0xe4
-#define CTRL_HARD_RESET   0x80
-#define CTRL_EOI          0xe2
-#define CTRL_ENABLE_DMA   0x02
-#define CTRL_ENABLE_INTR  0x01
-#define CTRL_DISABLE_INTR  0x00
-
-#define ATT_EOI 0x02
-
-/* bits of word 0 of configuration status block. more info see p.38 of tech ref */
-#define CONFIG_IS 0x10 /* Invalid Secondary */
-#define CONFIG_ZD 0x08 /* Zero Defect */
-#define CONFIG_SF 0x04 /* Skewed Format */
-#define CONFIG_FR 0x02 /* Removable */
-#define CONFIG_RT 0x01 /* Retries */
-
-#define PORT_SYS_A   0x92
-#define PORT_DMA_FN  0x18
-#define PORT_DMA_EX  0x1a
-
-#define ON (unsigned char)0x40
-#define OFF (unsigned char)~ON
-#define LITE_ON outb(inb(PORT_SYS_A) | ON,PORT_SYS_A)
-#define LITE_OFF outb((inb(PORT_SYS_A) & OFF),PORT_SYS_A)
-
-#define FAIL 0
-#define SUCCES 1
-
-#define INT_CMD_COMPLETE 0x01
-#define INT_CMD_ECC      0x03
-#define INT_CMD_RETRY    0x05
-#define INT_CMD_FORMAT   0x06
-#define INT_CMD_ECC_RETRY 0x07
-#define INT_CMD_WARNING  0x08
-#define INT_CMD_ABORT    0x09
-#define INT_RESET        0x0A
-#define INT_TRANSFER_REQ 0x0B
-#define INT_CMD_FAILED   0x0C
-#define INT_DMA_ERR      0x0D
-#define INT_CMD_BLK_ERR  0x0E
-#define INT_ATTN_ERROR   0x0F
-
-#define DMA_MASK_CHAN 0x90
-#define DMA_UNMASK_CHAN 0xA0
-#define DMA_WRITE_ADDR 0x20
-#define DMA_WRITE_TC 0x40
-#define DMA_WRITE_MODE 0x70
-
-#define CMD_GET_DEV_CONFIG 0x09
-#define CMD_READ 0x4601
-#define CMD_WRITE 0x4602
-#define DMA_READ_16 0x4C
-#define DMA_WRITE_16 0x44
-
-
-#define MB 1024*1024
-#define SECT_SIZE 512   
-
-#define ERROR 1
-#define OK 0
-
-#define HDIO_GETGEO 0x0301
-
-#define FALSE 0
-#define TRUE !FALSE
-
-struct ps2esdi_geometry {
-	unsigned char heads;
-	unsigned char sectors;
-	unsigned short cylinders;
-	unsigned long start;
-};
-
-#endif /* _PS2ESDI_H_ */
-- 
cgit v1.2.3


From 4265f161b6bb7b31163671329b1142b9023bf4e3 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 14 Mar 2008 14:17:05 +0100
Subject: virtio: fix race in enable_cb

There is a race in virtio_net, dealing with disabling/enabling the callback.
I saw the following oops:

kernel BUG at /space/kvm/drivers/virtio/virtio_ring.c:218!
illegal operation: 0001 [#1] SMP
Modules linked in: sunrpc dm_mod
CPU: 2 Not tainted 2.6.25-rc1zlive-host-10623-gd358142-dirty #99
Process swapper (pid: 0, task: 000000000f85a610, ksp: 000000000f873c60)
Krnl PSW : 0404300180000000 00000000002b81a6 (vring_disable_cb+0x16/0x20)
           R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:0 CC:3 PM:0 EA:3
Krnl GPRS: 0000000000000001 0000000000000001 0000000010005800 0000000000000001
           000000000f3a0900 000000000f85a610 0000000000000000 0000000000000000
           0000000000000000 000000000f870000 0000000000000000 0000000000001237
           000000000f3a0920 000000000010ff74 00000000002846f6 000000000fa0bcd8
Krnl Code: 00000000002b819a: a7110001           tmll    %r1,1
           00000000002b819e: a7840004           brc     8,2b81a6
           00000000002b81a2: a7f40001           brc     15,2b81a4
          >00000000002b81a6: a51b0001           oill    %r1,1
           00000000002b81aa: 40102000           sth     %r1,0(%r2)
           00000000002b81ae: 07fe               bcr     15,%r14
           00000000002b81b0: eb7ff0380024       stmg    %r7,%r15,56(%r15)
           00000000002b81b6: a7f13e00           tmll    %r15,15872
Call Trace:
([<000000000fa0bcd0>] 0xfa0bcd0)
 [<00000000002b8350>] vring_interrupt+0x5c/0x6c
 [<000000000010ab08>] do_extint+0xb8/0xf0
 [<0000000000110716>] ext_no_vtime+0x16/0x1a
 [<0000000000107e72>] cpu_idle+0x1c2/0x1e0

The problem can be triggered with a high amount of host->guest traffic.
I think its the following race:

poll says netif_rx_complete
poll calls enable_cb
enable_cb opens the interrupt mask
a new packet comes, an interrupt is triggered----\
enable_cb sees that there is more work           |
enable_cb disables the interrupt                 |
       .                                         V
       .                            interrupt is delivered
       .                            skb_recv_done does atomic napi test, ok
 some waiting                       disable_cb is called->check fails->bang!
       .
poll would do napi check
poll would do disable_cb

The fix is to let enable_cb not disable the interrupt again, but expect the
caller to do the cleanup if it returns false. In that case, the interrupt is
only disabled, if the napi test_set_bit was successful.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (cleaned up doco)
---
 drivers/net/virtio_net.c     | 10 +++++++---
 drivers/virtio/virtio_ring.c |  1 -
 include/linux/virtio.h       |  5 +++--
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e575df83e5c2..b58472cf76f8 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -203,8 +203,11 @@ again:
 	if (received < budget) {
 		netif_rx_complete(vi->dev, napi);
 		if (unlikely(!vi->rvq->vq_ops->enable_cb(vi->rvq))
-		    && netif_rx_reschedule(vi->dev, napi))
+		    && napi_schedule_prep(napi)) {
+			vi->rvq->vq_ops->disable_cb(vi->rvq);
+			__netif_rx_schedule(vi->dev, napi);
 			goto again;
+		}
 	}
 
 	return received;
@@ -278,10 +281,11 @@ again:
 		pr_debug("%s: virtio not prepared to send\n", dev->name);
 		netif_stop_queue(dev);
 
-		/* Activate callback for using skbs: if this fails it
+		/* Activate callback for using skbs: if this returns false it
 		 * means some were used in the meantime. */
 		if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) {
-			printk("Unlikely: restart svq failed\n");
+			printk("Unlikely: restart svq race\n");
+			vi->svq->vq_ops->disable_cb(vi->svq);
 			netif_start_queue(dev);
 			goto again;
 		}
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 3a28c1382131..aa714028641e 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -232,7 +232,6 @@ static bool vring_enable_cb(struct virtqueue *_vq)
 	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
 	mb();
 	if (unlikely(more_used(vq))) {
-		vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
 		END_USE(vq);
 		return false;
 	}
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 260d1fcf29a4..12c18ac1b973 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -43,8 +43,9 @@ struct virtqueue
  *	vq: the struct virtqueue we're talking about.
  * @enable_cb: restart callbacks after disable_cb.
  *	vq: the struct virtqueue we're talking about.
- *	This returns "false" (and doesn't re-enable) if there are pending
- *	buffers in the queue, to avoid a race.
+ *	This re-enables callbacks; it returns "false" if there are pending
+ *	buffers in the queue, to detect a possible race between the driver
+ *	checking for more work, and enabling callbacks.
  *
  * Locking rules are straightforward: the driver is responsible for
  * locking.  No two operations may be invoked simultaneously.
-- 
cgit v1.2.3


From 916fbfb7ae5f8c8f86399794d89e6d273df8826b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 12 Mar 2008 15:26:34 +0900
Subject: devres: implement pcim_iomap_regions_request_all()

Some drivers need to reserve all PCI BARs to prevent other drivers
misusing unoccupied BARs.  pcim_iomap_regions_request_all() requests
all BARs and iomap specified BARs.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/pci.h |  2 ++
 lib/devres.c        | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9010f5458767..b7e4b633c69b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1045,6 +1045,8 @@ void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen);
 void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr);
 void __iomem * const *pcim_iomap_table(struct pci_dev *pdev);
 int pcim_iomap_regions(struct pci_dev *pdev, u16 mask, const char *name);
+int pcim_iomap_regions_request_all(struct pci_dev *pdev, u16 mask,
+				   const char *name);
 void pcim_iounmap_regions(struct pci_dev *pdev, u16 mask);
 
 extern int pci_pci_problems;
diff --git a/lib/devres.c b/lib/devres.c
index b1d336ce7f3d..edc27a5d1b73 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -297,6 +297,31 @@ int pcim_iomap_regions(struct pci_dev *pdev, u16 mask, const char *name)
 }
 EXPORT_SYMBOL(pcim_iomap_regions);
 
+/**
+ * pcim_iomap_regions_request_all - Request all BARs and iomap specified ones
+ * @pdev: PCI device to map IO resources for
+ * @mask: Mask of BARs to iomap
+ * @name: Name used when requesting regions
+ *
+ * Request all PCI BARs and iomap regions specified by @mask.
+ */
+int pcim_iomap_regions_request_all(struct pci_dev *pdev, u16 mask,
+				   const char *name)
+{
+	int request_mask = ((1 << 6) - 1) & ~mask;
+	int rc;
+
+	rc = pci_request_selected_regions(pdev, request_mask, name);
+	if (rc)
+		return rc;
+
+	rc = pcim_iomap_regions(pdev, mask, name);
+	if (rc)
+		pci_release_selected_regions(pdev, request_mask);
+	return rc;
+}
+EXPORT_SYMBOL(pcim_iomap_regions_request_all);
+
 /**
  * pcim_iounmap_regions - Unmap and release PCI BARs
  * @pdev: PCI device to map IO resources for
-- 
cgit v1.2.3


From 0382b9c35469be273ed10fa374496a924055a3c8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Mar 2008 22:46:46 -0700
Subject: [PKT_SCHED]: annotate cls_u32

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_cls.h | 8 ++++----
 net/sched/cls_u32.c     | 8 ++++----
 net/sched/em_u32.c      | 2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 28dfc61cf79e..99efbed81fa2 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -201,8 +201,8 @@ enum
 
 struct tc_u32_key
 {
-	__u32		mask;
-	__u32		val;
+	__be32		mask;
+	__be32		val;
 	int		off;
 	int		offmask;
 };
@@ -213,12 +213,12 @@ struct tc_u32_sel
 	unsigned char		offshift;
 	unsigned char		nkeys;
 
-	__u16			offmask;
+	__be16			offmask;
 	__u16			off;
 	short			offoff;
 
 	short			hoff;
-	__u32			hmask;
+	__be32			hmask;
 	struct tc_u32_key	keys[0];
 };
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b18fa95ef248..c5c16b4b6e98 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -89,7 +89,7 @@ static const struct tcf_ext_map u32_ext_map = {
 
 static struct tc_u_common *u32_list;
 
-static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift)
+static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift)
 {
 	unsigned h = ntohl(key & sel->hmask)>>fshift;
 
@@ -137,7 +137,7 @@ next_knode:
 
 		for (i = n->sel.nkeys; i>0; i--, key++) {
 
-			if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
+			if ((*(__be32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
 				n = n->next;
 				goto next_knode;
 			}
@@ -182,7 +182,7 @@ check_terminal:
 		ht = n->ht_down;
 		sel = 0;
 		if (ht->divisor)
-			sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift);
+			sel = ht->divisor&u32_hash_fold(*(__be32*)(ptr+n->sel.hoff), &n->sel,n->fshift);
 
 		if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
 			goto next_ht;
@@ -190,7 +190,7 @@ check_terminal:
 		if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
 			off2 = n->sel.off + 3;
 			if (n->sel.flags&TC_U32_VAROFFSET)
-				off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
+				off2 += ntohs(n->sel.offmask & *(__be16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
 			off2 &= ~3;
 		}
 		if (n->sel.flags&TC_U32_EAT) {
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 112796e4a7c4..953f1479f7da 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
 	if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
 		return 0;
 
-	return !(((*(u32*) ptr)  ^ key->val) & key->mask);
+	return !(((*(__be32*) ptr)  ^ key->val) & key->mask);
 }
 
 static struct tcf_ematch_ops em_u32_ops = {
-- 
cgit v1.2.3


From 0ff9663c88ac5efdb5c8ac21c0bd7f993a4e3849 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 17 Mar 2008 22:48:46 -0700
Subject: [IPV4]: ipv4_is_lbcast() misannotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/in.h b/include/linux/in.h
index 70c6df882694..4065313cd7ee 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -265,7 +265,7 @@ static inline bool ipv4_is_local_multicast(__be32 addr)
 static inline bool ipv4_is_lbcast(__be32 addr)
 {
 	/* limited broadcast */
-	return addr == INADDR_BROADCAST;
+	return addr == htonl(INADDR_BROADCAST);
 }
 
 static inline bool ipv4_is_zeronet(__be32 addr)
-- 
cgit v1.2.3


From 4ae7d5cefd4aa3560e359a3b0f03e12adc8b5c86 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Mar 2008 01:42:00 +0100
Subject: sched: improve affine wakeups

improve affine wakeups. Maintain the 'overlap' metric based on CFS's
sum_exec_runtime - which means the amount of time a task executes
after it wakes up some other task.

Use the 'overlap' for the wakeup decisions: if the 'overlap' is short,
it means there's strong workload coupling between this task and the
woken up task. If the 'overlap' is large then the workload is decoupled
and the scheduler will move them to separate CPUs more easily.

( Also slightly move the preempt_check within try_to_wake_up() - this has
  no effect on functionality but allows 'early wakeups' (for still-on-rq
  tasks) to be correctly accounted as well.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  3 +++
 kernel/sched.c        |  5 ++++-
 kernel/sched_debug.c  |  1 +
 kernel/sched_fair.c   | 58 +++++++++++++++++++++++++++++++++++++--------------
 4 files changed, 50 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11d8e9a74eff..3625fcaf5d0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -929,6 +929,9 @@ struct sched_entity {
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 
+	u64			last_wakeup;
+	u64			avg_overlap;
+
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
diff --git a/kernel/sched.c b/kernel/sched.c
index d1ad69b270ca..adbd475cfd25 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1855,10 +1855,11 @@ out_activate:
 		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
-	check_preempt_curr(rq, p);
 	success = 1;
 
 out_running:
+	check_preempt_curr(rq, p);
+
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -1892,6 +1893,8 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
+	p->se.last_wakeup		= 0;
+	p->se.avg_overlap		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4a..ef358ba07683 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_start);
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
+	PN(se.avg_overlap);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b5a357396b49..87c9d3a2aafa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -556,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 	account_entity_enqueue(cfs_rq, se);
 }
 
+static void update_avg(u64 *avg, u64 sample)
+{
+	s64 diff = sample - *avg;
+	*avg += diff >> 3;
+}
+
+static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	if (!se->last_wakeup)
+		return;
+
+	update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
+	se->last_wakeup = 0;
+}
+
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -566,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
+		update_avg_stats(cfs_rq, se);
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -981,12 +997,15 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 
 #ifdef CONFIG_SMP
 
+static const struct sched_class fair_sched_class;
+
 static int
-wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
-	    int prev_cpu, int this_cpu, int sync, int idx,
-	    unsigned long load, unsigned long this_load,
+wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
+	    int idx, unsigned long load, unsigned long this_load,
 	    unsigned int imbalance)
 {
+	struct task_struct *curr = this_rq->curr;
 	unsigned long tl = this_load;
 	unsigned long tl_per_task;
 
@@ -994,10 +1013,15 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p,
 		return 0;
 
 	/*
-	 * Attract cache-cold tasks on sync wakeups:
+	 * If the currently running task will sleep within
+	 * a reasonable amount of time then attract this newly
+	 * woken task:
 	 */
-	if (sync && !task_hot(p, rq->clock, this_sd))
-		return 1;
+	if (sync && curr->sched_class == &fair_sched_class) {
+		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+				p->se.avg_overlap < sysctl_sched_migration_cost)
+			return 1;
+	}
 
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
@@ -1030,18 +1054,16 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	struct sched_domain *sd, *this_sd = NULL;
 	int prev_cpu, this_cpu, new_cpu;
 	unsigned long load, this_load;
+	struct rq *rq, *this_rq;
 	unsigned int imbalance;
-	struct rq *rq;
 	int idx;
 
 	prev_cpu	= task_cpu(p);
 	rq		= task_rq(p);
 	this_cpu	= smp_processor_id();
+	this_rq		= cpu_rq(this_cpu);
 	new_cpu		= prev_cpu;
 
-	if (prev_cpu == this_cpu)
-		goto out;
-
 	/*
 	 * 'this_sd' is the first domain that both
 	 * this_cpu and prev_cpu are present in:
@@ -1069,11 +1091,12 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	load = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
-	if (wake_affine(rq, this_sd, p, prev_cpu, this_cpu, sync, idx,
-				     load, this_load, imbalance)) {
-		new_cpu = this_cpu;
+	if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+				     load, this_load, imbalance))
+		return this_cpu;
+
+	if (prev_cpu == this_cpu)
 		goto out;
-	}
 
 	/*
 	 * Start passive balancing when half the imbalance_pct
@@ -1083,8 +1106,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 		if (imbalance*this_load <= 100*load) {
 			schedstat_inc(this_sd, ttwu_move_balance);
 			schedstat_inc(p, se.nr_wakeups_passive);
-			new_cpu = this_cpu;
-			goto out;
+			return this_cpu;
 		}
 	}
 
@@ -1111,6 +1133,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		return;
 	}
 
+	se->last_wakeup = se->sum_exec_runtime;
+	if (unlikely(se == pse))
+		return;
+
 	cfs_rq_of(pse)->next = pse;
 
 	/*
-- 
cgit v1.2.3


From 33b0c4217dcd67b788318c3192a2912b530e4eef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 16 Mar 2008 11:14:30 +0100
Subject: sched: tune multi-core idle balancing

WAKE_IDLE is too agressive on multi-core CPUs with the new
wake-affine code, keep it on for SMT/HT balancing alone
(where there's no cache affinity at all between logical CPUs).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/topology.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 2352f46160d3..2d8dac8799cf 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -138,7 +138,6 @@
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_SHARE_PKG_RESOURCES\
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
-- 
cgit v1.2.3


From ae66be9b71b12f16b84129860d06bbfe37fbec51 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 19 Mar 2008 17:00:57 -0700
Subject: rcu: fix misplaced mb() in rcu_enter/exit_nohz()

In the process of writing up the mechanical proof of correctness for the
dynticks/preemptable-RCU interface, I noticed misplaced memory barriers in
rcu_enter_nohz() and rcu_exit_nohz().

This patch puts them in the right place and adds a comment.  The key thing to
keep in mind is that rcu_enter_nohz() is -exiting- the mode that can legally
execute RCU read-side critical sections.

The memory barrier must be between any potential RCU read-side critical
sections and the increment of the per-CPU dynticks_progress_counter, and thus
must come -before- this increment.  And vice versa for rcu_exit_nohz().

The locking in the scheduler is probably saving us for the moment.

Also, switch to smp_mb() - we don't need a barrier for uniprocessor kernels.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rcupreempt.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 01152ed532c8..d038aa6e5ee1 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -87,15 +87,15 @@ DECLARE_PER_CPU(long, dynticks_progress_counter);
 
 static inline void rcu_enter_nohz(void)
 {
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
 	__get_cpu_var(dynticks_progress_counter)++;
 	WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
-	mb();
 }
 
 static inline void rcu_exit_nohz(void)
 {
-	mb();
 	__get_cpu_var(dynticks_progress_counter)++;
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 	WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
 }
 
-- 
cgit v1.2.3


From a6b91919e0881a0d0a4ae5211d5c879a8c7ca92b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 19 Mar 2008 17:01:00 -0700
Subject: fs: fix kernel-doc notation warnings

Fix kernel-doc notation warnings in fs/.

Warning(mmotm-2008-0314-1449//fs/super.c:560): missing initial short description on line:
 *	mark_files_ro
Warning(mmotm-2008-0314-1449//fs/locks.c:1277): missing initial short description on line:
 *	lease_get_mtime
Warning(mmotm-2008-0314-1449//fs/locks.c:1277): missing initial short description on line:
 *	lease_get_mtime
Warning(mmotm-2008-0314-1449//fs/namei.c:1368): missing initial short description on line:
 * lookup_one_len:  filesystem helper to lookup single pathname component
Warning(mmotm-2008-0314-1449//fs/buffer.c:3221): missing initial short description on line:
 * bh_uptodate_or_lock: Test whether the buffer is uptodate
Warning(mmotm-2008-0314-1449//fs/buffer.c:3240): missing initial short description on line:
 * bh_submit_read: Submit a locked buffer for reading
Warning(mmotm-2008-0314-1449//fs/fs-writeback.c:30): missing initial short description on line:
 * writeback_acquire: attempt to get exclusive writeback access to a device
Warning(mmotm-2008-0314-1449//fs/fs-writeback.c:47): missing initial short description on line:
 * writeback_in_progress: determine whether there is writeback in progress
Warning(mmotm-2008-0314-1449//fs/fs-writeback.c:58): missing initial short description on line:
 * writeback_release: relinquish exclusive writeback access against a device.
Warning(mmotm-2008-0314-1449//include/linux/jbd.h:351): contents before sections
Warning(mmotm-2008-0314-1449//include/linux/jbd.h:561): contents before sections
Warning(mmotm-2008-0314-1449//fs/jbd/transaction.c:1935): missing initial short description on line:
 * void journal_invalidatepage()

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c          |  4 ++--
 fs/fs-writeback.c    |  6 +++---
 fs/jbd/transaction.c |  5 ++---
 fs/locks.c           |  4 ++--
 fs/namei.c           |  6 +++---
 fs/super.c           |  6 +++---
 include/linux/jbd.h  | 11 ++---------
 7 files changed, 17 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index ddfdd2c80bf9..7ba58386beee 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3213,7 +3213,7 @@ static int buffer_cpu_notify(struct notifier_block *self,
 }
 
 /**
- * bh_uptodate_or_lock: Test whether the buffer is uptodate
+ * bh_uptodate_or_lock - Test whether the buffer is uptodate
  * @bh: struct buffer_head
  *
  * Return true if the buffer is up-to-date and false,
@@ -3232,7 +3232,7 @@ int bh_uptodate_or_lock(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_uptodate_or_lock);
 
 /**
- * bh_submit_read: Submit a locked buffer for reading
+ * bh_submit_read - Submit a locked buffer for reading
  * @bh: struct buffer_head
  *
  * Returns zero on success and -EIO on error.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c0076077d338..06557679ca41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -751,7 +751,7 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 EXPORT_SYMBOL(generic_osync_inode);
 
 /**
- * writeback_acquire: attempt to get exclusive writeback access to a device
+ * writeback_acquire - attempt to get exclusive writeback access to a device
  * @bdi: the device's backing_dev_info structure
  *
  * It is a waste of resources to have more than one pdflush thread blocked on
@@ -768,7 +768,7 @@ int writeback_acquire(struct backing_dev_info *bdi)
 }
 
 /**
- * writeback_in_progress: determine whether there is writeback in progress
+ * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
  *
  * Determine whether there is writeback in progress against a backing device.
@@ -779,7 +779,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 
 /**
- * writeback_release: relinquish exclusive writeback access against a device.
+ * writeback_release - relinquish exclusive writeback access against a device.
  * @bdi: the device's backing_dev_info structure
  */
 void writeback_release(struct backing_dev_info *bdi)
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 4c895044c7d6..2c9e8f5d13aa 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1904,13 +1904,12 @@ zap_buffer_unlocked:
 }
 
 /**
- * void journal_invalidatepage()
- * @journal: journal to use for flush...
+ * void journal_invalidatepage() - invalidate a journal page
+ * @journal: journal to use for flush
  * @page:    page to flush
  * @offset:  length of page to invalidate.
  *
  * Reap page buffers containing data after offset in page.
- *
  */
 void journal_invalidatepage(journal_t *journal,
 		      struct page *page,
diff --git a/fs/locks.c b/fs/locks.c
index f36f0e61558d..d83fab1b77b5 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1275,13 +1275,13 @@ out:
 EXPORT_SYMBOL(__break_lease);
 
 /**
- *	lease_get_mtime
+ *	lease_get_mtime - get the last modified time of an inode
  *	@inode: the inode
  *      @time:  pointer to a timespec which will contain the last modified time
  *
  * This is to force NFS clients to flush their caches for files with
  * exclusive leases.  The justification is that if someone has an
- * exclusive lease, then they could be modifiying it.
+ * exclusive lease, then they could be modifying it.
  */
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
diff --git a/fs/namei.c b/fs/namei.c
index 941c8e8228c0..6b7a0eef4090 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1364,13 +1364,13 @@ static int __lookup_one_len(const char *name, struct qstr *this,
 }
 
 /**
- * lookup_one_len:  filesystem helper to lookup single pathname component
+ * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
  * @base:	base directory to lookup from
  * @len:	maximum length @len should be interpreted to
  *
- * Note that this routine is purely a helper for filesystem useage and should
- * not be called by generic code.  Also note that by using this function to
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.  Also note that by using this function the
  * nameidata argument is passed to the filesystem methods and a filesystem
  * using this helper needs to be prepared for that.
  */
diff --git a/fs/super.c b/fs/super.c
index 010446d8c40a..d0a941a4e620 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -556,11 +556,11 @@ out:
 }
 
 /**
- *	mark_files_ro
+ *	mark_files_ro - mark all files read-only
  *	@sb: superblock in question
  *
- *	All files are marked read/only.  We don't care about pending
- *	delete files so this should be used in 'force' mode only
+ *	All files are marked read-only.  We don't care about pending
+ *	delete files so this should be used in 'force' mode only.
  */
 
 static void mark_files_ro(struct super_block *sb)
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index b18fd3b9b835..423f58272188 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -348,8 +348,7 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 struct jbd_revoke_table_s;
 
 /**
- * struct handle_s - The handle_s type is the concrete type associated with
- *     handle_t.
+ * struct handle_s - this is the concrete type associated with handle_t.
  * @h_transaction: Which compound transaction is this update a part of?
  * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
  * @h_ref: Reference count on this handle
@@ -358,12 +357,7 @@ struct jbd_revoke_table_s;
  * @h_jdata: flag to force data journaling
  * @h_aborted: flag indicating fatal error on handle
  * @h_lockdep_map: lockdep info for debugging lock problems
- **/
-
-/* Docbook can't yet cope with the bit fields, but will leave the documentation
- * in so it can be fixed later.
  */
-
 struct handle_s
 {
 	/* Which compound transaction is this update a part of? */
@@ -558,8 +552,7 @@ struct transaction_s
 };
 
 /**
- * struct journal_s - The journal_s type is the concrete type associated with
- *     journal_t.
+ * struct journal_s - this is the concrete type associated with journal_t.
  * @j_flags:  General journaling state flags
  * @j_errno:  Is there an outstanding uncleared error on the journal (from a
  *     prior abort)?
-- 
cgit v1.2.3


From ead70773608a5d97f81cb492f117d20b5e9f323e Mon Sep 17 00:00:00 2001
From: Alex Dubov <oakad@yahoo.com>
Date: Wed, 19 Mar 2008 17:01:06 -0700
Subject: memstick: automatically retrieve "INT" value from command response

MemoryStick storage cards, when in parallel mode, send several meaningful bits
of their "INT" register as part of command response.  This data is stored by
host and can be used to spare invocation of "GET_INT" TPC on each data page
transferred between host and card.

Signed-off-by: Alex Dubov <oakad@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/memstick/core/memstick.c    |  9 +++----
 drivers/memstick/core/mspro_block.c | 15 +++++++-----
 drivers/memstick/host/jmb38x_ms.c   | 48 +++++++++++++++++++++----------------
 drivers/memstick/host/tifm_ms.c     | 17 +++++++------
 include/linux/memstick.h            |  1 -
 5 files changed, 49 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index de80dba12f9b..946e3d3506ac 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -276,8 +276,6 @@ void memstick_init_req_sg(struct memstick_request *mrq, unsigned char tpc,
 		mrq->need_card_int = 1;
 	else
 		mrq->need_card_int = 0;
-
-	mrq->get_int_reg = 0;
 }
 EXPORT_SYMBOL(memstick_init_req_sg);
 
@@ -311,8 +309,6 @@ void memstick_init_req(struct memstick_request *mrq, unsigned char tpc,
 		mrq->need_card_int = 1;
 	else
 		mrq->need_card_int = 0;
-
-	mrq->get_int_reg = 0;
 }
 EXPORT_SYMBOL(memstick_init_req);
 
@@ -342,6 +338,7 @@ static int h_memstick_read_dev_id(struct memstick_dev *card,
 			card->id.class = id_reg.class;
 		}
 		complete(&card->mrq_complete);
+		dev_dbg(&card->dev, "if_mode = %02x\n", id_reg.if_mode);
 		return -EAGAIN;
 	}
 }
@@ -422,7 +419,6 @@ static void memstick_power_on(struct memstick_host *host)
 {
 	host->set_param(host, MEMSTICK_POWER, MEMSTICK_POWER_ON);
 	host->set_param(host, MEMSTICK_INTERFACE, MEMSTICK_SERIAL);
-	msleep(1);
 }
 
 static void memstick_check(struct work_struct *work)
@@ -579,7 +575,8 @@ EXPORT_SYMBOL(memstick_suspend_host);
 void memstick_resume_host(struct memstick_host *host)
 {
 	mutex_lock(&host->lock);
-	host->set_param(host, MEMSTICK_POWER, MEMSTICK_POWER_ON);
+	if (host->card)
+		memstick_power_on(host);
 	mutex_unlock(&host->lock);
 	memstick_detect_change(host);
 }
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 1d637e4561d3..e5356f97d076 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -133,6 +133,7 @@ struct mspro_devinfo {
 struct mspro_block_data {
 	struct memstick_dev   *card;
 	unsigned int          usage_count;
+	unsigned int          caps;
 	struct gendisk        *disk;
 	struct request_queue  *queue;
 	spinlock_t            q_lock;
@@ -577,7 +578,6 @@ static int h_mspro_block_wait_for_ced(struct memstick_dev *card,
 static int h_mspro_block_transfer_data(struct memstick_dev *card,
 				       struct memstick_request **mrq)
 {
-	struct memstick_host *host = card->host;
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
 	unsigned char t_val = 0;
 	struct scatterlist t_sg = { 0 };
@@ -591,12 +591,12 @@ static int h_mspro_block_transfer_data(struct memstick_dev *card,
 	switch ((*mrq)->tpc) {
 	case MS_TPC_WRITE_REG:
 		memstick_init_req(*mrq, MS_TPC_SET_CMD, &msb->transfer_cmd, 1);
-		(*mrq)->get_int_reg = 1;
+		(*mrq)->need_card_int = 1;
 		return 0;
 	case MS_TPC_SET_CMD:
 		t_val = (*mrq)->int_reg;
 		memstick_init_req(*mrq, MS_TPC_GET_INT, NULL, 1);
-		if (host->caps & MEMSTICK_CAP_AUTO_GET_INT)
+		if (msb->caps & MEMSTICK_CAP_AUTO_GET_INT)
 			goto has_int_reg;
 		return 0;
 	case MS_TPC_GET_INT:
@@ -646,12 +646,12 @@ has_int_reg:
 					   ? MS_TPC_READ_LONG_DATA
 					   : MS_TPC_WRITE_LONG_DATA,
 				     &t_sg);
-		(*mrq)->get_int_reg = 1;
+		(*mrq)->need_card_int = 1;
 		return 0;
 	case MS_TPC_READ_LONG_DATA:
 	case MS_TPC_WRITE_LONG_DATA:
 		msb->current_page++;
-		if (host->caps & MEMSTICK_CAP_AUTO_GET_INT) {
+		if (msb->caps & MEMSTICK_CAP_AUTO_GET_INT) {
 			t_val = (*mrq)->int_reg;
 			goto has_int_reg;
 		} else {
@@ -1052,7 +1052,8 @@ static int mspro_block_init_card(struct memstick_dev *card)
 	if (memstick_set_rw_addr(card))
 		return -EIO;
 
-	if (host->caps & MEMSTICK_CAP_PAR4) {
+	msb->caps = host->caps;
+	if (msb->caps & MEMSTICK_CAP_PAR4) {
 		if (mspro_block_switch_to_parallel(card))
 			printk(KERN_WARNING "%s: could not switch to "
 			       "parallel interface\n", card->dev.bus_id);
@@ -1062,6 +1063,8 @@ static int mspro_block_init_card(struct memstick_dev *card)
 	if (rc)
 		return rc;
 	dev_dbg(&card->dev, "card activated\n");
+	if (msb->system != MEMSTICK_SYS_SERIAL)
+		msb->caps |= MEMSTICK_CAP_AUTO_GET_INT;
 
 	card->next_request = h_mspro_block_req_init;
 	msb->mrq_handler = h_mspro_block_get_ro;
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 9a57504183c7..f91037d50422 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -57,8 +57,6 @@ struct jmb38x_ms_host {
 	unsigned long           timeout_jiffies;
 	struct timer_list       timer;
 	struct memstick_request *req;
-	unsigned char           eject:1,
-				use_dma:1;
 	unsigned char           cmd_flags;
 	unsigned char           io_pos;
 	unsigned int            io_word[2];
@@ -95,9 +93,22 @@ struct jmb38x_ms {
 #define HOST_CONTROL_IF_PAR4   0x1
 #define HOST_CONTROL_IF_PAR8   0x3
 
+#define STATUS_BUSY             0x00080000
+#define STATUS_MS_DAT7          0x00040000
+#define STATUS_MS_DAT6          0x00020000
+#define STATUS_MS_DAT5          0x00010000
+#define STATUS_MS_DAT4          0x00008000
+#define STATUS_MS_DAT3          0x00004000
+#define STATUS_MS_DAT2          0x00002000
+#define STATUS_MS_DAT1          0x00001000
+#define STATUS_MS_DAT0          0x00000800
 #define STATUS_HAS_MEDIA        0x00000400
 #define STATUS_FIFO_EMPTY       0x00000200
 #define STATUS_FIFO_FULL        0x00000100
+#define STATUS_MS_CED           0x00000080
+#define STATUS_MS_ERR           0x00000040
+#define STATUS_MS_BRQ           0x00000020
+#define STATUS_MS_CNK           0x00000001
 
 #define INT_STATUS_TPC_ERR      0x00080000
 #define INT_STATUS_CRC_ERR      0x00040000
@@ -124,7 +135,7 @@ enum {
 	CMD_READY    = 0x01,
 	FIFO_READY   = 0x02,
 	REG_DATA     = 0x04,
-	AUTO_GET_INT = 0x08
+	DMA_DATA     = 0x08
 };
 
 static unsigned int jmb38x_ms_read_data(struct jmb38x_ms_host *host,
@@ -367,28 +378,27 @@ static int jmb38x_ms_issue_cmd(struct memstick_host *msh)
 		cmd |= TPC_DIR;
 	if (host->req->need_card_int)
 		cmd |= TPC_WAIT_INT;
-	if (host->req->get_int_reg)
-		cmd |= TPC_GET_INT;
 
 	data = host->req->data;
 
-	host->use_dma = !no_dma;
+	if (!no_dma)
+		host->cmd_flags |= DMA_DATA;
 
 	if (host->req->long_data) {
 		data_len = host->req->sg.length;
 	} else {
 		data_len = host->req->data_len;
-		host->use_dma = 0;
+		host->cmd_flags &= ~DMA_DATA;
 	}
 
 	if (data_len <= 8) {
 		cmd &= ~(TPC_DATA_SEL | 0xf);
 		host->cmd_flags |= REG_DATA;
 		cmd |= data_len & 0xf;
-		host->use_dma = 0;
+		host->cmd_flags &= ~DMA_DATA;
 	}
 
-	if (host->use_dma) {
+	if (host->cmd_flags & DMA_DATA) {
 		if (1 != pci_map_sg(host->chip->pdev, &host->req->sg, 1,
 				    host->req->data_dir == READ
 				    ? PCI_DMA_FROMDEVICE
@@ -451,13 +461,12 @@ static void jmb38x_ms_complete_cmd(struct memstick_host *msh, int last)
 		readl(host->addr + INT_STATUS));
 	dev_dbg(msh->cdev.dev, "c hstatus %08x\n", readl(host->addr + STATUS));
 
-	if (host->req->get_int_reg) {
-		t_val = readl(host->addr + TPC_P0);
-		host->req->int_reg = (t_val & 0xff);
-	}
+	host->req->int_reg = readl(host->addr + STATUS) & 0xff;
+
+	writel(0, host->addr + BLOCK);
+	writel(0, host->addr + DMA_CONTROL);
 
-	if (host->use_dma) {
-		writel(0, host->addr + DMA_CONTROL);
+	if (host->cmd_flags & DMA_DATA) {
 		pci_unmap_sg(host->chip->pdev, &host->req->sg, 1,
 			     host->req->data_dir == READ
 			     ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
@@ -509,7 +518,7 @@ static irqreturn_t jmb38x_ms_isr(int irq, void *dev_id)
 			else
 				host->req->error = -ETIME;
 		} else {
-			if (host->use_dma) {
+			if (host->cmd_flags & DMA_DATA) {
 				if (irq_status & INT_STATUS_EOTRAN)
 					host->cmd_flags |= FIFO_READY;
 			} else {
@@ -775,13 +784,10 @@ static struct memstick_host *jmb38x_ms_alloc_host(struct jmb38x_ms *jm, int cnt)
 	snprintf(host->host_id, DEVICE_ID_SIZE, DRIVER_NAME ":slot%d",
 		 host->id);
 	host->irq = jm->pdev->irq;
-	host->timeout_jiffies = msecs_to_jiffies(4000);
+	host->timeout_jiffies = msecs_to_jiffies(1000);
 	msh->request = jmb38x_ms_request;
 	msh->set_param = jmb38x_ms_set_param;
-	/*
-	msh->caps = MEMSTICK_CAP_AUTO_GET_INT | MEMSTICK_CAP_PAR4
-		    | MEMSTICK_CAP_PAR8;
-	*/
+
 	msh->caps = MEMSTICK_CAP_PAR4 | MEMSTICK_CAP_PAR8;
 
 	setup_timer(&host->timer, jmb38x_ms_abort, (unsigned long)msh);
diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c
index 2b5bf52a8302..eb150dfb637f 100644
--- a/drivers/memstick/host/tifm_ms.c
+++ b/drivers/memstick/host/tifm_ms.c
@@ -340,11 +340,20 @@ static void tifm_ms_complete_cmd(struct tifm_ms *host)
 
 	del_timer(&host->timer);
 
-	if (host->use_dma)
+	host->req->int_reg = readl(sock->addr + SOCK_MS_STATUS) & 0xff;
+	host->req->int_reg = (host->req->int_reg & 1)
+			     | ((host->req->int_reg << 4) & 0xe0);
+
+	writel(TIFM_FIFO_INT_SETALL,
+	       sock->addr + SOCK_DMA_FIFO_INT_ENABLE_CLEAR);
+	writel(TIFM_DMA_RESET, sock->addr + SOCK_DMA_CONTROL);
+
+	if (host->use_dma) {
 		tifm_unmap_sg(sock, &host->req->sg, 1,
 			      host->req->data_dir == READ
 			      ? PCI_DMA_FROMDEVICE
 			      : PCI_DMA_TODEVICE);
+	}
 
 	writel((~TIFM_CTRL_LED) & readl(sock->addr + SOCK_CONTROL),
 	       sock->addr + SOCK_CONTROL);
@@ -424,12 +433,6 @@ static void tifm_ms_card_event(struct tifm_dev *sock)
 		else if (host_status & TIFM_MS_STAT_CRC)
 			host->req->error = -EILSEQ;
 
-		if (host->req->error) {
-			writel(TIFM_FIFO_INT_SETALL,
-			       sock->addr + SOCK_DMA_FIFO_INT_ENABLE_CLEAR);
-			writel(TIFM_DMA_RESET, sock->addr + SOCK_DMA_CONTROL);
-		}
-
 		if (host_status & TIFM_MS_STAT_RDY)
 			host->cmd_flags |= CMD_READY;
 
diff --git a/include/linux/memstick.h b/include/linux/memstick.h
index b7ee25888836..3e686ec6a967 100644
--- a/include/linux/memstick.h
+++ b/include/linux/memstick.h
@@ -239,7 +239,6 @@ struct memstick_request {
 	unsigned char tpc;
 	unsigned char data_dir:1,
 		      need_card_int:1,
-		      get_int_reg:1,
 		      long_data:1;
 	unsigned char int_reg;
 	int           error;
-- 
cgit v1.2.3


From aedb60a67c10a0861af179725d060765262ba0fb Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serge@hallyn.com>
Date: Fri, 29 Feb 2008 15:14:57 +0000
Subject: file capabilities: remove cap_task_kill()

The original justification for cap_task_kill() was as follows:

	check_kill_permission() does appropriate uid equivalence checks.
	However with file capabilities it becomes possible for an
	unprivileged user to execute a file with file capabilities
	resulting in a more privileged task with the same uid.

However now that cap_task_kill() always returns 0 (permission
granted) when p->uid==current->uid, the whole hook is worthless,
and only likely to create more subtle problems in the corner cases
where it might still be called but return -EPERM.  Those cases
are basically when uids are different but euid/suid is equivalent
as per the check in check_kill_permission().

One example of a still-broken application is 'at' for non-root users.

This patch removes cap_task_kill().

Signed-off-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Earlier-version-tested-by: Luiz Fernando N. Capitulino <lcapitulino@mandriva.com.br>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/security.h   |  3 +--
 security/capability.c      |  1 -
 security/commoncap.c       | 40 ----------------------------------------
 security/smack/smack_lsm.c |  5 -----
 4 files changed, 1 insertion(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index b07357ca2137..c673dfd4dffc 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -57,7 +57,6 @@ extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
 extern int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags);
 extern void cap_task_reparent_to_init (struct task_struct *p);
-extern int cap_task_kill(struct task_struct *p, struct siginfo *info, int sig, u32 secid);
 extern int cap_task_setscheduler (struct task_struct *p, int policy, struct sched_param *lp);
 extern int cap_task_setioprio (struct task_struct *p, int ioprio);
 extern int cap_task_setnice (struct task_struct *p, int nice);
@@ -2187,7 +2186,7 @@ static inline int security_task_kill (struct task_struct *p,
 				      struct siginfo *info, int sig,
 				      u32 secid)
 {
-	return cap_task_kill(p, info, sig, secid);
+	return 0;
 }
 
 static inline int security_task_wait (struct task_struct *p)
diff --git a/security/capability.c b/security/capability.c
index 9e99f36a8b5c..2c6e06d18fab 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -40,7 +40,6 @@ static struct security_operations capability_ops = {
 	.inode_need_killpriv =		cap_inode_need_killpriv,
 	.inode_killpriv =		cap_inode_killpriv,
 
-	.task_kill =			cap_task_kill,
 	.task_setscheduler =		cap_task_setscheduler,
 	.task_setioprio =		cap_task_setioprio,
 	.task_setnice =			cap_task_setnice,
diff --git a/security/commoncap.c b/security/commoncap.c
index bb0c095f5761..06d5c9469ba3 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -540,41 +540,6 @@ int cap_task_setnice (struct task_struct *p, int nice)
 	return cap_safe_nice(p);
 }
 
-int cap_task_kill(struct task_struct *p, struct siginfo *info,
-				int sig, u32 secid)
-{
-	if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
-		return 0;
-
-	/*
-	 * Running a setuid root program raises your capabilities.
-	 * Killing your own setuid root processes was previously
-	 * allowed.
-	 * We must preserve legacy signal behavior in this case.
-	 */
-	if (p->uid == current->uid)
-		return 0;
-
-	/* sigcont is permitted within same session */
-	if (sig == SIGCONT && (task_session_nr(current) == task_session_nr(p)))
-		return 0;
-
-	if (secid)
-		/*
-		 * Signal sent as a particular user.
-		 * Capabilities are ignored.  May be wrong, but it's the
-		 * only thing we can do at the moment.
-		 * Used only by usb drivers?
-		 */
-		return 0;
-	if (cap_issubset(p->cap_permitted, current->cap_permitted))
-		return 0;
-	if (capable(CAP_KILL))
-		return 0;
-
-	return -EPERM;
-}
-
 /*
  * called from kernel/sys.c for prctl(PR_CABSET_DROP)
  * done without task_capability_lock() because it introduces
@@ -605,11 +570,6 @@ int cap_task_setnice (struct task_struct *p, int nice)
 {
 	return 0;
 }
-int cap_task_kill(struct task_struct *p, struct siginfo *info,
-				int sig, u32 secid)
-{
-	return 0;
-}
 #endif
 
 void cap_task_reparent_to_init (struct task_struct *p)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 38d707593b31..732ba27923c4 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1117,11 +1117,6 @@ static int smack_task_movememory(struct task_struct *p)
 static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 			   int sig, u32 secid)
 {
-	int rc;
-
-	rc = cap_task_kill(p, info, sig, secid);
-	if (rc != 0)
-		return rc;
 	/*
 	 * Special cases where signals really ought to go through
 	 * in spite of policy. Stephen Smalley suggests it may
-- 
cgit v1.2.3


From 9aefd0abd8610e8f3bb097debf3afb73f8b7b210 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 12 Mar 2008 18:31:58 +0100
Subject: sched: add exported arch_reinit_sched_domains() to header file.

Needed so it can be called from outside of sched.c.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 +
 kernel/sched.c        | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3625fcaf5d0f..fed07d03364e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -790,6 +790,7 @@ struct sched_domain {
 };
 
 extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+extern int arch_reinit_sched_domains(void);
 
 #endif	/* CONFIG_SMP */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 573179eb553e..78482e51b583 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6920,7 +6920,7 @@ match2:
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static int arch_reinit_sched_domains(void)
+int arch_reinit_sched_domains(void)
 {
 	int err;
 
-- 
cgit v1.2.3


From 22e52b072dd87faa9b2559fe89d4e8f2370f81ca Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 12 Mar 2008 18:31:59 +0100
Subject: sched: add arch_update_cpu_topology hook.

Will be called each time the scheduling domains are rebuild.
Needed for architectures that don't have a static cpu topology.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/topology.h | 2 ++
 kernel/sched.c           | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 2d8dac8799cf..bd14f8b30f09 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -50,6 +50,8 @@
 	for_each_online_node(node)						\
 		if (nr_cpus_node(node))
 
+void arch_update_cpu_topology(void);
+
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
 #define REMOTE_DISTANCE		20
diff --git a/kernel/sched.c b/kernel/sched.c
index 78482e51b583..28c73f07efb2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6807,6 +6807,10 @@ static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
  */
 static cpumask_t fallback_doms;
 
+void __attribute__((weak)) arch_update_cpu_topology(void)
+{
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -6816,6 +6820,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	int err;
 
+	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
-- 
cgit v1.2.3


From 03c086a747d0b242878eb881971ec61c1555869d Mon Sep 17 00:00:00 2001
From: Darren Salt <linux@youmustbejoking.demon.co.uk>
Date: Thu, 13 Mar 2008 15:35:49 +0000
Subject: PNP: increase the number of PnP memory resources from 12 to 24

Increase the number of PnP memory resources from 12 to 24.

This removes an "exceeded the max num of mem resources" warning on boot. I
also noticed the reservation of two more iomem ranges on the computer on
which this was tested.

Signed-off-by: Darren Salt <linux@youmustbejoking.demon.co.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pnp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index cd6332b88829..29dd55838e84 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -14,7 +14,7 @@
 #include <linux/mod_devicetable.h>
 
 #define PNP_MAX_PORT		40
-#define PNP_MAX_MEM		12
+#define PNP_MAX_MEM		24
 #define PNP_MAX_IRQ		2
 #define PNP_MAX_DMA		2
 #define PNP_NAME_LEN		50
-- 
cgit v1.2.3


From aacda37538e7f9cf2148eedf3766239829e51ba4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 18 Mar 2008 17:47:43 +0900
Subject: libata: implement ata_qc_raw_nbytes()

Implement ata_qc_raw_nbytes() which determines the raw user-requested
size of a PC command.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-scsi.c | 14 +++++++++++---
 include/linux/libata.h    |  8 +++++++-
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 8f0e8f2bc628..15795394b0a8 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -527,6 +527,14 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
 	return qc;
 }
 
+static void ata_qc_set_pc_nbytes(struct ata_queued_cmd *qc)
+{
+	struct scsi_cmnd *scmd = qc->scsicmd;
+
+	qc->extrabytes = scmd->request->extra_len;
+	qc->nbytes = scsi_bufflen(scmd) + qc->extrabytes;
+}
+
 /**
  *	ata_dump_status - user friendly display of error info
  *	@id: id of the port in question
@@ -2539,7 +2547,7 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc)
 	}
 
 	qc->tf.command = ATA_CMD_PACKET;
-	qc->nbytes = scsi_bufflen(scmd) + scmd->request->extra_len;
+	ata_qc_set_pc_nbytes(qc);
 
 	/* check whether ATAPI DMA is safe */
 	if (!using_pio && ata_check_atapi_dma(qc))
@@ -2550,7 +2558,7 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc)
 	 * want to set it properly, and for DMA where it is
 	 * effectively meaningless.
 	 */
-	nbytes = min(scmd->request->data_len, (unsigned int)63 * 1024);
+	nbytes = min(ata_qc_raw_nbytes(qc), (unsigned int)63 * 1024);
 
 	/* Most ATAPI devices which honor transfer chunk size don't
 	 * behave according to the spec when odd chunk size which
@@ -2876,7 +2884,7 @@ static unsigned int ata_scsi_pass_thru(struct ata_queued_cmd *qc)
 	 * TODO: find out if we need to do more here to
 	 *       cover scatter/gather case.
 	 */
-	qc->nbytes = scsi_bufflen(scmd) + scmd->request->extra_len;
+	ata_qc_set_pc_nbytes(qc);
 
 	/* request result TF and be quiet about device error */
 	qc->flags |= ATA_QCFLAG_RESULT_TF | ATA_QCFLAG_QUIET;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index a05f60013642..269cdba09578 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -463,6 +463,7 @@ struct ata_queued_cmd {
 	unsigned int		sect_size;
 
 	unsigned int		nbytes;
+	unsigned int		extrabytes;
 	unsigned int		curbytes;
 
 	struct scatterlist	*cursg;
@@ -1336,6 +1337,11 @@ static inline struct ata_queued_cmd *ata_qc_from_tag(struct ata_port *ap,
 	return NULL;
 }
 
+static inline unsigned int ata_qc_raw_nbytes(struct ata_queued_cmd *qc)
+{
+	return qc->nbytes - min(qc->extrabytes, qc->nbytes);
+}
+
 static inline void ata_tf_init(struct ata_device *dev, struct ata_taskfile *tf)
 {
 	memset(tf, 0, sizeof(*tf));
@@ -1354,7 +1360,7 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
 	qc->flags = 0;
 	qc->cursg = NULL;
 	qc->cursg_ofs = 0;
-	qc->nbytes = qc->curbytes = 0;
+	qc->nbytes = qc->extrabytes = qc->curbytes = 0;
 	qc->n_elem = 0;
 	qc->err_mask = 0;
 	qc->sect_size = ATA_SECT_SIZE;
-- 
cgit v1.2.3


From 392e1d9817d0024c96aae237c3c4349e47c976fd Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 11 Mar 2008 10:20:12 -0400
Subject: USB: new quirk flag to avoid Set-Interface

This patch (as1057) fixes a problem with the X-Rite/Gretag-Macbeth
Eye-One Pro display colorimeter; the device crashes when it receives a
Set-Interface request.  A new quirk (USB_QUIRK_NO_SET_INTF) is
introduced and a quirks entry is created for this device.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/message.c | 5 ++++-
 drivers/usb/core/quirks.c  | 3 +++
 include/linux/usb/quirks.h | 3 +++
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index fefb92296e8f..c311f67b7f08 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -1206,7 +1206,10 @@ int usb_set_interface(struct usb_device *dev, int interface, int alternate)
 		return -EINVAL;
 	}
 
-	ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
+	if (dev->quirks & USB_QUIRK_NO_SET_INTF)
+		ret = -EPIPE;
+	else
+		ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 				   USB_REQ_SET_INTERFACE, USB_RECIP_INTERFACE,
 				   alternate, interface, NULL, 0, 5000);
 
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index d9d1eb19f2a1..dfc5418ea10c 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -50,6 +50,9 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* M-Systems Flash Disk Pioneers */
 	{ USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME },
 
+	/* X-Rite/Gretag-Macbeth Eye-One Pro display colorimeter */
+	{ USB_DEVICE(0x0971, 0x2000), .driver_info = USB_QUIRK_NO_SET_INTF },
+
 	/* Action Semiconductor flash disk */
 	{ USB_DEVICE(0x10d6, 0x2200), .driver_info =
 			USB_QUIRK_STRING_FETCH_255 },
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index 2692ec9389ca..1f999ec8d08c 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -9,3 +9,6 @@
 
 /* device can't resume correctly so reset it instead */
 #define USB_QUIRK_RESET_RESUME		0x00000002
+
+/* device can't handle Set-Interface requests */
+#define USB_QUIRK_NO_SET_INTF		0x00000004
-- 
cgit v1.2.3


From cc36bdd47ae51b66780b317c1fa519221f894405 Mon Sep 17 00:00:00 2001
From: Constantin Baranov <const@tltsu.ru>
Date: Sun, 16 Mar 2008 20:04:23 +0000
Subject: USB: add support for Motorola ROKR Z6 cellphone in mass storage mode

Motorola ROKR Z6 cellphone has bugs in its USB, so it is impossible to use
it as mass storage. Patch describes new "unusual" USB device for it with
FIX_INQUIRY and FIX_CAPACITY flags and new BULK_IGNORE_TAG flag.
Last flag relaxes check for equality of bcs->Tag and us->tag in
usb_stor_Bulk_transport routine.

Signed-off-by: Constantin Baranov <const@tltsu.ru>
Signed-off-by: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
Signed-off-by: Daniel Drake <dsd@gentoo.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/storage/transport.c    |  3 ++-
 drivers/usb/storage/unusual_devs.h | 11 +++++++++++
 include/linux/usb_usual.h          |  4 +++-
 3 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index 5780ed15f1ad..bdd4334bed5a 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -1009,7 +1009,8 @@ int usb_stor_Bulk_transport(struct scsi_cmnd *srb, struct us_data *us)
 	US_DEBUGP("Bulk Status S 0x%x T 0x%x R %u Stat 0x%x\n",
 			le32_to_cpu(bcs->Signature), bcs->Tag, 
 			residue, bcs->Status);
-	if (bcs->Tag != us->tag || bcs->Status > US_BULK_STAT_PHASE) {
+	if (!(bcs->Tag == us->tag || (us->flags & US_FL_BULK_IGNORE_TAG)) ||
+		bcs->Status > US_BULK_STAT_PHASE) {
 		US_DEBUGP("Bulk logical error\n");
 		return USB_STOR_TRANSPORT_ERROR;
 	}
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 99679a8cfa02..e5219a56947c 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -1589,6 +1589,17 @@ UNUSUAL_DEV(  0x22b8, 0x4810, 0x0001, 0x0001,
 		US_SC_DEVICE, US_PR_DEVICE, NULL,
 		US_FL_FIX_CAPACITY),
 
+/*
+ * Patch by Constantin Baranov <const@tltsu.ru>
+ * Report by Andreas Koenecke.
+ * Motorola ROKR Z6.
+ */
+UNUSUAL_DEV(  0x22b8, 0x6426, 0x0101, 0x0101,
+		"Motorola",
+		"MSnc.",
+		US_SC_DEVICE, US_PR_DEVICE, NULL,
+		US_FL_FIX_INQUIRY | US_FL_FIX_CAPACITY | US_FL_BULK_IGNORE_TAG),
+
 /* Reported by Radovan Garabik <garabik@kassiopeia.juls.savba.sk> */
 UNUSUAL_DEV(  0x2735, 0x100b, 0x0000, 0x9999,
 		"MPIO",
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index cee0623b3c7b..0a40dfa44c9f 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -50,7 +50,9 @@
 	US_FLAG(CAPACITY_HEURISTICS,	0x00001000)		\
 		/* sometimes sizes is too big */		\
 	US_FLAG(MAX_SECTORS_MIN,0x00002000)			\
-		/* Sets max_sectors to arch min */
+		/* Sets max_sectors to arch min */		\
+	US_FLAG(BULK_IGNORE_TAG,0x00004000)			\
+		/* Ignore tag mismatch in bulk operations */
 
 
 #define US_FLAG(name, value)	US_FL_##name = value ,
-- 
cgit v1.2.3


From 49741c4d01554c2630cea02cfdf236b17062a912 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 17 Mar 2008 14:21:18 -0700
Subject: PCI: revert "pcie: utilize pcie transaction pending bit"

Revert as it is reported to cause problems for people.

commit 4348a2dc49f9baecd34a9b0904245488c6189398
Author: Shaohua Li <shaohua.li@intel.com>
Date:   Wed Oct 24 10:45:08 2007 +0800

    pcie: utilize pcie transaction pending bit

    PCIE has a mechanism to wait for Non-Posted request to complete. I think
    pci_disable_device is a good place to do this.

    Signed-off-by: Shaohua Li <shaohua.li@intel.com>
    Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

Due to the regression reported at
http://bugzilla.kernel.org/show_bug.cgi?id=10065

Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Soeren Sonnenburg <kernel@nn7.de>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/pci.c   | 21 ---------------------
 include/linux/pci.h |  4 ----
 2 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 183fddaa38b7..a4445b7210bf 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -314,24 +314,6 @@ int pci_find_ht_capability(struct pci_dev *dev, int ht_cap)
 }
 EXPORT_SYMBOL_GPL(pci_find_ht_capability);
 
-void pcie_wait_pending_transaction(struct pci_dev *dev)
-{
-	int pos;
-	u16 reg16;
-
-	pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
-	if (!pos)
-		return;
-	while (1) {
-		pci_read_config_word(dev, pos + PCI_EXP_DEVSTA, &reg16);
-		if (!(reg16 & PCI_EXP_DEVSTA_TRPND))
-			break;
-		cpu_relax();
-	}
-
-}
-EXPORT_SYMBOL_GPL(pcie_wait_pending_transaction);
-
 /**
  * pci_find_parent_resource - return resource region of parent bus of given region
  * @dev: PCI device structure contains resources to be searched
@@ -936,9 +918,6 @@ pci_disable_device(struct pci_dev *dev)
 	if (atomic_sub_return(1, &dev->enable_cnt) != 0)
 		return;
 
-	/* Wait for all transactions are finished before disabling the device */
-	pcie_wait_pending_transaction(dev);
-
 	pci_read_config_word(dev, PCI_COMMAND, &pci_command);
 	if (pci_command & PCI_COMMAND_MASTER) {
 		pci_command &= ~PCI_COMMAND_MASTER;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b7e4b633c69b..ea760e519c46 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -513,7 +513,6 @@ int pci_find_next_capability(struct pci_dev *dev, u8 pos, int cap);
 int pci_find_ext_capability(struct pci_dev *dev, int cap);
 int pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
 int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap);
-void pcie_wait_pending_transaction(struct pci_dev *dev);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 
 struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device,
@@ -884,9 +883,6 @@ static inline int pci_find_ext_capability(struct pci_dev *dev, int cap)
 	return 0;
 }
 
-static inline void pcie_wait_pending_transaction(struct pci_dev *dev)
-{ }
-
 /* Power management related routines */
 static inline int pci_save_state(struct pci_dev *dev)
 {
-- 
cgit v1.2.3


From 8b78cf602fd3bd97c0080edd22fe8fd5d0fa7832 Mon Sep 17 00:00:00 2001
From: Yi Yang <yi.y.yang@intel.com>
Date: Mon, 25 Feb 2008 08:46:12 +0800
Subject: cpuidle: fix cpuidle time and usage overflow

cpuidle C-state sysfs node time and usage are very easy to overflow because
they are all of unsigned int type, time will overflow within about two hours,
usage will take longer time to overflow, but they are increasing for ever.

This patch will convert them to unsigned long long.

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/cpuidle/cpuidle.c |  2 +-
 drivers/cpuidle/sysfs.c   | 10 ++++++++--
 include/linux/cpuidle.h   |  4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index d73663a52324..d42deb310ac7 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -67,7 +67,7 @@ static void cpuidle_idle_call(void)
 	/* enter the state and update stats */
 	dev->last_residency = target_state->enter(dev, target_state);
 	dev->last_state = target_state;
-	target_state->time += dev->last_residency;
+	target_state->time += (unsigned long long)dev->last_residency;
 	target_state->usage++;
 
 	/* give the governor an opportunity to reflect on the outcome */
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 69102ca05685..e949618b9be0 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -218,6 +218,12 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
 	return sprintf(buf, "%u\n", state->_name);\
 }
 
+#define define_show_state_ull_function(_name) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
+{ \
+	return sprintf(buf, "%llu\n", state->_name);\
+}
+
 #define define_show_state_str_function(_name) \
 static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
 { \
@@ -228,8 +234,8 @@ static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
 
 define_show_state_function(exit_latency)
 define_show_state_function(power_usage)
-define_show_state_function(usage)
-define_show_state_function(time)
+define_show_state_ull_function(usage)
+define_show_state_ull_function(time)
 define_show_state_str_function(name)
 define_show_state_str_function(desc)
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 6b72a4584086..51e6b1e520e6 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -38,8 +38,8 @@ struct cpuidle_state {
 	unsigned int	power_usage; /* in mW */
 	unsigned int	target_residency; /* in US */
 
-	unsigned int	usage;
-	unsigned int	time; /* in US */
+	unsigned long long	usage;
+	unsigned long long	time; /* in US */
 
 	int (*enter)	(struct cpuidle_device *dev,
 			 struct cpuidle_state *state);
-- 
cgit v1.2.3


From 06d8308c61e54346585b2691c13ee3f90cb6fb2f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 22 Mar 2008 09:20:24 +0100
Subject: NOHZ: reevaluate idle sleep length after add_timer_on()

add_timer_on() can add a timer on a CPU which is currently in a long
idle sleep, but the timer wheel is not reevaluated by the nohz code on
that CPU. So a timer can be delayed for quite a long time. This
triggered a false positive in the clocksource watchdog code.

To avoid this we need to wake up the idle CPU and enforce the
reevaluation of the timer wheel for the next timer event.

Add a function, which checks a given CPU for idle state, marks the
idle task with NEED_RESCHED and sends a reschedule IPI to notify the
other CPU of the change in the timer wheel.

Call this function from add_timer_on().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: stable@kernel.org

--
 include/linux/sched.h |    6 ++++++
 kernel/sched.c        |   43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c        |   10 +++++++++-
 3 files changed, 58 insertions(+), 1 deletion(-)
---
 include/linux/sched.h |  6 ++++++
 kernel/sched.c        | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c        | 10 +++++++++-
 3 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fed07d03364e..6a1e7afb099b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1541,6 +1541,12 @@ static inline void idle_task_exit(void) {}
 
 extern void sched_idle_next(void);
 
+#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+extern void wake_up_idle_cpu(int cpu);
+#else
+static inline void wake_up_idle_cpu(int cpu) { }
+#endif
+
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
diff --git a/kernel/sched.c b/kernel/sched.c
index 28c73f07efb2..8dcdec6fe0fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1052,6 +1052,49 @@ static void resched_cpu(int cpu)
 	resched_task(cpu_curr(cpu));
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
+
+#ifdef CONFIG_NO_HZ
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (cpu == smp_processor_id())
+		return;
+
+	/*
+	 * This is safe, as this function is called with the timer
+	 * wheel base lock of (cpu) held. When the CPU is on the way
+	 * to idle and has not yet set rq->curr to idle then it will
+	 * be serialized on the timer wheel base lock and take the new
+	 * timer into account automatically.
+	 */
+	if (rq->curr != rq->idle)
+		return;
+
+	/*
+	 * We can set TIF_RESCHED on the idle task of the other CPU
+	 * lockless. The worst case is that the other CPU runs the
+	 * idle task through an additional NOOP schedule()
+	 */
+	set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(rq->idle))
+		smp_send_reschedule(cpu);
+}
+#endif
+
 #else
 static void __resched_task(struct task_struct *p, int tif_bit)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a25f88b..b024106daa70 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	spin_lock_irqsave(&base->lock, flags);
 	timer_set_base(timer, base);
 	internal_add_timer(base, timer);
+	/*
+	 * Check whether the other CPU is idle and needs to be
+	 * triggered to reevaluate the timer wheel when nohz is
+	 * active. We are protected against the other CPU fiddling
+	 * with the timer by holding the timer base lock. This also
+	 * makes sure that a CPU on the way to idle can not evaluate
+	 * the timer wheel.
+	 */
+	wake_up_idle_cpu(cpu);
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
-
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
-- 
cgit v1.2.3


From a6bd8e13034dd7d60b6f14217096efa192d0adc1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 28 Mar 2008 11:05:53 -0500
Subject: lguest: comment documentation update.

Took some cycles to re-read the Lguest Journey end-to-end, fix some
rot and tighten some phrases.

Only comments change.  No new jokes, but a couple of recycled old jokes.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c         |  69 ++++++++++++----------
 arch/x86/lguest/boot.c                | 108 +++++++++++++++++++---------------
 arch/x86/lguest/i386_head.S           |  15 +++--
 drivers/lguest/core.c                 |  18 +++---
 drivers/lguest/hypercalls.c           |  11 +++-
 drivers/lguest/interrupts_and_traps.c |   7 +--
 drivers/lguest/lguest_device.c        |  11 ++--
 drivers/lguest/lguest_user.c          |  30 +++++++---
 drivers/lguest/page_tables.c          |  32 +++++-----
 drivers/lguest/x86/core.c             |  33 +++++++----
 drivers/lguest/x86/switcher_32.S      |   8 +--
 include/asm-x86/lguest_hcall.h        |   2 +-
 include/linux/lguest_launcher.h       |   6 +-
 13 files changed, 208 insertions(+), 142 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index d45c7f682b1b..4c1fc65a8b3d 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,7 +1,7 @@
 /*P:100 This is the Launcher code, a simple program which lays out the
- * "physical" memory for the new Guest by mapping the kernel image and the
- * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
-:*/
+ * "physical" memory for the new Guest by mapping the kernel image and
+ * the virtual devices, then opens /dev/lguest to tell the kernel
+ * about the Guest and control it. :*/
 #define _LARGEFILE64_SOURCE
 #define _GNU_SOURCE
 #include <stdio.h>
@@ -43,7 +43,7 @@
 #include "linux/virtio_console.h"
 #include "linux/virtio_ring.h"
 #include "asm-x86/bootparam.h"
-/*L:110 We can ignore the 38 include files we need for this program, but I do
+/*L:110 We can ignore the 39 include files we need for this program, but I do
  * want to draw attention to the use of kernel-style types.
  *
  * As Linus said, "C is a Spartan language, and so should your naming be."  I
@@ -320,7 +320,7 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
 		err(1, "Reading program headers");
 
 	/* Try all the headers: there are usually only three.  A read-only one,
-	 * a read-write one, and a "note" section which isn't loadable. */
+	 * a read-write one, and a "note" section which we don't load. */
 	for (i = 0; i < ehdr->e_phnum; i++) {
 		/* If this isn't a loadable segment, we ignore it */
 		if (phdr[i].p_type != PT_LOAD)
@@ -387,7 +387,7 @@ static unsigned long load_kernel(int fd)
 	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
 		return map_elf(fd, &hdr);
 
-	/* Otherwise we assume it's a bzImage, and try to unpack it */
+	/* Otherwise we assume it's a bzImage, and try to load it. */
 	return load_bzimage(fd);
 }
 
@@ -433,12 +433,12 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
 	return len;
 }
 
-/* Once we know how much memory we have, we can construct simple linear page
+/* Once we know how much memory we have we can construct simple linear page
  * tables which set virtual == physical which will get the Guest far enough
  * into the boot to create its own.
  *
  * We lay them out of the way, just below the initrd (which is why we need to
- * know its size). */
+ * know its size here). */
 static unsigned long setup_pagetables(unsigned long mem,
 				      unsigned long initrd_size)
 {
@@ -850,7 +850,8 @@ static void handle_console_output(int fd, struct virtqueue *vq)
  *
  * Handling output for network is also simple: we get all the output buffers
  * and write them (ignoring the first element) to this device's file descriptor
- * (stdout). */
+ * (/dev/net/tun).
+ */
 static void handle_net_output(int fd, struct virtqueue *vq)
 {
 	unsigned int head, out, in;
@@ -924,7 +925,7 @@ static void enable_fd(int fd, struct virtqueue *vq)
 	write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
 }
 
-/* Resetting a device is fairly easy. */
+/* When the Guest asks us to reset a device, it's is fairly easy. */
 static void reset_device(struct device *dev)
 {
 	struct virtqueue *vq;
@@ -1003,8 +1004,8 @@ static void handle_input(int fd)
 		if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
 			break;
 
-		/* Otherwise, call the device(s) which have readable
-		 * file descriptors and a method of handling them.  */
+		/* Otherwise, call the device(s) which have readable file
+		 * descriptors and a method of handling them.  */
 		for (i = devices.dev; i; i = i->next) {
 			if (i->handle_input && FD_ISSET(i->fd, &fds)) {
 				int dev_fd;
@@ -1015,8 +1016,7 @@ static void handle_input(int fd)
 				 * should no longer service it.  Networking and
 				 * console do this when there's no input
 				 * buffers to deliver into.  Console also uses
-				 * it when it discovers that stdin is
-				 * closed. */
+				 * it when it discovers that stdin is closed. */
 				FD_CLR(i->fd, &devices.infds);
 				/* Tell waker to ignore it too, by sending a
 				 * negative fd number (-1, since 0 is a valid
@@ -1033,7 +1033,8 @@ static void handle_input(int fd)
  *
  * All devices need a descriptor so the Guest knows it exists, and a "struct
  * device" so the Launcher can keep track of it.  We have common helper
- * routines to allocate and manage them. */
+ * routines to allocate and manage them.
+ */
 
 /* The layout of the device page is a "struct lguest_device_desc" followed by a
  * number of virtqueue descriptors, then two sets of feature bits, then an
@@ -1078,7 +1079,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 	struct virtqueue **i, *vq = malloc(sizeof(*vq));
 	void *p;
 
-	/* First we need some pages for this virtqueue. */
+	/* First we need some memory for this virtqueue. */
 	pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1)
 		/ getpagesize();
 	p = get_pages(pages);
@@ -1122,7 +1123,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 }
 
 /* The first half of the feature bitmask is for us to advertise features.  The
- * second half if for the Guest to accept features. */
+ * second half is for the Guest to accept features. */
 static void add_feature(struct device *dev, unsigned bit)
 {
 	u8 *features = get_feature_bits(dev);
@@ -1151,7 +1152,9 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
 }
 
 /* This routine does all the creation and setup of a new device, including
- * calling new_dev_desc() to allocate the descriptor and device memory. */
+ * calling new_dev_desc() to allocate the descriptor and device memory.
+ *
+ * See what I mean about userspace being boring? */
 static struct device *new_device(const char *name, u16 type, int fd,
 				 bool (*handle_input)(int, struct device *))
 {
@@ -1492,7 +1495,10 @@ static int io_thread(void *_dev)
 	while (read(vblk->workpipe[0], &c, 1) == 1) {
 		/* We acknowledge each request immediately to reduce latency,
 		 * rather than waiting until we've done them all.  I haven't
-		 * measured to see if it makes any difference. */
+		 * measured to see if it makes any difference.
+		 *
+		 * That would be an interesting test, wouldn't it?  You could
+		 * also try having more than one I/O thread. */
 		while (service_io(dev))
 			write(vblk->done_fd, &c, 1);
 	}
@@ -1500,7 +1506,7 @@ static int io_thread(void *_dev)
 }
 
 /* Now we've seen the I/O thread, we return to the Launcher to see what happens
- * when the thread tells us it's completed some I/O. */
+ * when that thread tells us it's completed some I/O. */
 static bool handle_io_finish(int fd, struct device *dev)
 {
 	char c;
@@ -1572,11 +1578,12 @@ static void setup_block_file(const char *filename)
 	 * more work. */
 	pipe(vblk->workpipe);
 
-	/* Create stack for thread and run it */
+	/* Create stack for thread and run it.  Since stack grows upwards, we
+	 * point the stack pointer to the end of this region. */
 	stack = malloc(32768);
 	/* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
 	 * becoming a zombie. */
-	if (clone(io_thread, stack + 32768,  CLONE_VM | SIGCHLD, dev) == -1)
+	if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
 		err(1, "Creating clone");
 
 	/* We don't need to keep the I/O thread's end of the pipes open. */
@@ -1586,14 +1593,14 @@ static void setup_block_file(const char *filename)
 	verbose("device %u: virtblock %llu sectors\n",
 		devices.device_num, le64_to_cpu(conf.capacity));
 }
-/* That's the end of device setup. :*/
+/* That's the end of device setup. */
 
-/* Reboot */
+/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
 static void __attribute__((noreturn)) restart_guest(void)
 {
 	unsigned int i;
 
-	/* Closing pipes causes the waker thread and io_threads to die, and
+	/* Closing pipes causes the Waker thread and io_threads to die, and
 	 * closing /dev/lguest cleans up the Guest.  Since we don't track all
 	 * open fds, we simply close everything beyond stderr. */
 	for (i = 3; i < FD_SETSIZE; i++)
@@ -1602,7 +1609,7 @@ static void __attribute__((noreturn)) restart_guest(void)
 	err(1, "Could not exec %s", main_args[0]);
 }
 
-/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
+/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
  * its input and output, and finally, lays it to rest. */
 static void __attribute__((noreturn)) run_guest(int lguest_fd)
 {
@@ -1643,7 +1650,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
 			err(1, "Resetting break");
 	}
 }
-/*
+/*L:240
  * This is the end of the Launcher.  The good news: we are over halfway
  * through!  The bad news: the most fiendish part of the code still lies ahead
  * of us.
@@ -1690,8 +1697,8 @@ int main(int argc, char *argv[])
 	 * device receive input from a file descriptor, we keep an fdset
 	 * (infds) and the maximum fd number (max_infd) with the head of the
 	 * list.  We also keep a pointer to the last device.  Finally, we keep
-	 * the next interrupt number to hand out (1: remember that 0 is used by
-	 * the timer). */
+	 * the next interrupt number to use for devices (1: remember that 0 is
+	 * used by the timer). */
 	FD_ZERO(&devices.infds);
 	devices.max_infd = -1;
 	devices.lastdev = NULL;
@@ -1792,8 +1799,8 @@ int main(int argc, char *argv[])
 	lguest_fd = tell_kernel(pgdir, start);
 
 	/* We fork off a child process, which wakes the Launcher whenever one
-	 * of the input file descriptors needs attention.  Otherwise we would
-	 * run the Guest until it tries to output something. */
+	 * of the input file descriptors needs attention.  We call this the
+	 * Waker, and we'll cover it in a moment. */
 	waker_fd = setup_waker(lguest_fd);
 
 	/* Finally, run the Guest.  This doesn't return. */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a104c532ff70..3335b4595efd 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -10,21 +10,19 @@
  * (such as the example in Documentation/lguest/lguest.c) is called the
  * Launcher.
  *
- * Secondly, we only run specially modified Guests, not normal kernels.  When
- * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets
- * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows
- * how to be a Guest.  This means that you can use the same kernel you boot
- * normally (ie. as a Host) as a Guest.
+ * Secondly, we only run specially modified Guests, not normal kernels: setting
+ * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
+ * how to be a Guest at boot time.  This means that you can use the same kernel
+ * you boot normally (ie. as a Host) as a Guest.
  *
  * These Guests know that they cannot do privileged operations, such as disable
  * interrupts, and that they have to ask the Host to do such things explicitly.
  * This file consists of all the replacements for such low-level native
  * hardware operations: these special Guest versions call the Host.
  *
- * So how does the kernel know it's a Guest?  The Guest starts at a special
- * entry point marked with a magic string, which sets up a few things then
- * calls here.  We replace the native functions various "paravirt" structures
- * with our Guest versions, then boot like normal. :*/
+ * So how does the kernel know it's a Guest?  We'll see that later, but let's
+ * just say that we end up here where we replace the native functions various
+ * "paravirt" structures with our Guest versions, then boot like normal. :*/
 
 /*
  * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -134,7 +132,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
  * lguest_leave_lazy_mode().
  *
  * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing. */
+ * future processing: */
 static void lazy_hcall(unsigned long call,
 		       unsigned long arg1,
 		       unsigned long arg2,
@@ -147,7 +145,7 @@ static void lazy_hcall(unsigned long call,
 }
 
 /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue a hypercall to flush any stored calls. */
+ * issue the do-nothing hypercall to flush any stored calls. */
 static void lguest_leave_lazy_mode(void)
 {
 	paravirt_leave_lazy(paravirt_get_lazy_mode());
@@ -164,7 +162,7 @@ static void lguest_leave_lazy_mode(void)
  *
  * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
  * which the Guest can update with a single instruction.  The Host knows to
- * check there when it wants to deliver an interrupt.
+ * check there before it tries to deliver an interrupt.
  */
 
 /* save_flags() is expected to return the processor state (ie. "flags").  The
@@ -196,10 +194,15 @@ static void irq_enable(void)
 /*M:003 Note that we don't check for outstanding interrupts when we re-enable
  * them (or when we unmask an interrupt).  This seems to work for the moment,
  * since interrupts are rare and we'll just get the interrupt on the next timer
- * tick, but when we turn on CONFIG_NO_HZ, we should revisit this.  One way
+ * tick, but now we can run with CONFIG_NO_HZ, we should revisit this.  One way
  * would be to put the "irq_enabled" field in a page by itself, and have the
  * Host write-protect it when an interrupt comes in when irqs are disabled.
- * There will then be a page fault as soon as interrupts are re-enabled. :*/
+ * There will then be a page fault as soon as interrupts are re-enabled.
+ *
+ * A better method is to implement soft interrupt disable generally for x86:
+ * instead of disabling interrupts, we set a flag.  If an interrupt does come
+ * in, we then disable them for real.  This is uncommon, so we could simply use
+ * a hypercall for interrupt control and not worry about efficiency. :*/
 
 /*G:034
  * The Interrupt Descriptor Table (IDT).
@@ -212,6 +215,10 @@ static void irq_enable(void)
 static void lguest_write_idt_entry(gate_desc *dt,
 				   int entrynum, const gate_desc *g)
 {
+	/* The gate_desc structure is 8 bytes long: we hand it to the Host in
+	 * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
+	 * around like this; typesafety wasn't a big concern in Linux's early
+	 * years. */
 	u32 *desc = (u32 *)g;
 	/* Keep the local copy up to date. */
 	native_write_idt_entry(dt, entrynum, g);
@@ -243,7 +250,8 @@ static void lguest_load_idt(const struct desc_ptr *desc)
  *
  * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
  * hypercall and use that repeatedly to load a new IDT.  I don't think it
- * really matters, but wouldn't it be nice if they were the same?
+ * really matters, but wouldn't it be nice if they were the same?  Wouldn't
+ * it be even better if you were the one to send the patch to fix it?
  */
 static void lguest_load_gdt(const struct desc_ptr *desc)
 {
@@ -298,9 +306,9 @@ static void lguest_load_tr_desc(void)
 
 /* The "cpuid" instruction is a way of querying both the CPU identity
  * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel and AMD.  As you
- * might imagine, after a decade and a half this treatment, it is now a giant
- * ball of hair.  Its entry in the current Intel manual runs to 28 pages.
+ * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
+ * As you might imagine, after a decade and a half this treatment, it is now a
+ * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
  *
  * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
  * has been translated into 4 languages.  I am not making this up!
@@ -594,17 +602,17 @@ static unsigned long lguest_get_wallclock(void)
 	return lguest_data.time.tv_sec;
 }
 
-/* The TSC is a Time Stamp Counter.  The Host tells us what speed it runs at,
- * or 0 if it's unusable as a reliable clock source.  This matches what we want
- * here: if we return 0 from this function, the x86 TSC clock will not register
- * itself. */
+/* The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
+ * what speed it runs at, or 0 if it's unusable as a reliable clock source.
+ * This matches what we want here: if we return 0 from this function, the x86
+ * TSC clock will give up and not register itself. */
 static unsigned long lguest_cpu_khz(void)
 {
 	return lguest_data.tsc_khz;
 }
 
-/* If we can't use the TSC, the kernel falls back to our "lguest_clock", where
- * we read the time value given to us by the Host. */
+/* If we can't use the TSC, the kernel falls back to our lower-priority
+ * "lguest_clock", where we read the time value given to us by the Host. */
 static cycle_t lguest_clock_read(void)
 {
 	unsigned long sec, nsec;
@@ -648,12 +656,16 @@ static struct clocksource lguest_clock = {
 static int lguest_clockevent_set_next_event(unsigned long delta,
                                            struct clock_event_device *evt)
 {
+	/* FIXME: I don't think this can ever happen, but James tells me he had
+	 * to put this code in.  Maybe we should remove it now.  Anyone? */
 	if (delta < LG_CLOCK_MIN_DELTA) {
 		if (printk_ratelimit())
 			printk(KERN_DEBUG "%s: small delta %lu ns\n",
 			       __FUNCTION__, delta);
 		return -ETIME;
 	}
+
+	/* Please wake us this far in the future. */
 	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
 	return 0;
 }
@@ -738,7 +750,7 @@ static void lguest_time_init(void)
  * will not tolerate us trying to use that), the stack pointer, and the number
  * of pages in the stack. */
 static void lguest_load_sp0(struct tss_struct *tss,
-				     struct thread_struct *thread)
+			    struct thread_struct *thread)
 {
 	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
 		   THREAD_SIZE/PAGE_SIZE);
@@ -786,9 +798,8 @@ static void lguest_safe_halt(void)
 	hcall(LHCALL_HALT, 0, 0, 0);
 }
 
-/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a
- * message out when we're crashing as well as elegant termination like powering
- * off.
+/* The SHUTDOWN hypercall takes a string to describe what's happening, and
+ * an argument which says whether this to restart (reboot) the Guest or not.
  *
  * Note that the Host always prefers that the Guest speak in physical addresses
  * rather than virtual addresses, so we use __pa() here. */
@@ -816,8 +827,9 @@ static struct notifier_block paniced = {
 /* Setting up memory is fairly easy. */
 static __init char *lguest_memory_setup(void)
 {
-	/* We do this here and not earlier because lockcheck barfs if we do it
-	 * before start_kernel() */
+	/* We do this here and not earlier because lockcheck used to barf if we
+	 * did it before start_kernel().  I think we fixed that, so it'd be
+	 * nice to move it back to lguest_init.  Patch welcome... */
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
 	/* The Linux bootloader header contains an "e820" memory map: the
@@ -850,12 +862,19 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
 	return len;
 }
 
+/* Rebooting also tells the Host we're finished, but the RESTART flag tells the
+ * Launcher to reboot us. */
+static void lguest_restart(char *reason)
+{
+	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+}
+
 /*G:050
  * Patching (Powerfully Placating Performance Pedants)
  *
- * We have already seen that pv_ops structures let us replace simple
- * native instructions with calls to the appropriate back end all throughout
- * the kernel.  This allows the same kernel to run as a Guest and as a native
+ * We have already seen that pv_ops structures let us replace simple native
+ * instructions with calls to the appropriate back end all throughout the
+ * kernel.  This allows the same kernel to run as a Guest and as a native
  * kernel, but it's slow because of all the indirect branches.
  *
  * Remember that David Wheeler quote about "Any problem in computer science can
@@ -908,14 +927,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
 	return insn_len;
 }
 
-static void lguest_restart(char *reason)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
-}
-
-/*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
- * structures in the kernel provide points for (almost) every routine we have
- * to override to avoid privileged instructions. */
+/*G:030 Once we get to lguest_init(), we know we're a Guest.  The various
+ * pv_ops structures in the kernel provide points for (almost) every routine we
+ * have to override to avoid privileged instructions. */
 __init void lguest_init(void)
 {
 	/* We're under lguest, paravirt is enabled, and we're running at
@@ -1003,9 +1017,9 @@ __init void lguest_init(void)
 	 * the normal data segment to get through booting. */
 	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
 
-	/* The Host uses the top of the Guest's virtual address space for the
-	 * Host<->Guest Switcher, and it tells us how big that is in
-	 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
+	/* The Host<->Guest Switcher lives at the top of our address space, and
+	 * the Host told us how big it is when we made LGUEST_INIT hypercall:
+	 * it put the answer in lguest_data.reserve_mem  */
 	reserve_top_address(lguest_data.reserve_mem);
 
 	/* If we don't initialize the lock dependency checker now, it crashes
@@ -1027,6 +1041,7 @@ __init void lguest_init(void)
 	/* Math is always hard! */
 	new_cpu_data.hard_math = 1;
 
+	/* We don't have features.  We have puppies!  Puppies! */
 #ifdef CONFIG_X86_MCE
 	mce_disabled = 1;
 #endif
@@ -1044,10 +1059,11 @@ __init void lguest_init(void)
 	virtio_cons_early_init(early_put_chars);
 
 	/* Last of all, we set the power management poweroff hook to point to
-	 * the Guest routine to power off. */
+	 * the Guest routine to power off, and the reboot hook to our restart
+	 * routine. */
 	pm_power_off = lguest_power_off;
-
 	machine_ops.restart = lguest_restart;
+
 	/* Now we're set up, call start_kernel() in init/main.c and we proceed
 	 * to boot as normal.  It never returns. */
 	start_kernel();
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 95b6fbcded63..5c7cef34c9e7 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,13 +5,20 @@
 #include <asm/thread_info.h>
 #include <asm/processor-flags.h>
 
-/*G:020 This is where we begin: head.S notes that the boot header's platform
- * type field is "1" (lguest), so calls us here.
+/*G:020 Our story starts with the kernel booting into startup_32 in
+ * arch/x86/kernel/head_32.S.  It expects a boot header, which is created by
+ * the bootloader (the Launcher in our case).
+ *
+ * The startup_32 function does very little: it clears the uninitialized global
+ * C variables which we expect to be zero (ie. BSS) and then copies the boot
+ * header and kernel command line somewhere safe.  Finally it checks the
+ * 'hardware_subarch' field.  This was introduced in 2.6.24 for lguest and Xen:
+ * if it's set to '1' (lguest's assigned number), then it calls us here.
  *
  * WARNING: be very careful here!  We're running at addresses equal to physical
  * addesses (around 0), not above PAGE_OFFSET as most code expectes
  * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
- * data.
+ * data without remembering to subtract __PAGE_OFFSET!
  *
  * The .section line puts this code in .init.text so it will be discarded after
  * boot. */
@@ -24,7 +31,7 @@ ENTRY(lguest_entry)
 	int $LGUEST_TRAP_ENTRY
 
 	/* The Host put the toplevel pagetable in lguest_data.pgdir.  The movsl
-	 * instruction uses %esi implicitly as the source for the copy we'
+	 * instruction uses %esi implicitly as the source for the copy we're
 	 * about to do. */
 	movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
 
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index c632c08cbbdc..5eea4356d703 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -1,8 +1,6 @@
 /*P:400 This contains run_guest() which actually calls into the Host<->Guest
  * Switcher and analyzes the return, such as determining if the Guest wants the
- * Host to do something.  This file also contains useful helper routines, and a
- * couple of non-obvious setup and teardown pieces which were implemented after
- * days of debugging pain. :*/
+ * Host to do something.  This file also contains useful helper routines. :*/
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/stddef.h>
@@ -49,8 +47,8 @@ static __init int map_switcher(void)
 	 * easy.
 	 */
 
-	/* We allocate an array of "struct page"s.  map_vm_area() wants the
-	 * pages in this form, rather than just an array of pointers. */
+	/* We allocate an array of struct page pointers.  map_vm_area() wants
+	 * this, rather than just an array of pages. */
 	switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
 				GFP_KERNEL);
 	if (!switcher_page) {
@@ -172,7 +170,7 @@ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
 	}
 }
 
-/* This is the write (copy into guest) version. */
+/* This is the write (copy into Guest) version. */
 void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
 	       unsigned bytes)
 {
@@ -209,9 +207,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		if (cpu->break_out)
 			return -EAGAIN;
 
-		/* Check if there are any interrupts which can be delivered
-		 * now: if so, this sets up the hander to be executed when we
-		 * next run the Guest. */
+		/* Check if there are any interrupts which can be delivered now:
+		 * if so, this sets up the hander to be executed when we next
+		 * run the Guest. */
 		maybe_do_interrupt(cpu);
 
 		/* All long-lived kernel loops need to check with this horrible
@@ -246,8 +244,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		lguest_arch_handle_trap(cpu);
 	}
 
+	/* Special case: Guest is 'dead' but wants a reboot. */
 	if (cpu->lg->dead == ERR_PTR(-ERESTART))
 		return -ERESTART;
+
 	/* The Guest is dead => "No such file or directory" */
 	return -ENOENT;
 }
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 0f2cb4fd7c69..54d66f05fefa 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -29,7 +29,7 @@
 #include "lg.h"
 
 /*H:120 This is the core hypercall routine: where the Guest gets what it wants.
- * Or gets killed.  Or, in the case of LHCALL_CRASH, both. */
+ * Or gets killed.  Or, in the case of LHCALL_SHUTDOWN, both. */
 static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 {
 	switch (args->arg0) {
@@ -190,6 +190,13 @@ static void initialize(struct lg_cpu *cpu)
 	 * pagetable. */
 	guest_pagetable_clear_all(cpu);
 }
+/*:*/
+
+/*M:013 If a Guest reads from a page (so creates a mapping) that it has never
+ * written to, and then the Launcher writes to it (ie. the output of a virtual
+ * device), the Guest will still see the old page.  In practice, this never
+ * happens: why would the Guest read a page which it has never written to?  But
+ * a similar scenario might one day bite us, so it's worth mentioning. :*/
 
 /*H:100
  * Hypercalls
@@ -227,7 +234,7 @@ void do_hypercalls(struct lg_cpu *cpu)
 		 * However, if we are signalled or the Guest sends I/O to the
 		 * Launcher, the run_guest() loop will exit without running the
 		 * Guest.  When it comes back it would try to re-run the
-		 * hypercall. */
+		 * hypercall.  Finding that bug sucked. */
 		cpu->hcall = NULL;
 	}
 }
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 32e97c1858e5..0414ddf87587 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -144,7 +144,6 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
 	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
 			   sizeof(blk)))
 		return;
-
 	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
 
 	/* Find the first interrupt. */
@@ -237,9 +236,9 @@ void free_interrupts(void)
 		clear_bit(syscall_vector, used_vectors);
 }
 
-/*H:220 Now we've got the routines to deliver interrupts, delivering traps
- * like page fault is easy.  The only trick is that Intel decided that some
- * traps should have error codes: */
+/*H:220 Now we've got the routines to deliver interrupts, delivering traps like
+ * page fault is easy.  The only trick is that Intel decided that some traps
+ * should have error codes: */
 static int has_err(unsigned int trap)
 {
 	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 1b2ec0bf5eb1..2bc9bf7e88e5 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -1,10 +1,10 @@
 /*P:050 Lguest guests use a very simple method to describe devices.  It's a
- * series of device descriptors contained just above the top of normal
+ * series of device descriptors contained just above the top of normal Guest
  * memory.
  *
  * We use the standard "virtio" device infrastructure, which provides us with a
  * console, a network and a block driver.  Each one expects some configuration
- * information and a "virtqueue" mechanism to send and receive data. :*/
+ * information and a "virtqueue" or two to send and receive data. :*/
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/lguest_launcher.h>
@@ -53,7 +53,7 @@ struct lguest_device {
  * Device configurations
  *
  * The configuration information for a device consists of one or more
- * virtqueues, a feature bitmaks, and some configuration bytes.  The
+ * virtqueues, a feature bitmap, and some configuration bytes.  The
  * configuration bytes don't really matter to us: the Launcher sets them up, and
  * the driver will look at them during setup.
  *
@@ -179,7 +179,7 @@ struct lguest_vq_info
 };
 
 /* When the virtio_ring code wants to prod the Host, it calls us here and we
- * make a hypercall.  We hand the page number of the virtqueue so the Host
+ * make a hypercall.  We hand the physical address of the virtqueue so the Host
  * knows which virtqueue we're talking about. */
 static void lg_notify(struct virtqueue *vq)
 {
@@ -199,7 +199,8 @@ static void lg_notify(struct virtqueue *vq)
  * allocate its own pages and tell the Host where they are, but for lguest it's
  * simpler for the Host to simply tell us where the pages are.
  *
- * So we provide devices with a "find virtqueue and set it up" function. */
+ * So we provide drivers with a "find the Nth virtqueue and set it up"
+ * function. */
 static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 				    unsigned index,
 				    void (*callback)(struct virtqueue *vq))
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 2221485b0773..564e425d71dd 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -73,7 +73,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 	if (current != cpu->tsk)
 		return -EPERM;
 
-	/* If the guest is already dead, we indicate why */
+	/* If the Guest is already dead, we indicate why */
 	if (lg->dead) {
 		size_t len;
 
@@ -88,7 +88,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 		return len;
 	}
 
-	/* If we returned from read() last time because the Guest notified,
+	/* If we returned from read() last time because the Guest sent I/O,
 	 * clear the flag. */
 	if (cpu->pending_notify)
 		cpu->pending_notify = 0;
@@ -97,14 +97,20 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 	return run_guest(cpu, (unsigned long __user *)user);
 }
 
+/*L:025 This actually initializes a CPU.  For the moment, a Guest is only
+ * uniprocessor, so "id" is always 0. */
 static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 {
+	/* We have a limited number the number of CPUs in the lguest struct. */
 	if (id >= NR_CPUS)
 		return -EINVAL;
 
+	/* Set up this CPU's id, and pointer back to the lguest struct. */
 	cpu->id = id;
 	cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
 	cpu->lg->nr_cpus++;
+
+	/* Each CPU has a timer it can set. */
 	init_clockdev(cpu);
 
 	/* We need a complete page for the Guest registers: they are accessible
@@ -120,11 +126,11 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	 * address. */
 	lguest_arch_setup_regs(cpu, start_ip);
 
-	/* Initialize the queue for the waker to wait on */
+	/* Initialize the queue for the Waker to wait on */
 	init_waitqueue_head(&cpu->break_wq);
 
 	/* We keep a pointer to the Launcher task (ie. current task) for when
-	 * other Guests want to wake this one (inter-Guest I/O). */
+	 * other Guests want to wake this one (eg. console input). */
 	cpu->tsk = current;
 
 	/* We need to keep a pointer to the Launcher's memory map, because if
@@ -136,6 +142,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	 * when the same Guest runs on the same CPU twice. */
 	cpu->last_pages = NULL;
 
+	/* No error == success. */
 	return 0;
 }
 
@@ -185,14 +192,13 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	lg->mem_base = (void __user *)(long)args[0];
 	lg->pfn_limit = args[1];
 
-	/* This is the first cpu */
+	/* This is the first cpu (cpu 0) and it will start booting at args[3] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
 	if (err)
 		goto release_guest;
 
 	/* Initialize the Guest's shadow page tables, using the toplevel
-	 * address the Launcher gave us.  This allocates memory, so can
-	 * fail. */
+	 * address the Launcher gave us.  This allocates memory, so can fail. */
 	err = init_guest_pagetable(lg, args[2]);
 	if (err)
 		goto free_regs;
@@ -218,11 +224,16 @@ unlock:
 /*L:010 The first operation the Launcher does must be a write.  All writes
  * start with an unsigned long number: for the first write this must be
  * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
- * writes of other values to send interrupts. */
+ * writes of other values to send interrupts.
+ *
+ * Note that we overload the "offset" in the /dev/lguest file to indicate what
+ * CPU number we're dealing with.  Currently this is always 0, since we only
+ * support uniprocessor Guests, but you can see the beginnings of SMP support
+ * here. */
 static ssize_t write(struct file *file, const char __user *in,
 		     size_t size, loff_t *off)
 {
-	/* Once the guest is initialized, we hold the "struct lguest" in the
+	/* Once the Guest is initialized, we hold the "struct lguest" in the
 	 * file private data. */
 	struct lguest *lg = file->private_data;
 	const unsigned long __user *input = (const unsigned long __user *)in;
@@ -230,6 +241,7 @@ static ssize_t write(struct file *file, const char __user *in,
 	struct lg_cpu *uninitialized_var(cpu);
 	unsigned int cpu_id = *off;
 
+	/* The first value tells us what this request is. */
 	if (get_user(req, input) != 0)
 		return -EFAULT;
 	input++;
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a7f64a9d67e0..d93500f24fbb 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -2,8 +2,8 @@
  * previous encounters.  It's functional, and as neat as it can be in the
  * circumstances, but be wary, for these things are subtle and break easily.
  * The Guest provides a virtual to physical mapping, but we can neither trust
- * it nor use it: we verify and convert it here to point the hardware to the
- * actual Guest pages when running the Guest. :*/
+ * it nor use it: we verify and convert it here then point the CPU to the
+ * converted Guest pages when running the Guest. :*/
 
 /* Copyright (C) Rusty Russell IBM Corporation 2006.
  * GPL v2 and any later version */
@@ -106,6 +106,11 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
 	return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
 }
+/*:*/
+
+/*M:014 get_pfn is slow; it takes the mmap sem and calls get_user_pages.  We
+ * could probably try to grab batches of pages here as an optimization
+ * (ie. pre-faulting). :*/
 
 /*H:350 This routine takes a page number given by the Guest and converts it to
  * an actual, physical page number.  It can fail for several reasons: the
@@ -113,8 +118,8 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
  * and the page is read-only, or the write flag was set and the page was
  * shared so had to be copied, but we ran out of memory.
  *
- * This holds a reference to the page, so release_pte() is careful to
- * put that back. */
+ * This holds a reference to the page, so release_pte() is careful to put that
+ * back. */
 static unsigned long get_pfn(unsigned long virtpfn, int write)
 {
 	struct page *page;
@@ -532,13 +537,13 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
  * all processes.  So when the page table above that address changes, we update
  * all the page tables, not just the current one.  This is rare.
  *
- * The benefit is that when we have to track a new page table, we can copy keep
- * all the kernel mappings.  This speeds up context switch immensely. */
+ * The benefit is that when we have to track a new page table, we can keep all
+ * the kernel mappings.  This speeds up context switch immensely. */
 void guest_set_pte(struct lg_cpu *cpu,
 		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
 {
-	/* Kernel mappings must be changed on all top levels.  Slow, but
-	 * doesn't happen often. */
+	/* Kernel mappings must be changed on all top levels.  Slow, but doesn't
+	 * happen often. */
 	if (vaddr >= cpu->lg->kernel_address) {
 		unsigned int i;
 		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
@@ -704,12 +709,11 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 /* We've made it through the page table code.  Perhaps our tired brains are
  * still processing the details, or perhaps we're simply glad it's over.
  *
- * If nothing else, note that all this complexity in juggling shadow page
- * tables in sync with the Guest's page tables is for one reason: for most
- * Guests this page table dance determines how bad performance will be.  This
- * is why Xen uses exotic direct Guest pagetable manipulation, and why both
- * Intel and AMD have implemented shadow page table support directly into
- * hardware.
+ * If nothing else, note that all this complexity in juggling shadow page tables
+ * in sync with the Guest's page tables is for one reason: for most Guests this
+ * page table dance determines how bad performance will be.  This is why Xen
+ * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
+ * have implemented shadow page table support directly into hardware.
  *
  * There is just one file remaining in the Host. */
 
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 635187812d52..5126d5d9ea0e 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -17,6 +17,13 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
+/*P:450 This file contains the x86-specific lguest code.  It used to be all
+ * mixed in with drivers/lguest/core.c but several foolhardy code slashers
+ * wrestled most of the dependencies out to here in preparation for porting
+ * lguest to other architectures (see what I mean by foolhardy?).
+ *
+ * This also contains a couple of non-obvious setup and teardown pieces which
+ * were implemented after days of debugging pain. :*/
 #include <linux/kernel.h>
 #include <linux/start_kernel.h>
 #include <linux/string.h>
@@ -157,6 +164,8 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
  * also simplify copy_in_guest_info().  Note that we'd still need to restore
  * things when we exit to Launcher userspace, but that's fairly easy.
  *
+ * We could also try using this hooks for PGE, but that might be too expensive.
+ *
  * The hooks were designed for KVM, but we can also put them to good use. :*/
 
 /*H:040 This is the i386-specific code to setup and run the Guest.  Interrupts
@@ -182,7 +191,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 	 * was doing. */
 	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
 
-	/* Note that the "regs" pointer contains two extra entries which are
+	/* Note that the "regs" structure contains two extra entries which are
 	 * not really registers: a trap number which says what interrupt or
 	 * trap made the switcher code come back, and an error code which some
 	 * traps set.  */
@@ -293,11 +302,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 		break;
 	case 14: /* We've intercepted a Page Fault. */
 		/* The Guest accessed a virtual address that wasn't mapped.
-		 * This happens a lot: we don't actually set up most of the
-		 * page tables for the Guest at all when we start: as it runs
-		 * it asks for more and more, and we set them up as
-		 * required. In this case, we don't even tell the Guest that
-		 * the fault happened.
+		 * This happens a lot: we don't actually set up most of the page
+		 * tables for the Guest at all when we start: as it runs it asks
+		 * for more and more, and we set them up as required. In this
+		 * case, we don't even tell the Guest that the fault happened.
 		 *
 		 * The errcode tells whether this was a read or a write, and
 		 * whether kernel or userspace code. */
@@ -342,7 +350,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 	if (!deliver_trap(cpu, cpu->regs->trapnum))
 		/* If the Guest doesn't have a handler (either it hasn't
 		 * registered any yet, or it's one of the faults we don't let
-		 * it handle), it dies with a cryptic error message. */
+		 * it handle), it dies with this cryptic error message. */
 		kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
 			   cpu->regs->trapnum, cpu->regs->eip,
 			   cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
@@ -375,8 +383,8 @@ void __init lguest_arch_host_init(void)
 	 * The only exception is the interrupt handlers in switcher.S: their
 	 * addresses are placed in a table (default_idt_entries), so we need to
 	 * update the table with the new addresses.  switcher_offset() is a
-	 * convenience function which returns the distance between the builtin
-	 * switcher code and the high-mapped copy we just made. */
+	 * convenience function which returns the distance between the
+	 * compiled-in switcher code and the high-mapped copy we just made. */
 	for (i = 0; i < IDT_ENTRIES; i++)
 		default_idt_entries[i] += switcher_offset();
 
@@ -416,7 +424,7 @@ void __init lguest_arch_host_init(void)
 		state->guest_gdt_desc.address = (long)&state->guest_gdt;
 
 		/* We know where we want the stack to be when the Guest enters
-		 * the switcher: in pages->regs.  The stack grows upwards, so
+		 * the Switcher: in pages->regs.  The stack grows upwards, so
 		 * we start it at the end of that structure. */
 		state->guest_tss.sp0 = (long)(&pages->regs + 1);
 		/* And this is the GDT entry to use for the stack: we keep a
@@ -513,8 +521,8 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
 {
 	u32 tsc_speed;
 
-	/* The pointer to the Guest's "struct lguest_data" is the only
-	 * argument.  We check that address now. */
+	/* The pointer to the Guest's "struct lguest_data" is the only argument.
+	 * We check that address now. */
 	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
 			       sizeof(*cpu->lg->lguest_data)))
 		return -EFAULT;
@@ -546,6 +554,7 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
 
 	return 0;
 }
+/*:*/
 
 /*L:030 lguest_arch_setup_regs()
  *
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
index 0af8baaa0d4a..3fc15318a80f 100644
--- a/drivers/lguest/x86/switcher_32.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -1,6 +1,6 @@
-/*P:900 This is the Switcher: code which sits at 0xFFC00000 to do the low-level
- * Guest<->Host switch.  It is as simple as it can be made, but it's naturally
- * very specific to x86.
+/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the
+ * Host and Guest to do the low-level Guest<->Host switch.  It is as simple as
+ * it can be made, but it's naturally very specific to x86.
  *
  * You have now completed Preparation.  If this has whet your appetite; if you
  * are feeling invigorated and refreshed then the next, more challenging stage
@@ -189,7 +189,7 @@ ENTRY(switch_to_guest)
 	// Interrupts are turned back on: we are Guest.
 	iret
 
-// We treat two paths to switch back to the Host
+// We tread two paths to switch back to the Host
 // Yet both must save Guest state and restore Host
 // So we put the routine in a macro.
 #define SWITCH_TO_HOST							\
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h
index 758b9a5d4539..f239e7069cab 100644
--- a/include/asm-x86/lguest_hcall.h
+++ b/include/asm-x86/lguest_hcall.h
@@ -27,7 +27,7 @@
 #ifndef __ASSEMBLY__
 #include <asm/hw_irq.h>
 
-/*G:031 First, how does our Guest contact the Host to ask for privileged
+/*G:031 But first, how does our Guest contact the Host to ask for privileged
  * operations?  There are two ways: the direct way is to make a "hypercall",
  * to make requests of the Host Itself.
  *
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index 589be3e1f3ac..e7217dc58f39 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -16,6 +16,10 @@
  * a new device, we simply need to write a new virtio driver and create support
  * for it in the Launcher: this code won't need to change.
  *
+ * Virtio devices are also used by kvm, so we can simply reuse their optimized
+ * device drivers.  And one day when everyone uses virtio, my plan will be
+ * complete.  Bwahahahah!
+ *
  * Devices are described by a simplified ID, a status byte, and some "config"
  * bytes which describe this device's configuration.  This is placed by the
  * Launcher just above the top of physical memory:
@@ -26,7 +30,7 @@ struct lguest_device_desc {
 	/* The number of virtqueues (first in config array) */
 	__u8 num_vq;
 	/* The number of bytes of feature bits.  Multiply by 2: one for host
-	 * features and one for guest acknowledgements. */
+	 * features and one for Guest acknowledgements. */
 	__u8 feature_len;
 	/* The number of bytes of the config array after virtqueues. */
 	__u8 config_len;
-- 
cgit v1.2.3


From 7c4b93d8269b9d35971a8239426b1f6ddc3d5ef7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Mar 2008 23:59:49 -0400
Subject: [PATCH] count ghost references to vfsmounts

make propagate_mount_busy() exclude references from the vfsmounts
that had been isolated by umount_tree() and are just waiting for
release_mounts() to dispose of their ->mnt_parent/->mnt_mountpoint.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 5 ++++-
 fs/pnode.c            | 2 +-
 include/linux/mount.h | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 6324dfc80dc6..c175218ebae1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -548,6 +548,7 @@ void release_mounts(struct list_head *head)
 			m = mnt->mnt_parent;
 			mnt->mnt_mountpoint = mnt->mnt_root;
 			mnt->mnt_parent = mnt;
+			m->mnt_ghosts--;
 			spin_unlock(&vfsmount_lock);
 			dput(dentry);
 			mntput(m);
@@ -572,8 +573,10 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
 		list_del_init(&p->mnt_child);
-		if (p->mnt_parent != p)
+		if (p->mnt_parent != p) {
+			p->mnt_parent->mnt_ghosts++;
 			p->mnt_mountpoint->d_mounted--;
+		}
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
 }
diff --git a/fs/pnode.c b/fs/pnode.c
index 05ba692bc540..1d8f5447f3f7 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -225,7 +225,7 @@ out:
  */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-	int mycount = atomic_read(&mnt->mnt_count);
+	int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
 	return (mycount > count);
 }
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 6d3047d8c91c..dac5e67ff3ee 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -61,6 +61,7 @@ struct vfsmount {
 	atomic_t mnt_count;
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
+	int mnt_ghosts;
 };
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
-- 
cgit v1.2.3


From c35038becad0adb0e25261fff66d85b1a6ddd0c2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Mar 2008 00:46:23 -0400
Subject: [PATCH] do shrink_submounts() for all fs types

... and take it out of ->umount_begin() instances.  Call with all locks
already taken (by do_umount()) and leave calling release_mounts() to
caller (it will do release_mounts() anyway, so we can just put into
the same list).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/internal.h      |  1 -
 fs/afs/mntpt.c         |  8 --------
 fs/afs/super.c         |  1 -
 fs/cifs/cifs_dfs_ref.c |  1 -
 fs/namespace.c         | 23 ++++++++++-------------
 fs/nfs/super.c         |  2 --
 include/linux/mount.h  |  1 -
 7 files changed, 10 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5ca3625cd39e..9ba16edc0af2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -573,7 +573,6 @@ extern const struct file_operations afs_mntpt_file_operations;
 
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
-extern void afs_umount_begin(struct vfsmount *, int);
 
 /*
  * proc.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a3510b8ba3e7..2f5503902c37 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -283,11 +283,3 @@ void afs_mntpt_kill_timer(void)
 	cancel_delayed_work(&afs_mntpt_expiry_timer);
 	flush_scheduled_work();
 }
-
-/*
- * begin unmount by attempting to remove all automounted mountpoints we added
- */
-void afs_umount_begin(struct vfsmount *vfsmnt, int flags)
-{
-	shrink_submounts(vfsmnt, &afs_vfsmounts);
-}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 36bbce45f44b..4b572b801d8d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -50,7 +50,6 @@ static const struct super_operations afs_super_ops = {
 	.write_inode	= afs_write_inode,
 	.destroy_inode	= afs_destroy_inode,
 	.clear_inode	= afs_clear_inode,
-	.umount_begin	= afs_umount_begin,
 	.put_super	= afs_put_super,
 	.show_options	= generic_show_options,
 };
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index a1a95b027136..56c924033b78 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -33,7 +33,6 @@ void dfs_shrink_umount_helper(struct vfsmount *vfsmnt)
 {
 	mark_mounts_for_expiry(&cifs_dfs_automount_list);
 	mark_mounts_for_expiry(&cifs_dfs_automount_list);
-	shrink_submounts(vfsmnt, &cifs_dfs_automount_list);
 }
 
 /**
diff --git a/fs/namespace.c b/fs/namespace.c
index 1c78917ec930..7bd74b25930c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -581,6 +581,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 	}
 }
 
+static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts);
+
 static int do_umount(struct vfsmount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt_sb;
@@ -653,6 +655,9 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	spin_lock(&vfsmount_lock);
 	event++;
 
+	if (!(flags & MNT_DETACH))
+		shrink_submounts(mnt, &umount_list);
+
 	retval = -EBUSY;
 	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
 		if (!list_empty(&mnt->mnt_list))
@@ -1302,30 +1307,22 @@ resume:
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
  */
-void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
 	LIST_HEAD(graveyard);
-	LIST_HEAD(umounts);
-	struct vfsmount *mnt;
+	struct vfsmount *m;
 
-	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
 	/* extract submounts of 'mountpoint' from the expiration list */
-	while (select_submounts(mountpoint, &graveyard)) {
+	while (select_submounts(mnt, &graveyard)) {
 		while (!list_empty(&graveyard)) {
-			mnt = list_first_entry(&graveyard, struct vfsmount,
+			m = list_first_entry(&graveyard, struct vfsmount,
 						mnt_expire);
 			touch_mnt_namespace(mnt->mnt_ns);
-			umount_tree(mnt, 1, &umounts);
+			umount_tree(mnt, 1, umounts);
 		}
 	}
-	spin_unlock(&vfsmount_lock);
-	up_write(&namespace_sem);
-	release_mounts(&umounts);
 }
 
-EXPORT_SYMBOL_GPL(shrink_submounts);
-
 /*
  * Some copy_from_user() implementations do not return the exact number of
  * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index dd4dfcd632ec..f9219024f31a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -589,8 +589,6 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 	struct nfs_server *server = NFS_SB(vfsmnt->mnt_sb);
 	struct rpc_clnt *rpc;
 
-	shrink_submounts(vfsmnt, &nfs_automount_list);
-
 	if (!(flags & MNT_FORCE))
 		return;
 	/* -EIO all pending I/O */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dac5e67ff3ee..5ee2df217cdf 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -99,7 +99,6 @@ extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 			int mnt_flags, struct list_head *fslist);
 
 extern void mark_mounts_for_expiry(struct list_head *mounts);
-extern void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts);
 
 extern spinlock_t vfsmount_lock;
 extern dev_t name_to_dev_t(char *name);
-- 
cgit v1.2.3


From 8c703d35fa91911dd92a18c31a718853f483ad80 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 28 Mar 2008 14:15:49 -0700
Subject: in_atomic(): document why it is unsuitable for general use

Discourage people from inappropriately using in_atomic()

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hardirq.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 49829988bfa0..897f723bd222 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -72,6 +72,13 @@
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 
+/*
+ * Are we running in atomic context?  WARNING: this macro cannot
+ * always detect atomic context; in particular, it cannot know about
+ * held spinlocks in non-preemptible kernels.  Thus it should not be
+ * used in the general case to determine whether sleeping is possible.
+ * Do not use in_atomic() in driver code.
+ */
 #define in_atomic()		((preempt_count() & ~PREEMPT_ACTIVE) != 0)
 
 #ifdef CONFIG_PREEMPT
-- 
cgit v1.2.3


From 3afe3925987adc3fc052abe404e44520c2072fc8 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 28 Mar 2008 14:16:01 -0700
Subject: kernel: add bit rotation helpers for 16 and 8 bit

Will replace open-coded variants elsewhere.  Done in the same
style as the 32-bit versions.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John W. Linville <linville@tuxdriver.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jiri Benc <jbenc@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 69c1edb9fe54..40d54731de7e 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -65,6 +65,46 @@ static inline __u32 ror32(__u32 word, unsigned int shift)
 	return (word >> shift) | (word << (32 - shift));
 }
 
+/**
+ * rol16 - rotate a 16-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u16 rol16(__u16 word, unsigned int shift)
+{
+	return (word << shift) | (word >> (16 - shift));
+}
+
+/**
+ * ror16 - rotate a 16-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u16 ror16(__u16 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (16 - shift));
+}
+
+/**
+ * rol8 - rotate an 8-bit value left
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u8 rol8(__u8 word, unsigned int shift)
+{
+	return (word << shift) | (word >> (8 - shift));
+}
+
+/**
+ * ror8 - rotate an 8-bit value right
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline __u8 ror8(__u8 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (8 - shift));
+}
+
 static inline unsigned fls_long(unsigned long l)
 {
 	if (sizeof(l) == 4)
-- 
cgit v1.2.3


From 5ac7ec85bcc70ef605657fb2d1106d27ab3bd131 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 28 Mar 2008 14:16:03 -0700
Subject: ext3: don't export ext3_fs.h and jbd.h

Neither of the headers actually compiles when included from userpsace nor
should it be made available as userspace tools should be using the libraries
or at least headers from e2fsprogs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/Kbuild | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 4108b38ebb16..4a446a19295e 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -195,7 +195,6 @@ unifdef-y += ethtool.h
 unifdef-y += eventpoll.h
 unifdef-y += signalfd.h
 unifdef-y += ext2_fs.h
-unifdef-y += ext3_fs.h
 unifdef-y += fb.h
 unifdef-y += fcntl.h
 unifdef-y += filter.h
@@ -248,7 +247,6 @@ unifdef-y += isdn.h
 unifdef-y += isdnif.h
 unifdef-y += isdn_divertif.h
 unifdef-y += isdn_ppp.h
-unifdef-y += jbd.h
 unifdef-y += joystick.h
 unifdef-y += kdev_t.h
 unifdef-y += kd.h
-- 
cgit v1.2.3


From 3ec25ebd69dc120d0590e64caaf1477aa88c8a93 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 27 Mar 2008 18:37:14 +0900
Subject: libata: ATA_EHI_LPM should be ATA_EH_LPM

EH actions are ATA_EH_* not ATA_EHI_*.  Rename ATA_EHI_LPM to
ATA_EH_LPM.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-core.c | 2 +-
 drivers/ata/libata-eh.c   | 2 +-
 include/linux/libata.h    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index c4248b37ff64..48519887f94a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -972,7 +972,7 @@ static void ata_dev_disable_pm(struct ata_device *dev)
 void ata_lpm_schedule(struct ata_port *ap, enum link_pm policy)
 {
 	ap->pm_policy = policy;
-	ap->link.eh_info.action |= ATA_EHI_LPM;
+	ap->link.eh_info.action |= ATA_EH_LPM;
 	ap->link.eh_info.flags |= ATA_EHI_NO_AUTOPSY;
 	ata_port_schedule_eh(ap);
 }
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 681252fd8143..a5830329eda4 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2748,7 +2748,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 			ehc->i.flags &= ~ATA_EHI_SETMODE;
 		}
 
-		if (ehc->i.action & ATA_EHI_LPM)
+		if (ehc->i.action & ATA_EH_LPM)
 			ata_link_for_each_dev(dev, link)
 				ata_dev_enable_pm(dev, ap->pm_policy);
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 269cdba09578..b064bfeb69ee 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -295,6 +295,7 @@ enum {
 	ATA_EH_SOFTRESET	= (1 << 1),
 	ATA_EH_HARDRESET	= (1 << 2),
 	ATA_EH_ENABLE_LINK	= (1 << 3),
+	ATA_EH_LPM		= (1 << 4),  /* link power management action */
 
 	ATA_EH_RESET_MASK	= ATA_EH_SOFTRESET | ATA_EH_HARDRESET,
 	ATA_EH_PERDEV_MASK	= ATA_EH_REVALIDATE,
@@ -304,7 +305,6 @@ enum {
 	ATA_EHI_RESUME_LINK	= (1 << 1),  /* resume link (reset modifier) */
 	ATA_EHI_NO_AUTOPSY	= (1 << 2),  /* no autopsy */
 	ATA_EHI_QUIET		= (1 << 3),  /* be quiet */
-	ATA_EHI_LPM		= (1 << 4),  /* link power management action */
 
 	ATA_EHI_DID_SOFTRESET	= (1 << 16), /* already soft-reset this port */
 	ATA_EHI_DID_HARDRESET	= (1 << 17), /* already soft-reset this port */
-- 
cgit v1.2.3


From 729d4de96a5c090e40a918a41f63b7fb1b27c240 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Sat, 29 Mar 2008 19:55:17 +0100
Subject: ide: fix defining SUPPORT_VLB_SYNC

We need to check for CONFIG_{CRIS,FRV} not {CRIS,FRV}.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index a3b69c10d667..bc26b2f27359 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,7 +26,7 @@
 #include <asm/semaphore.h>
 #include <asm/mutex.h>
 
-#if defined(CRIS) || defined(FRV)
+#if defined(CONFIG_CRIS) || defined(CONFIG_FRV)
 # define SUPPORT_VLB_SYNC 0
 #else
 # define SUPPORT_VLB_SYNC 1
-- 
cgit v1.2.3


From 7d61c4596d11d624efb4bbcbad01f9cf2b321162 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 29 Mar 2008 03:09:28 +0000
Subject: compat_sys_wait4() prototype misannotation

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index a671dbff7a1f..8fa7857e153b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -192,8 +192,8 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		struct compat_timeval __user *tvp);
 
 asmlinkage long compat_sys_wait4(compat_pid_t pid,
-				 compat_uint_t *stat_addr, int options,
-				 struct compat_rusage *ru);
+				 compat_uint_t __user *stat_addr, int options,
+				 struct compat_rusage __user *ru);
 
 #define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))
 
-- 
cgit v1.2.3


From b2ddb9019ea13fb7b62d8e45adcc468376af0de7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 29 Mar 2008 03:09:38 +0000
Subject: dma_page_list ->base_address is a userland pointer

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/dmaengine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 261e43a4c873..34d440698293 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -423,7 +423,7 @@ void dma_async_device_unregister(struct dma_device *device);
 /* --- Helper iov-locking functions --- */
 
 struct dma_page_list {
-	char *base_address;
+	char __user *base_address;
 	int nr_pages;
 	struct page **pages;
 };
-- 
cgit v1.2.3