From 3b2d99429e3386b6e2ac949fc72486509c8bbe36 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 14 Dec 2005 15:05:00 -0500
Subject: P-state software coordination for ACPI core

http://bugzilla.kernel.org/show_bug.cgi?id=5737

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/cpufreq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 17866d7e2b71..f7d988366941 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -73,6 +73,8 @@ struct cpufreq_real_policy {
 
 struct cpufreq_policy {
 	cpumask_t		cpus;	/* affected CPUs */
+	unsigned int		shared_type; /* ANY or ALL affected CPUs
+						should set cpufreq */
 	unsigned int		cpu;    /* cpu nr of registered CPU */
 	struct cpufreq_cpuinfo	cpuinfo;/* see above */
 
@@ -99,6 +101,8 @@ struct cpufreq_policy {
 #define CPUFREQ_INCOMPATIBLE	(1)
 #define CPUFREQ_NOTIFY		(2)
 
+#define CPUFREQ_SHARED_TYPE_ALL	(0) /* All dependent CPUs should set freq */
+#define CPUFREQ_SHARED_TYPE_ANY	(1) /* Freq can be set from any dependent CPU */
 
 /******************** cpufreq transition notifiers *******************/
 
-- 
cgit v1.2.3


From 651d765d0b2c72d33430487c8b6ef64c60cd2134 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 7 Jun 2006 16:10:19 +1000
Subject: [PATCH] Add a prctl to change the endianness of a process.

This new prctl is intended for changing the execution mode of the
processor, on processors that support both a little-endian mode and a
big-endian mode.  It is intended for use by programs such as
instruction set emulators (for example an x86 emulator on PowerPC),
which may find it convenient to use the processor in an alternate
endianness mode when executing translated instructions.

Note that this does not imply the existence of a fully-fledged ABI for
both endiannesses, or of compatibility code for converting system
calls done in the non-native endianness mode.  The program is expected
to arrange for all of its system call arguments to be presented in the
native endianness.

Switching between big and little-endian mode will require some care in
constructing the instruction sequence for the switch.  Generally the
instructions up to the instruction that invokes the prctl system call
will have to be in the old endianness, and subsequent instructions
will have to be in the new endianness.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/prctl.h |  7 +++++++
 kernel/sys.c          | 13 +++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index bf022c43a18e..52a9be41250d 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -52,4 +52,11 @@
 #define PR_SET_NAME    15		/* Set process name */
 #define PR_GET_NAME    16		/* Get process name */
 
+/* Get/set process endian */
+#define PR_GET_ENDIAN	19
+#define PR_SET_ENDIAN	20
+# define PR_ENDIAN_BIG		0
+# define PR_ENDIAN_LITTLE	1	/* True little endian mode */
+# define PR_ENDIAN_PPC_LITTLE	2	/* "PowerPC" pseudo little endian */
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..12d2d753dc3b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -57,6 +57,12 @@
 #ifndef GET_FPEXC_CTL
 # define GET_FPEXC_CTL(a,b)	(-EINVAL)
 #endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a,b)	(-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a,b)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2057,6 +2063,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				return -EFAULT;
 			return 0;
 		}
+		case PR_GET_ENDIAN:
+			error = GET_ENDIAN(current, arg2);
+			break;
+		case PR_SET_ENDIAN:
+			error = SET_ENDIAN(current, arg2);
+			break;
+
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3


From 1e92a550e80fef01ebcc0bcd0896109cdb986c72 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 15 Jun 2006 14:11:22 +1000
Subject: [POWERPC] Fix mdelay badness on shared processor partitions

On partitioned PPC64 systems where a partition is given 1/10 of a
processor, we have seen mdelay() delaying for 10 times longer than it
should.  The reason is that the generic mdelay(n) does n delays of 1
millisecond each.  However, with 1/10 of a processor, we only get a
one-millisecond timeslice every 10ms.  Thus each 1 millisecond delay
loop ends up taking 10ms elapsed time.

The solution is just to use the PPC64 udelay function, which uses the
timebase to ensure that the delay is based on elapsed time rather than
how much processing time the partition has been given.  (Yes, the
generic mdelay uses the PPC64 udelay, but the problem is that the
start time gets reset every millisecond, and each time it gets reset
we lose another 9ms.)

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Andrew Morton <akpm@osdl.org>
---
 include/asm-powerpc/delay.h | 13 +++++++++++++
 include/linux/delay.h       |  5 +----
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-powerpc/delay.h b/include/asm-powerpc/delay.h
index 057a60955474..f9200a65c632 100644
--- a/include/asm-powerpc/delay.h
+++ b/include/asm-powerpc/delay.h
@@ -17,5 +17,18 @@
 extern void __delay(unsigned long loops);
 extern void udelay(unsigned long usecs);
 
+/*
+ * On shared processor machines the generic implementation of mdelay can
+ * result in large errors. While each iteration of the loop inside mdelay
+ * is supposed to take 1ms, the hypervisor could sleep our partition for
+ * longer (eg 10ms). With the right timing these errors can add up.
+ *
+ * Since there is no 32bit overflow issue on 64bit kernels, just call
+ * udelay directly.
+ */
+#ifdef CONFIG_PPC64
+#define mdelay(n)	udelay((n) * 1000)
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_DELAY_H */
diff --git a/include/linux/delay.h b/include/linux/delay.h
index acb74865b973..17ddb55430ae 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -25,10 +25,7 @@ extern unsigned long loops_per_jiffy;
 #define MAX_UDELAY_MS	5
 #endif
 
-#ifdef notdef
-#define mdelay(n) (\
-	{unsigned long __ms=(n); while (__ms--) udelay(1000);})
-#else
+#ifndef mdelay
 #define mdelay(n) (\
 	(__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
 	({unsigned long __ms=(n); while (__ms--) udelay(1000);}))
-- 
cgit v1.2.3


From c34b4c734482dda750deb6089521f7c891b48736 Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 9 May 2006 10:52:09 -0700
Subject: [PATCH] PCI: Add PCI_CAP_ID_VNDR

Add the vendor-specific extended capability PCI_CAP_ID_VNDR.  It will be
used by the Myri-10G Ethernet driver (will be submitted soon).

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d27a78b71297..6bce4a240364 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -197,6 +197,7 @@
 #define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
 #define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
 #define  PCI_CAP_ID_HT_IRQCONF	0x08	/* HyperTransport IRQ Configuration */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific capability */
 #define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
 #define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
-- 
cgit v1.2.3


From 75acfecaa031c0e1bc412cee4fe58ba49ff3406c Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Mon, 1 May 2006 10:43:46 -0500
Subject: [PATCH] PCI: Add pci_assign_resource_fixed -- allow fixed address
 assignments

PCI: Add pci_assign_resource_fixed -- allow fixed address assignments

On some embedded systems the PCI address for hotplug devices are not only
known a priori but are required to be at a given PCI address for other
master in the system to be able to access.

An example of such a system would be an FPGA which is setup from user space
after the system has booted.  The FPGA may be access by DSPs in the system
and those DSPs expect the FPGA at a fixed PCI address.

Added pci_assign_resource_fixed() as a way to allow assignment of the PCI
devices's BARs at fixed PCI addresses.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/setup-res.c | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h     |  1 +
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index ea9277b7f899..577f4b55c46d 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -155,6 +155,46 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 	return ret;
 }
 
+#ifdef CONFIG_EMBEDDED
+int pci_assign_resource_fixed(struct pci_dev *dev, int resno)
+{
+	struct pci_bus *bus = dev->bus;
+	struct resource *res = dev->resource + resno;
+	unsigned int type_mask;
+	int i, ret = -EBUSY;
+
+	type_mask = IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+
+	for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) {
+		struct resource *r = bus->resource[i];
+		if (!r)
+			continue;
+
+		/* type_mask must match */
+		if ((res->flags ^ r->flags) & type_mask)
+			continue;
+
+		ret = request_resource(r, res);
+
+		if (ret == 0)
+			break;
+	}
+
+	if (ret) {
+		printk(KERN_ERR "PCI: Failed to allocate %s resource "
+				"#%d:%llx@%llx for %s\n",
+			res->flags & IORESOURCE_IO ? "I/O" : "mem",
+			resno, (unsigned long long)(res->end - res->start + 1),
+			(unsigned long long)res->start, pci_name(dev));
+	} else if (resno < PCI_BRIDGE_RESOURCES) {
+		pci_update_resource(dev, res, resno);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_assign_resource_fixed);
+#endif
+
 /* Sort resources by alignment */
 void __devinit
 pdev_sort_resources(struct pci_dev *dev, struct resource_list *head)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6c4bc773f7b7..b9eb9b021d6a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -496,6 +496,7 @@ int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
 int pci_assign_resource(struct pci_dev *dev, int i);
+int pci_assign_resource_fixed(struct pci_dev *dev, int i);
 void pci_restore_bars(struct pci_dev *dev);
 
 /* ROM control related routines */
-- 
cgit v1.2.3


From bd8481e1646d7649fa101ee57a5139b9da3c2436 Mon Sep 17 00:00:00 2001
From: Doug Thompson <norsk5@yahoo.com>
Date: Mon, 8 May 2006 17:06:09 -0700
Subject: [PATCH] PCI Bus Parity Status-broken hardware attribute, EDAC
 foundation

Currently, the EDAC (error detection and correction) modules that are in
the kernel contain some features that need to be moved. After some good
feedback on the PCI Parity detection code and interface
(http://www.ussg.iu.edu/hypermail/linux/kernel/0603.1/0897.html) this
patch ADDs an new attribute to the pci_dev structure: Namely the
'broken_parity_status' bit.

When set this indicates that the respective hardware generates false
positives of Parity errors.

The EDAC "blacklist" solution was inferior and will be removed in a
future patch.

Also in this patch is a PCI quirk.c entry for an Infiniband PCI-X card
which generates false positive parity errors.

I am requesting comments on this AND on the possibility of a exposing
this 'broken_parity_status' bit to userland via the PCI device sysfs
directory for devices. This access would allow for enabling of this
feature on new devices and for old devices that have their drivers
updated. (SLES 9 SP3 did this on an ATI motherboard video device). There
is a need to update such a PCI attribute between kernel releases.

This patch just adds a storage place for the attribute and a quirk entry
for a known bad PCI device. PCI Parity reaper/harvestor operations are
in EDAC itself and will be refactored to use this PCI attribute instead
of its own mechanisms (which are currently disabled) in the future.

Signed-off-by: Doug Thompson <norsk5@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/quirks.c    | 11 +++++++++++
 include/linux/pci.h     |  1 +
 include/linux/pci_ids.h |  1 +
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index d378478612fb..ea48e969a12e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -24,6 +24,17 @@
 #include <linux/acpi.h>
 #include "pci.h"
 
+/* The Mellanox Tavor device gives false positive parity errors
+ * Mark this device with a broken_parity_status, to allow
+ * PCI scanning code to "skip" this now blacklisted device.
+ */
+static void __devinit quirk_mellanox_tavor(struct pci_dev *dev)
+{
+	dev->broken_parity_status = 1;	/* This device gives false positives */
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR,quirk_mellanox_tavor);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX,PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE,quirk_mellanox_tavor);
+
 /* Deal with broken BIOS'es that neglect to enable passive release,
    which can cause problems in combination with the 82441FX/PPro MTRRs */
 static void __devinit quirk_passive_release(struct pci_dev *dev)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b9eb9b021d6a..91c37750cd34 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -162,6 +162,7 @@ struct pci_dev {
 	unsigned int	is_busmaster:1; /* device is busmaster */
 	unsigned int	no_msi:1;	/* device may not use msi */
 	unsigned int	block_ucfg_access:1;	/* userspace config space access is blocked */
+	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
 
 	u32		saved_config_space[16]; /* config space saved at suspend time */
 	struct hlist_head saved_cap_space;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bcfe9d4f56ae..3d197cdcfa3a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1946,6 +1946,7 @@
 
 #define PCI_VENDOR_ID_MELLANOX		0x15b3
 #define PCI_DEVICE_ID_MELLANOX_TAVOR	0x5a44
+#define PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE	0x5a46
 #define PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT 0x6278
 #define PCI_DEVICE_ID_MELLANOX_ARBEL	0x6282
 #define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c
-- 
cgit v1.2.3


From 74d0a988d3aa359b6b8a8536c8cb92cce02ca5d5 Mon Sep 17 00:00:00 2001
From: Brent Casavant <bcasavan@sgi.com>
Date: Wed, 10 May 2006 01:49:14 -0700
Subject: [PATCH] PCI: Move various PCI IDs to header file

Move various QLogic, Vitesse, and Intel storage controller PCI IDs to the
main header file.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Acked-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/scsi/qla1280.c  | 24 ------------------------
 drivers/scsi/sata_vsc.c | 11 ++++++-----
 include/linux/pci_ids.h |  9 +++++++++
 3 files changed, 15 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 77bb2351500c..680f6063954b 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -397,30 +397,6 @@
 #include "ql1280_fw.h"
 #include "ql1040_fw.h"
 
-
-/*
- * Missing PCI ID's
- */
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP1080
-#define PCI_DEVICE_ID_QLOGIC_ISP1080	0x1080
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP1240
-#define PCI_DEVICE_ID_QLOGIC_ISP1240	0x1240
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP1280
-#define PCI_DEVICE_ID_QLOGIC_ISP1280	0x1280
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP10160
-#define PCI_DEVICE_ID_QLOGIC_ISP10160	0x1016
-#endif
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP12160
-#define PCI_DEVICE_ID_QLOGIC_ISP12160	0x1216
-#endif
-
-#ifndef PCI_VENDOR_ID_AMI
-#define PCI_VENDOR_ID_AMI               0x101e
-#endif
-
 #ifndef BITS_PER_LONG
 #error "BITS_PER_LONG not defined!"
 #endif
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index 8a29ce340b47..27d658704cf9 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -433,13 +433,14 @@ err_out:
 
 
 /*
- * 0x1725/0x7174 is the Vitesse VSC-7174
- * 0x8086/0x3200 is the Intel 31244, which is supposed to be identical
- * compatibility is untested as of yet
+ * Intel 31244 is supposed to be identical.
+ * Compatibility is untested as of yet.
  */
 static const struct pci_device_id vsc_sata_pci_tbl[] = {
-	{ 0x1725, 0x7174, PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
-	{ 0x8086, 0x3200, PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
+	{ PCI_VENDOR_ID_VITESSE, PCI_DEVICE_ID_VITESSE_VSC7174,
+	  PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_GD31244,
+	  PCI_ANY_ID, PCI_ANY_ID, 0x10600, 0xFFFFFF, 0 },
 	{ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3d197cdcfa3a..e526e7b5ea47 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -848,7 +848,12 @@
 
 
 #define PCI_VENDOR_ID_QLOGIC		0x1077
+#define PCI_DEVICE_ID_QLOGIC_ISP10160	0x1016
 #define PCI_DEVICE_ID_QLOGIC_ISP1020	0x1020
+#define PCI_DEVICE_ID_QLOGIC_ISP1080	0x1080
+#define PCI_DEVICE_ID_QLOGIC_ISP12160	0x1216
+#define PCI_DEVICE_ID_QLOGIC_ISP1240	0x1240
+#define PCI_DEVICE_ID_QLOGIC_ISP1280	0x1280
 #define PCI_DEVICE_ID_QLOGIC_ISP2100	0x2100
 #define PCI_DEVICE_ID_QLOGIC_ISP2200	0x2200
 #define PCI_DEVICE_ID_QLOGIC_ISP2300	0x2300
@@ -1970,6 +1975,9 @@
 #define PCI_VENDOR_ID_NETCELL		0x169c
 #define PCI_DEVICE_ID_REVOLUTION	0x0044
 
+#define PCI_VENDOR_ID_VITESSE		0x1725
+#define PCI_DEVICE_ID_VITESSE_VSC7174	0x7174
+
 #define PCI_VENDOR_ID_LINKSYS		0x1737
 #define PCI_DEVICE_ID_LINKSYS_EG1064	0x1064
 
@@ -2149,6 +2157,7 @@
 #define PCI_DEVICE_ID_INTEL_ICH8_4	0x2815
 #define PCI_DEVICE_ID_INTEL_ICH8_5	0x283e
 #define PCI_DEVICE_ID_INTEL_ICH8_6	0x2850
+#define PCI_DEVICE_ID_INTEL_GD31244	0x3200
 #define PCI_DEVICE_ID_INTEL_82855PM_HB	0x3340
 #define PCI_DEVICE_ID_INTEL_82830_HB	0x3575
 #define PCI_DEVICE_ID_INTEL_82830_CGC	0x3577
-- 
cgit v1.2.3


From 99dc804d9bcc2c53f4c20c291bf4e185312a1a0c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 26 May 2006 10:58:27 +0800
Subject: [PATCH] PCI: disable msi mode in pci_disable_device

Brice said the pci_save_msi_state breaks his driver in his special usage
(not in suspend/resume), as pci_save_msi_state will disable msi mode. In
his usage, pci_save_state will be called at runtime, and later (after
the device operates for some time and has an error) pci_restore_state
will be called.
In another hand, suspend/resume needs disable msi mode, as device should
stop working completely. This patch try to workaround this issue.
Drivers are expected call pci_disable_device in suspend time after
pci_save_state.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/msi.c   | 6 ++++--
 drivers/pci/pci.c   | 9 ++++++++-
 include/linux/pci.h | 2 ++
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 9c69b6966e79..3ec558dc6523 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -453,9 +453,11 @@ static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
 		/* Set enabled bits to single MSI & enable MSI_enable bit */
 		msi_enable(control, 1);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msi_enabled = 1;
 	} else {
 		msix_enable(control);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msix_enabled = 1;
 	}
     	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
 		/* PCI Express Endpoint device detected */
@@ -472,9 +474,11 @@ void disable_msi_mode(struct pci_dev *dev, int pos, int type)
 		/* Set enabled bits to single MSI & enable MSI_enable bit */
 		msi_disable(control);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msi_enabled = 0;
 	} else {
 		msix_disable(control);
 		pci_write_config_word(dev, msi_control_reg(pos), control);
+		dev->msix_enabled = 0;
 	}
     	if (pci_find_capability(dev, PCI_CAP_ID_EXP)) {
 		/* PCI Express Endpoint device detected */
@@ -549,7 +553,6 @@ int pci_save_msi_state(struct pci_dev *dev)
 		pci_read_config_dword(dev, pos + PCI_MSI_DATA_32, &cap[i++]);
 	if (control & PCI_MSI_FLAGS_MASKBIT)
 		pci_read_config_dword(dev, pos + PCI_MSI_MASK_BIT, &cap[i++]);
-	disable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
 	save_state->cap_nr = PCI_CAP_ID_MSI;
 	pci_add_saved_cap(dev, save_state);
 	return 0;
@@ -639,7 +642,6 @@ int pci_save_msix_state(struct pci_dev *dev)
 	}
 	dev->irq = temp;
 
-	disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX);
 	save_state->cap_nr = PCI_CAP_ID_MSIX;
 	pci_add_saved_cap(dev, save_state);
 	return 0;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index aa480370ef10..d408a3c30426 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -551,7 +551,14 @@ void
 pci_disable_device(struct pci_dev *dev)
 {
 	u16 pci_command;
-	
+
+	if (dev->msi_enabled)
+		disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI),
+			PCI_CAP_ID_MSI);
+	if (dev->msix_enabled)
+		disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI),
+			PCI_CAP_ID_MSIX);
+
 	pci_read_config_word(dev, PCI_COMMAND, &pci_command);
 	if (pci_command & PCI_COMMAND_MASTER) {
 		pci_command &= ~PCI_COMMAND_MASTER;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 91c37750cd34..62a8c22f5f60 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -163,6 +163,8 @@ struct pci_dev {
 	unsigned int	no_msi:1;	/* device may not use msi */
 	unsigned int	block_ucfg_access:1;	/* userspace config space access is blocked */
 	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
+	unsigned int 	msi_enabled:1;
+	unsigned int	msix_enabled:1;
 
 	u32		saved_config_space[16]; /* config space saved at suspend time */
 	struct hlist_head saved_cap_space;
-- 
cgit v1.2.3


From cf34a8e07f02c76f3f1232eecb681301a3d7b10b Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 13 Jun 2006 14:35:42 -0400
Subject: [PATCH] PCI: nVidia quirk to make AER PCI-E extended capability
 visible

The nVidia CK804 PCI-E chipset supports the AER extended capability
but sometimes fails to link it (with some BIOS or after a warm reboot).
It makes the AER cap invisible to pci_find_ext_capability().

The patch adds a quirk to set the missing bit that controls the
linking of the capability.
By the way, it removes the corresponding code in the myri10ge driver.

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Loic Prylli <loic@myri.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/quirks.c    | 19 +++++++++++++++++++
 include/linux/pci_ids.h |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index aea4d49bcce3..4364d793f73b 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1499,6 +1499,25 @@ static void __devinit quirk_p64h2_1k_io(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,	0x1460,		quirk_p64h2_1k_io);
 
+/* Under some circumstances, AER is not linked with extended capabilities.
+ * Force it to be linked by setting the corresponding control bit in the
+ * config space.
+ */
+static void __devinit quirk_nvidia_ck804_pcie_aer_ext_cap(struct pci_dev *dev)
+{
+	uint8_t b;
+	if (pci_read_config_byte(dev, 0xf41, &b) == 0) {
+		if (!(b & 0x20)) {
+			pci_write_config_byte(dev, 0xf41, b | 0x20);
+			printk(KERN_INFO
+			       "PCI: Linking AER extended capability on %s\n",
+			       pci_name(dev));
+		}
+	}
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA,  PCI_DEVICE_ID_NVIDIA_CK804_PCIE,
+			quirk_nvidia_ck804_pcie_aer_ext_cap);
+
 EXPORT_SYMBOL(pcie_mch_quirk);
 #ifdef CONFIG_HOTPLUG
 EXPORT_SYMBOL(pci_fixup_device);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e526e7b5ea47..fd54a9d4c3d4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1023,6 +1023,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NVENET_8		0x0056
 #define PCI_DEVICE_ID_NVIDIA_NVENET_9		0x0057
 #define PCI_DEVICE_ID_NVIDIA_CK804_AUDIO	0x0059
+#define PCI_DEVICE_ID_NVIDIA_CK804_PCIE		0x005d
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2_SMBUS	0x0064
 #define PCI_DEVICE_ID_NVIDIA_NFORCE2_IDE	0x0065
 #define PCI_DEVICE_ID_NVIDIA_NVENET_2		0x0066
-- 
cgit v1.2.3


From 1cdcb6b43fda7424b7435dac8f80b2b5d8a48899 Mon Sep 17 00:00:00 2001
From: Hansjoerg Lipp <hjlipp@web.de>
Date: Sat, 22 Apr 2006 18:36:53 +0200
Subject: [PATCH] TTY: return class device pointer from tty_register_device()

Let tty_register_device() return a pointer to the class device it creates.
This allows registrants to add their own sysfs files under the class
device node.

Signed-off-by: Hansjoerg Lipp <hjlipp@web.de>
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/tty_io.c | 11 +++++++----
 include/linux/tty.h   |  4 +++-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a88b94a82b14..8b2a59969868 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2961,12 +2961,14 @@ static struct class *tty_class;
  *	This field is optional, if there is no known struct device for this
  *	tty device it can be set to NULL safely.
  *
+ * Returns a pointer to the class device (or ERR_PTR(-EFOO) on error).
+ *
  * This call is required to be made to register an individual tty device if
  * the tty driver's flags have the TTY_DRIVER_NO_DEVFS bit set.  If that
  * bit is not set, this function should not be called.
  */
-void tty_register_device(struct tty_driver *driver, unsigned index,
-			 struct device *device)
+struct class_device *tty_register_device(struct tty_driver *driver,
+					 unsigned index, struct device *device)
 {
 	char name[64];
 	dev_t dev = MKDEV(driver->major, driver->minor_start) + index;
@@ -2974,7 +2976,7 @@ void tty_register_device(struct tty_driver *driver, unsigned index,
 	if (index >= driver->num) {
 		printk(KERN_ERR "Attempt to register invalid tty line number "
 		       " (%d).\n", index);
-		return;
+		return ERR_PTR(-EINVAL);
 	}
 
 	devfs_mk_cdev(dev, S_IFCHR | S_IRUSR | S_IWUSR,
@@ -2984,7 +2986,8 @@ void tty_register_device(struct tty_driver *driver, unsigned index,
 		pty_line_name(driver, index, name);
 	else
 		tty_line_name(driver, index, name);
-	class_device_create(tty_class, NULL, dev, device, "%s", name);
+
+	return class_device_create(tty_class, NULL, dev, device, "%s", name);
 }
 
 /**
diff --git a/include/linux/tty.h b/include/linux/tty.h
index e898eeb94166..cb35ca50a0a6 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -290,7 +290,9 @@ extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_register_driver(struct tty_driver *driver);
 extern int tty_unregister_driver(struct tty_driver *driver);
-extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev);
+extern struct class_device *tty_register_device(struct tty_driver *driver,
+						unsigned index,
+						struct device *dev);
 extern void tty_unregister_device(struct tty_driver *driver, unsigned index);
 extern int tty_read_raw_data(struct tty_struct *tty, unsigned char *bufp,
 			     int buflen);
-- 
cgit v1.2.3


From 1740757e8f94c6899705eb6f5434de9404992778 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 2 May 2006 16:59:59 +0200
Subject: [PATCH] Driver Core: remove unused exports

Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/attribute_container.c | 8 --------
 drivers/base/base.h                | 2 ++
 drivers/base/bus.c                 | 6 ------
 drivers/base/class.c               | 6 ++----
 include/linux/device.h             | 8 --------
 5 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/attribute_container.c b/drivers/base/attribute_container.c
index 2a7d7ae83e1e..22220733f76f 100644
--- a/drivers/base/attribute_container.c
+++ b/drivers/base/attribute_container.c
@@ -236,7 +236,6 @@ attribute_container_remove_device(struct device *dev,
 	}
 	up(&attribute_container_mutex);
 }
-EXPORT_SYMBOL_GPL(attribute_container_remove_device);
 
 /**
  * attribute_container_device_trigger - execute a trigger for each matching classdev
@@ -276,7 +275,6 @@ attribute_container_device_trigger(struct device *dev,
 	}
 	up(&attribute_container_mutex);
 }
-EXPORT_SYMBOL_GPL(attribute_container_device_trigger);
 
 /**
  * attribute_container_trigger - trigger a function for each matching container
@@ -304,7 +302,6 @@ attribute_container_trigger(struct device *dev,
 	}
 	up(&attribute_container_mutex);
 }
-EXPORT_SYMBOL_GPL(attribute_container_trigger);
 
 /**
  * attribute_container_add_attrs - add attributes
@@ -333,7 +330,6 @@ attribute_container_add_attrs(struct class_device *classdev)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(attribute_container_add_attrs);
 
 /**
  * attribute_container_add_class_device - same function as class_device_add
@@ -352,7 +348,6 @@ attribute_container_add_class_device(struct class_device *classdev)
 		return error;
 	return attribute_container_add_attrs(classdev);
 }
-EXPORT_SYMBOL_GPL(attribute_container_add_class_device);
 
 /**
  * attribute_container_add_class_device_adapter - simple adapter for triggers
@@ -367,7 +362,6 @@ attribute_container_add_class_device_adapter(struct attribute_container *cont,
 {
 	return attribute_container_add_class_device(classdev);
 }
-EXPORT_SYMBOL_GPL(attribute_container_add_class_device_adapter);
 
 /**
  * attribute_container_remove_attrs - remove any attribute files
@@ -389,7 +383,6 @@ attribute_container_remove_attrs(struct class_device *classdev)
 	for (i = 0; attrs[i]; i++)
 		class_device_remove_file(classdev, attrs[i]);
 }
-EXPORT_SYMBOL_GPL(attribute_container_remove_attrs);
 
 /**
  * attribute_container_class_device_del - equivalent of class_device_del
@@ -405,7 +398,6 @@ attribute_container_class_device_del(struct class_device *classdev)
 	attribute_container_remove_attrs(classdev);
 	class_device_del(classdev);
 }
-EXPORT_SYMBOL_GPL(attribute_container_class_device_del);
 
 /**
  * attribute_container_find_class_device - find the corresponding class_device
diff --git a/drivers/base/base.h b/drivers/base/base.h
index bbbc2acd921c..122498aef50b 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -13,6 +13,8 @@ extern int attribute_container_init(void);
 extern int bus_add_device(struct device * dev);
 extern void bus_attach_device(struct device * dev);
 extern void bus_remove_device(struct device * dev);
+extern struct bus_type *get_bus(struct bus_type * bus);
+extern void put_bus(struct bus_type * bus);
 
 extern int bus_add_driver(struct device_driver *);
 extern void bus_remove_driver(struct device_driver *);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index b27a6067e5a4..64ba9011d1a8 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -745,15 +745,9 @@ EXPORT_SYMBOL_GPL(bus_for_each_dev);
 EXPORT_SYMBOL_GPL(bus_find_device);
 EXPORT_SYMBOL_GPL(bus_for_each_drv);
 
-EXPORT_SYMBOL_GPL(bus_add_device);
-EXPORT_SYMBOL_GPL(bus_attach_device);
-EXPORT_SYMBOL_GPL(bus_remove_device);
 EXPORT_SYMBOL_GPL(bus_register);
 EXPORT_SYMBOL_GPL(bus_unregister);
 EXPORT_SYMBOL_GPL(bus_rescan_devices);
-EXPORT_SYMBOL_GPL(get_bus);
-EXPORT_SYMBOL_GPL(put_bus);
-EXPORT_SYMBOL_GPL(find_bus);
 
 EXPORT_SYMBOL_GPL(bus_create_file);
 EXPORT_SYMBOL_GPL(bus_remove_file);
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 48ad5df72812..4b598be0e4b6 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -91,14 +91,14 @@ void class_remove_file(struct class * cls, const struct class_attribute * attr)
 		sysfs_remove_file(&cls->subsys.kset.kobj, &attr->attr);
 }
 
-struct class * class_get(struct class * cls)
+static struct class *class_get(struct class *cls)
 {
 	if (cls)
 		return container_of(subsys_get(&cls->subsys), struct class, subsys);
 	return NULL;
 }
 
-void class_put(struct class * cls)
+static void class_put(struct class * cls)
 {
 	if (cls)
 		subsys_put(&cls->subsys);
@@ -894,8 +894,6 @@ EXPORT_SYMBOL_GPL(class_create_file);
 EXPORT_SYMBOL_GPL(class_remove_file);
 EXPORT_SYMBOL_GPL(class_register);
 EXPORT_SYMBOL_GPL(class_unregister);
-EXPORT_SYMBOL_GPL(class_get);
-EXPORT_SYMBOL_GPL(class_put);
 EXPORT_SYMBOL_GPL(class_create);
 EXPORT_SYMBOL_GPL(class_destroy);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index b2e5da2b637b..ade10dd6b779 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -60,11 +60,6 @@ extern void bus_unregister(struct bus_type * bus);
 
 extern void bus_rescan_devices(struct bus_type * bus);
 
-extern struct bus_type * get_bus(struct bus_type * bus);
-extern void put_bus(struct bus_type * bus);
-
-extern struct bus_type * find_bus(char * name);
-
 /* iterator helpers for buses */
 
 int bus_for_each_dev(struct bus_type * bus, struct device * start, void * data,
@@ -163,9 +158,6 @@ struct class {
 extern int class_register(struct class *);
 extern void class_unregister(struct class *);
 
-extern struct class * class_get(struct class *);
-extern void class_put(struct class *);
-
 
 struct class_attribute {
 	struct attribute	attr;
-- 
cgit v1.2.3


From 670dd90d81f60ef429cbba54ad235e9207f4d444 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 8 May 2006 13:45:57 +0800
Subject: [PATCH] Driver Core: Allow sysdev_class have attributes

allow sysdev_class adding attribute. Next patch will use the new API to
add an attribute under /sys/device/system/cpu/.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/sys.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/sysdev.h | 18 +++++++++++++++++-
 2 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 6fc23ab127bd..6858178b3aff 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -80,10 +80,59 @@ void sysdev_remove_file(struct sys_device * s, struct sysdev_attribute * a)
 EXPORT_SYMBOL_GPL(sysdev_create_file);
 EXPORT_SYMBOL_GPL(sysdev_remove_file);
 
+#define to_sysdev_class(k) container_of(k, struct sysdev_class, kset.kobj)
+#define to_sysdev_class_attr(a) container_of(a, \
+	struct sysdev_class_attribute, attr)
+
+static ssize_t sysdev_class_show(struct kobject *kobj, struct attribute *attr,
+				 char *buffer)
+{
+	struct sysdev_class * class = to_sysdev_class(kobj);
+	struct sysdev_class_attribute *class_attr = to_sysdev_class_attr(attr);
+
+	if (class_attr->show)
+		return class_attr->show(class, buffer);
+	return -EIO;
+}
+
+static ssize_t sysdev_class_store(struct kobject *kobj, struct attribute *attr,
+				  const char *buffer, size_t count)
+{
+	struct sysdev_class * class = to_sysdev_class(kobj);
+	struct sysdev_class_attribute * class_attr = to_sysdev_class_attr(attr);
+
+	if (class_attr->store)
+		return class_attr->store(class, buffer, count);
+	return -EIO;
+}
+
+static struct sysfs_ops sysfs_class_ops = {
+	.show	= sysdev_class_show,
+	.store	= sysdev_class_store,
+};
+
+static struct kobj_type ktype_sysdev_class = {
+	.sysfs_ops	= &sysfs_class_ops,
+};
+
+int sysdev_class_create_file(struct sysdev_class *c,
+			     struct sysdev_class_attribute *a)
+{
+	return sysfs_create_file(&c->kset.kobj, &a->attr);
+}
+EXPORT_SYMBOL_GPL(sysdev_class_create_file);
+
+void sysdev_class_remove_file(struct sysdev_class *c,
+			      struct sysdev_class_attribute *a)
+{
+	sysfs_remove_file(&c->kset.kobj, &a->attr);
+}
+EXPORT_SYMBOL_GPL(sysdev_class_remove_file);
+
 /*
  * declare system_subsys
  */
-static decl_subsys(system, &ktype_sysdev, NULL);
+static decl_subsys(system, &ktype_sysdev_class, NULL);
 
 int sysdev_class_register(struct sysdev_class * cls)
 {
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index 2a4b432e1176..166a2e58c287 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -37,11 +37,27 @@ struct sysdev_class {
 	struct kset		kset;
 };
 
+struct sysdev_class_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct sysdev_class *, char *);
+	ssize_t (*store)(struct sysdev_class *, const char *, size_t);
+};
+
+#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) 		\
+struct sysdev_class_attribute attr_##_name = { 			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+};
+
 
 extern int sysdev_class_register(struct sysdev_class *);
 extern void sysdev_class_unregister(struct sysdev_class *);
 
-
+extern int sysdev_class_create_file(struct sysdev_class *,
+	struct sysdev_class_attribute *);
+extern void sysdev_class_remove_file(struct sysdev_class *,
+	struct sysdev_class_attribute *);
 /**
  * Auxillary system device drivers.
  */
-- 
cgit v1.2.3


From 4039483fd3065920f035eed39ec59085421c0a4f Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@de.ibm.com>
Date: Tue, 9 May 2006 12:53:49 +0200
Subject: [PATCH] Driver Core: Add /sys/hypervisor when needed

To have a home for all hypervisors, this patch creates /sys/hypervisor.
A new config option SYS_HYPERVISOR is introduced, which should to be set
by architecture dependent hypervisors (e.g. s390 or Xen).

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Michael Holzheu <holzheu@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/Kconfig      |  4 ++++
 drivers/base/Makefile     |  1 +
 drivers/base/base.h       |  5 +++++
 drivers/base/hypervisor.c | 19 +++++++++++++++++++
 drivers/base/init.c       |  1 +
 include/linux/kobject.h   |  2 ++
 6 files changed, 32 insertions(+)
 create mode 100644 drivers/base/hypervisor.c

(limited to 'include/linux')

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index f0eff3dac58d..80502dc6ed66 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -38,3 +38,7 @@ config DEBUG_DRIVER
 	  If you are unsure about this, say N here.
 
 endmenu
+
+config SYS_HYPERVISOR
+	bool
+	default n
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index e99471d3232b..659cde6c2fb9 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 obj-$(CONFIG_SMP)	+= topology.o
+obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
 
 ifeq ($(CONFIG_DEBUG_DRIVER),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 122498aef50b..79115eff6e94 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -5,6 +5,11 @@ extern int devices_init(void);
 extern int buses_init(void);
 extern int classes_init(void);
 extern int firmware_init(void);
+#ifdef CONFIG_SYS_HYPERVISOR
+extern int hypervisor_init(void);
+#else
+static inline int hypervisor_init(void) { return 0; }
+#endif
 extern int platform_bus_init(void);
 extern int system_bus_init(void);
 extern int cpu_dev_init(void);
diff --git a/drivers/base/hypervisor.c b/drivers/base/hypervisor.c
new file mode 100644
index 000000000000..0c85e9d6a448
--- /dev/null
+++ b/drivers/base/hypervisor.c
@@ -0,0 +1,19 @@
+/*
+ * hypervisor.c - /sys/hypervisor subsystem.
+ *
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+
+#include "base.h"
+
+decl_subsys(hypervisor, NULL, NULL);
+EXPORT_SYMBOL_GPL(hypervisor_subsys);
+
+int __init hypervisor_init(void)
+{
+	return subsystem_register(&hypervisor_subsys);
+}
diff --git a/drivers/base/init.c b/drivers/base/init.c
index c648914b9cde..37138154f9e8 100644
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -27,6 +27,7 @@ void __init driver_init(void)
 	buses_init();
 	classes_init();
 	firmware_init();
+	hypervisor_init();
 
 	/* These are also core pieces, but must come after the
 	 * core core pieces.
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c187c53cecd0..2d229327959e 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -190,6 +190,8 @@ struct subsystem _varname##_subsys = { \
 
 /* The global /sys/kernel/ subsystem for people to chain off of */
 extern struct subsystem kernel_subsys;
+/* The global /sys/hypervisor/ subsystem  */
+extern struct subsystem hypervisor_subsys;
 
 /**
  * Helpers for setting the kset of registered objects.
-- 
cgit v1.2.3


From 23681e479129854305da1da32f7f1eaf635ef22c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 14 Jun 2006 12:14:34 -0700
Subject: [PATCH] Driver core: allow struct device to have a dev_t

This is the first step in moving class_device to being replaced by
struct device.  It allows struct device to export a dev_t and makes it
easy to dynamically create and destroy struct device as long as they are
associated with a specific class.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/class.c   |   1 +
 drivers/base/core.c    | 162 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/device.h |  14 +++++
 3 files changed, 176 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index 41a8e0934e3a..50e841a33af0 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -142,6 +142,7 @@ int class_register(struct class * cls)
 	pr_debug("device class '%s': registering\n", cls->name);
 
 	INIT_LIST_HEAD(&cls->children);
+	INIT_LIST_HEAD(&cls->devices);
 	INIT_LIST_HEAD(&cls->interfaces);
 	init_MUTEX(&cls->sem);
 	error = kobject_set_name(&cls->subsys.kset.kobj, "%s", cls->name);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index d5e15a03584e..252cf403f891 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/kdev_t.h>
 
 #include <asm/semaphore.h>
 
@@ -98,6 +99,8 @@ static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
 		struct device *dev = to_dev(kobj);
 		if (dev->bus)
 			return 1;
+		if (dev->class)
+			return 1;
 	}
 	return 0;
 }
@@ -106,7 +109,11 @@ static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)
 {
 	struct device *dev = to_dev(kobj);
 
-	return dev->bus->name;
+	if (dev->bus)
+		return dev->bus->name;
+	if (dev->class)
+		return dev->class->name;
+	return NULL;
 }
 
 static int dev_uevent(struct kset *kset, struct kobject *kobj, char **envp,
@@ -117,6 +124,16 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj, char **envp,
 	int length = 0;
 	int retval = 0;
 
+	/* add the major/minor if present */
+	if (MAJOR(dev->devt)) {
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MAJOR=%u", MAJOR(dev->devt));
+		add_uevent_var(envp, num_envp, &i,
+			       buffer, buffer_size, &length,
+			       "MINOR=%u", MINOR(dev->devt));
+	}
+
 	/* add bus name of physical device */
 	if (dev->bus)
 		add_uevent_var(envp, num_envp, &i,
@@ -161,6 +178,12 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 	return count;
 }
 
+static ssize_t show_dev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	return print_dev_t(buf, dev->devt);
+}
+
 /*
  *	devices_subsys - structure to be registered with kobject core.
  */
@@ -231,6 +254,7 @@ void device_initialize(struct device *dev)
 	klist_init(&dev->klist_children, klist_children_get,
 		   klist_children_put);
 	INIT_LIST_HEAD(&dev->dma_pools);
+	INIT_LIST_HEAD(&dev->node);
 	init_MUTEX(&dev->sem);
 	device_init_wakeup(dev, 0);
 }
@@ -274,6 +298,31 @@ int device_add(struct device *dev)
 	dev->uevent_attr.store = store_uevent;
 	device_create_file(dev, &dev->uevent_attr);
 
+	if (MAJOR(dev->devt)) {
+		struct device_attribute *attr;
+		attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+		if (!attr) {
+			error = -ENOMEM;
+			goto PMError;
+		}
+		attr->attr.name = "dev";
+		attr->attr.mode = S_IRUGO;
+		if (dev->driver)
+			attr->attr.owner = dev->driver->owner;
+		attr->show = show_dev;
+		error = device_create_file(dev, attr);
+		if (error) {
+			kfree(attr);
+			goto attrError;
+		}
+
+		dev->devt_attr = attr;
+	}
+
+	if (dev->class)
+		sysfs_create_link(&dev->class->subsys.kset.kobj, &dev->kobj,
+				  dev->bus_id);
+
 	if ((error = device_pm_add(dev)))
 		goto PMError;
 	if ((error = bus_add_device(dev)))
@@ -292,6 +341,11 @@ int device_add(struct device *dev)
  BusError:
 	device_pm_remove(dev);
  PMError:
+	if (dev->devt_attr) {
+		device_remove_file(dev, dev->devt_attr);
+		kfree(dev->devt_attr);
+	}
+ attrError:
 	kobject_uevent(&dev->kobj, KOBJ_REMOVE);
 	kobject_del(&dev->kobj);
  Error:
@@ -366,6 +420,10 @@ void device_del(struct device * dev)
 
 	if (parent)
 		klist_del(&dev->knode_parent);
+	if (dev->devt_attr)
+		device_remove_file(dev, dev->devt_attr);
+	if (dev->class)
+		sysfs_remove_link(&dev->class->subsys.kset.kobj, dev->bus_id);
 	device_remove_file(dev, &dev->uevent_attr);
 
 	/* Notify the platform of the removal, in case they
@@ -450,3 +508,105 @@ EXPORT_SYMBOL_GPL(put_device);
 
 EXPORT_SYMBOL_GPL(device_create_file);
 EXPORT_SYMBOL_GPL(device_remove_file);
+
+
+static void device_create_release(struct device *dev)
+{
+	pr_debug("%s called for %s\n", __FUNCTION__, dev->bus_id);
+	kfree(dev);
+}
+
+/**
+ * device_create - creates a device and registers it with sysfs
+ * @cs: pointer to the struct class that this device should be registered to.
+ * @parent: pointer to the parent struct device of this new device, if any.
+ * @dev: the dev_t for the char device to be added.
+ * @fmt: string for the class device's name
+ *
+ * This function can be used by char device classes.  A struct
+ * device will be created in sysfs, registered to the specified
+ * class.
+ * A "dev" file will be created, showing the dev_t for the device, if
+ * the dev_t is not 0,0.
+ * If a pointer to a parent struct device is passed in, the newly
+ * created struct device will be a child of that device in sysfs.  The
+ * pointer to the struct device will be returned from the call.  Any
+ * further sysfs files that might be required can be created using this
+ * pointer.
+ *
+ * Note: the struct class passed to this function must have previously
+ * been created with a call to class_create().
+ */
+struct device *device_create(struct class *class, struct device *parent,
+			     dev_t devt, char *fmt, ...)
+{
+	va_list args;
+	struct device *dev = NULL;
+	int retval = -ENODEV;
+
+	if (class == NULL || IS_ERR(class))
+		goto error;
+	if (parent == NULL) {
+		printk(KERN_WARNING "%s does not work yet for NULL parents\n", __FUNCTION__);
+		goto error;
+	}
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		retval = -ENOMEM;
+		goto error;
+	}
+
+	dev->devt = devt;
+	dev->class = class;
+	dev->parent = parent;
+	dev->release = device_create_release;
+
+	va_start(args, fmt);
+	vsnprintf(dev->bus_id, BUS_ID_SIZE, fmt, args);
+	va_end(args);
+	retval = device_register(dev);
+	if (retval)
+		goto error;
+
+	/* tie the class to the device */
+	down(&class->sem);
+	list_add_tail(&dev->node, &class->devices);
+	up(&class->sem);
+
+	return dev;
+
+error:
+	kfree(dev);
+	return ERR_PTR(retval);
+}
+EXPORT_SYMBOL_GPL(device_create);
+
+/**
+ * device_destroy - removes a device that was created with device_create()
+ * @class: the pointer to the struct class that this device was registered * with.
+ * @dev: the dev_t of the device that was previously registered.
+ *
+ * This call unregisters and cleans up a class device that was created with a
+ * call to class_device_create()
+ */
+void device_destroy(struct class *class, dev_t devt)
+{
+	struct device *dev = NULL;
+	struct device *dev_tmp;
+
+	down(&class->sem);
+	list_for_each_entry(dev_tmp, &class->devices, node) {
+		if (dev_tmp->devt == devt) {
+			dev = dev_tmp;
+			break;
+		}
+	}
+	up(&class->sem);
+
+	if (dev) {
+		list_del_init(&dev->node);
+		device_unregister(dev);
+	}
+}
+EXPORT_SYMBOL_GPL(device_destroy);
diff --git a/include/linux/device.h b/include/linux/device.h
index ade10dd6b779..b473f4278910 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -142,6 +142,7 @@ struct class {
 
 	struct subsystem	subsys;
 	struct list_head	children;
+	struct list_head	devices;
 	struct list_head	interfaces;
 	struct semaphore	sem;	/* locks both the children and interfaces lists */
 
@@ -305,6 +306,7 @@ struct device {
 	struct kobject kobj;
 	char	bus_id[BUS_ID_SIZE];	/* position on parent bus */
 	struct device_attribute uevent_attr;
+	struct device_attribute *devt_attr;
 
 	struct semaphore	sem;	/* semaphore to synchronize calls to
 					 * its driver.
@@ -332,6 +334,11 @@ struct device {
 	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
 					     override */
 
+	/* class_device migration path */
+	struct list_head	node;
+	struct class		*class;		/* optional*/
+	dev_t			devt;		/* dev_t, creates the sysfs "dev" */
+
 	void	(*release)(struct device * dev);
 };
 
@@ -373,6 +380,13 @@ extern int  device_attach(struct device * dev);
 extern void driver_attach(struct device_driver * drv);
 extern void device_reprobe(struct device *dev);
 
+/*
+ * Easy functions for dynamically creating devices on the fly
+ */
+extern struct device *device_create(struct class *cls, struct device *parent,
+				    dev_t devt, char *fmt, ...)
+				    __attribute__((format(printf,4,5)));
+extern void device_destroy(struct class *cls, dev_t devt);
 
 /*
  * Platform "fixup" functions - allow the platform to have their say
-- 
cgit v1.2.3


From 3e95637a48820ff8bedb33e6439def96ccff1de5 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 16 Jun 2006 17:10:48 -0400
Subject: [PATCH] Driver Core: Make dev_info and friends print the bus name if
 there is no driver

This patch (as721) makes dev_info and related macros print the device's
bus name if the device doesn't have a driver, instead of printing just a
blank.  If the device isn't on a bus either... well, then it does leave
a blank space.  But it will be easier for someone else to change if they
want.

Cc: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/core.c    | 16 ++++++++++++++++
 include/linux/device.h |  3 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index a979bc3f49a9..d0f84ff78776 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -29,6 +29,22 @@ int (*platform_notify_remove)(struct device * dev) = NULL;
  * sysfs bindings for devices.
  */
 
+/**
+ * dev_driver_string - Return a device's driver name, if at all possible
+ * @dev: struct device to get the name of
+ *
+ * Will return the device's driver's name if it is bound to a device.  If
+ * the device is not bound to a device, it will return the name of the bus
+ * it is attached to.  If it is not attached to a bus either, an empty
+ * string will be returned.
+ */
+const char *dev_driver_string(struct device *dev)
+{
+	return dev->driver ? dev->driver->name :
+			(dev->bus ? dev->bus->name : "");
+}
+EXPORT_SYMBOL_GPL(dev_driver_string);
+
 #define to_dev(obj) container_of(obj, struct device, kobj)
 #define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)
 
diff --git a/include/linux/device.h b/include/linux/device.h
index b473f4278910..1e5f30da98bc 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -416,8 +416,9 @@ extern int firmware_register(struct subsystem *);
 extern void firmware_unregister(struct subsystem *);
 
 /* debugging and troubleshooting/diagnostic helpers. */
+extern const char *dev_driver_string(struct device *dev);
 #define dev_printk(level, dev, format, arg...)	\
-	printk(level "%s %s: " format , (dev)->driver ? (dev)->driver->name : "" , (dev)->bus_id , ## arg)
+	printk(level "%s %s: " format , dev_driver_string(dev) , (dev)->bus_id , ## arg)
 
 #ifdef DEBUG
 #define dev_dbg(dev, format, arg...)		\
-- 
cgit v1.2.3


From a5117ba7da37deb09df5eb802dace229b3fb1e9f Mon Sep 17 00:00:00 2001
From: Rene Herman <rene.herman@keyaccess.nl>
Date: Tue, 6 Jun 2006 23:54:02 +0200
Subject: [PATCH] Driver model: add ISA bus

During the recent "isa drivers using platform devices" discussion it was
pointed out that (ALSA) ISA drivers ran into the problem of not having
the option to fail driver load (device registration rather) upon not
finding their hardware due to a probe() error not being passed up
through the driver model. In the course of that, I suggested a seperate
ISA bus might be best; Russell King agreed and suggested this bus could
use the .match() method for the actual device discovery.

The attached does this. For this old non (generically) discoverable ISA
hardware only the driver itself can do discovery so as a difference with
the platform_bus, this isa_bus also distributes match() up to the driver.

As another difference: these devices only exist in the driver model due
to the driver creating them because it might want to drive them, meaning
that all device creation has been made internal as well.

The usage model this provides is nice, and has been acked from the ALSA
side by Takashi Iwai and Jaroslav Kysela. The ALSA driver module_init's
now (for oldisa-only drivers) become:

static int __init alsa_card_foo_init(void)
{
	return isa_register_driver(&snd_foo_isa_driver, SNDRV_CARDS);
}

static void __exit alsa_card_foo_exit(void)
{
	isa_unregister_driver(&snd_foo_isa_driver);
}

Quite like the other bus models therefore. This removes a lot of
duplicated init code from the ALSA ISA drivers.

The passed in isa_driver struct is the regular driver struct embedding a
struct device_driver, the normal probe/remove/shutdown/suspend/resume
callbacks, and as indicated that .match callback.

The "SNDRV_CARDS" you see being passed in is a "unsigned int ndev"
parameter, indicating how many devices to create and call our methods with.

The platform_driver callbacks are called with a platform_device param;
the isa_driver callbacks are being called with a "struct device *dev,
unsigned int id" pair directly -- with the device creation completely
internal to the bus it's much cleaner to not leak isa_dev's by passing
them in at all. The id is the only thing we ever want other then the
struct device * anyways, and it makes for nicer code in the callbacks as
well.

With this additional .match() callback ISA drivers have all options. If
ALSA would want to keep the old non-load behaviour, it could stick all
of the old .probe in .match, which would only keep them registered after
everything was found to be present and accounted for. If it wanted the
behaviour of always loading as it inadvertently did for a bit after the
changeover to platform devices, it could just not provide a .match() and
do everything in .probe() as before.

If it, as Takashi Iwai already suggested earlier as a way of following
the model from saner buses more closely, wants to load when a later bind
could conceivably succeed, it could use .match() for the prerequisites
(such as checking the user wants the card enabled and that port/irq/dma
values have been passed in) and .probe() for everything else. This is
the nicest model.

To the code...

This exports only two functions; isa_{,un}register_driver().

isa_register_driver() register's the struct device_driver, and then
loops over the passed in ndev creating devices and registering them.
This causes the bus match method to be called for them, which is:

int isa_bus_match(struct device *dev, struct device_driver *driver)
{
          struct isa_driver *isa_driver = to_isa_driver(driver);

          if (dev->platform_data == isa_driver) {
                  if (!isa_driver->match ||
                          isa_driver->match(dev, to_isa_dev(dev)->id))
                          return 1;
                  dev->platform_data = NULL;
          }
          return 0;
}

The first thing this does is check if this device is in fact one of this
driver's devices by seeing if the device's platform_data pointer is set
to this driver. Platform devices compare strings, but we don't need to
do that with everything being internal, so isa_register_driver() abuses
dev->platform_data as a isa_driver pointer which we can then check here.
I believe platform_data is available for this, but if rather not, moving
the isa_driver pointer to the private struct isa_dev is ofcourse fine as
well.

Then, if the the driver did not provide a .match, it matches. If it did,
the driver match() method is called to determine a match.

If it did _not_ match, dev->platform_data is reset to indicate this to
isa_register_driver which can then unregister the device again.

If during all this, there's any error, or no devices matched at all
everything is backed out again and the error, or -ENODEV, is returned.

isa_unregister_driver() just unregisters the matched devices and the
driver itself.

More global points/questions...

- I'm introducing include/linux/isa.h. It was available but is ofcourse
a somewhat generic name. Moving more isa stuff over to it in time is
ofcourse fine, so can I have it please? :)

- I'm using device_initcall() and added the isa.o (dependent on
CONFIG_ISA) after the base driver model things in the Makefile. Will
this do, or I really need to stick it in drivers/base/init.c, inside
#ifdef CONFIG_ISA? It's working fine.

Lastly -- I also looked, a bit, into integrating with PnP. "Old ISA"
could be another pnp_protocol, but this does not seem to be a good
match, largely due to the same reason platform_devices weren't -- the
devices do not have a life of their own outside the driver, meaning the
pnp_protocol {get,set}_resources callbacks would need to callback into
driver -- which again means you first need to _have_ that driver. Even
if there's clean way around that, you only end up inventing fake but
valid-form PnP IDs and generally catering to the PnP layer without any
practical advantages over this very simple isa_bus. The thing I also
suggested earlier about the user echoing values into /sys to set up the
hardware from userspace first is... well, cute, but a horrible idea from
a user standpoint.

Comments ofcourse appreciated. Hope it's okay. As said, the usage model
is nice at least.

Signed-off-by: Rene Herman <rene.herman@keyaccess.nl>
---
 drivers/base/Makefile |   1 +
 drivers/base/isa.c    | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/isa.h   |  28 ++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 drivers/base/isa.c
 create mode 100644 include/linux/isa.h

(limited to 'include/linux')

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 659cde6c2fb9..b539e5e75b56 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -5,6 +5,7 @@ obj-y			:= core.o sys.o bus.o dd.o \
 			   cpu.o firmware.o init.o map.o dmapool.o \
 			   attribute_container.o transport_class.o
 obj-y			+= power/
+obj-$(CONFIG_ISA)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
diff --git a/drivers/base/isa.c b/drivers/base/isa.c
new file mode 100644
index 000000000000..d2222397a401
--- /dev/null
+++ b/drivers/base/isa.c
@@ -0,0 +1,180 @@
+/*
+ * ISA bus.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/isa.h>
+
+static struct device isa_bus = {
+	.bus_id		= "isa"
+};
+
+struct isa_dev {
+	struct device dev;
+	struct device *next;
+	unsigned int id;
+};
+
+#define to_isa_dev(x) container_of((x), struct isa_dev, dev)
+
+static int isa_bus_match(struct device *dev, struct device_driver *driver)
+{
+	struct isa_driver *isa_driver = to_isa_driver(driver);
+
+	if (dev->platform_data == isa_driver) {
+		if (!isa_driver->match ||
+			isa_driver->match(dev, to_isa_dev(dev)->id))
+			return 1;
+		dev->platform_data = NULL;
+	}
+	return 0;
+}
+
+static int isa_bus_probe(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->probe)
+		return isa_driver->probe(dev, to_isa_dev(dev)->id);
+
+	return 0;
+}
+
+static int isa_bus_remove(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->remove)
+		return isa_driver->remove(dev, to_isa_dev(dev)->id);
+
+	return 0;
+}
+
+static void isa_bus_shutdown(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->shutdown)
+		isa_driver->shutdown(dev, to_isa_dev(dev)->id);
+}
+
+static int isa_bus_suspend(struct device *dev, pm_message_t state)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->suspend)
+		return isa_driver->suspend(dev, to_isa_dev(dev)->id, state);
+
+	return 0;
+}
+
+static int isa_bus_resume(struct device *dev)
+{
+	struct isa_driver *isa_driver = dev->platform_data;
+
+	if (isa_driver->resume)
+		return isa_driver->resume(dev, to_isa_dev(dev)->id);
+
+	return 0;
+}
+
+static struct bus_type isa_bus_type = {
+	.name		= "isa",
+	.match		= isa_bus_match,
+	.probe		= isa_bus_probe,
+	.remove		= isa_bus_remove,
+	.shutdown	= isa_bus_shutdown,
+	.suspend	= isa_bus_suspend,
+	.resume		= isa_bus_resume
+};
+
+static void isa_dev_release(struct device *dev)
+{
+	kfree(to_isa_dev(dev));
+}
+
+void isa_unregister_driver(struct isa_driver *isa_driver)
+{
+	struct device *dev = isa_driver->devices;
+
+	while (dev) {
+		struct device *tmp = to_isa_dev(dev)->next;
+		device_unregister(dev);
+		dev = tmp;
+	}
+	driver_unregister(&isa_driver->driver);
+}
+EXPORT_SYMBOL_GPL(isa_unregister_driver);
+
+int isa_register_driver(struct isa_driver *isa_driver, unsigned int ndev)
+{
+	int error;
+	unsigned int id;
+
+	isa_driver->driver.bus	= &isa_bus_type;
+	isa_driver->devices	= NULL;
+
+	error = driver_register(&isa_driver->driver);
+	if (error)
+		return error;
+
+	for (id = 0; id < ndev; id++) {
+		struct isa_dev *isa_dev;
+
+		isa_dev = kzalloc(sizeof *isa_dev, GFP_KERNEL);
+		if (!isa_dev) {
+			error = -ENOMEM;
+			break;
+		}
+
+		isa_dev->dev.parent	= &isa_bus;
+		isa_dev->dev.bus	= &isa_bus_type;
+
+		snprintf(isa_dev->dev.bus_id, BUS_ID_SIZE, "%s.%u",
+				isa_driver->driver.name, id);
+
+		isa_dev->dev.platform_data	= isa_driver;
+		isa_dev->dev.release		= isa_dev_release;
+		isa_dev->id			= id;
+
+		error = device_register(&isa_dev->dev);
+		if (error) {
+			put_device(&isa_dev->dev);
+			break;
+		}
+
+		if (isa_dev->dev.platform_data) {
+			isa_dev->next = isa_driver->devices;
+			isa_driver->devices = &isa_dev->dev;
+		} else
+			device_unregister(&isa_dev->dev);
+	}
+
+	if (!error && !isa_driver->devices)
+		error = -ENODEV;
+
+	if (error)
+		isa_unregister_driver(isa_driver);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(isa_register_driver);
+
+static int __init isa_bus_init(void)
+{
+	int error;
+
+	error = bus_register(&isa_bus_type);
+	if (!error) {
+		error = device_register(&isa_bus);
+		if (error)
+			bus_unregister(&isa_bus_type);
+	}
+	return error;
+}
+
+device_initcall(isa_bus_init);
diff --git a/include/linux/isa.h b/include/linux/isa.h
new file mode 100644
index 000000000000..1b855335cb11
--- /dev/null
+++ b/include/linux/isa.h
@@ -0,0 +1,28 @@
+/*
+ * ISA bus.
+ */
+
+#ifndef __LINUX_ISA_H
+#define __LINUX_ISA_H
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+
+struct isa_driver {
+	int (*match)(struct device *, unsigned int);
+	int (*probe)(struct device *, unsigned int);
+	int (*remove)(struct device *, unsigned int);
+	void (*shutdown)(struct device *, unsigned int);
+	int (*suspend)(struct device *, unsigned int, pm_message_t);
+	int (*resume)(struct device *, unsigned int);
+
+	struct device_driver driver;
+	struct device *devices;
+};
+
+#define to_isa_driver(x) container_of((x), struct isa_driver, driver)
+
+int isa_register_driver(struct isa_driver *, unsigned int);
+void isa_unregister_driver(struct isa_driver *);
+
+#endif /* __LINUX_ISA_H */
-- 
cgit v1.2.3


From 782a7a632e4b0581ade665e3d89ee97c8db0f441 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 19 May 2006 13:20:20 -0700
Subject: [PATCH] USB: add usb_interrupt_msg() function for api completeness.

Really just a wrapper around usb_bulk_msg() but now it's documented
much better.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/message.c | 31 +++++++++++++++++++++++++++++++
 include/linux/usb.h        |  2 ++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 08fb20f06f3e..b2f608b0538d 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -157,6 +157,37 @@ int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u
 }
 
 
+/**
+ * usb_interrupt_msg - Builds an interrupt urb, sends it off and waits for completion
+ * @usb_dev: pointer to the usb device to send the message to
+ * @pipe: endpoint "pipe" to send the message to
+ * @data: pointer to the data to send
+ * @len: length in bytes of the data to send
+ * @actual_length: pointer to a location to put the actual length transferred in bytes
+ * @timeout: time in msecs to wait for the message to complete before
+ *	timing out (if 0 the wait is forever)
+ * Context: !in_interrupt ()
+ *
+ * This function sends a simple interrupt message to a specified endpoint and
+ * waits for the message to complete, or timeout.
+ *
+ * If successful, it returns 0, otherwise a negative error number.  The number
+ * of actual bytes transferred will be stored in the actual_length paramater.
+ *
+ * Don't use this function from within an interrupt context, like a bottom half
+ * handler.  If you need an asynchronous message, or need to send a message
+ * from within interrupt context, use usb_submit_urb() If a thread in your
+ * driver uses this call, make sure your disconnect() method can wait for it to
+ * complete.  Since you don't have a handle on the URB used, you can't cancel
+ * the request.
+ */
+int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe,
+		      void *data, int len, int *actual_length, int timeout)
+{
+	return usb_bulk_msg(usb_dev, pipe, data, len, actual_length, timeout);
+}
+EXPORT_SYMBOL_GPL(usb_interrupt_msg);
+
 /**
  *	usb_bulk_msg - Builds a bulk urb, sends it off and waits for completion
  *	@usb_dev: pointer to the usb device to send the message to
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 1f492c0c7047..317ec9f28bce 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1008,6 +1008,8 @@ void usb_buffer_unmap_sg (struct usb_device *dev, unsigned pipe,
 extern int usb_control_msg(struct usb_device *dev, unsigned int pipe,
 	__u8 request, __u8 requesttype, __u16 value, __u16 index,
 	void *data, __u16 size, int timeout);
+extern int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe,
+	void *data, int len, int *actual_length, int timeout);
 extern int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe,
 	void *data, int len, int *actual_length,
 	int timeout);
-- 
cgit v1.2.3


From 79efa097e75018a2918155f343f0e08e61ee8a8c Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Thu, 1 Jun 2006 13:33:42 -0400
Subject: [PATCH] usbcore: port reset for composite devices

This patch (as699) adds usb_reset_composite_device(), a routine for
sending a USB port reset to a device with multiple interfaces owned by
different drivers.  Drivers are notified about impending and completed
resets through two new methods in the usb_driver structure.

The patch modifieds the usbfs ioctl code to make it use the new routine
instead of usb_reset_device().  Follow-up patches will modify the hub,
usb-storage, and usbhid drivers so they can utilize this new API.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devio.c |  3 +-
 drivers/usb/core/hub.c   | 84 ++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/usb/core/usb.c   |  1 +
 include/linux/usb.h      |  9 ++++++
 4 files changed, 92 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index b04ede772f2c..df3fb57d71e6 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -823,8 +823,7 @@ static int proc_connectinfo(struct dev_state *ps, void __user *arg)
 
 static int proc_resetdevice(struct dev_state *ps)
 {
-	return usb_reset_device(ps->dev);
-
+	return usb_reset_composite_device(ps->dev, NULL);
 }
 
 static int proc_setintf(struct dev_state *ps, void __user *arg)
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index f41c08946a52..37c67d7e8b84 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -3007,9 +3007,9 @@ static int config_descriptors_changed(struct usb_device *udev)
  * usb_reset_device - perform a USB port reset to reinitialize a device
  * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
  *
- * WARNING - don't reset any device unless drivers for all of its
- * interfaces are expecting that reset!  Maybe some driver->reset()
- * method should eventually help ensure sufficient cooperation.
+ * WARNING - don't use this routine to reset a composite device
+ * (one with multiple interfaces owned by separate drivers)!
+ * Use usb_reset_composite_device() instead.
  *
  * Do a port reset, reassign the device's address, and establish its
  * former operating configuration.  If the reset fails, or the device's
@@ -3125,3 +3125,81 @@ re_enumerate:
 	hub_port_logical_disconnect(parent_hub, port1);
 	return -ENODEV;
 }
+
+/**
+ * usb_reset_composite_device - warn interface drivers and perform a USB port reset
+ * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
+ * @iface: interface bound to the driver making the request (optional)
+ *
+ * Warns all drivers bound to registered interfaces (using their pre_reset
+ * method), performs the port reset, and then lets the drivers know that
+ * the reset is over (using their post_reset method).
+ *
+ * Return value is the same as for usb_reset_device().
+ *
+ * The caller must own the device lock.  For example, it's safe to use
+ * this from a driver probe() routine after downloading new firmware.
+ * For calls that might not occur during probe(), drivers should lock
+ * the device using usb_lock_device_for_reset().
+ *
+ * The interface locks are acquired during the pre_reset stage and released
+ * during the post_reset stage.  However if iface is not NULL and is
+ * currently being probed, we assume that the caller already owns its
+ * lock.
+ */
+int usb_reset_composite_device(struct usb_device *udev,
+		struct usb_interface *iface)
+{
+	int ret;
+	struct usb_host_config *config = udev->actconfig;
+
+	if (udev->state == USB_STATE_NOTATTACHED ||
+			udev->state == USB_STATE_SUSPENDED) {
+		dev_dbg(&udev->dev, "device reset not allowed in state %d\n",
+				udev->state);
+		return -EINVAL;
+	}
+
+	if (iface && iface->condition != USB_INTERFACE_BINDING)
+		iface = NULL;
+
+	if (config) {
+		int i;
+		struct usb_interface *cintf;
+		struct usb_driver *drv;
+
+		for (i = 0; i < config->desc.bNumInterfaces; ++i) {
+			cintf = config->interface[i];
+			if (cintf != iface)
+				down(&cintf->dev.sem);
+			if (device_is_registered(&cintf->dev) &&
+					cintf->dev.driver) {
+				drv = to_usb_driver(cintf->dev.driver);
+				if (drv->pre_reset)
+					(drv->pre_reset)(cintf);
+			}
+		}
+	}
+
+	ret = usb_reset_device(udev);
+
+	if (config) {
+		int i;
+		struct usb_interface *cintf;
+		struct usb_driver *drv;
+
+		for (i = config->desc.bNumInterfaces - 1; i >= 0; --i) {
+			cintf = config->interface[i];
+			if (device_is_registered(&cintf->dev) &&
+					cintf->dev.driver) {
+				drv = to_usb_driver(cintf->dev.driver);
+				if (drv->post_reset)
+					(drv->post_reset)(cintf);
+			}
+			if (cintf != iface)
+				up(&cintf->dev.sem);
+		}
+	}
+
+	return ret;
+}
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index b7fdc1cd134a..515310751303 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -1207,6 +1207,7 @@ EXPORT_SYMBOL(usb_ifnum_to_if);
 EXPORT_SYMBOL(usb_altnum_to_altsetting);
 
 EXPORT_SYMBOL(usb_reset_device);
+EXPORT_SYMBOL(usb_reset_composite_device);
 
 EXPORT_SYMBOL(__usb_get_extra_descriptor);
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 317ec9f28bce..5ad30cefe7b2 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -386,6 +386,8 @@ extern int usb_lock_device_for_reset(struct usb_device *udev,
 
 /* USB port reset for device reinitialization */
 extern int usb_reset_device(struct usb_device *dev);
+extern int usb_reset_composite_device(struct usb_device *dev,
+		struct usb_interface *iface);
 
 extern struct usb_device *usb_find_device(u16 vendor_id, u16 product_id);
 
@@ -554,6 +556,10 @@ struct usb_dynids {
  *	do (or don't) show up otherwise in the filesystem.
  * @suspend: Called when the device is going to be suspended by the system.
  * @resume: Called when the device is being resumed by the system.
+ * @pre_reset: Called by usb_reset_composite_device() when the device
+ *	is about to be reset.
+ * @post_reset: Called by usb_reset_composite_device() after the device
+ *	has been reset.
  * @id_table: USB drivers use ID table to support hotplugging.
  *	Export this with MODULE_DEVICE_TABLE(usb,...).  This must be set
  *	or your driver's probe function will never get called.
@@ -592,6 +598,9 @@ struct usb_driver {
 	int (*suspend) (struct usb_interface *intf, pm_message_t message);
 	int (*resume) (struct usb_interface *intf);
 
+	void (*pre_reset) (struct usb_interface *intf);
+	void (*post_reset) (struct usb_interface *intf);
+
 	const struct usb_device_id *id_table;
 
 	struct usb_dynids dynids;
-- 
cgit v1.2.3


From a8c28f2389942bab376e39351d27525499630248 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 13 Jun 2006 09:57:47 -0700
Subject: [PATCH] USB: move <linux/usb_cdc.h> to <linux/usb/cdc.h>

This moves <linux/usb_cdc.h> to <linux/usb/cdc.h> to reduce some of the
clutter of usb header files.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/class/cdc-acm.c  |   2 +-
 drivers/usb/gadget/ether.c   |   2 +-
 drivers/usb/gadget/serial.c  |   2 +-
 drivers/usb/net/cdc_ether.c  |   2 +-
 drivers/usb/net/rndis_host.c |   2 +-
 drivers/usb/net/zaurus.c     |   2 +-
 include/linux/usb/cdc.h      | 205 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb_cdc.h      | 205 -------------------------------------------
 8 files changed, 211 insertions(+), 211 deletions(-)
 create mode 100644 include/linux/usb/cdc.h
 delete mode 100644 include/linux/usb_cdc.h

(limited to 'include/linux')

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 506aff60dac5..d41dc67ba4cc 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -63,7 +63,7 @@
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <linux/list.h>
diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index fc4684096fcc..078daa026718 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c
@@ -49,7 +49,7 @@
 #include <asm/unaligned.h>
 
 #include <linux/usb_ch9.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 #include <linux/usb_gadget.h>
 
 #include <linux/random.h>
diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c
index e477edd681d3..9d6e1d295528 100644
--- a/drivers/usb/gadget/serial.c
+++ b/drivers/usb/gadget/serial.c
@@ -45,7 +45,7 @@
 #include <asm/uaccess.h>
 
 #include <linux/usb_ch9.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 #include <linux/usb_gadget.h>
 
 #include "gadget_chips.h"
diff --git a/drivers/usb/net/cdc_ether.c b/drivers/usb/net/cdc_ether.c
index 9c6c5b0b01ad..efd195b5912c 100644
--- a/drivers/usb/net/cdc_ether.c
+++ b/drivers/usb/net/cdc_ether.c
@@ -31,7 +31,7 @@
 #include <linux/workqueue.h>
 #include <linux/mii.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 
 #include "usbnet.h"
 
diff --git a/drivers/usb/net/rndis_host.c b/drivers/usb/net/rndis_host.c
index 94ddfe16fdda..f551546d7521 100644
--- a/drivers/usb/net/rndis_host.c
+++ b/drivers/usb/net/rndis_host.c
@@ -30,7 +30,7 @@
 #include <linux/workqueue.h>
 #include <linux/mii.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 
 #include "usbnet.h"
 
diff --git a/drivers/usb/net/zaurus.c b/drivers/usb/net/zaurus.c
index bf2035d329f4..813e470d0600 100644
--- a/drivers/usb/net/zaurus.c
+++ b/drivers/usb/net/zaurus.c
@@ -30,7 +30,7 @@
 #include <linux/mii.h>
 #include <linux/crc32.h>
 #include <linux/usb.h>
-#include <linux/usb_cdc.h>
+#include <linux/usb/cdc.h>
 
 #include "usbnet.h"
 
diff --git a/include/linux/usb/cdc.h b/include/linux/usb/cdc.h
new file mode 100644
index 000000000000..ba617c372455
--- /dev/null
+++ b/include/linux/usb/cdc.h
@@ -0,0 +1,205 @@
+/*
+ * USB Communications Device Class (CDC) definitions
+ *
+ * CDC says how to talk to lots of different types of network adapters,
+ * notably ethernet adapters and various modems.  It's used mostly with
+ * firmware based USB peripherals.
+ */
+
+#define USB_CDC_SUBCLASS_ACM			0x02
+#define USB_CDC_SUBCLASS_ETHERNET		0x06
+#define USB_CDC_SUBCLASS_WHCM			0x08
+#define USB_CDC_SUBCLASS_DMM			0x09
+#define USB_CDC_SUBCLASS_MDLM			0x0a
+#define USB_CDC_SUBCLASS_OBEX			0x0b
+
+#define USB_CDC_PROTO_NONE			0
+
+#define USB_CDC_ACM_PROTO_AT_V25TER		1
+#define USB_CDC_ACM_PROTO_AT_PCCA101		2
+#define USB_CDC_ACM_PROTO_AT_PCCA101_WAKE	3
+#define USB_CDC_ACM_PROTO_AT_GSM		4
+#define USB_CDC_ACM_PROTO_AT_3G			5
+#define USB_CDC_ACM_PROTO_AT_CDMA		6
+#define USB_CDC_ACM_PROTO_VENDOR		0xff
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * Class-Specific descriptors ... there are a couple dozen of them
+ */
+
+#define USB_CDC_HEADER_TYPE		0x00		/* header_desc */
+#define USB_CDC_CALL_MANAGEMENT_TYPE	0x01		/* call_mgmt_descriptor */
+#define USB_CDC_ACM_TYPE		0x02		/* acm_descriptor */
+#define USB_CDC_UNION_TYPE		0x06		/* union_desc */
+#define USB_CDC_COUNTRY_TYPE		0x07
+#define USB_CDC_NETWORK_TERMINAL_TYPE	0x0a		/* network_terminal_desc */
+#define USB_CDC_ETHERNET_TYPE		0x0f		/* ether_desc */
+#define USB_CDC_WHCM_TYPE		0x11
+#define USB_CDC_MDLM_TYPE		0x12		/* mdlm_desc */
+#define USB_CDC_MDLM_DETAIL_TYPE	0x13		/* mdlm_detail_desc */
+#define USB_CDC_DMM_TYPE		0x14
+#define USB_CDC_OBEX_TYPE		0x15
+
+/* "Header Functional Descriptor" from CDC spec  5.2.3.1 */
+struct usb_cdc_header_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__le16	bcdCDC;
+} __attribute__ ((packed));
+
+/* "Call Management Descriptor" from CDC spec  5.2.3.2 */
+struct usb_cdc_call_mgmt_descriptor {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bmCapabilities;
+#define USB_CDC_CALL_MGMT_CAP_CALL_MGMT		0x01
+#define USB_CDC_CALL_MGMT_CAP_DATA_INTF		0x02
+
+	__u8	bDataInterface;
+} __attribute__ ((packed));
+
+/* "Abstract Control Management Descriptor" from CDC spec  5.2.3.3 */
+struct usb_cdc_acm_descriptor {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bmCapabilities;
+} __attribute__ ((packed));
+
+/* "Union Functional Descriptor" from CDC spec 5.2.3.8 */
+struct usb_cdc_union_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bMasterInterface0;
+	__u8	bSlaveInterface0;
+	/* ... and there could be other slave interfaces */
+} __attribute__ ((packed));
+
+/* "Network Channel Terminal Functional Descriptor" from CDC spec 5.2.3.11 */
+struct usb_cdc_network_terminal_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bEntityId;
+	__u8	iName;
+	__u8	bChannelIndex;
+	__u8	bPhysicalInterface;
+} __attribute__ ((packed));
+
+/* "Ethernet Networking Functional Descriptor" from CDC spec 5.2.3.16 */
+struct usb_cdc_ether_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	iMACAddress;
+	__le32	bmEthernetStatistics;
+	__le16	wMaxSegmentSize;
+	__le16	wNumberMCFilters;
+	__u8	bNumberPowerFilters;
+} __attribute__ ((packed));
+
+/* "MDLM Functional Descriptor" from CDC WMC spec 6.7.2.3 */
+struct usb_cdc_mdlm_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__le16	bcdVersion;
+	__u8	bGUID[16];
+} __attribute__ ((packed));
+
+/* "MDLM Detail Functional Descriptor" from CDC WMC spec 6.7.2.4 */
+struct usb_cdc_mdlm_detail_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	/* type is associated with mdlm_desc.bGUID */
+	__u8	bGuidDescriptorType;
+	__u8	bDetailData[0];
+} __attribute__ ((packed));
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * Class-Specific Control Requests (6.2)
+ *
+ * section 3.6.2.1 table 4 has the ACM profile, for modems.
+ * section 3.8.2 table 10 has the ethernet profile.
+ *
+ * Microsoft's RNDIS stack for Ethernet is a vendor-specific CDC ACM variant,
+ * heavily dependent on the encapsulated (proprietary) command mechanism.
+ */
+
+#define USB_CDC_SEND_ENCAPSULATED_COMMAND	0x00
+#define USB_CDC_GET_ENCAPSULATED_RESPONSE	0x01
+#define USB_CDC_REQ_SET_LINE_CODING		0x20
+#define USB_CDC_REQ_GET_LINE_CODING		0x21
+#define USB_CDC_REQ_SET_CONTROL_LINE_STATE	0x22
+#define USB_CDC_REQ_SEND_BREAK			0x23
+#define USB_CDC_SET_ETHERNET_MULTICAST_FILTERS	0x40
+#define USB_CDC_SET_ETHERNET_PM_PATTERN_FILTER	0x41
+#define USB_CDC_GET_ETHERNET_PM_PATTERN_FILTER	0x42
+#define USB_CDC_SET_ETHERNET_PACKET_FILTER	0x43
+#define USB_CDC_GET_ETHERNET_STATISTIC		0x44
+
+/* Line Coding Structure from CDC spec 6.2.13 */
+struct usb_cdc_line_coding {
+	__le32	dwDTERate;
+	__u8	bCharFormat;
+#define USB_CDC_1_STOP_BITS			0
+#define USB_CDC_1_5_STOP_BITS			1
+#define USB_CDC_2_STOP_BITS			2
+
+	__u8	bParityType;
+#define USB_CDC_NO_PARITY			0
+#define USB_CDC_ODD_PARITY			1
+#define USB_CDC_EVEN_PARITY			2
+#define USB_CDC_MARK_PARITY			3
+#define USB_CDC_SPACE_PARITY			4
+
+	__u8	bDataBits;
+} __attribute__ ((packed));
+
+/* table 62; bits in multicast filter */
+#define	USB_CDC_PACKET_TYPE_PROMISCUOUS		(1 << 0)
+#define	USB_CDC_PACKET_TYPE_ALL_MULTICAST	(1 << 1) /* no filter */
+#define	USB_CDC_PACKET_TYPE_DIRECTED		(1 << 2)
+#define	USB_CDC_PACKET_TYPE_BROADCAST		(1 << 3)
+#define	USB_CDC_PACKET_TYPE_MULTICAST		(1 << 4) /* filtered */
+
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * Class-Specific Notifications (6.3) sent by interrupt transfers
+ *
+ * section 3.8.2 table 11 of the CDC spec lists Ethernet notifications
+ * section 3.6.2.1 table 5 specifies ACM notifications, accepted by RNDIS
+ * RNDIS also defines its own bit-incompatible notifications
+ */
+
+#define USB_CDC_NOTIFY_NETWORK_CONNECTION	0x00
+#define USB_CDC_NOTIFY_RESPONSE_AVAILABLE	0x01
+#define USB_CDC_NOTIFY_SERIAL_STATE		0x20
+#define USB_CDC_NOTIFY_SPEED_CHANGE		0x2a
+
+struct usb_cdc_notification {
+	__u8	bmRequestType;
+	__u8	bNotificationType;
+	__le16	wValue;
+	__le16	wIndex;
+	__le16	wLength;
+} __attribute__ ((packed));
+
diff --git a/include/linux/usb_cdc.h b/include/linux/usb_cdc.h
deleted file mode 100644
index ba617c372455..000000000000
--- a/include/linux/usb_cdc.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * USB Communications Device Class (CDC) definitions
- *
- * CDC says how to talk to lots of different types of network adapters,
- * notably ethernet adapters and various modems.  It's used mostly with
- * firmware based USB peripherals.
- */
-
-#define USB_CDC_SUBCLASS_ACM			0x02
-#define USB_CDC_SUBCLASS_ETHERNET		0x06
-#define USB_CDC_SUBCLASS_WHCM			0x08
-#define USB_CDC_SUBCLASS_DMM			0x09
-#define USB_CDC_SUBCLASS_MDLM			0x0a
-#define USB_CDC_SUBCLASS_OBEX			0x0b
-
-#define USB_CDC_PROTO_NONE			0
-
-#define USB_CDC_ACM_PROTO_AT_V25TER		1
-#define USB_CDC_ACM_PROTO_AT_PCCA101		2
-#define USB_CDC_ACM_PROTO_AT_PCCA101_WAKE	3
-#define USB_CDC_ACM_PROTO_AT_GSM		4
-#define USB_CDC_ACM_PROTO_AT_3G			5
-#define USB_CDC_ACM_PROTO_AT_CDMA		6
-#define USB_CDC_ACM_PROTO_VENDOR		0xff
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Class-Specific descriptors ... there are a couple dozen of them
- */
-
-#define USB_CDC_HEADER_TYPE		0x00		/* header_desc */
-#define USB_CDC_CALL_MANAGEMENT_TYPE	0x01		/* call_mgmt_descriptor */
-#define USB_CDC_ACM_TYPE		0x02		/* acm_descriptor */
-#define USB_CDC_UNION_TYPE		0x06		/* union_desc */
-#define USB_CDC_COUNTRY_TYPE		0x07
-#define USB_CDC_NETWORK_TERMINAL_TYPE	0x0a		/* network_terminal_desc */
-#define USB_CDC_ETHERNET_TYPE		0x0f		/* ether_desc */
-#define USB_CDC_WHCM_TYPE		0x11
-#define USB_CDC_MDLM_TYPE		0x12		/* mdlm_desc */
-#define USB_CDC_MDLM_DETAIL_TYPE	0x13		/* mdlm_detail_desc */
-#define USB_CDC_DMM_TYPE		0x14
-#define USB_CDC_OBEX_TYPE		0x15
-
-/* "Header Functional Descriptor" from CDC spec  5.2.3.1 */
-struct usb_cdc_header_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__le16	bcdCDC;
-} __attribute__ ((packed));
-
-/* "Call Management Descriptor" from CDC spec  5.2.3.2 */
-struct usb_cdc_call_mgmt_descriptor {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bmCapabilities;
-#define USB_CDC_CALL_MGMT_CAP_CALL_MGMT		0x01
-#define USB_CDC_CALL_MGMT_CAP_DATA_INTF		0x02
-
-	__u8	bDataInterface;
-} __attribute__ ((packed));
-
-/* "Abstract Control Management Descriptor" from CDC spec  5.2.3.3 */
-struct usb_cdc_acm_descriptor {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bmCapabilities;
-} __attribute__ ((packed));
-
-/* "Union Functional Descriptor" from CDC spec 5.2.3.8 */
-struct usb_cdc_union_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bMasterInterface0;
-	__u8	bSlaveInterface0;
-	/* ... and there could be other slave interfaces */
-} __attribute__ ((packed));
-
-/* "Network Channel Terminal Functional Descriptor" from CDC spec 5.2.3.11 */
-struct usb_cdc_network_terminal_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	bEntityId;
-	__u8	iName;
-	__u8	bChannelIndex;
-	__u8	bPhysicalInterface;
-} __attribute__ ((packed));
-
-/* "Ethernet Networking Functional Descriptor" from CDC spec 5.2.3.16 */
-struct usb_cdc_ether_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__u8	iMACAddress;
-	__le32	bmEthernetStatistics;
-	__le16	wMaxSegmentSize;
-	__le16	wNumberMCFilters;
-	__u8	bNumberPowerFilters;
-} __attribute__ ((packed));
-
-/* "MDLM Functional Descriptor" from CDC WMC spec 6.7.2.3 */
-struct usb_cdc_mdlm_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	__le16	bcdVersion;
-	__u8	bGUID[16];
-} __attribute__ ((packed));
-
-/* "MDLM Detail Functional Descriptor" from CDC WMC spec 6.7.2.4 */
-struct usb_cdc_mdlm_detail_desc {
-	__u8	bLength;
-	__u8	bDescriptorType;
-	__u8	bDescriptorSubType;
-
-	/* type is associated with mdlm_desc.bGUID */
-	__u8	bGuidDescriptorType;
-	__u8	bDetailData[0];
-} __attribute__ ((packed));
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Class-Specific Control Requests (6.2)
- *
- * section 3.6.2.1 table 4 has the ACM profile, for modems.
- * section 3.8.2 table 10 has the ethernet profile.
- *
- * Microsoft's RNDIS stack for Ethernet is a vendor-specific CDC ACM variant,
- * heavily dependent on the encapsulated (proprietary) command mechanism.
- */
-
-#define USB_CDC_SEND_ENCAPSULATED_COMMAND	0x00
-#define USB_CDC_GET_ENCAPSULATED_RESPONSE	0x01
-#define USB_CDC_REQ_SET_LINE_CODING		0x20
-#define USB_CDC_REQ_GET_LINE_CODING		0x21
-#define USB_CDC_REQ_SET_CONTROL_LINE_STATE	0x22
-#define USB_CDC_REQ_SEND_BREAK			0x23
-#define USB_CDC_SET_ETHERNET_MULTICAST_FILTERS	0x40
-#define USB_CDC_SET_ETHERNET_PM_PATTERN_FILTER	0x41
-#define USB_CDC_GET_ETHERNET_PM_PATTERN_FILTER	0x42
-#define USB_CDC_SET_ETHERNET_PACKET_FILTER	0x43
-#define USB_CDC_GET_ETHERNET_STATISTIC		0x44
-
-/* Line Coding Structure from CDC spec 6.2.13 */
-struct usb_cdc_line_coding {
-	__le32	dwDTERate;
-	__u8	bCharFormat;
-#define USB_CDC_1_STOP_BITS			0
-#define USB_CDC_1_5_STOP_BITS			1
-#define USB_CDC_2_STOP_BITS			2
-
-	__u8	bParityType;
-#define USB_CDC_NO_PARITY			0
-#define USB_CDC_ODD_PARITY			1
-#define USB_CDC_EVEN_PARITY			2
-#define USB_CDC_MARK_PARITY			3
-#define USB_CDC_SPACE_PARITY			4
-
-	__u8	bDataBits;
-} __attribute__ ((packed));
-
-/* table 62; bits in multicast filter */
-#define	USB_CDC_PACKET_TYPE_PROMISCUOUS		(1 << 0)
-#define	USB_CDC_PACKET_TYPE_ALL_MULTICAST	(1 << 1) /* no filter */
-#define	USB_CDC_PACKET_TYPE_DIRECTED		(1 << 2)
-#define	USB_CDC_PACKET_TYPE_BROADCAST		(1 << 3)
-#define	USB_CDC_PACKET_TYPE_MULTICAST		(1 << 4) /* filtered */
-
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Class-Specific Notifications (6.3) sent by interrupt transfers
- *
- * section 3.8.2 table 11 of the CDC spec lists Ethernet notifications
- * section 3.6.2.1 table 5 specifies ACM notifications, accepted by RNDIS
- * RNDIS also defines its own bit-incompatible notifications
- */
-
-#define USB_CDC_NOTIFY_NETWORK_CONNECTION	0x00
-#define USB_CDC_NOTIFY_RESPONSE_AVAILABLE	0x01
-#define USB_CDC_NOTIFY_SERIAL_STATE		0x20
-#define USB_CDC_NOTIFY_SPEED_CHANGE		0x2a
-
-struct usb_cdc_notification {
-	__u8	bmRequestType;
-	__u8	bNotificationType;
-	__le16	wValue;
-	__le16	wIndex;
-	__le16	wLength;
-} __attribute__ ((packed));
-
-- 
cgit v1.2.3


From 325a4af60dc945bf2da9cbcdbabb276e312b297c Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 13 Jun 2006 09:59:32 -0700
Subject: [PATCH] USB: move hardware-specific <linux/usb_*.h> to
 <linux/usb/*.h>

This moves header files for controller-specific platform data
from <linux/usb_XXX.h> to <linux/usb/XXX.h> to start reducing
some clutter.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/host/isp116x-hcd.c |  2 +-
 drivers/usb/host/sl811-hcd.c   |  2 +-
 drivers/usb/host/sl811_cs.c    |  2 +-
 include/linux/usb/isp116x.h    | 29 +++++++++++++++++++++++++++++
 include/linux/usb/sl811.h      | 26 ++++++++++++++++++++++++++
 include/linux/usb_isp116x.h    | 29 -----------------------------
 include/linux/usb_sl811.h      | 26 --------------------------
 7 files changed, 58 insertions(+), 58 deletions(-)
 create mode 100644 include/linux/usb/isp116x.h
 create mode 100644 include/linux/usb/sl811.h
 delete mode 100644 include/linux/usb_isp116x.h
 delete mode 100644 include/linux/usb_sl811.h

(limited to 'include/linux')

diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c
index c5e224048efa..14386254c870 100644
--- a/drivers/usb/host/isp116x-hcd.c
+++ b/drivers/usb/host/isp116x-hcd.c
@@ -63,7 +63,7 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/usb.h>
-#include <linux/usb_isp116x.h>
+#include <linux/usb/isp116x.h>
 #include <linux/platform_device.h>
 
 #include <asm/io.h>
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index a92343052751..6b4bc3f2bd86 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -46,7 +46,7 @@
 #include <linux/list.h>
 #include <linux/interrupt.h>
 #include <linux/usb.h>
-#include <linux/usb_sl811.h>
+#include <linux/usb/sl811.h>
 #include <linux/platform_device.h>
 
 #include <asm/io.h>
diff --git a/drivers/usb/host/sl811_cs.c b/drivers/usb/host/sl811_cs.c
index 302aa1ec312f..54f554e0f0ad 100644
--- a/drivers/usb/host/sl811_cs.c
+++ b/drivers/usb/host/sl811_cs.c
@@ -27,7 +27,7 @@
 #include <pcmcia/cisreg.h>
 #include <pcmcia/ds.h>
 
-#include <linux/usb_sl811.h>
+#include <linux/usb/sl811.h>
 
 MODULE_AUTHOR("Botond Botyanszki");
 MODULE_DESCRIPTION("REX-CFU1U PCMCIA driver for 2.6");
diff --git a/include/linux/usb/isp116x.h b/include/linux/usb/isp116x.h
new file mode 100644
index 000000000000..436dd8a2b64a
--- /dev/null
+++ b/include/linux/usb/isp116x.h
@@ -0,0 +1,29 @@
+
+/*
+ * Board initialization code should put one of these into dev->platform_data
+ * and place the isp116x onto platform_bus.
+ */
+
+struct isp116x_platform_data {
+	/* Enable internal resistors on downstream ports */
+	unsigned sel15Kres:1;
+	/* On-chip overcurrent detection */
+	unsigned oc_enable:1;
+	/* INT output polarity */
+	unsigned int_act_high:1;
+	/* INT edge or level triggered */
+	unsigned int_edge_triggered:1;
+	/* Enable wakeup by devices on usb bus (e.g. wakeup
+	   by attachment/detachment or by device activity
+	   such as moving a mouse). When chosen, this option
+	   prevents stopping internal clock, increasing
+	   thereby power consumption in suspended state. */
+	unsigned remote_wakeup_enable:1;
+	/* Inter-io delay (ns). The chip is picky about access timings; it
+	   expects at least:
+	   150ns delay between consecutive accesses to DATA_REG,
+	   300ns delay between access to ADDR_REG and DATA_REG
+	   OE, WE MUST NOT be changed during these intervals
+	 */
+	void (*delay) (struct device * dev, int delay);
+};
diff --git a/include/linux/usb/sl811.h b/include/linux/usb/sl811.h
new file mode 100644
index 000000000000..397ee3b3d7f3
--- /dev/null
+++ b/include/linux/usb/sl811.h
@@ -0,0 +1,26 @@
+
+/*
+ * board initialization should put one of these into dev->platform_data
+ * and place the sl811hs onto platform_bus named "sl811-hcd".
+ */
+
+struct sl811_platform_data {
+	unsigned	can_wakeup:1;
+
+	/* given port_power, msec/2 after power on till power good */
+	u8		potpg;
+
+	/* mA/2 power supplied on this port (max = default = 250) */
+	u8		power;
+
+	/* sl811 relies on an external source of VBUS current */
+	void		(*port_power)(struct device *dev, int is_on);
+
+	/* pulse sl811 nRST (probably with a GPIO) */
+	void		(*reset)(struct device *dev);
+
+	// some boards need something like these:
+	// int		(*check_overcurrent)(struct device *dev);
+	// void		(*clock_enable)(struct device *dev, int is_on);
+};
+
diff --git a/include/linux/usb_isp116x.h b/include/linux/usb_isp116x.h
deleted file mode 100644
index 436dd8a2b64a..000000000000
--- a/include/linux/usb_isp116x.h
+++ /dev/null
@@ -1,29 +0,0 @@
-
-/*
- * Board initialization code should put one of these into dev->platform_data
- * and place the isp116x onto platform_bus.
- */
-
-struct isp116x_platform_data {
-	/* Enable internal resistors on downstream ports */
-	unsigned sel15Kres:1;
-	/* On-chip overcurrent detection */
-	unsigned oc_enable:1;
-	/* INT output polarity */
-	unsigned int_act_high:1;
-	/* INT edge or level triggered */
-	unsigned int_edge_triggered:1;
-	/* Enable wakeup by devices on usb bus (e.g. wakeup
-	   by attachment/detachment or by device activity
-	   such as moving a mouse). When chosen, this option
-	   prevents stopping internal clock, increasing
-	   thereby power consumption in suspended state. */
-	unsigned remote_wakeup_enable:1;
-	/* Inter-io delay (ns). The chip is picky about access timings; it
-	   expects at least:
-	   150ns delay between consecutive accesses to DATA_REG,
-	   300ns delay between access to ADDR_REG and DATA_REG
-	   OE, WE MUST NOT be changed during these intervals
-	 */
-	void (*delay) (struct device * dev, int delay);
-};
diff --git a/include/linux/usb_sl811.h b/include/linux/usb_sl811.h
deleted file mode 100644
index 4f2d012d7309..000000000000
--- a/include/linux/usb_sl811.h
+++ /dev/null
@@ -1,26 +0,0 @@
-
-/*
- * board initialization should put one of these into dev->platform_data
- * and place the sl811hs onto platform_bus named "sl811-hcd".
- */
-
-struct sl811_platform_data {
-	unsigned	can_wakeup:1;
-
-	/* given port_power, msec/2 after power on till power good */
-	u8		potpg;
-
-	/* mA/2 power supplied on this port (max = default = 250) */
-	u8		power;
-
-	/* sl811 relies on an external source of VBUS current */
-	void 		(*port_power)(struct device *dev, int is_on);
-
-	/* pulse sl811 nRST (probably with a GPIO) */
-	void 		(*reset)(struct device *dev);
-
-	// some boards need something like these:
- 	// int 		(*check_overcurrent)(struct device *dev);
- 	// void 	(*clock_enable)(struct device *dev, int is_on);
-};
-
-- 
cgit v1.2.3


From ae0dadcf0f912cbab2ac84caa437454620bf71b2 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 13 Jun 2006 10:04:34 -0700
Subject: [PATCH] USB: move <linux/usb_input.h> to <linux/usb/input.h>

Move <linux/usb_input.h> to <linux/usb/input.h> and remove some
redundant includes.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/media/video/usbvideo/konicawc.c |  3 +--
 drivers/usb/input/acecad.c              |  4 +---
 drivers/usb/input/aiptek.c              |  4 +---
 drivers/usb/input/appletouch.c          |  4 +---
 drivers/usb/input/ati_remote.c          |  4 +---
 drivers/usb/input/ati_remote2.c         |  2 +-
 drivers/usb/input/hid-input.c           |  4 +---
 drivers/usb/input/itmtouch.c            |  4 +---
 drivers/usb/input/kbtab.c               |  5 +----
 drivers/usb/input/keyspan_remote.c      |  4 +---
 drivers/usb/input/mtouchusb.c           |  4 +---
 drivers/usb/input/powermate.c           |  4 +---
 drivers/usb/input/touchkitusb.c         |  4 +---
 drivers/usb/input/usbkbd.c              |  4 +---
 drivers/usb/input/usbmouse.c            |  4 +---
 drivers/usb/input/usbtouchscreen.c      |  2 +-
 drivers/usb/input/wacom.c               |  5 +----
 drivers/usb/input/xpad.c                |  4 +---
 drivers/usb/input/yealink.c             |  4 +---
 drivers/usb/storage/onetouch.c          |  3 +--
 include/linux/usb/input.h               | 25 +++++++++++++++++++++++++
 include/linux/usb_input.h               | 25 -------------------------
 22 files changed, 45 insertions(+), 81 deletions(-)
 create mode 100644 include/linux/usb/input.h
 delete mode 100644 include/linux/usb_input.h

(limited to 'include/linux')

diff --git a/drivers/media/video/usbvideo/konicawc.c b/drivers/media/video/usbvideo/konicawc.c
index c11f5d46b114..6f31ecc88843 100644
--- a/drivers/media/video/usbvideo/konicawc.c
+++ b/drivers/media/video/usbvideo/konicawc.c
@@ -15,8 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/input.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #include "usbvideo.h"
 
diff --git a/drivers/usb/input/acecad.c b/drivers/usb/input/acecad.c
index df29b8078b54..18c10e150ef3 100644
--- a/drivers/usb/input/acecad.c
+++ b/drivers/usb/input/acecad.c
@@ -27,11 +27,9 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/aiptek.c b/drivers/usb/input/aiptek.c
index a6693b0d1c4c..b138dae2b055 100644
--- a/drivers/usb/input/aiptek.c
+++ b/drivers/usb/input/aiptek.c
@@ -73,11 +73,9 @@
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
diff --git a/drivers/usb/input/appletouch.c b/drivers/usb/input/appletouch.c
index 4eff8d7a79d4..36855062eacc 100644
--- a/drivers/usb/input/appletouch.c
+++ b/drivers/usb/input/appletouch.c
@@ -33,9 +33,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/usb.h>
-#include <linux/input.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /* Apple has powerbooks which have the keyboard with different Product IDs */
 #define APPLE_VENDOR_ID		0x05AC
diff --git a/drivers/usb/input/ati_remote.c b/drivers/usb/input/ati_remote.c
index 99f986cb6e95..07c8c0e665dd 100644
--- a/drivers/usb/input/ati_remote.c
+++ b/drivers/usb/input/ati_remote.c
@@ -92,9 +92,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/input.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <linux/wait.h>
 #include <linux/jiffies.h>
 
diff --git a/drivers/usb/input/ati_remote2.c b/drivers/usb/input/ati_remote2.c
index ab1a1ae24be9..ea71de81ca6b 100644
--- a/drivers/usb/input/ati_remote2.c
+++ b/drivers/usb/input/ati_remote2.c
@@ -8,7 +8,7 @@
  * as published by the Free Software Foundation.
  */
 
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define DRIVER_DESC    "ATI/Philips USB RF remote driver"
 #define DRIVER_VERSION "0.1"
diff --git a/drivers/usb/input/hid-input.c b/drivers/usb/input/hid-input.c
index 2f665195f4ac..028e1ad89f5d 100644
--- a/drivers/usb/input/hid-input.c
+++ b/drivers/usb/input/hid-input.c
@@ -29,9 +29,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/input.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #undef DEBUG
 
diff --git a/drivers/usb/input/itmtouch.c b/drivers/usb/input/itmtouch.c
index 7618ae5c104f..5c570cc703f3 100644
--- a/drivers/usb/input/itmtouch.c
+++ b/drivers/usb/input/itmtouch.c
@@ -42,11 +42,9 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /* only an 8 byte buffer necessary for a single packet */
 #define ITM_BUFSIZE			8
diff --git a/drivers/usb/input/kbtab.c b/drivers/usb/input/kbtab.c
index f6d5cead542b..604ade356ead 100644
--- a/drivers/usb/input/kbtab.c
+++ b/drivers/usb/input/kbtab.c
@@ -1,12 +1,9 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <asm/unaligned.h>
-#include <asm/byteorder.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/keyspan_remote.c b/drivers/usb/input/keyspan_remote.c
index 3d911976f378..70af985b5db9 100644
--- a/drivers/usb/input/keyspan_remote.c
+++ b/drivers/usb/input/keyspan_remote.c
@@ -18,9 +18,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/input.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define DRIVER_VERSION	"v0.1"
 #define DRIVER_AUTHOR	"Michael Downey <downey@zymeta.com>"
diff --git a/drivers/usb/input/mtouchusb.c b/drivers/usb/input/mtouchusb.c
index f018953a5485..4fdee4db0729 100644
--- a/drivers/usb/input/mtouchusb.c
+++ b/drivers/usb/input/mtouchusb.c
@@ -42,11 +42,9 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define MTOUCHUSB_MIN_XC                0x0
 #define MTOUCHUSB_MAX_RAW_XC            0x4000
diff --git a/drivers/usb/input/powermate.c b/drivers/usb/input/powermate.c
index fdf0f788062c..b3c0d0c3eae9 100644
--- a/drivers/usb/input/powermate.c
+++ b/drivers/usb/input/powermate.c
@@ -30,12 +30,10 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define POWERMATE_VENDOR	0x077d	/* Griffin Technology, Inc. */
 #define POWERMATE_PRODUCT_NEW	0x0410	/* Griffin PowerMate */
diff --git a/drivers/usb/input/touchkitusb.c b/drivers/usb/input/touchkitusb.c
index 697c5e573a11..da7b0bf51aff 100644
--- a/drivers/usb/input/touchkitusb.c
+++ b/drivers/usb/input/touchkitusb.c
@@ -27,11 +27,9 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define TOUCHKIT_MIN_XC			0x0
 #define TOUCHKIT_MAX_XC			0x07ff
diff --git a/drivers/usb/input/usbkbd.c b/drivers/usb/input/usbkbd.c
index 2f3edc26cb50..5067a6ae650f 100644
--- a/drivers/usb/input/usbkbd.c
+++ b/drivers/usb/input/usbkbd.c
@@ -29,10 +29,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/input.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/usbmouse.c b/drivers/usb/input/usbmouse.c
index af526135d210..446935b671d9 100644
--- a/drivers/usb/input/usbmouse.c
+++ b/drivers/usb/input/usbmouse.c
@@ -28,11 +28,9 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/usbtouchscreen.c b/drivers/usb/input/usbtouchscreen.c
index e9a07c1e905b..3b175aa482cd 100644
--- a/drivers/usb/input/usbtouchscreen.c
+++ b/drivers/usb/input/usbtouchscreen.c
@@ -39,7 +39,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 
 #define DRIVER_VERSION		"v0.3"
diff --git a/drivers/usb/input/wacom.c b/drivers/usb/input/wacom.c
index cf84c6096f29..369461a70b72 100644
--- a/drivers/usb/input/wacom.c
+++ b/drivers/usb/input/wacom.c
@@ -69,13 +69,10 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/input.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include <asm/unaligned.h>
-#include <asm/byteorder.h>
 
 /*
  * Version Information
diff --git a/drivers/usb/input/xpad.c b/drivers/usb/input/xpad.c
index e278489a80c6..cfd4a4e04334 100644
--- a/drivers/usb/input/xpad.c
+++ b/drivers/usb/input/xpad.c
@@ -56,13 +56,11 @@
 
 #include <linux/config.h>
 #include <linux/kernel.h>
-#include <linux/input.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #define DRIVER_VERSION "v0.0.5"
 #define DRIVER_AUTHOR "Marko Friedemann <mfr@bmx-chemnitz.de>"
diff --git a/drivers/usb/input/yealink.c b/drivers/usb/input/yealink.c
index 37d2f0ba0319..24aedbb20f03 100644
--- a/drivers/usb/input/yealink.c
+++ b/drivers/usb/input/yealink.c
@@ -48,13 +48,11 @@
 
 #include <linux/config.h>
 #include <linux/kernel.h>
-#include <linux/input.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/rwsem.h>
-#include <linux/usb.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 
 #include "map_to_7segment.h"
 #include "yealink.h"
diff --git a/drivers/usb/storage/onetouch.c b/drivers/usb/storage/onetouch.c
index 55ee2d36d585..026a587eb8dd 100644
--- a/drivers/usb/storage/onetouch.c
+++ b/drivers/usb/storage/onetouch.c
@@ -34,9 +34,8 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/usb.h>
 #include <linux/usb_ch9.h>
-#include <linux/usb_input.h>
+#include <linux/usb/input.h>
 #include "usb.h"
 #include "onetouch.h"
 #include "debug.h"
diff --git a/include/linux/usb/input.h b/include/linux/usb/input.h
new file mode 100644
index 000000000000..716e0cc16043
--- /dev/null
+++ b/include/linux/usb/input.h
@@ -0,0 +1,25 @@
+#ifndef __USB_INPUT_H
+#define __USB_INPUT_H
+
+/*
+ * Copyright (C) 2005 Dmitry Torokhov
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/usb.h>
+#include <linux/input.h>
+#include <asm/byteorder.h>
+
+static inline void
+usb_to_input_id(const struct usb_device *dev, struct input_id *id)
+{
+	id->bustype = BUS_USB;
+	id->vendor = le16_to_cpu(dev->descriptor.idVendor);
+	id->product = le16_to_cpu(dev->descriptor.idProduct);
+	id->version = le16_to_cpu(dev->descriptor.bcdDevice);
+}
+
+#endif
diff --git a/include/linux/usb_input.h b/include/linux/usb_input.h
deleted file mode 100644
index 716e0cc16043..000000000000
--- a/include/linux/usb_input.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __USB_INPUT_H
-#define __USB_INPUT_H
-
-/*
- * Copyright (C) 2005 Dmitry Torokhov
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
- */
-
-#include <linux/usb.h>
-#include <linux/input.h>
-#include <asm/byteorder.h>
-
-static inline void
-usb_to_input_id(const struct usb_device *dev, struct input_id *id)
-{
-	id->bustype = BUS_USB;
-	id->vendor = le16_to_cpu(dev->descriptor.idVendor);
-	id->product = le16_to_cpu(dev->descriptor.idProduct);
-	id->version = le16_to_cpu(dev->descriptor.bcdDevice);
-}
-
-#endif
-- 
cgit v1.2.3


From 9bde7497e0b54178c317fac47a18be7f948dd471 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 14 Jun 2006 12:14:34 -0700
Subject: [PATCH] USB: make endpoints real struct devices

This will allow for us to give endpoints a major/minor to create a
"usbfs2-like" way to access endpoints directly from userspace in an
easier manner than the current usbfs provides us.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/endpoint.c | 238 ++++++++++++++++++++++++++++----------------
 include/linux/usb.h         |   4 +-
 2 files changed, 153 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/endpoint.c b/drivers/usb/core/endpoint.c
index 4c2fe8f723e5..247b5a4913a8 100644
--- a/drivers/usb/core/endpoint.c
+++ b/drivers/usb/core/endpoint.c
@@ -14,13 +14,14 @@
 #include "usb.h"
 
 /* endpoint stuff */
-struct ep_object {
+
+struct ep_device {
 	struct usb_endpoint_descriptor *desc;
 	struct usb_device *udev;
-	struct kobject kobj;
+	struct device dev;
 };
-#define to_ep_object(_kobj) \
-	container_of(_kobj, struct ep_object, kobj)
+#define to_ep_device(_dev) \
+	container_of(_dev, struct ep_device, dev)
 
 struct ep_attribute {
 	struct attribute attr;
@@ -30,40 +31,37 @@ struct ep_attribute {
 #define to_ep_attribute(_attr) \
 	container_of(_attr, struct ep_attribute, attr)
 
-#define EP_ATTR(_name)						\
-struct ep_attribute ep_##_name = {				\
-	.attr = {.name = #_name, .owner = THIS_MODULE,		\
-			.mode = S_IRUGO},			\
-	.show = show_ep_##_name}
-
 #define usb_ep_attr(field, format_string)			\
-static ssize_t show_ep_##field(struct usb_device *udev,		\
-		struct usb_endpoint_descriptor *desc, 		\
-		char *buf)					\
+static ssize_t show_ep_##field(struct device *dev,		\
+			       struct device_attribute *attr,	\
+			       char *buf)			\
 {								\
-	return sprintf(buf, format_string, desc->field);	\
+	struct ep_device *ep = to_ep_device(dev);		\
+	return sprintf(buf, format_string, ep->desc->field);	\
 }								\
-static EP_ATTR(field);
+static DEVICE_ATTR(field, S_IRUGO, show_ep_##field, NULL);
 
 usb_ep_attr(bLength, "%02x\n")
 usb_ep_attr(bEndpointAddress, "%02x\n")
 usb_ep_attr(bmAttributes, "%02x\n")
 usb_ep_attr(bInterval, "%02x\n")
 
-static ssize_t show_ep_wMaxPacketSize(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_wMaxPacketSize(struct device *dev,
+				      struct device_attribute *attr, char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	return sprintf(buf, "%04x\n",
-			le16_to_cpu(desc->wMaxPacketSize) & 0x07ff);
+			le16_to_cpu(ep->desc->wMaxPacketSize) & 0x07ff);
 }
-static EP_ATTR(wMaxPacketSize);
+static DEVICE_ATTR(wMaxPacketSize, S_IRUGO, show_ep_wMaxPacketSize, NULL);
 
-static ssize_t show_ep_type(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_type(struct device *dev, struct device_attribute *attr,
+			    char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	char *type = "unknown";
 
-	switch (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
+	switch (ep->desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
 	case USB_ENDPOINT_XFER_CONTROL:
 		type = "Control";
 		break;
@@ -79,37 +77,38 @@ static ssize_t show_ep_type(struct usb_device *udev,
 	}
 	return sprintf(buf, "%s\n", type);
 }
-static EP_ATTR(type);
+static DEVICE_ATTR(type, S_IRUGO, show_ep_type, NULL);
 
-static ssize_t show_ep_interval(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_interval(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	char unit;
 	unsigned interval = 0;
 	unsigned in;
 
-	in = (desc->bEndpointAddress & USB_DIR_IN);
+	in = (ep->desc->bEndpointAddress & USB_DIR_IN);
 
-	switch (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
+	switch (ep->desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
 	case USB_ENDPOINT_XFER_CONTROL:
-		if (udev->speed == USB_SPEED_HIGH) 	/* uframes per NAK */
-			interval = desc->bInterval;
+		if (ep->udev->speed == USB_SPEED_HIGH) 	/* uframes per NAK */
+			interval = ep->desc->bInterval;
 		break;
 	case USB_ENDPOINT_XFER_ISOC:
-		interval = 1 << (desc->bInterval - 1);
+		interval = 1 << (ep->desc->bInterval - 1);
 		break;
 	case USB_ENDPOINT_XFER_BULK:
-		if (udev->speed == USB_SPEED_HIGH && !in) /* uframes per NAK */
-			interval = desc->bInterval;
+		if (ep->udev->speed == USB_SPEED_HIGH && !in) /* uframes per NAK */
+			interval = ep->desc->bInterval;
 		break;
 	case USB_ENDPOINT_XFER_INT:
-		if (udev->speed == USB_SPEED_HIGH)
-			interval = 1 << (desc->bInterval - 1);
+		if (ep->udev->speed == USB_SPEED_HIGH)
+			interval = 1 << (ep->desc->bInterval - 1);
 		else
-			interval = desc->bInterval;
+			interval = ep->desc->bInterval;
 		break;
 	}
-	interval *= (udev->speed == USB_SPEED_HIGH) ? 125 : 1000;
+	interval *= (ep->udev->speed == USB_SPEED_HIGH) ? 125 : 1000;
 	if (interval % 1000)
 		unit = 'u';
 	else {
@@ -119,95 +118,158 @@ static ssize_t show_ep_interval(struct usb_device *udev,
 
 	return sprintf(buf, "%d%cs\n", interval, unit);
 }
-static EP_ATTR(interval);
+static DEVICE_ATTR(interval, S_IRUGO, show_ep_interval, NULL);
 
-static ssize_t show_ep_direction(struct usb_device *udev,
-		struct usb_endpoint_descriptor *desc, char *buf)
+static ssize_t show_ep_direction(struct device *dev,
+				 struct device_attribute *attr, char *buf)
 {
+	struct ep_device *ep = to_ep_device(dev);
 	char *direction;
 
-	if ((desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
+	if ((ep->desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) ==
 			USB_ENDPOINT_XFER_CONTROL)
 		direction = "both";
-	else if (desc->bEndpointAddress & USB_DIR_IN)
+	else if (ep->desc->bEndpointAddress & USB_DIR_IN)
 		direction = "in";
 	else
 		direction = "out";
 	return sprintf(buf, "%s\n", direction);
 }
-static EP_ATTR(direction);
-
-static struct attribute *ep_attrs[] = {
-	&ep_bLength.attr,
-	&ep_bEndpointAddress.attr,
-	&ep_bmAttributes.attr,
-	&ep_bInterval.attr,
-	&ep_wMaxPacketSize.attr,
-	&ep_type.attr,
-	&ep_interval.attr,
-	&ep_direction.attr,
+static DEVICE_ATTR(direction, S_IRUGO, show_ep_direction, NULL);
+
+static struct attribute *ep_dev_attrs[] = {
+	&dev_attr_bLength.attr,
+	&dev_attr_bEndpointAddress.attr,
+	&dev_attr_bmAttributes.attr,
+	&dev_attr_bInterval.attr,
+	&dev_attr_wMaxPacketSize.attr,
+	&dev_attr_interval.attr,
+	&dev_attr_type.attr,
+	&dev_attr_direction.attr,
 	NULL,
 };
+static struct attribute_group ep_dev_attr_grp = {
+	.attrs = ep_dev_attrs,
+};
 
-static void ep_object_release(struct kobject *kobj)
+static struct endpoint_class {
+	struct kref kref;
+	struct class *class;
+} *ep_class;
+
+static int init_endpoint_class(void)
 {
-	kfree(to_ep_object(kobj));
+	int result = 0;
+
+	if (ep_class != NULL) {
+		kref_get(&ep_class->kref);
+		goto exit;
+	}
+
+	ep_class = kmalloc(sizeof(*ep_class), GFP_KERNEL);
+	if (!ep_class) {
+		result = -ENOMEM;
+		goto exit;
+	}
+
+	kref_init(&ep_class->kref);
+	ep_class->class = class_create(THIS_MODULE, "usb_endpoint");
+	if (IS_ERR(ep_class->class)) {
+		result = IS_ERR(ep_class->class);
+		kfree(ep_class);
+		ep_class = NULL;
+		goto exit;
+	}
+
+exit:
+	return result;
 }
 
-static ssize_t ep_object_show(struct kobject *kobj, struct attribute *attr,
-		char *buf)
+static void release_endpoint_class(struct kref *kref)
 {
-	struct ep_object *ep_obj = to_ep_object(kobj);
-	struct ep_attribute *ep_attr = to_ep_attribute(attr);
+	/* Ok, we cheat as we know we only have one ep_class */
+	class_destroy(ep_class->class);
+	kfree(ep_class);
+	ep_class = NULL;
+}
 
-	return (ep_attr->show)(ep_obj->udev, ep_obj->desc, buf);
+static void destroy_endpoint_class(void)
+{
+	if (ep_class)
+		kref_put(&ep_class->kref, release_endpoint_class);
 }
 
-static struct sysfs_ops ep_object_sysfs_ops = {
-	.show =			ep_object_show,
-};
+static void ep_device_release(struct device *dev)
+{
+	struct ep_device *ep_dev = to_ep_device(dev);
 
-static struct kobj_type ep_object_ktype = {
-	.release =		ep_object_release,
-	.sysfs_ops =		&ep_object_sysfs_ops,
-	.default_attrs =	ep_attrs,
-};
+	dev_dbg(dev, "%s called for %s\n", __FUNCTION__, dev->bus_id);
+	kfree(ep_dev);
+}
 
 void usb_create_ep_files(struct device *parent,
 			 struct usb_host_endpoint *endpoint,
 			 struct usb_device *udev)
 {
-	struct ep_object *ep_obj;
-	struct kobject *kobj;
+	char name[8];
+	struct ep_device *ep_dev;
+	int minor;
+	int retval;
+
+	retval = init_endpoint_class();
+	if (retval)
+		goto exit;
 
-	ep_obj = kzalloc(sizeof(struct ep_object), GFP_KERNEL);
-	if (!ep_obj)
-		return;
+	ep_dev = kzalloc(sizeof(*ep_dev), GFP_KERNEL);
+	if (!ep_dev) {
+		retval = -ENOMEM;
+		goto exit;
+	}
 
-	ep_obj->desc = &endpoint->desc;
-	ep_obj->udev = udev;
+	/* fun calculation to determine the minor of this endpoint */
+	minor = (((udev->bus->busnum - 1) * 128) * 16) + (udev->devnum - 1);
 
-	kobj = &ep_obj->kobj;
-	kobject_set_name(kobj, "ep_%02x", endpoint->desc.bEndpointAddress);
-	kobj->parent = &parent->kobj;
-	kobj->ktype = &ep_object_ktype;
+	ep_dev->desc = &endpoint->desc;
+	ep_dev->udev = udev;
+	ep_dev->dev.devt = MKDEV(442, minor);	// FIXME fake number...
+	ep_dev->dev.class = ep_class->class;
+	ep_dev->dev.parent = parent;
+	ep_dev->dev.release = ep_device_release;
+	snprintf(ep_dev->dev.bus_id, BUS_ID_SIZE, "usbdev%d.%d_ep%02x",
+		 udev->bus->busnum, udev->devnum,
+		 endpoint->desc.bEndpointAddress);
 
-	/* Don't use kobject_register, because it generates a hotplug event */
-	kobject_init(kobj);
-	if (kobject_add(kobj) == 0)
-		endpoint->kobj = kobj;
-	else
-		kobject_put(kobj);
+	retval = device_register(&ep_dev->dev);
+	if (retval)
+		goto error;
+	sysfs_create_group(&ep_dev->dev.kobj, &ep_dev_attr_grp);
+
+	endpoint->ep_dev = ep_dev;
+
+	/* create the symlink to the old-style "ep_XX" directory */
+	sprintf(name, "ep_%02x", endpoint->desc.bEndpointAddress);
+	sysfs_create_link(&parent->kobj, &endpoint->ep_dev->dev.kobj, name);
+
+exit:
+	return;
+error:
+	kfree(ep_dev);
+	return;
 }
 
 void usb_remove_ep_files(struct usb_host_endpoint *endpoint)
 {
 
-	if (endpoint->kobj) {
-		kobject_del(endpoint->kobj);
-		kobject_put(endpoint->kobj);
-		endpoint->kobj = NULL;
+	if (endpoint->ep_dev) {
+		char name[8];
+
+		sprintf(name, "ep_%02x", endpoint->desc.bEndpointAddress);
+		sysfs_remove_link(&endpoint->ep_dev->dev.parent->kobj, name);
+		sysfs_remove_group(&endpoint->ep_dev->dev.kobj, &ep_dev_attr_grp);
+		device_unregister(&endpoint->ep_dev->dev);
+		endpoint->ep_dev = NULL;
 	}
+	destroy_endpoint_class();
 }
 
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 5ad30cefe7b2..46956a72de5d 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -40,6 +40,8 @@ struct usb_driver;
  * Devices may also have class-specific or vendor-specific descriptors.
  */
 
+struct ep_device;
+
 /**
  * struct usb_host_endpoint - host-side endpoint descriptor and queue
  * @desc: descriptor for this endpoint, wMaxPacketSize in native byteorder
@@ -57,7 +59,7 @@ struct usb_host_endpoint {
 	struct usb_endpoint_descriptor	desc;
 	struct list_head		urb_list;
 	void				*hcpriv;
-	struct kobject			*kobj;	/* For sysfs info */
+	struct ep_device 		*ep_dev;	/* For sysfs info */
 
 	unsigned char *extra;   /* Extra descriptors */
 	int extralen;
-- 
cgit v1.2.3


From c182274ffe1277f4e7c564719a696a37cacf74ea Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 19 Jun 2006 23:59:31 -0700
Subject: [PATCH] USB: move usb_device_class class devices to be real devices

This moves the usb class devices that control the usbfs nodes to show up
in the proper place in the larger device tree.

No userspace changes is needed, this is compatible due to the symlinks
generated by the driver core.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/devio.c | 20 ++++++++++----------
 include/linux/usb.h      |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 2eda52fc1ebc..3f8e06279c92 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -515,19 +515,19 @@ static int check_ctrlrecip(struct dev_state *ps, unsigned int requesttype, unsig
 
 static struct usb_device *usbdev_lookup_minor(int minor)
 {
-	struct class_device *class_dev;
-	struct usb_device *dev = NULL;
+	struct device *device;
+	struct usb_device *udev = NULL;
 
 	down(&usb_device_class->sem);
-	list_for_each_entry(class_dev, &usb_device_class->children, node) {
-		if (class_dev->devt == MKDEV(USB_DEVICE_MAJOR, minor)) {
-			dev = class_dev->class_data;
+	list_for_each_entry(device, &usb_device_class->devices, node) {
+		if (device->devt == MKDEV(USB_DEVICE_MAJOR, minor)) {
+			udev = device->platform_data;
 			break;
 		}
 	}
 	up(&usb_device_class->sem);
 
-	return dev;
+	return udev;
 };
 
 /*
@@ -1580,16 +1580,16 @@ static void usbdev_add(struct usb_device *dev)
 {
 	int minor = ((dev->bus->busnum-1) * 128) + (dev->devnum-1);
 
-	dev->class_dev = class_device_create(usb_device_class, NULL,
-				MKDEV(USB_DEVICE_MAJOR, minor), &dev->dev,
+	dev->usbfs_dev = device_create(usb_device_class, &dev->dev,
+				MKDEV(USB_DEVICE_MAJOR, minor),
 				"usbdev%d.%d", dev->bus->busnum, dev->devnum);
 
-	dev->class_dev->class_data = dev;
+	dev->usbfs_dev->platform_data = dev;
 }
 
 static void usbdev_remove(struct usb_device *dev)
 {
-	class_device_unregister(dev->class_dev);
+	device_unregister(dev->usbfs_dev);
 }
 
 static int usbdev_notify(struct notifier_block *self, unsigned long action,
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 46956a72de5d..b69b6cfb0bd7 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -360,7 +360,7 @@ struct usb_device {
 	char *serial;			/* iSerialNumber string, if present */
 
 	struct list_head filelist;
-	struct class_device *class_dev;
+	struct device *usbfs_dev;
 	struct dentry *usbfs_dentry;	/* usbfs dentry entry for the device */
 
 	/*
-- 
cgit v1.2.3


From bd00949647ddcea47ce4ea8bb2cfcfc98ebf9f2a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 20 Jun 2006 13:09:50 -0700
Subject: [PATCH] USB: convert usb class devices to real devices

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/file.c | 13 ++++++-------
 include/linux/usb.h     |  5 +++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index b263a54a13c0..70898716dd9a 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -158,14 +158,13 @@ int usb_register_dev(struct usb_interface *intf,
 		++temp;
 	else
 		temp = name;
-	intf->class_dev = class_device_create(usb_class, NULL,
-					      MKDEV(USB_MAJOR, minor),
-					      &intf->dev, "%s", temp);
-	if (IS_ERR(intf->class_dev)) {
+	intf->usb_dev = device_create(usb_class, &intf->dev,
+				      MKDEV(USB_MAJOR, minor), "%s", temp);
+	if (IS_ERR(intf->usb_dev)) {
 		spin_lock (&minor_lock);
 		usb_minors[intf->minor] = NULL;
 		spin_unlock (&minor_lock);
-		retval = PTR_ERR(intf->class_dev);
+		retval = PTR_ERR(intf->usb_dev);
 	}
 exit:
 	return retval;
@@ -206,8 +205,8 @@ void usb_deregister_dev(struct usb_interface *intf,
 	spin_unlock (&minor_lock);
 
 	snprintf(name, BUS_ID_SIZE, class_driver->name, intf->minor - minor_base);
-	class_device_destroy(usb_class, MKDEV(USB_MAJOR, intf->minor));
-	intf->class_dev = NULL;
+	device_destroy(usb_class, MKDEV(USB_MAJOR, intf->minor));
+	intf->usb_dev = NULL;
 	intf->minor = -1;
 }
 EXPORT_SYMBOL(usb_deregister_dev);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index b69b6cfb0bd7..8dead32e7ebf 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -103,7 +103,8 @@ enum usb_interface_condition {
  * @condition: binding state of the interface: not bound, binding
  *	(in probe()), bound to a driver, or unbinding (in disconnect())
  * @dev: driver model's view of this device
- * @class_dev: driver model's class view of this device.
+ * @usb_dev: if an interface is bound to the USB major, this will point
+ *	to the sysfs representation for that device.
  *
  * USB device drivers attach to interfaces on a physical device.  Each
  * interface encapsulates a single high level function, such as feeding
@@ -143,7 +144,7 @@ struct usb_interface {
 					 * bound to */
 	enum usb_interface_condition condition;		/* state of binding */
 	struct device dev;		/* interface specific device info */
-	struct class_device *class_dev;
+	struct device *usb_dev;		/* pointer to the usb class's device, if any */
 };
 #define	to_usb_interface(d) container_of(d, struct usb_interface, dev)
 #define	interface_to_usbdev(intf) \
-- 
cgit v1.2.3


From 02e0c5d5c2e00374b6808a42f8eea4ea9baaa216 Mon Sep 17 00:00:00 2001
From: Rudolf Marek <r.marek@sh.cvut.cz>
Date: Thu, 23 Mar 2006 16:48:09 +0100
Subject: [PATCH] i2c-piix4: Add ATI IXP200/300/400 support

This patch adds the ATI IXP southbridges support to i2c-piix4,
as it turned out those chips are compatible with it.

Signed-off-by: Rudolf Marek <r.marek@sh.cvut.cz>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-piix4 | 2 ++
 drivers/i2c/busses/Kconfig         | 5 ++++-
 drivers/i2c/busses/i2c-piix4.c     | 6 ++++++
 include/linux/pci_ids.h            | 3 +++
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index a1c8f581afed..6e6c905143a1 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -6,6 +6,8 @@ Supported adapters:
     Datasheet: Publicly available at the Intel website
   * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges
     Datasheet: Only available via NDA from ServerWorks
+  * ATI IXP southbridges IXP200, IXP300, IXP400
+    Datasheet: Not publicly available
   * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge
     Datasheet: Publicly available at the SMSC website http://www.smsc.com
 
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index d6d44946a283..e3450d16d8a4 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -163,7 +163,7 @@ config I2C_PXA_SLAVE
 	  I2C bus.
 
 config I2C_PIIX4
-	tristate "Intel PIIX4"
+	tristate "Intel PIIX4 and compatible (ATI/Serverworks/Broadcom/SMSC)"
 	depends on I2C && PCI
 	help
 	  If you say yes to this option, support will be included for the Intel
@@ -172,6 +172,9 @@ config I2C_PIIX4
 	  of Broadcom):
 	    Intel PIIX4
 	    Intel 440MX
+	    ATI IXP200
+	    ATI IXP300
+	    ATI IXP400
 	    Serverworks OSB4
 	    Serverworks CSB5
 	    Serverworks CSB6
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index d9c7c00e71f9..5f06e81a2087 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -413,6 +413,12 @@ static struct i2c_adapter piix4_adapter = {
 static struct pci_device_id piix4_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3),
 	  .driver_data = 3 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP200_SMBUS),
+	  .driver_data = 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP300_SMBUS),
+	  .driver_data = 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS),
+	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_OSB4),
 	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB5),
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bcfe9d4f56ae..489af9d3ce1f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -352,8 +352,11 @@
 #define PCI_DEVICE_ID_ATI_RS480         0x5950
 /* ATI IXP Chipset */
 #define PCI_DEVICE_ID_ATI_IXP200_IDE	0x4349
+#define PCI_DEVICE_ID_ATI_IXP200_SMBUS	0x4353
+#define PCI_DEVICE_ID_ATI_IXP300_SMBUS	0x4363
 #define PCI_DEVICE_ID_ATI_IXP300_IDE	0x4369
 #define PCI_DEVICE_ID_ATI_IXP300_SATA   0x436e
+#define PCI_DEVICE_ID_ATI_IXP400_SMBUS	0x4372
 #define PCI_DEVICE_ID_ATI_IXP400_IDE	0x4376
 #define PCI_DEVICE_ID_ATI_IXP400_SATA   0x4379
 #define PCI_DEVICE_ID_ATI_IXP400_SATA2	0x437a
-- 
cgit v1.2.3


From 5e9f4f2e5a02bb6908278a819952aa31fffefaa2 Mon Sep 17 00:00:00 2001
From: "Mark A. Greer" <mgreer@mvista.com>
Date: Tue, 25 Apr 2006 13:04:54 +0200
Subject: [PATCH] I2C: m41t00: Add support for the ST M41T81 and M41T85

This patch adds support for the ST m41t81 and m41t85 i2c rtc chips
to the existing m41t00 driver.

Since there is no way to reliably determine what type of rtc chip
is in use, the chip type is passed in via platform_data.  The i2c
address and square wave frequency are passed in via platform_data
as well.  To accommodate the use of platform_data, a new header
file include/linux/m41t00.h has been added.

The m41t81 and m41t85 chips halt the updating of their time registers
while they are being accessed.  They resume when a stop condition
exists on the i2c bus or when non-time related regs are accessed.
To make the best use of that facility and to make more efficient
use of the i2c bus, this patch replaces multiple i2c_smbus_xxx calls
with a single i2c_transfer call.

Signed-off-by: Mark A. Greer <mgreer@mvista.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/chips/m41t00.c | 313 ++++++++++++++++++++++++++++++++++-----------
 include/linux/m41t00.h     |  50 ++++++++
 2 files changed, 291 insertions(+), 72 deletions(-)
 create mode 100644 include/linux/m41t00.h

(limited to 'include/linux')

diff --git a/drivers/i2c/chips/m41t00.c b/drivers/i2c/chips/m41t00.c
index eb2be3e4416c..2dd0a34d9472 100644
--- a/drivers/i2c/chips/m41t00.c
+++ b/drivers/i2c/chips/m41t00.c
@@ -1,9 +1,9 @@
 /*
- * I2C client/driver for the ST M41T00 Real-Time Clock chip.
+ * I2C client/driver for the ST M41T00 family of i2c rtc chips.
  *
  * Author: Mark A. Greer <mgreer@mvista.com>
  *
- * 2005 (c) MontaVista Software, Inc. This file is licensed under
+ * 2005, 2006 (c) MontaVista Software, Inc. This file is licensed under
  * the terms of the GNU General Public License version 2. This program
  * is licensed "as is" without any warranty of any kind, whether express
  * or implied.
@@ -19,21 +19,17 @@
 #include <linux/i2c.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
-#include <linux/mutex.h>
 #include <linux/workqueue.h>
-
+#include <linux/platform_device.h>
+#include <linux/m41t00.h>
 #include <asm/time.h>
 #include <asm/rtc.h>
 
-#define	M41T00_DRV_NAME		"m41t00"
-
-static DEFINE_MUTEX(m41t00_mutex);
-
 static struct i2c_driver m41t00_driver;
 static struct i2c_client *save_client;
 
 static unsigned short ignore[] = { I2C_CLIENT_END };
-static unsigned short normal_addr[] = { 0x68, I2C_CLIENT_END };
+static unsigned short normal_addr[] = { I2C_CLIENT_END, I2C_CLIENT_END };
 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c	= normal_addr,
@@ -41,34 +37,92 @@ static struct i2c_client_address_data addr_data = {
 	.ignore		= ignore,
 };
 
+struct m41t00_chip_info {
+	u8	type;
+	char	*name;
+	u8	read_limit;
+	u8	sec;		/* Offsets for chip regs */
+	u8	min;
+	u8	hour;
+	u8	day;
+	u8	mon;
+	u8	year;
+	u8	alarm_mon;
+	u8	alarm_hour;
+	u8	sqw;
+	u8	sqw_freq;
+};
+
+static struct m41t00_chip_info m41t00_chip_info_tbl[] = {
+	{
+		.type		= M41T00_TYPE_M41T00,
+		.name		= "m41t00",
+		.read_limit	= 5,
+		.sec		= 0,
+		.min		= 1,
+		.hour		= 2,
+		.day		= 4,
+		.mon		= 5,
+		.year		= 6,
+	},
+	{
+		.type		= M41T00_TYPE_M41T81,
+		.name		= "m41t81",
+		.read_limit	= 1,
+		.sec		= 1,
+		.min		= 2,
+		.hour		= 3,
+		.day		= 5,
+		.mon		= 6,
+		.year		= 7,
+		.alarm_mon	= 0xa,
+		.alarm_hour	= 0xc,
+		.sqw		= 0x13,
+	},
+	{
+		.type		= M41T00_TYPE_M41T85,
+		.name		= "m41t85",
+		.read_limit	= 1,
+		.sec		= 1,
+		.min		= 2,
+		.hour		= 3,
+		.day		= 5,
+		.mon		= 6,
+		.year		= 7,
+		.alarm_mon	= 0xa,
+		.alarm_hour	= 0xc,
+		.sqw		= 0x13,
+	},
+};
+static struct m41t00_chip_info *m41t00_chip;
+
 ulong
 m41t00_get_rtc_time(void)
 {
 	s32 sec, min, hour, day, mon, year;
 	s32 sec1, min1, hour1, day1, mon1, year1;
-	ulong limit = 10;
+	u8 reads = 0;
+	u8 buf[8], msgbuf[1] = { 0 }; /* offset into rtc's regs */
+	struct i2c_msg msgs[] = {
+		{
+			.addr	= save_client->addr,
+			.flags	= 0,
+			.len	= 1,
+			.buf	= msgbuf,
+		},
+		{
+			.addr	= save_client->addr,
+			.flags	= I2C_M_RD,
+			.len	= 8,
+			.buf	= buf,
+		},
+	};
 
 	sec = min = hour = day = mon = year = 0;
-	sec1 = min1 = hour1 = day1 = mon1 = year1 = 0;
 
-	mutex_lock(&m41t00_mutex);
 	do {
-		if (((sec = i2c_smbus_read_byte_data(save_client, 0)) >= 0)
-			&& ((min = i2c_smbus_read_byte_data(save_client, 1))
-				>= 0)
-			&& ((hour = i2c_smbus_read_byte_data(save_client, 2))
-				>= 0)
-			&& ((day = i2c_smbus_read_byte_data(save_client, 4))
-				>= 0)
-			&& ((mon = i2c_smbus_read_byte_data(save_client, 5))
-				>= 0)
-			&& ((year = i2c_smbus_read_byte_data(save_client, 6))
-				>= 0)
-			&& ((sec == sec1) && (min == min1) && (hour == hour1)
-				&& (day == day1) && (mon == mon1)
-				&& (year == year1)))
-
-				break;
+		if (i2c_transfer(save_client->adapter, msgs, 2) < 0)
+			goto read_err;
 
 		sec1 = sec;
 		min1 = min;
@@ -76,21 +130,21 @@ m41t00_get_rtc_time(void)
 		day1 = day;
 		mon1 = mon;
 		year1 = year;
-	} while (--limit > 0);
-	mutex_unlock(&m41t00_mutex);
-
-	if (limit == 0) {
-		dev_warn(&save_client->dev,
-			"m41t00: can't read rtc chip\n");
-		sec = min = hour = day = mon = year = 0;
-	}
 
-	sec &= 0x7f;
-	min &= 0x7f;
-	hour &= 0x3f;
-	day &= 0x3f;
-	mon &= 0x1f;
-	year &= 0xff;
+		sec = buf[m41t00_chip->sec] & 0x7f;
+		min = buf[m41t00_chip->min] & 0x7f;
+		hour = buf[m41t00_chip->hour] & 0x3f;
+		day = buf[m41t00_chip->day] & 0x3f;
+		mon = buf[m41t00_chip->mon] & 0x1f;
+		year = buf[m41t00_chip->year];
+	} while ((++reads < m41t00_chip->read_limit) && ((sec != sec1)
+			|| (min != min1) || (hour != hour1) || (day != day1)
+			|| (mon != mon1) || (year != year1)));
+
+	if ((m41t00_chip->read_limit > 1) && ((sec != sec1) || (min != min1)
+			|| (hour != hour1) || (day != day1) || (mon != mon1)
+			|| (year != year1)))
+		goto read_err;
 
 	sec = BCD2BIN(sec);
 	min = BCD2BIN(min);
@@ -104,40 +158,60 @@ m41t00_get_rtc_time(void)
 		year += 100;
 
 	return mktime(year, mon, day, hour, min, sec);
+
+read_err:
+	dev_err(&save_client->dev, "m41t00_get_rtc_time: Read error\n");
+	return 0;
 }
+EXPORT_SYMBOL_GPL(m41t00_get_rtc_time);
 
 static void
 m41t00_set(void *arg)
 {
 	struct rtc_time	tm;
-	ulong nowtime = *(ulong *)arg;
+	int nowtime = *(int *)arg;
+	s32 sec, min, hour, day, mon, year;
+	u8 wbuf[9], *buf = &wbuf[1], msgbuf[1] = { 0 };
+	struct i2c_msg msgs[] = {
+		{
+			.addr	= save_client->addr,
+			.flags	= 0,
+			.len	= 1,
+			.buf	= msgbuf,
+		},
+		{
+			.addr	= save_client->addr,
+			.flags	= I2C_M_RD,
+			.len	= 8,
+			.buf	= buf,
+		},
+	};
 
 	to_tm(nowtime, &tm);
 	tm.tm_year = (tm.tm_year - 1900) % 100;
 
-	tm.tm_sec = BIN2BCD(tm.tm_sec);
-	tm.tm_min = BIN2BCD(tm.tm_min);
-	tm.tm_hour = BIN2BCD(tm.tm_hour);
-	tm.tm_mon = BIN2BCD(tm.tm_mon);
-	tm.tm_mday = BIN2BCD(tm.tm_mday);
-	tm.tm_year = BIN2BCD(tm.tm_year);
-
-	mutex_lock(&m41t00_mutex);
-	if ((i2c_smbus_write_byte_data(save_client, 0, tm.tm_sec & 0x7f) < 0)
-		|| (i2c_smbus_write_byte_data(save_client, 1, tm.tm_min & 0x7f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 2, tm.tm_hour & 0x3f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 4, tm.tm_mday & 0x3f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 5, tm.tm_mon & 0x1f)
-			< 0)
-		|| (i2c_smbus_write_byte_data(save_client, 6, tm.tm_year & 0xff)
-			< 0))
-
-		dev_warn(&save_client->dev,"m41t00: can't write to rtc chip\n");
-
-	mutex_unlock(&m41t00_mutex);
+	sec = BIN2BCD(tm.tm_sec);
+	min = BIN2BCD(tm.tm_min);
+	hour = BIN2BCD(tm.tm_hour);
+	day = BIN2BCD(tm.tm_mday);
+	mon = BIN2BCD(tm.tm_mon);
+	year = BIN2BCD(tm.tm_year);
+
+	/* Read reg values into buf[0..7]/wbuf[1..8] */
+	if (i2c_transfer(save_client->adapter, msgs, 2) < 0) {
+		dev_err(&save_client->dev, "m41t00_set: Read error\n");
+		return;
+	}
+
+	wbuf[0] = 0; /* offset into rtc's regs */
+	buf[m41t00_chip->sec] = (buf[m41t00_chip->sec] & ~0x7f) | (sec & 0x7f);
+	buf[m41t00_chip->min] = (buf[m41t00_chip->min] & ~0x7f) | (min & 0x7f);
+	buf[m41t00_chip->hour] = (buf[m41t00_chip->hour] & ~0x3f) | (hour& 0x3f);
+	buf[m41t00_chip->day] = (buf[m41t00_chip->day] & ~0x3f) | (day & 0x3f);
+	buf[m41t00_chip->mon] = (buf[m41t00_chip->mon] & ~0x1f) | (mon & 0x1f);
+
+	if (i2c_master_send(save_client, wbuf, 9) < 0)
+		dev_err(&save_client->dev, "m41t00_set: Write error\n");
 }
 
 static ulong new_time;
@@ -156,6 +230,48 @@ m41t00_set_rtc_time(ulong nowtime)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(m41t00_set_rtc_time);
+
+/*
+ *****************************************************************************
+ *
+ *	platform_data Driver Interface
+ *
+ *****************************************************************************
+ */
+static int __init
+m41t00_platform_probe(struct platform_device *pdev)
+{
+	struct m41t00_platform_data *pdata;
+	int i;
+
+	if (pdev && (pdata = pdev->dev.platform_data)) {
+		normal_addr[0] = pdata->i2c_addr;
+
+		for (i=0; i<ARRAY_SIZE(m41t00_chip_info_tbl); i++)
+			if (m41t00_chip_info_tbl[i].type == pdata->type) {
+				m41t00_chip = &m41t00_chip_info_tbl[i];
+				m41t00_chip->sqw_freq = pdata->sqw_freq;
+				return 0;
+			}
+	}
+	return -ENODEV;
+}
+
+static int __exit
+m41t00_platform_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static struct platform_driver m41t00_platform_driver = {
+	.probe  = m41t00_platform_probe,
+	.remove = m41t00_platform_remove,
+	.driver = {
+		.owner = THIS_MODULE,
+		.name  = M41T00_DRV_NAME,
+	},
+};
 
 /*
  *****************************************************************************
@@ -170,23 +286,71 @@ m41t00_probe(struct i2c_adapter *adap, int addr, int kind)
 	struct i2c_client *client;
 	int rc;
 
+	if (!i2c_check_functionality(adap, I2C_FUNC_I2C
+			| I2C_FUNC_SMBUS_BYTE_DATA))
+		return 0;
+
 	client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL);
 	if (!client)
 		return -ENOMEM;
 
-	strlcpy(client->name, M41T00_DRV_NAME, I2C_NAME_SIZE);
+	strlcpy(client->name, m41t00_chip->name, I2C_NAME_SIZE);
 	client->addr = addr;
 	client->adapter = adap;
 	client->driver = &m41t00_driver;
 
-	if ((rc = i2c_attach_client(client)) != 0) {
-		kfree(client);
-		return rc;
+	if ((rc = i2c_attach_client(client)))
+		goto attach_err;
+
+	if (m41t00_chip->type != M41T00_TYPE_M41T00) {
+		/* If asked, disable SQW, set SQW frequency & re-enable */
+		if (m41t00_chip->sqw_freq)
+			if (((rc = i2c_smbus_read_byte_data(client,
+					m41t00_chip->alarm_mon)) < 0)
+			 || ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->alarm_mon, rc & ~0x40)) <0)
+			 || ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->sqw,
+					m41t00_chip->sqw_freq)) < 0)
+			 || ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->alarm_mon, rc | 0x40)) <0))
+				goto sqw_err;
+
+		/* Make sure HT (Halt Update) bit is cleared */
+		if ((rc = i2c_smbus_read_byte_data(client,
+				m41t00_chip->alarm_hour)) < 0)
+			goto ht_err;
+
+		if (rc & 0x40)
+			if ((rc = i2c_smbus_write_byte_data(client,
+					m41t00_chip->alarm_hour, rc & ~0x40))<0)
+				goto ht_err;
 	}
 
-	m41t00_wq = create_singlethread_workqueue("m41t00");
+	/* Make sure ST (stop) bit is cleared */
+	if ((rc = i2c_smbus_read_byte_data(client, m41t00_chip->sec)) < 0)
+		goto st_err;
+
+	if (rc & 0x80)
+		if ((rc = i2c_smbus_write_byte_data(client, m41t00_chip->sec,
+				rc & ~0x80)) < 0)
+			goto st_err;
+
+	m41t00_wq = create_singlethread_workqueue(m41t00_chip->name);
 	save_client = client;
 	return 0;
+
+st_err:
+	dev_err(&client->dev, "m41t00_probe: Can't clear ST bit\n");
+	goto attach_err;
+ht_err:
+	dev_err(&client->dev, "m41t00_probe: Can't clear HT bit\n");
+	goto attach_err;
+sqw_err:
+	dev_err(&client->dev, "m41t00_probe: Can't set SQW Frequency\n");
+attach_err:
+	kfree(client);
+	return rc;
 }
 
 static int
@@ -219,13 +383,18 @@ static struct i2c_driver m41t00_driver = {
 static int __init
 m41t00_init(void)
 {
-	return i2c_add_driver(&m41t00_driver);
+	int rc;
+
+	if (!(rc = platform_driver_register(&m41t00_platform_driver)))
+		rc = i2c_add_driver(&m41t00_driver);
+	return rc;
 }
 
 static void __exit
 m41t00_exit(void)
 {
 	i2c_del_driver(&m41t00_driver);
+	platform_driver_unregister(&m41t00_platform_driver);
 }
 
 module_init(m41t00_init);
diff --git a/include/linux/m41t00.h b/include/linux/m41t00.h
new file mode 100644
index 000000000000..b423360ca38e
--- /dev/null
+++ b/include/linux/m41t00.h
@@ -0,0 +1,50 @@
+/*
+ * Definitions for the ST M41T00 family of i2c rtc chips.
+ *
+ * Author: Mark A. Greer <mgreer@mvista.com>
+ *
+ * 2005, 2006 (c) MontaVista Software, Inc. This file is licensed under
+ * the terms of the GNU General Public License version 2. This program
+ * is licensed "as is" without any warranty of any kind, whether express
+ * or implied.
+ */
+
+#ifndef _M41T00_H
+#define _M41T00_H
+
+#define	M41T00_DRV_NAME		"m41t00"
+#define	M41T00_I2C_ADDR		0x68
+
+#define	M41T00_TYPE_M41T00	0
+#define	M41T00_TYPE_M41T81	81
+#define	M41T00_TYPE_M41T85	85
+
+struct m41t00_platform_data {
+	u8	type;
+	u8	i2c_addr;
+	u8	sqw_freq;
+};
+
+/* SQW output disabled, this is default value by power on */
+#define M41T00_SQW_DISABLE	(0)
+
+#define M41T00_SQW_32KHZ	(1<<4)		/* 32.768 KHz */
+#define M41T00_SQW_8KHZ		(2<<4)		/* 8.192 KHz */
+#define M41T00_SQW_4KHZ		(3<<4)		/* 4.096 KHz */
+#define M41T00_SQW_2KHZ		(4<<4)		/* 2.048 KHz */
+#define M41T00_SQW_1KHZ		(5<<4)		/* 1.024 KHz */
+#define M41T00_SQW_512HZ	(6<<4)		/* 512 Hz */
+#define M41T00_SQW_256HZ	(7<<4)		/* 256 Hz */
+#define M41T00_SQW_128HZ	(8<<4)		/* 128 Hz */
+#define M41T00_SQW_64HZ		(9<<4)		/* 64 Hz */
+#define M41T00_SQW_32HZ		(10<<4)		/* 32 Hz */
+#define M41T00_SQW_16HZ		(11<<4)		/* 16 Hz */
+#define M41T00_SQW_8HZ		(12<<4)		/* 8 Hz */
+#define M41T00_SQW_4HZ		(13<<4)		/* 4 Hz */
+#define M41T00_SQW_2HZ		(14<<4)		/* 2 Hz */
+#define M41T00_SQW_1HZ		(15<<4)		/* 1 Hz */
+
+extern ulong m41t00_get_rtc_time(void);
+extern int m41t00_set_rtc_time(ulong nowtime);
+
+#endif /* _M41T00_H */
-- 
cgit v1.2.3


From 5c7ae65899a4c5b05b6277f856018d1eeeb98907 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Tue, 25 Apr 2006 14:18:16 +0200
Subject: [PATCH] I2C: i2c-nforce2: Add support for the nForce4 MCP51 and MCP55

Add support for the new nForce4 MCP51 (also known as nForce 410 or
430) and nForce4 MCP55 to the i2c-nforce2 driver. Some code changes
were required because the base I/O address registers have changed in
these versions. Standard BARs are now being used, while the original
nForce2 chips used non-standard ones.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-nforce2 |  2 ++
 drivers/i2c/busses/i2c-nforce2.c     | 38 +++++++++++++++++++++++++-----------
 include/linux/pci_ids.h              |  2 ++
 3 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-nforce2 b/Documentation/i2c/busses/i2c-nforce2
index d751282d9b2a..cd49c428a3ab 100644
--- a/Documentation/i2c/busses/i2c-nforce2
+++ b/Documentation/i2c/busses/i2c-nforce2
@@ -7,6 +7,8 @@ Supported adapters:
   * nForce3 250Gb MCP          10de:00E4 
   * nForce4 MCP                10de:0052
   * nForce4 MCP-04             10de:0034
+  * nForce4 MCP51              10de:0264
+  * nForce4 MCP55              10de:0368
 
 Datasheet: not publically available, but seems to be similar to the
            AMD-8111 SMBus 2.0 adapter.
diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c
index 2d80eb26f688..604b49e22df1 100644
--- a/drivers/i2c/busses/i2c-nforce2.c
+++ b/drivers/i2c/busses/i2c-nforce2.c
@@ -31,6 +31,8 @@
     nForce3 250Gb MCP		00E4
     nForce4 MCP			0052
     nForce4 MCP-04		0034
+    nForce4 MCP51		0264
+    nForce4 MCP55		0368
 
     This driver supports the 2 SMBuses that are included in the MCP of the
     nForce2/3/4 chipsets.
@@ -64,6 +66,7 @@ struct nforce2_smbus {
 
 /*
  * nVidia nForce2 SMBus control register definitions
+ * (Newer incarnations use standard BARs 4 and 5 instead)
  */
 #define NFORCE_PCI_SMB1	0x50
 #define NFORCE_PCI_SMB2	0x54
@@ -259,6 +262,8 @@ static struct pci_device_id nforce2_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE3S_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE4_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SMBUS) },
 	{ 0 }
 };
 
@@ -266,19 +271,29 @@ static struct pci_device_id nforce2_ids[] = {
 MODULE_DEVICE_TABLE (pci, nforce2_ids);
 
 
-static int __devinit nforce2_probe_smb (struct pci_dev *dev, int reg,
-	struct nforce2_smbus *smbus, char *name)
+static int __devinit nforce2_probe_smb (struct pci_dev *dev, int bar,
+	int alt_reg, struct nforce2_smbus *smbus, const char *name)
 {
-	u16 iobase;
 	int error;
 
-	if (pci_read_config_word(dev, reg, &iobase) != PCIBIOS_SUCCESSFUL) {
-		dev_err(&smbus->adapter.dev, "Error reading PCI config for %s\n", name);
-		return -1;
+	smbus->base = pci_resource_start(dev, bar);
+	if (smbus->base) {
+		smbus->size = pci_resource_len(dev, bar);
+	} else {
+		/* Older incarnations of the device used non-standard BARs */
+		u16 iobase;
+
+		if (pci_read_config_word(dev, alt_reg, &iobase)
+		    != PCIBIOS_SUCCESSFUL) {
+			dev_err(&dev->dev, "Error reading PCI config for %s\n",
+				name);
+			return -1;
+		}
+
+		smbus->base = iobase & PCI_BASE_ADDRESS_IO_MASK;
+		smbus->size = 8;
 	}
-	smbus->dev  = dev;
-	smbus->base = iobase & 0xfffc;
-	smbus->size = 8;
+	smbus->dev = dev;
 
 	if (!request_region(smbus->base, smbus->size, nforce2_driver.name)) {
 		dev_err(&smbus->adapter.dev, "Error requesting region %02x .. %02X for %s\n",
@@ -313,12 +328,13 @@ static int __devinit nforce2_probe(struct pci_dev *dev, const struct pci_device_
 	pci_set_drvdata(dev, smbuses);
 
 	/* SMBus adapter 1 */
-	res1 = nforce2_probe_smb (dev, NFORCE_PCI_SMB1, &smbuses[0], "SMB1");
+	res1 = nforce2_probe_smb(dev, 4, NFORCE_PCI_SMB1, &smbuses[0], "SMB1");
 	if (res1 < 0) {
 		dev_err(&dev->dev, "Error probing SMB1.\n");
 		smbuses[0].base = 0;	/* to have a check value */
 	}
-	res2 = nforce2_probe_smb (dev, NFORCE_PCI_SMB2, &smbuses[1], "SMB2");
+	/* SMBus adapter 2 */
+	res2 = nforce2_probe_smb(dev, 5, NFORCE_PCI_SMB2, &smbuses[1], "SMB2");
 	if (res2 < 0) {
 		dev_err(&dev->dev, "Error probing SMB2.\n");
 		smbuses[1].base = 0;	/* to have a check value */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 489af9d3ce1f..d33436097e1d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1130,9 +1130,11 @@
 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_900XGL	0x0258
 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_750XGL	0x0259
 #define PCI_DEVICE_ID_NVIDIA_QUADRO4_700XGL	0x025B
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SMBUS	0x0264
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE	0x0265
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA	0x0266
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA2	0x0267
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SMBUS	0x0368
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE	0x036E
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA	0x037E
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SATA2	0x037F
-- 
cgit v1.2.3


From 18f98b1e3147afdb51e545cc6ff2b016c7d088a7 Mon Sep 17 00:00:00 2001
From: Peter Korsgaard <jacmet@sunsite.dk>
Date: Sun, 4 Jun 2006 20:01:08 +0200
Subject: [PATCH] i2c: New bus driver for the OpenCores I2C controller

The following patch adds support for the OpenCores I2C controller IP
core (See http://www.opencores.org/projects.cgi/web/i2c/overview).

Signed-off-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/i2c/busses/i2c-ocores |  51 ++++++
 drivers/i2c/busses/Kconfig          |  11 ++
 drivers/i2c/busses/Makefile         |   1 +
 drivers/i2c/busses/i2c-ocores.c     | 343 ++++++++++++++++++++++++++++++++++++
 include/linux/i2c-ocores.h          |  19 ++
 5 files changed, 425 insertions(+)
 create mode 100644 Documentation/i2c/busses/i2c-ocores
 create mode 100644 drivers/i2c/busses/i2c-ocores.c
 create mode 100644 include/linux/i2c-ocores.h

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
new file mode 100644
index 000000000000..cfcebb10d14e
--- /dev/null
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -0,0 +1,51 @@
+Kernel driver i2c-ocores
+
+Supported adapters:
+  * OpenCores.org I2C controller by Richard Herveille (see datasheet link)
+    Datasheet: http://www.opencores.org/projects.cgi/web/i2c/overview
+
+Author: Peter Korsgaard <jacmet@sunsite.dk>
+
+Description
+-----------
+
+i2c-ocores is an i2c bus driver for the OpenCores.org I2C controller
+IP core by Richard Herveille.
+
+Usage
+-----
+
+i2c-ocores uses the platform bus, so you need to provide a struct
+platform_device with the base address and interrupt number. The
+dev.platform_data of the device should also point to a struct
+ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
+distance between registers and the input clock speed.
+
+E.G. something like:
+
+static struct resource ocores_resources[] = {
+	[0] = {
+		.start	= MYI2C_BASEADDR,
+		.end	= MYI2C_BASEADDR + 8,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= MYI2C_IRQ,
+		.end	= MYI2C_IRQ,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct ocores_i2c_platform_data myi2c_data = {
+	.regstep	= 2,		/* two bytes between registers */
+	.clock_khz	= 50000,	/* input clock of 50MHz */
+};
+
+static struct platform_device myi2c = {
+	.name			= "ocores-i2c",
+	.dev = {
+		.platform_data	= &myi2c_data,
+	},
+	.num_resources		= ARRAY_SIZE(ocores_resources),
+	.resource		= ocores_resources,
+};
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index d25a8cbbec0a..f7af7e9bb7d9 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -276,6 +276,17 @@ config I2C_NFORCE2
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-nforce2.
 
+config I2C_OCORES
+	tristate "OpenCores I2C Controller"
+	depends on I2C && EXPERIMENTAL
+	help
+	  If you say yes to this option, support will be included for the
+	  OpenCores I2C controller. For details see
+	  http://www.opencores.org/projects.cgi/web/i2c/overview
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called i2c-ocores.
+
 config I2C_PARPORT
 	tristate "Parallel port adapter"
 	depends on I2C && PARPORT
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index b44831dff683..ac56df53155b 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_I2C_POWERMAC)	+= i2c-powermac.o
 obj-$(CONFIG_I2C_MPC)		+= i2c-mpc.o
 obj-$(CONFIG_I2C_MV64XXX)	+= i2c-mv64xxx.o
 obj-$(CONFIG_I2C_NFORCE2)	+= i2c-nforce2.o
+obj-$(CONFIG_I2C_OCORES)	+= i2c-ocores.o
 obj-$(CONFIG_I2C_PARPORT)	+= i2c-parport.o
 obj-$(CONFIG_I2C_PARPORT_LIGHT)	+= i2c-parport-light.o
 obj-$(CONFIG_I2C_PCA_ISA)	+= i2c-pca-isa.o
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
new file mode 100644
index 000000000000..d5c0610bfe6d
--- /dev/null
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -0,0 +1,343 @@
+/*
+ * i2c-ocores.c: I2C bus driver for OpenCores I2C controller
+ * (http://www.opencores.org/projects.cgi/web/i2c/overview).
+ *
+ * Peter Korsgaard <jacmet@sunsite.dk>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/platform_device.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/i2c-ocores.h>
+#include <asm/io.h>
+
+struct ocores_i2c {
+	void __iomem *base;
+	int regstep;
+	wait_queue_head_t wait;
+	struct i2c_adapter adap;
+	struct i2c_msg *msg;
+	int pos;
+	int nmsgs;
+	int state; /* see STATE_ */
+};
+
+/* registers */
+#define OCI2C_PRELOW		0
+#define OCI2C_PREHIGH		1
+#define OCI2C_CONTROL		2
+#define OCI2C_DATA		3
+#define OCI2C_CMD		4
+#define OCI2C_STATUS		4
+
+#define OCI2C_CTRL_IEN		0x40
+#define OCI2C_CTRL_EN		0x80
+
+#define OCI2C_CMD_START		0x91
+#define OCI2C_CMD_STOP		0x41
+#define OCI2C_CMD_READ		0x21
+#define OCI2C_CMD_WRITE		0x11
+#define OCI2C_CMD_READ_ACK	0x21
+#define OCI2C_CMD_READ_NACK	0x29
+#define OCI2C_CMD_IACK		0x01
+
+#define OCI2C_STAT_IF		0x01
+#define OCI2C_STAT_TIP		0x02
+#define OCI2C_STAT_ARBLOST	0x20
+#define OCI2C_STAT_BUSY		0x40
+#define OCI2C_STAT_NACK		0x80
+
+#define STATE_DONE		0
+#define STATE_START		1
+#define STATE_WRITE		2
+#define STATE_READ		3
+#define STATE_ERROR		4
+
+static inline void oc_setreg(struct ocores_i2c *i2c, int reg, u8 value)
+{
+	iowrite8(value, i2c->base + reg * i2c->regstep);
+}
+
+static inline u8 oc_getreg(struct ocores_i2c *i2c, int reg)
+{
+	return ioread8(i2c->base + reg * i2c->regstep);
+}
+
+static void ocores_process(struct ocores_i2c *i2c)
+{
+	struct i2c_msg *msg = i2c->msg;
+	u8 stat = oc_getreg(i2c, OCI2C_STATUS);
+
+	if ((i2c->state == STATE_DONE) || (i2c->state == STATE_ERROR)) {
+		/* stop has been sent */
+		oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK);
+		wake_up(&i2c->wait);
+		return;
+	}
+
+	/* error? */
+	if (stat & OCI2C_STAT_ARBLOST) {
+		i2c->state = STATE_ERROR;
+		oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_STOP);
+		return;
+	}
+
+	if ((i2c->state == STATE_START) || (i2c->state == STATE_WRITE)) {
+		i2c->state =
+			(msg->flags & I2C_M_RD) ? STATE_READ : STATE_WRITE;
+
+		if (stat & OCI2C_STAT_NACK) {
+			i2c->state = STATE_ERROR;
+			oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_STOP);
+			return;
+		}
+	} else
+		msg->buf[i2c->pos++] = oc_getreg(i2c, OCI2C_DATA);
+
+	/* end of msg? */
+	if (i2c->pos == msg->len) {
+		i2c->nmsgs--;
+		i2c->msg++;
+		i2c->pos = 0;
+		msg = i2c->msg;
+
+		if (i2c->nmsgs) {	/* end? */
+			/* send start? */
+			if (!(msg->flags & I2C_M_NOSTART)) {
+				u8 addr = (msg->addr << 1);
+
+				if (msg->flags & I2C_M_RD)
+					addr |= 1;
+
+				i2c->state = STATE_START;
+
+				oc_setreg(i2c, OCI2C_DATA, addr);
+				oc_setreg(i2c, OCI2C_CMD,  OCI2C_CMD_START);
+				return;
+			} else
+				i2c->state = (msg->flags & I2C_M_RD)
+					? STATE_READ : STATE_WRITE;
+		} else {
+			i2c->state = STATE_DONE;
+			oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_STOP);
+			return;
+		}
+	}
+
+	if (i2c->state == STATE_READ) {
+		oc_setreg(i2c, OCI2C_CMD, i2c->pos == (msg->len-1) ?
+			  OCI2C_CMD_READ_NACK : OCI2C_CMD_READ_ACK);
+	} else {
+		oc_setreg(i2c, OCI2C_DATA, msg->buf[i2c->pos++]);
+		oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_WRITE);
+	}
+}
+
+static irqreturn_t ocores_isr(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct ocores_i2c *i2c = dev_id;
+
+	ocores_process(i2c);
+
+	return IRQ_HANDLED;
+}
+
+static int ocores_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
+{
+	struct ocores_i2c *i2c = i2c_get_adapdata(adap);
+
+	i2c->msg = msgs;
+	i2c->pos = 0;
+	i2c->nmsgs = num;
+	i2c->state = STATE_START;
+
+	oc_setreg(i2c, OCI2C_DATA,
+			(i2c->msg->addr << 1) |
+			((i2c->msg->flags & I2C_M_RD) ? 1:0));
+
+	oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_START);
+
+	if (wait_event_timeout(i2c->wait, (i2c->state == STATE_ERROR) ||
+			       (i2c->state == STATE_DONE), HZ))
+		return (i2c->state == STATE_DONE) ? num : -EIO;
+	else
+		return -ETIMEDOUT;
+}
+
+static void ocores_init(struct ocores_i2c *i2c,
+			struct ocores_i2c_platform_data *pdata)
+{
+	int prescale;
+	u8 ctrl = oc_getreg(i2c, OCI2C_CONTROL);
+
+	/* make sure the device is disabled */
+	oc_setreg(i2c, OCI2C_CONTROL, ctrl & ~(OCI2C_CTRL_EN|OCI2C_CTRL_IEN));
+
+	prescale = (pdata->clock_khz / (5*100)) - 1;
+	oc_setreg(i2c, OCI2C_PRELOW, prescale & 0xff);
+	oc_setreg(i2c, OCI2C_PREHIGH, prescale >> 8);
+
+	/* Init the device */
+	oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK);
+	oc_setreg(i2c, OCI2C_CONTROL, ctrl | OCI2C_CTRL_IEN | OCI2C_CTRL_EN);
+}
+
+
+static u32 ocores_func(struct i2c_adapter *adap)
+{
+	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
+}
+
+static struct i2c_algorithm ocores_algorithm = {
+	.master_xfer	= ocores_xfer,
+	.functionality	= ocores_func,
+};
+
+static struct i2c_adapter ocores_adapter = {
+	.owner		= THIS_MODULE,
+	.name		= "i2c-ocores",
+	.class		= I2C_CLASS_HWMON,
+	.algo		= &ocores_algorithm,
+	.timeout	= 2,
+	.retries	= 1,
+};
+
+
+static int __devinit ocores_i2c_probe(struct platform_device *pdev)
+{
+	struct ocores_i2c *i2c;
+	struct ocores_i2c_platform_data *pdata;
+	struct resource *res, *res2;
+	int ret;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	res2 = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!res2)
+		return -ENODEV;
+
+	pdata = (struct ocores_i2c_platform_data*) pdev->dev.platform_data;
+	if (!pdata)
+		return -ENODEV;
+
+	i2c = kzalloc(sizeof(*i2c), GFP_KERNEL);
+	if (!i2c)
+		return -ENOMEM;
+
+	if (!request_mem_region(res->start, res->end - res->start + 1,
+				pdev->name)) {
+		dev_err(&pdev->dev, "Memory region busy\n");
+		ret = -EBUSY;
+		goto request_mem_failed;
+	}
+
+	i2c->base = ioremap(res->start, res->end - res->start + 1);
+	if (!i2c->base) {
+		dev_err(&pdev->dev, "Unable to map registers\n");
+		ret = -EIO;
+		goto map_failed;
+	}
+
+	i2c->regstep = pdata->regstep;
+	ocores_init(i2c, pdata);
+
+	init_waitqueue_head(&i2c->wait);
+	ret = request_irq(res2->start, ocores_isr, 0, pdev->name, i2c);
+	if (ret) {
+		dev_err(&pdev->dev, "Cannot claim IRQ\n");
+		goto request_irq_failed;
+	}
+
+	/* hook up driver to tree */
+	platform_set_drvdata(pdev, i2c);
+	i2c->adap = ocores_adapter;
+	i2c_set_adapdata(&i2c->adap, i2c);
+	i2c->adap.dev.parent = &pdev->dev;
+
+	/* add i2c adapter to i2c tree */
+	ret = i2c_add_adapter(&i2c->adap);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to add adapter\n");
+		goto add_adapter_failed;
+	}
+
+	return 0;
+
+add_adapter_failed:
+	free_irq(res2->start, i2c);
+request_irq_failed:
+	iounmap(i2c->base);
+map_failed:
+	release_mem_region(res->start, res->end - res->start + 1);
+request_mem_failed:
+	kfree(i2c);
+
+	return ret;
+}
+
+static int __devexit ocores_i2c_remove(struct platform_device* pdev)
+{
+	struct ocores_i2c *i2c = platform_get_drvdata(pdev);
+	struct resource *res;
+
+	/* disable i2c logic */
+	oc_setreg(i2c, OCI2C_CONTROL, oc_getreg(i2c, OCI2C_CONTROL)
+		  & ~(OCI2C_CTRL_EN|OCI2C_CTRL_IEN));
+
+	/* remove adapter & data */
+	i2c_del_adapter(&i2c->adap);
+	platform_set_drvdata(pdev, NULL);
+
+	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (res)
+		free_irq(res->start, i2c);
+
+	iounmap(i2c->base);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (res)
+		release_mem_region(res->start, res->end - res->start + 1);
+
+	kfree(i2c);
+
+	return 0;
+}
+
+static struct platform_driver ocores_i2c_driver = {
+	.probe  = ocores_i2c_probe,
+	.remove = __devexit_p(ocores_i2c_remove),
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "ocores-i2c",
+	},
+};
+
+static int __init ocores_i2c_init(void)
+{
+	return platform_driver_register(&ocores_i2c_driver);
+}
+
+static void __exit ocores_i2c_exit(void)
+{
+	platform_driver_unregister(&ocores_i2c_driver);
+}
+
+module_init(ocores_i2c_init);
+module_exit(ocores_i2c_exit);
+
+MODULE_AUTHOR("Peter Korsgaard <jacmet@sunsite.dk>");
+MODULE_DESCRIPTION("OpenCores I2C bus driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c-ocores.h b/include/linux/i2c-ocores.h
new file mode 100644
index 000000000000..8ed591b0887e
--- /dev/null
+++ b/include/linux/i2c-ocores.h
@@ -0,0 +1,19 @@
+/*
+ * i2c-ocores.h - definitions for the i2c-ocores interface
+ *
+ * Peter Korsgaard <jacmet@sunsite.dk>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef _LINUX_I2C_OCORES_H
+#define _LINUX_I2C_OCORES_H
+
+struct ocores_i2c_platform_data {
+	u32 regstep;   /* distance between registers */
+	u32 clock_khz; /* input clock in kHz */
+};
+
+#endif /* _LINUX_I2C_OCORES_H */
-- 
cgit v1.2.3


From 46f5ed753fac512f73069bd07455555b41a8a06e Mon Sep 17 00:00:00 2001
From: Krzysztof Halasa <khc@pm.waw.pl>
Date: Mon, 12 Jun 2006 21:42:20 +0200
Subject: [PATCH] i2c: Mark block write buffers as const

The attached patch marks i2c_smbus_write_block_data() and
i2c_smbus_write_i2c_block_data() buffers as const.

Signed-off-by: Krzysztof Halasa <khc@pm.waw.pl>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/i2c-core.c | 4 ++--
 include/linux/i2c.h    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 45e2cdf54736..a45155f799d4 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -916,7 +916,7 @@ s32 i2c_smbus_write_word_data(struct i2c_client *client, u8 command, u16 value)
 }
 
 s32 i2c_smbus_write_block_data(struct i2c_client *client, u8 command,
-			       u8 length, u8 *values)
+			       u8 length, const u8 *values)
 {
 	union i2c_smbus_data data;
 
@@ -944,7 +944,7 @@ s32 i2c_smbus_read_i2c_block_data(struct i2c_client *client, u8 command, u8 *val
 }
 
 s32 i2c_smbus_write_i2c_block_data(struct i2c_client *client, u8 command,
-				   u8 length, u8 *values)
+				   u8 length, const u8 *values)
 {
 	union i2c_smbus_data data;
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 0510430e00db..526ddc8eecfb 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -97,13 +97,13 @@ extern s32 i2c_smbus_write_word_data(struct i2c_client * client,
                                      u8 command, u16 value);
 extern s32 i2c_smbus_write_block_data(struct i2c_client * client,
 				      u8 command, u8 length,
-				      u8 *values);
+				      const u8 *values);
 /* Returns the number of read bytes */
 extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
 					 u8 command, u8 *values);
 extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client,
 					  u8 command, u8 length,
-					  u8 *values);
+					  const u8 *values);
 
 /*
  * A driver is capable of handling one or more physical devices present on
-- 
cgit v1.2.3


From b6043fcab4b2b06b9fcde4c783ab253cdc2c1129 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Thu, 23 Mar 2006 19:11:58 +0300
Subject: [PATCH] w1: Move w1-connector definitions into
 linux/include/connector.h

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/w1/w1_netlink.h   | 3 ---
 include/linux/connector.h | 5 ++++-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/w1/w1_netlink.h b/drivers/w1/w1_netlink.h
index 5644221f9a44..56122b9e9294 100644
--- a/drivers/w1/w1_netlink.h
+++ b/drivers/w1/w1_netlink.h
@@ -27,9 +27,6 @@
 
 #include "w1.h"
 
-#define CN_W1_IDX	3
-#define CN_W1_VAL	1
-
 enum w1_netlink_message_types {
 	W1_SLAVE_ADD = 0,
 	W1_SLAVE_REMOVE,
diff --git a/include/linux/connector.h b/include/linux/connector.h
index ad1a22c1c42e..4c02119c6ab9 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -34,8 +34,11 @@
 #define CN_VAL_PROC			0x1
 #define CN_IDX_CIFS			0x2
 #define CN_VAL_CIFS                     0x1
+#define CN_W1_IDX			0x3	/* w1 communication */
+#define CN_W1_VAL			0x1
 
-#define CN_NETLINK_USERS		1
+
+#define CN_NETLINK_USERS		4
 
 /*
  * Maximum connector's message size.
-- 
cgit v1.2.3


From bb5427b5466782ba0bbf56a4ed752e08b65a5d08 Mon Sep 17 00:00:00 2001
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Thu, 23 Mar 2006 19:11:58 +0300
Subject: [PATCH] w1: netlink: Mark netlink group 1 as unused.

netlink_w1 was moved to connector.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 87b8a5703ebc..855b44668caa 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 
 #define NETLINK_ROUTE		0	/* Routing/device hook				*/
-#define NETLINK_W1		1	/* 1-wire subsystem				*/
+#define NETLINK_UNUSED		1	/* Unused number				*/
 #define NETLINK_USERSOCK	2	/* Reserved for user mode socket protocols 	*/
 #define NETLINK_FIREWALL	3	/* Firewalling hook				*/
 #define NETLINK_INET_DIAG	4	/* INET socket monitoring			*/
-- 
cgit v1.2.3


From d720024e94de4e8b7f10ee83c532926f3ad5d708 Mon Sep 17 00:00:00 2001
From: Michael LeMay <mdlemay@epoch.ncsc.mil>
Date: Thu, 22 Jun 2006 14:47:17 -0700
Subject: [PATCH] selinux: add hooks for key subsystem

Introduce SELinux hooks to support the access key retention subsystem
within the kernel.  Incorporate new flask headers from a modified version
of the SELinux reference policy, with support for the new security class
representing retained keys.  Extend the "key_alloc" security hook with a
task parameter representing the intended ownership context for the key
being allocated.  Attach security information to root's default keyrings
within the SELinux initialization routine.

Has passed David's testsuite.

Signed-off-by: Michael LeMay <mdlemay@epoch.ncsc.mil>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt                       | 29 +++++++++++++
 include/linux/key.h                          | 18 +++++---
 include/linux/security.h                     | 10 +++--
 kernel/user.c                                |  2 +-
 security/dummy.c                             |  2 +-
 security/keys/key.c                          |  8 ++--
 security/keys/keyring.c                      |  5 ++-
 security/keys/process_keys.c                 | 15 ++++---
 security/keys/request_key.c                  |  6 ++-
 security/keys/request_key_auth.c             |  2 +-
 security/selinux/hooks.c                     | 64 ++++++++++++++++++++++++++++
 security/selinux/include/av_perm_to_string.h |  6 +++
 security/selinux/include/av_permissions.h    |  8 ++++
 security/selinux/include/class_to_string.h   |  1 +
 security/selinux/include/flask.h             |  1 +
 security/selinux/include/objsec.h            |  5 +++
 16 files changed, 155 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index aaa01b0e3ee9..703020012708 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -19,6 +19,7 @@ This document has the following sections:
 	- Key overview
 	- Key service overview
 	- Key access permissions
+	- SELinux support
 	- New procfs files
 	- Userspace system call interface
 	- Kernel services
@@ -232,6 +233,34 @@ For changing the ownership, group ID or permissions mask, being the owner of
 the key or having the sysadmin capability is sufficient.
 
 
+===============
+SELINUX SUPPORT
+===============
+
+The security class "key" has been added to SELinux so that mandatory access
+controls can be applied to keys created within various contexts.  This support
+is preliminary, and is likely to change quite significantly in the near future.
+Currently, all of the basic permissions explained above are provided in SELinux
+as well; SE Linux is simply invoked after all basic permission checks have been
+performed.
+
+Each key is labeled with the same context as the task to which it belongs.
+Typically, this is the same task that was running when the key was created.
+The default keyrings are handled differently, but in a way that is very
+intuitive:
+
+ (*) The user and user session keyrings that are created when the user logs in
+     are currently labeled with the context of the login manager.
+
+ (*) The keyrings associated with new threads are each labeled with the context
+     of their associated thread, and both session and process keyrings are
+     handled similarly.
+
+Note, however, that the default keyrings associated with the root user are
+labeled with the default kernel context, since they are created early in the
+boot process, before root has a chance to log in.
+
+
 ================
 NEW PROCFS FILES
 ================
diff --git a/include/linux/key.h b/include/linux/key.h
index cbf464ad9589..8c275d12ef63 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -241,8 +241,9 @@ extern void unregister_key_type(struct key_type *ktype);
 
 extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
-			     uid_t uid, gid_t gid, key_perm_t perm,
-			     int not_in_quota);
+			     uid_t uid, gid_t gid,
+			     struct task_struct *ctx,
+			     key_perm_t perm, int not_in_quota);
 extern int key_payload_reserve(struct key *key, size_t datalen);
 extern int key_instantiate_and_link(struct key *key,
 				    const void *data,
@@ -292,7 +293,9 @@ extern int key_unlink(struct key *keyring,
 		      struct key *key);
 
 extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-				 int not_in_quota, struct key *dest);
+				 struct task_struct *ctx,
+				 int not_in_quota,
+				 struct key *dest);
 
 extern int keyring_clear(struct key *keyring);
 
@@ -313,7 +316,8 @@ extern void keyring_replace_payload(struct key *key, void *replacement);
  * the userspace interface
  */
 extern struct key root_user_keyring, root_session_keyring;
-extern int alloc_uid_keyring(struct user_struct *user);
+extern int alloc_uid_keyring(struct user_struct *user,
+			     struct task_struct *ctx);
 extern void switch_uid_keyring(struct user_struct *new_user);
 extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
 extern int copy_thread_group_keys(struct task_struct *tsk);
@@ -342,7 +346,7 @@ extern void key_init(void);
 #define make_key_ref(k)			({ NULL; })
 #define key_ref_to_ptr(k)		({ NULL; })
 #define is_key_possessed(k)		0
-#define alloc_uid_keyring(u)		0
+#define alloc_uid_keyring(u,c)		0
 #define switch_uid_keyring(u)		do { } while(0)
 #define __install_session_keyring(t, k)	({ NULL; })
 #define copy_keys(f,t)			0
@@ -355,6 +359,10 @@ extern void key_init(void);
 #define key_fsgid_changed(t)		do { } while(0)
 #define key_init()			do { } while(0)
 
+/* Initial keyrings */
+extern struct key root_user_keyring;
+extern struct key root_session_keyring;
+
 #endif /* CONFIG_KEYS */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_KEY_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 4dfb1b84a9b3..47722d355532 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1313,7 +1313,7 @@ struct security_operations {
 
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
-	int (*key_alloc)(struct key *key);
+	int (*key_alloc)(struct key *key, struct task_struct *tsk);
 	void (*key_free)(struct key *key);
 	int (*key_permission)(key_ref_t key_ref,
 			      struct task_struct *context,
@@ -3008,9 +3008,10 @@ static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid
 
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
-static inline int security_key_alloc(struct key *key)
+static inline int security_key_alloc(struct key *key,
+				     struct task_struct *tsk)
 {
-	return security_ops->key_alloc(key);
+	return security_ops->key_alloc(key, tsk);
 }
 
 static inline void security_key_free(struct key *key)
@@ -3027,7 +3028,8 @@ static inline int security_key_permission(key_ref_t key_ref,
 
 #else
 
-static inline int security_key_alloc(struct key *key)
+static inline int security_key_alloc(struct key *key,
+				     struct task_struct *tsk)
 {
 	return 0;
 }
diff --git a/kernel/user.c b/kernel/user.c
index 4b1eb745afa1..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
 		new->mq_bytes = 0;
 		new->locked_shm = 0;
 
-		if (alloc_uid_keyring(new) < 0) {
+		if (alloc_uid_keyring(new, current) < 0) {
 			kmem_cache_free(uid_cachep, new);
 			return NULL;
 		}
diff --git a/security/dummy.c b/security/dummy.c
index 64f6da0f422e..6de4a4a5eb13 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -860,7 +860,7 @@ static int dummy_setprocattr(struct task_struct *p, char *name, void *value, siz
 }
 
 #ifdef CONFIG_KEYS
-static inline int dummy_key_alloc(struct key *key)
+static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx)
 {
 	return 0;
 }
diff --git a/security/keys/key.c b/security/keys/key.c
index 3fdc49c6a02c..14a15abb7735 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -247,8 +247,8 @@ static inline void key_alloc_serial(struct key *key)
  *   instantiate the key or discard it before returning
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
-		      uid_t uid, gid_t gid, key_perm_t perm,
-		      int not_in_quota)
+		      uid_t uid, gid_t gid, struct task_struct *ctx,
+		      key_perm_t perm, int not_in_quota)
 {
 	struct key_user *user = NULL;
 	struct key *key;
@@ -318,7 +318,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 #endif
 
 	/* let the security module know about the key */
-	ret = security_key_alloc(key);
+	ret = security_key_alloc(key, ctx);
 	if (ret < 0)
 		goto security_error;
 
@@ -822,7 +822,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	/* allocate a new key */
 	key = key_alloc(ktype, description, current->fsuid, current->fsgid,
-			perm, not_in_quota);
+			current, perm, not_in_quota);
 	if (IS_ERR(key)) {
 		key_ref = ERR_PTR(PTR_ERR(key));
 		goto error_3;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index bffa924c1f88..1357207fc9df 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -240,13 +240,14 @@ static long keyring_read(const struct key *keyring,
  * allocate a keyring and link into the destination keyring
  */
 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-			  int not_in_quota, struct key *dest)
+			  struct task_struct *ctx, int not_in_quota,
+			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid,
+			    uid, gid, ctx,
 			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
 			    not_in_quota);
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 217a0bef3c82..a50a91332fe1 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -67,7 +67,8 @@ struct key root_session_keyring = {
 /*
  * allocate the keyrings to be associated with a UID
  */
-int alloc_uid_keyring(struct user_struct *user)
+int alloc_uid_keyring(struct user_struct *user,
+		      struct task_struct *ctx)
 {
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
@@ -76,7 +77,7 @@ int alloc_uid_keyring(struct user_struct *user)
 	/* concoct a default session keyring */
 	sprintf(buf, "_uid_ses.%u", user->uid);
 
-	session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, 0, NULL);
+	session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0, NULL);
 	if (IS_ERR(session_keyring)) {
 		ret = PTR_ERR(session_keyring);
 		goto error;
@@ -86,7 +87,7 @@ int alloc_uid_keyring(struct user_struct *user)
 	 * keyring */
 	sprintf(buf, "_uid.%u", user->uid);
 
-	uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, 0,
+	uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0,
 				    session_keyring);
 	if (IS_ERR(uid_keyring)) {
 		key_put(session_keyring);
@@ -143,7 +144,7 @@ int install_thread_keyring(struct task_struct *tsk)
 
 	sprintf(buf, "_tid.%u", tsk->pid);
 
-	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL);
+	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
@@ -177,7 +178,7 @@ int install_process_keyring(struct task_struct *tsk)
 	if (!tsk->signal->process_keyring) {
 		sprintf(buf, "_pid.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL);
+		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error;
@@ -217,7 +218,7 @@ static int install_session_keyring(struct task_struct *tsk,
 	if (!keyring) {
 		sprintf(buf, "_ses.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL);
+		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	}
@@ -717,7 +718,7 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, 0);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, tsk->uid, tsk->gid, 0, NULL);
+		keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk, 0, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index f030a0ccbb93..eab66a06ca53 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -48,7 +48,8 @@ static int call_sbin_request_key(struct key *key,
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
-	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid,
+				current, 1, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error_alloc;
@@ -137,7 +138,8 @@ static struct key *__request_key_construction(struct key_type *type,
 
 	/* create a key and add it to the queue */
 	key = key_alloc(type, description,
-			current->fsuid, current->fsgid, KEY_POS_ALL, 0);
+			current->fsuid, current->fsgid,
+			current, KEY_POS_ALL, 0);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index cce6ba6b0323..0ecc2e8d2bd0 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -148,7 +148,7 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info)
 	sprintf(desc, "%x", target->serial);
 
 	authkey = key_alloc(&key_type_request_key_auth, desc,
-			    current->fsuid, current->fsgid,
+			    current->fsuid, current->fsgid, current,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
 			    KEY_USR_VIEW, 1);
 	if (IS_ERR(authkey)) {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 54adc9d31e92..524915dfda64 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4252,6 +4252,57 @@ static int selinux_setprocattr(struct task_struct *p,
 	return size;
 }
 
+#ifdef CONFIG_KEYS
+
+static int selinux_key_alloc(struct key *k, struct task_struct *tsk)
+{
+	struct task_security_struct *tsec = tsk->security;
+	struct key_security_struct *ksec;
+
+	ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
+	if (!ksec)
+		return -ENOMEM;
+
+	ksec->obj = k;
+	ksec->sid = tsec->sid;
+	k->security = ksec;
+
+	return 0;
+}
+
+static void selinux_key_free(struct key *k)
+{
+	struct key_security_struct *ksec = k->security;
+
+	k->security = NULL;
+	kfree(ksec);
+}
+
+static int selinux_key_permission(key_ref_t key_ref,
+			    struct task_struct *ctx,
+			    key_perm_t perm)
+{
+	struct key *key;
+	struct task_security_struct *tsec;
+	struct key_security_struct *ksec;
+
+	key = key_ref_to_ptr(key_ref);
+
+	tsec = ctx->security;
+	ksec = key->security;
+
+	/* if no specific permissions are requested, we skip the
+	   permission check. No serious, additional covert channels
+	   appear to be created. */
+	if (perm == 0)
+		return 0;
+
+	return avc_has_perm(tsec->sid, ksec->sid,
+			    SECCLASS_KEY, perm, NULL);
+}
+
+#endif
+
 static struct security_operations selinux_ops = {
 	.ptrace =			selinux_ptrace,
 	.capget =			selinux_capget,
@@ -4406,6 +4457,12 @@ static struct security_operations selinux_ops = {
 	.xfrm_state_delete_security =	selinux_xfrm_state_delete,
 	.xfrm_policy_lookup = 		selinux_xfrm_policy_lookup,
 #endif
+
+#ifdef CONFIG_KEYS
+	.key_alloc =                    selinux_key_alloc,
+	.key_free =                     selinux_key_free,
+	.key_permission =               selinux_key_permission,
+#endif
 };
 
 static __init int selinux_init(void)
@@ -4441,6 +4498,13 @@ static __init int selinux_init(void)
 	} else {
 		printk(KERN_INFO "SELinux:  Starting in permissive mode\n");
 	}
+
+#ifdef CONFIG_KEYS
+	/* Add security information to initial keyrings */
+	security_key_alloc(&root_user_keyring, current);
+	security_key_alloc(&root_session_keyring, current);
+#endif
+
 	return 0;
 }
 
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 70ee65a58817..bc020bde6c86 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -242,3 +242,9 @@
    S_(SECCLASS_PACKET, PACKET__SEND, "send")
    S_(SECCLASS_PACKET, PACKET__RECV, "recv")
    S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto")
+   S_(SECCLASS_KEY, KEY__VIEW, "view")
+   S_(SECCLASS_KEY, KEY__READ, "read")
+   S_(SECCLASS_KEY, KEY__WRITE, "write")
+   S_(SECCLASS_KEY, KEY__SEARCH, "search")
+   S_(SECCLASS_KEY, KEY__LINK, "link")
+   S_(SECCLASS_KEY, KEY__SETATTR, "setattr")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index 1d9cf3d306bc..1205227a3a33 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -959,3 +959,11 @@
 #define PACKET__SEND                              0x00000001UL
 #define PACKET__RECV                              0x00000002UL
 #define PACKET__RELABELTO                         0x00000004UL
+
+#define KEY__VIEW                                 0x00000001UL
+#define KEY__READ                                 0x00000002UL
+#define KEY__WRITE                                0x00000004UL
+#define KEY__SEARCH                               0x00000008UL
+#define KEY__LINK                                 0x00000010UL
+#define KEY__SETATTR                              0x00000020UL
+
diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h
index 3aec75fee4f7..24303b61309f 100644
--- a/security/selinux/include/class_to_string.h
+++ b/security/selinux/include/class_to_string.h
@@ -60,3 +60,4 @@
     S_("netlink_kobject_uevent_socket")
     S_("appletalk_socket")
     S_("packet")
+    S_("key")
diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h
index a0eb9e281d18..95887aed2a68 100644
--- a/security/selinux/include/flask.h
+++ b/security/selinux/include/flask.h
@@ -62,6 +62,7 @@
 #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET           55
 #define SECCLASS_APPLETALK_SOCKET                        56
 #define SECCLASS_PACKET                                  57
+#define SECCLASS_KEY                                     58
 
 /*
  * Security identifier indices for initial entities
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 54c030778882..8f5547ad1856 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -99,6 +99,11 @@ struct sk_security_struct {
 	u32 peer_sid;			/* SID of peer */
 };
 
+struct key_security_struct {
+	struct key *obj; /* back pointer */
+	u32 sid;         /* SID of key */
+};
+
 extern unsigned int selinux_checkreqprot;
 
 #endif /* _SELINUX_OBJSEC_H_ */
-- 
cgit v1.2.3


From 04c567d9313e4927b9835361d8ac0318ce65af6b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Jun 2006 14:47:18 -0700
Subject: [PATCH] Keys: Fix race between two instantiators of a key

Add a revocation notification method to the key type and calls it whilst
the key's semaphore is still write-locked after setting the revocation
flag.

The patch then uses this to maintain a reference on the task_struct of the
process that calls request_key() for as long as the authorisation key
remains unrevoked.

This fixes a potential race between two processes both of which have
assumed the authority to instantiate a key (one may have forked the other
for example).  The problem is that there's no locking around the check for
revocation of the auth key and the use of the task_struct it points to, nor
does the auth key keep a reference on the task_struct.

Access to the "context" pointer in the auth key must thenceforth be done
with the auth key semaphore held.  The revocation method is called with the
target key semaphore held write-locked and the search of the context
process's keyrings is done with the auth key semaphore read-locked.

The check for the revocation state of the auth key just prior to searching
it is done after the auth key is read-locked for the search.  This ensures
that the auth key can't be revoked between the check and the search.

The revocation notification method is added so that the context task_struct
can be released as soon as instantiation happens rather than waiting for
the auth key to be destroyed, thus avoiding the unnecessary pinning of the
requesting process.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt           | 10 +++++++++
 include/linux/key.h              |  5 +++++
 security/keys/key.c              |  4 ++++
 security/keys/process_keys.c     | 42 +++++++++++++++++++++++--------------
 security/keys/request_key_auth.c | 45 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 89 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 703020012708..3bbe157b45e4 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -964,6 +964,16 @@ The structure has a number of fields, some of which are mandatory:
      It is not safe to sleep in this method; the caller may hold spinlocks.
 
 
+ (*) void (*revoke)(struct key *key);
+
+     This method is optional.  It is called to discard part of the payload
+     data upon a key being revoked.  The caller will have the key semaphore
+     write-locked.
+
+     It is safe to sleep in this method, though care should be taken to avoid
+     a deadlock against the key semaphore.
+
+
  (*) void (*destroy)(struct key *key);
 
      This method is optional. It is called to discard the payload data on a key
diff --git a/include/linux/key.h b/include/linux/key.h
index 8c275d12ef63..e81ebf910d0b 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -205,6 +205,11 @@ struct key_type {
 	/* match a key against a description */
 	int (*match)(const struct key *key, const void *desc);
 
+	/* clear some of the data from a key on revokation (optional)
+	 * - the key's semaphore will be write-locked by the caller
+	 */
+	void (*revoke)(struct key *key);
+
 	/* clear the data from a key (optional) */
 	void (*destroy)(struct key *key);
 
diff --git a/security/keys/key.c b/security/keys/key.c
index 14a15abb7735..51f851557389 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -907,6 +907,10 @@ void key_revoke(struct key *key)
 	 * it */
 	down_write(&key->sem);
 	set_bit(KEY_FLAG_REVOKED, &key->flags);
+
+	if (key->type->revoke)
+		key->type->revoke(key);
+
 	up_write(&key->sem);
 
 } /* end key_revoke() */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index a50a91332fe1..4d9825f9962c 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -391,6 +391,8 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	struct request_key_auth *rka;
 	key_ref_t key_ref, ret, err;
 
+	might_sleep();
+
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
 	 * otherwise we want to return a sample error (probably -EACCES) if
@@ -496,27 +498,35 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 */
 	if (context->request_key_auth &&
 	    context == current &&
-	    type != &key_type_request_key_auth &&
-	    key_validate(context->request_key_auth) == 0
+	    type != &key_type_request_key_auth
 	    ) {
-		rka = context->request_key_auth->payload.data;
+		/* defend against the auth key being revoked */
+		down_read(&context->request_key_auth->sem);
 
-		key_ref = search_process_keyrings(type, description, match,
-						  rka->context);
+		if (key_validate(context->request_key_auth) == 0) {
+			rka = context->request_key_auth->payload.data;
 
-		if (!IS_ERR(key_ref))
-			goto found;
+			key_ref = search_process_keyrings(type, description,
+							  match, rka->context);
 
-		switch (PTR_ERR(key_ref)) {
-		case -EAGAIN: /* no key */
-			if (ret)
+			up_read(&context->request_key_auth->sem);
+
+			if (!IS_ERR(key_ref))
+				goto found;
+
+			switch (PTR_ERR(key_ref)) {
+			case -EAGAIN: /* no key */
+				if (ret)
+					break;
+			case -ENOKEY: /* negative key */
+				ret = key_ref;
 				break;
-		case -ENOKEY: /* negative key */
-			ret = key_ref;
-			break;
-		default:
-			err = key_ref;
-			break;
+			default:
+				err = key_ref;
+				break;
+			}
+		} else {
+			up_read(&context->request_key_auth->sem);
 		}
 	}
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 0ecc2e8d2bd0..cb9817ced3fd 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -20,6 +20,7 @@
 
 static int request_key_auth_instantiate(struct key *, const void *, size_t);
 static void request_key_auth_describe(const struct key *, struct seq_file *);
+static void request_key_auth_revoke(struct key *);
 static void request_key_auth_destroy(struct key *);
 static long request_key_auth_read(const struct key *, char __user *, size_t);
 
@@ -31,6 +32,7 @@ struct key_type key_type_request_key_auth = {
 	.def_datalen	= sizeof(struct request_key_auth),
 	.instantiate	= request_key_auth_instantiate,
 	.describe	= request_key_auth_describe,
+	.revoke		= request_key_auth_revoke,
 	.destroy	= request_key_auth_destroy,
 	.read		= request_key_auth_read,
 };
@@ -91,6 +93,24 @@ static long request_key_auth_read(const struct key *key,
 
 } /* end request_key_auth_read() */
 
+/*****************************************************************************/
+/*
+ * handle revocation of an authorisation token key
+ * - called with the key sem write-locked
+ */
+static void request_key_auth_revoke(struct key *key)
+{
+	struct request_key_auth *rka = key->payload.data;
+
+	kenter("{%d}", key->serial);
+
+	if (rka->context) {
+		put_task_struct(rka->context);
+		rka->context = NULL;
+	}
+
+} /* end request_key_auth_revoke() */
+
 /*****************************************************************************/
 /*
  * destroy an instantiation authorisation token key
@@ -101,6 +121,11 @@ static void request_key_auth_destroy(struct key *key)
 
 	kenter("{%d}", key->serial);
 
+	if (rka->context) {
+		put_task_struct(rka->context);
+		rka->context = NULL;
+	}
+
 	key_put(rka->target_key);
 	kfree(rka);
 
@@ -131,14 +156,26 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info)
 	 * another process */
 	if (current->request_key_auth) {
 		/* it is - use that instantiation context here too */
+		down_read(&current->request_key_auth->sem);
+
+		/* if the auth key has been revoked, then the key we're
+		 * servicing is already instantiated */
+		if (test_bit(KEY_FLAG_REVOKED,
+			     &current->request_key_auth->flags))
+			goto auth_key_revoked;
+
 		irka = current->request_key_auth->payload.data;
 		rka->context = irka->context;
 		rka->pid = irka->pid;
+		get_task_struct(rka->context);
+
+		up_read(&current->request_key_auth->sem);
 	}
 	else {
 		/* it isn't - use this process as the context */
 		rka->context = current;
 		rka->pid = current->pid;
+		get_task_struct(rka->context);
 	}
 
 	rka->target_key = key_get(target);
@@ -161,9 +198,15 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info)
 	if (ret < 0)
 		goto error_inst;
 
-	kleave(" = {%d})", authkey->serial);
+	kleave(" = {%d}", authkey->serial);
 	return authkey;
 
+auth_key_revoked:
+	up_read(&current->request_key_auth->sem);
+	kfree(rka);
+	kleave("= -EKEYREVOKED");
+	return ERR_PTR(-EKEYREVOKED);
+
 error_inst:
 	key_revoke(authkey);
 	key_put(authkey);
-- 
cgit v1.2.3


From 0e5b3781591cc954037c08ef78edf7f1192d38c5 Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Thu, 22 Jun 2006 14:47:20 -0700
Subject: [PATCH] PCI: Add PCI_CAP_ID_VNDR

Add the vendor-specific extended capability PCI_CAP_ID_VNDR.  It is required
by the Myri-10G Ethernet driver.

Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index d27a78b71297..6bce4a240364 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -197,6 +197,7 @@
 #define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
 #define  PCI_CAP_ID_PCIX	0x07	/* PCI-X */
 #define  PCI_CAP_ID_HT_IRQCONF	0x08	/* HyperTransport IRQ Configuration */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific capability */
 #define  PCI_CAP_ID_SHPC 	0x0C	/* PCI Standard Hot-Plug Controller */
 #define  PCI_CAP_ID_EXP 	0x10	/* PCI Express */
 #define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
-- 
cgit v1.2.3


From c89681ed7d0e4a61d35bdc12c06c6733b718b2cb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 22 Jun 2006 14:47:22 -0700
Subject: [PATCH] remove steal_locks()

This patch removes the steal_locks() function.

steal_locks() doesn't work correctly with any filesystem that does it's own
lock management, including NFS, CIFS, etc.

In addition it has weird semantics on local filesystems in case tasks
sharing file-descriptor tables are doing POSIX locking operations in
parallel to execve().

The steal_locks() function has an effect on applications doing:

clone(CLONE_FILES)
  /* in child */
  lock
  execve
  lock

POSIX locks acquired before execve (by "child", "parent" or any further
task sharing files_struct) will after the execve be owned exclusively by
"child".

According to Chris Wright some LSB/LTP kind of suite triggers without the
stealing behavior, but there's no known real-world application that would
also fail.

Apps using NPTL are not affected, since all other threads are killed before
execve.

Apps using LinuxThreads are only affected if they

  - have multiple threads during exec (LinuxThreads doesn't kill other
    threads, the app may do it with pthread_kill_other_threads_np())
  - rely on POSIX locks being inherited across exec

Both conditions are documented, but not their interaction.

Apps using clone() natively are affected if they

  - use clone(CLONE_FILES)
  - rely on POSIX locks being inherited across exec

The above scenarios are unlikely, but possible.

If the patch is vetoed, there's a plan B, that involves mostly keeping the
weird stealing semantics, but changing the way lock ownership is handled so
that network and local filesystems work consistently.

That would add more complexity though, so this solution seems to be
preferred by most people.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c    |  1 -
 fs/binfmt_misc.c   |  1 -
 fs/exec.c          |  1 -
 fs/locks.c         | 57 ------------------------------------------------------
 include/linux/fs.h |  1 -
 5 files changed, 61 deletions(-)

(limited to 'include/linux')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 537893a16014..8a04216e8b4d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -759,7 +759,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Discard our unneeded old files struct */
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a39..599f36fd0f67 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -203,7 +203,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _error;
 
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index d07858c0b7c4..0b88bf646143 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -866,7 +866,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;		/* We're using it now */
 
 	/* This is the point of no return */
-	steal_locks(files);
 	put_files_struct(files);
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
diff --git a/fs/locks.c b/fs/locks.c
index ab61a8b54829..69435c68c1ed 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2206,63 +2206,6 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 
 EXPORT_SYMBOL(lock_may_write);
 
-static inline void __steal_locks(struct file *file, fl_owner_t from)
-{
-	struct inode *inode = file->f_dentry->d_inode;
-	struct file_lock *fl = inode->i_flock;
-
-	while (fl) {
-		if (fl->fl_file == file && fl->fl_owner == from)
-			fl->fl_owner = current->files;
-		fl = fl->fl_next;
-	}
-}
-
-/* When getting ready for executing a binary, we make sure that current
- * has a files_struct on its own. Before dropping the old files_struct,
- * we take over ownership of all locks for all file descriptors we own.
- * Note that we may accidentally steal a lock for a file that a sibling
- * has created since the unshare_files() call.
- */
-void steal_locks(fl_owner_t from)
-{
-	struct files_struct *files = current->files;
-	int i, j;
-	struct fdtable *fdt;
-
-	if (from == files)
-		return;
-
-	lock_kernel();
-	j = 0;
-
-	/*
-	 * We are not taking a ref to the file structures, so
-	 * we need to acquire ->file_lock.
-	 */
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	for (;;) {
-		unsigned long set;
-		i = j * __NFDBITS;
-		if (i >= fdt->max_fdset || i >= fdt->max_fds)
-			break;
-		set = fdt->open_fds->fds_bits[j++];
-		while (set) {
-			if (set & 1) {
-				struct file *file = fdt->fd[i];
-				if (file)
-					__steal_locks(file, from);
-			}
-			i++;
-			set >>= 1;
-		}
-	}
-	spin_unlock(&files->file_lock);
-	unlock_kernel();
-}
-EXPORT_SYMBOL(steal_locks);
-
 static int __init filelock_init(void)
 {
 	filelock_cache = kmem_cache_create("file_lock_cache",
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ecc8c2c3d8ca..73c7d6f04b31 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -782,7 +782,6 @@ extern int setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
-extern void steal_locks(fl_owner_t from);
 
 struct fasync_struct {
 	int	magic;
-- 
cgit v1.2.3


From 0feae5c47aabdde59cbbec32d150e17102de37f0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 22 Jun 2006 14:47:28 -0700
Subject: [PATCH] Fix dcache race during umount

The race is that the shrink_dcache_memory shrinker could get called while a
filesystem is being unmounted, and could try to prune a dentry belonging to
that filesystem.

If it does, then it will call in to iput on the inode while the dentry is
no longer able to be found by the umounting process.  If iput takes a
while, generic_shutdown_super could get all the way though
shrink_dcache_parent and shrink_dcache_anon and invalidate_inodes without
ever waiting on this particular inode.

Eventually the superblock gets freed anyway and if the iput tried to touch
it (which some filesystems certainly do), it will lose.  The promised
"Self-destruct in 5 seconds" doesn't lead to a nice day.

The race is closed by holding s_umount while calling prune_one_dentry on
someone else's dentry.  As a down_read_trylock is used,
shrink_dcache_memory will no longer try to prune the dentry of a filesystem
that is being unmounted, and unmount will not be able to start until any
such active prune_one_dentry completes.

This requires that prune_dcache *knows* which filesystem (if any) it is
doing the prune on behalf of so that it can be careful of other
filesystems.  shrink_dcache_memory isn't called it on behalf of any
filesystem, and so is careful of everything.

shrink_dcache_anon is now passed a super_block rather than the s_anon list
out of the superblock, so it can get the s_anon list itself, and can pass
the superblock down to prune_dcache.

If prune_dcache finds a dentry that it cannot free, it leaves it where it
is (at the tail of the list) and exits, on the assumption that some other
thread will be removing that dentry soon.  To try to make sure that some
work gets done, a limited number of dnetries which are untouchable are
skipped over while choosing the dentry to work on.

I believe this race was first found by Kirill Korotaev.

Cc: Jan Blunck <jblunck@suse.de>
Acked-by: Kirill Korotaev <dev@openvz.org>
Cc: Olaf Hering <olh@suse.de>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c            | 66 +++++++++++++++++++++++++++++++++++++++++++++-----
 fs/super.c             |  2 +-
 include/linux/dcache.h |  2 +-
 3 files changed, 62 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 940d188e5d14..385f5dbc4b0c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -382,6 +382,8 @@ static inline void prune_one_dentry(struct dentry * dentry)
 /**
  * prune_dcache - shrink the dcache
  * @count: number of entries to try and free
+ * @sb: if given, ignore dentries for other superblocks
+ *         which are being unmounted.
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -392,16 +394,29 @@ static inline void prune_one_dentry(struct dentry * dentry)
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count)
+static void prune_dcache(int count, struct super_block *sb)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
+		struct rw_semaphore *s_umount;
 
 		cond_resched_lock(&dcache_lock);
 
 		tmp = dentry_unused.prev;
+		if (unlikely(sb)) {
+			/* Try to find a dentry for this sb, but don't try
+			 * too hard, if they aren't near the tail they will
+			 * be moved down again soon
+			 */
+			int skip = count;
+			while (skip && tmp != &dentry_unused &&
+			    list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
+				skip--;
+				tmp = tmp->prev;
+			}
+		}
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
@@ -427,7 +442,45 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		prune_one_dentry(dentry);
+		/*
+		 * If the dentry is not DCACHED_REFERENCED, it is time
+		 * to remove it from the dcache, provided the super block is
+		 * NULL (which means we are trying to reclaim memory)
+		 * or this dentry belongs to the same super block that
+		 * we want to shrink.
+		 */
+		/*
+		 * If this dentry is for "my" filesystem, then I can prune it
+		 * without taking the s_umount lock (I already hold it).
+		 */
+		if (sb && dentry->d_sb == sb) {
+			prune_one_dentry(dentry);
+			continue;
+		}
+		/*
+		 * ...otherwise we need to be sure this filesystem isn't being
+		 * unmounted, otherwise we could race with
+		 * generic_shutdown_super(), and end up holding a reference to
+		 * an inode while the filesystem is unmounted.
+		 * So we try to get s_umount, and make sure s_root isn't NULL.
+		 * (Take a local copy of s_umount to avoid a use-after-free of
+		 * `dentry').
+		 */
+		s_umount = &dentry->d_sb->s_umount;
+		if (down_read_trylock(s_umount)) {
+			if (dentry->d_sb->s_root != NULL) {
+				prune_one_dentry(dentry);
+				up_read(s_umount);
+				continue;
+			}
+			up_read(s_umount);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* Cannot remove the first dentry, and it isn't appropriate
+		 * to move it to the head of the list, so give up, and try
+		 * later
+		 */
+		break;
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -630,7 +683,7 @@ void shrink_dcache_parent(struct dentry * parent)
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		prune_dcache(found, parent->d_sb);
 }
 
 /**
@@ -643,9 +696,10 @@ void shrink_dcache_parent(struct dentry * parent)
  * done under dcache_lock.
  *
  */
-void shrink_dcache_anon(struct hlist_head *head)
+void shrink_dcache_anon(struct super_block *sb)
 {
 	struct hlist_node *lp;
+	struct hlist_head *head = &sb->s_anon;
 	int found;
 	do {
 		found = 0;
@@ -668,7 +722,7 @@ void shrink_dcache_anon(struct hlist_head *head)
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		prune_dcache(found, sb);
 	} while(found);
 }
 
@@ -689,7 +743,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr);
+		prune_dcache(nr, NULL);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
diff --git a/fs/super.c b/fs/super.c
index a66f66bb8049..9d5c2add7228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(&sb->s_anon);
+		shrink_dcache_anon(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 836325ee0931..46d0e079735d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,7 +217,7 @@ extern struct dentry * d_alloc_anon(struct inode *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_anon(struct hlist_head *);
+extern void shrink_dcache_anon(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */
-- 
cgit v1.2.3


From 4f3865fb57a04db7cca068fed1c15badc064a302 Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@rpsys.net>
Date: Thu, 22 Jun 2006 14:47:34 -0700
Subject: [PATCH] zlib_inflate: Upgrade library code to a recent version

Upgrade the zlib_inflate implementation in the kernel from a patched
version 1.1.3/4 to a patched 1.2.3.

The code in the kernel is about seven years old and I noticed that the
external zlib library's inflate performance was significantly faster (~50%)
than the code in the kernel on ARM (and faster again on x86_32).

For comparison the newer deflate code is 20% slower on ARM and 50% slower
on x86_32 but gives an approx 1% compression ratio improvement.  I don't
consider this to be an improvement for kernel use so have no plans to
change the zlib_deflate code.

Various changes have been made to the zlib code in the kernel, the most
significant being the extra functions/flush option used by ppp_deflate.
This update reimplements the features PPP needs to ensure it continues to
work.

This code has been tested on ARM under both JFFS2 (with zlib compression
enabled) and ppp_deflate and on x86_32.  JFFS2 sees an approx.  10% real
world file read speed improvement.

This patch also removes ZLIB_VERSION as it no longer has a correct value.
We don't need version checks anyway as the kernel's module handling will
take care of that for us.  This removal is also more in keeping with the
zlib author's wishes (http://www.zlib.net/zlib_faq.html#faq24) and I've
added something to the zlib.h header to note its a modified version.

Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Acked-by: Joern Engel <joern@wh.fh-wedel.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/boot/Makefile      |    4 +-
 arch/ppc/boot/lib/Makefile      |    2 +-
 arch/xtensa/boot/lib/Makefile   |    2 +-
 include/linux/zconf.h           |   12 +
 include/linux/zlib.h            |  209 +++++---
 include/linux/zutil.h           |   12 -
 lib/zlib_deflate/deflate.c      |   25 +-
 lib/zlib_deflate/deflate_syms.c |    3 +-
 lib/zlib_inflate/Makefile       |    4 +-
 lib/zlib_inflate/infblock.c     |  365 -------------
 lib/zlib_inflate/infblock.h     |   48 --
 lib/zlib_inflate/infcodes.c     |  202 --------
 lib/zlib_inflate/infcodes.h     |   33 --
 lib/zlib_inflate/inffast.c      |  462 +++++++++++------
 lib/zlib_inflate/inffast.h      |   12 +-
 lib/zlib_inflate/inffixed.h     |   94 ++++
 lib/zlib_inflate/inflate.c      | 1086 +++++++++++++++++++++++++++++++--------
 lib/zlib_inflate/inflate.h      |  107 ++++
 lib/zlib_inflate/inflate_syms.c |    3 +-
 lib/zlib_inflate/inflate_sync.c |  152 ------
 lib/zlib_inflate/inftrees.c     |  683 +++++++++++-------------
 lib/zlib_inflate/inftrees.h     |   99 ++--
 lib/zlib_inflate/infutil.c      |   88 ----
 lib/zlib_inflate/infutil.h      |  176 +------
 24 files changed, 1877 insertions(+), 2006 deletions(-)
 delete mode 100644 lib/zlib_inflate/infblock.c
 delete mode 100644 lib/zlib_inflate/infblock.h
 delete mode 100644 lib/zlib_inflate/infcodes.c
 delete mode 100644 lib/zlib_inflate/infcodes.h
 create mode 100644 lib/zlib_inflate/inffixed.h
 create mode 100644 lib/zlib_inflate/inflate.h
 delete mode 100644 lib/zlib_inflate/inflate_sync.c
 delete mode 100644 lib/zlib_inflate/infutil.c

(limited to 'include/linux')

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 840ae595a617..d961bfeed05f 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -29,8 +29,8 @@ OBJCOPYFLAGS    := contents,alloc,load,readonly,data
 OBJCOPY_COFF_ARGS := -O aixcoff-rs6000 --set-start 0x500000
 OBJCOPY_MIB_ARGS  := -O aixcoff-rs6000 -R .stab -R .stabstr -R .comment
 
-zlib       := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
-zlibheader := infblock.h infcodes.h inffast.h inftrees.h infutil.h
+zlib       := inffast.c inflate.c inftrees.c
+zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
 zliblinuxheader := zlib.h zconf.h zutil.h
 
 $(addprefix $(obj)/,$(zlib) main.o): $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader))
diff --git a/arch/ppc/boot/lib/Makefile b/arch/ppc/boot/lib/Makefile
index 80c84d562fa4..2f995f712ec5 100644
--- a/arch/ppc/boot/lib/Makefile
+++ b/arch/ppc/boot/lib/Makefile
@@ -5,7 +5,7 @@
 CFLAGS_kbd.o	:= -Idrivers/char
 CFLAGS_vreset.o := -Iarch/ppc/boot/include
 
-zlib  := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
+zlib  := inffast.c inflate.c inftrees.c
 	 
 lib-y += $(zlib:.c=.o) div64.o
 lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
diff --git a/arch/xtensa/boot/lib/Makefile b/arch/xtensa/boot/lib/Makefile
index 9e73bb8aeb7a..d3d2aa2d883a 100644
--- a/arch/xtensa/boot/lib/Makefile
+++ b/arch/xtensa/boot/lib/Makefile
@@ -2,7 +2,7 @@
 # Makefile for some libs needed by zImage.
 #
 
-zlib	:= infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
+zlib	:= inffast.c inflate.c inftrees.c
 
 lib-y	+= $(zlib:.c=.o) zmem.o
 
diff --git a/include/linux/zconf.h b/include/linux/zconf.h
index f1cfd66b9554..0beb75e38caa 100644
--- a/include/linux/zconf.h
+++ b/include/linux/zconf.h
@@ -33,6 +33,18 @@
  */
 #ifndef MAX_WBITS
 #  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+#ifndef DEF_WBITS
+#  define DEF_WBITS MAX_WBITS
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
 #endif
 
                         /* Type declarations */
diff --git a/include/linux/zlib.h b/include/linux/zlib.h
index 4fa32f0d4df8..9e3192a7dc6f 100644
--- a/include/linux/zlib.h
+++ b/include/linux/zlib.h
@@ -1,7 +1,6 @@
 /* zlib.h -- interface of the 'zlib' general purpose compression library
-  version 1.1.3, July 9th, 1998
 
-  Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
+  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
 
   This software is provided 'as-is', without any express or implied
   warranty.  In no event will the authors be held liable for any damages
@@ -24,7 +23,7 @@
 
 
   The data format used by the zlib library is described by RFCs (Request for
-  Comments) 1950 to 1952 in the files ftp://ds.internic.net/rfc/rfc1950.txt
+  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
   (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
 */
 
@@ -33,7 +32,22 @@
 
 #include <linux/zconf.h>
 
-#define ZLIB_VERSION "1.1.3"
+/* zlib deflate based on ZLIB_VERSION "1.1.3" */
+/* zlib inflate based on ZLIB_VERSION "1.2.3" */
+
+/*
+  This is a modified version of zlib for use inside the Linux kernel.
+  The main changes are to perform all memory allocation in advance.
+
+  Inflation Changes:
+    * Z_PACKET_FLUSH is added and used by ppp_deflate. Before returning
+      this checks there is no more input data available and the next data
+      is a STORED block. It also resets the mode to be read for the next
+      data, all as per PPP requirements.
+    * Addition of zlib_inflateIncomp which copies incompressible data into
+      the history window and adjusts the accoutning without calling
+      zlib_inflate itself to inflate the data.
+*/
 
 /* 
      The 'zlib' compression library provides in-memory compression and
@@ -48,9 +62,18 @@
   application must provide more input and/or consume the output
   (providing more output space) before each call.
 
+     The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
      The library also supports reading and writing files in gzip (.gz) format
   with an interface similar to that of stdio.
 
+     The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
      The library does not install any signal handler. The decoder checks
   the consistency of the compressed data, so the library should never
   crash even in case of corrupted input.
@@ -119,7 +142,8 @@ typedef z_stream *z_streamp;
 #define Z_SYNC_FLUSH    3
 #define Z_FULL_FLUSH    4
 #define Z_FINISH        5
-/* Allowed flush values; see deflate() below for details */
+#define Z_BLOCK         6 /* Only for inflate at present */
+/* Allowed flush values; see deflate() and inflate() below for details */
 
 #define Z_OK            0
 #define Z_STREAM_END    1
@@ -155,13 +179,6 @@ typedef z_stream *z_streamp;
 
                         /* basic functions */
 
-extern const char * zlib_zlibVersion (void);
-/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
-   If the first character differs, the library code actually used is
-   not compatible with the zlib.h header file used by the application.
-   This check is automatically made by deflateInit and inflateInit.
- */
-
 extern int zlib_deflate_workspacesize (void);
 /*
    Returns the number of bytes that needs to be allocated for a per-
@@ -315,9 +332,9 @@ extern int zlib_inflateInit (z_streamp strm);
 extern int zlib_inflate (z_streamp strm, int flush);
 /*
     inflate decompresses as much data as possible, and stops when the input
-  buffer becomes empty or the output buffer becomes full. It may some
-  introduce some output latency (reading input without producing any output)
-  except when forced to flush.
+  buffer becomes empty or the output buffer becomes full. It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
 
   The detailed semantics are as follows. inflate performs one or both of the
   following actions:
@@ -341,11 +358,26 @@ extern int zlib_inflate (z_streamp strm, int flush);
   must be called again after making room in the output buffer because there
   might be more output pending.
 
-    If the parameter flush is set to Z_SYNC_FLUSH, inflate flushes as much
-  output as possible to the output buffer. The flushing behavior of inflate is
-  not specified for values of the flush parameter other than Z_SYNC_FLUSH
-  and Z_FINISH, but the current implementation actually flushes as much output
-  as possible anyway.
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
+  Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer. Z_BLOCK requests that inflate() stop
+  if and when it gets to the next deflate block boundary. When decoding the
+  zlib or gzip format, this will cause inflate() to return immediately after
+  the header and before the first block. When doing a raw inflate, inflate()
+  will go ahead and process the first block, and will return when it gets to
+  the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  Also to assist in this, on return inflate() will set strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64
+  if inflate() is currently decoding the last block in the deflate stream,
+  plus 128 if inflate() returned immediately after decoding an end-of-block
+  code or decoding the complete header up to just before the first byte of the
+  deflate stream. The end-of-block will not be indicated until all of the
+  uncompressed data from that block has been written to strm->next_out.  The
+  number of unused bits may in general be greater than seven, except when
+  bit 7 of data_type is set, in which case the number of unused bits will be
+  less than eight.
 
     inflate() should normally be called until it returns Z_STREAM_END or an
   error. However if all decompression is to be performed in a single step
@@ -355,29 +387,44 @@ extern int zlib_inflate (z_streamp strm, int flush);
   uncompressed data. (The size of the uncompressed data may have been saved
   by the compressor for this purpose.) The next operation on this stream must
   be inflateEnd to deallocate the decompression state. The use of Z_FINISH
-  is never required, but can be used to inform inflate that a faster routine
+  is never required, but can be used to inform inflate that a faster approach
   may be used for the single inflate() call.
 
-     If a preset dictionary is needed at this point (see inflateSetDictionary
-  below), inflate sets strm-adler to the adler32 checksum of the
-  dictionary chosen by the compressor and returns Z_NEED_DICT; otherwise 
-  it sets strm->adler to the adler32 checksum of all output produced
-  so far (that is, total_out bytes) and returns Z_OK, Z_STREAM_END or
-  an error code as described below. At the end of the stream, inflate()
-  checks that its computed adler32 checksum is equal to that saved by the
-  compressor and returns Z_STREAM_END only if the checksum is correct.
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call. So the only effect of the flush parameter in this implementation
+  is on the return value of inflate(), as noted below, or when it returns early
+  because Z_BLOCK is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the adler32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the adler32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below. At the end of the stream, inflate() checks that its computed adler32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() will decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically.  Any information
+  contained in the gzip header is not retained, so applications that need that
+  information should instead use raw inflate, see inflateInit2() below, or
+  inflateBack() and perform their own processing of the gzip header and
+  trailer.
 
     inflate() returns Z_OK if some progress has been made (more input processed
   or more output produced), Z_STREAM_END if the end of the compressed data has
   been reached and all uncompressed output has been produced, Z_NEED_DICT if a
   preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
-  corrupted (input stream not conforming to the zlib format or incorrect
-  adler32 checksum), Z_STREAM_ERROR if the stream structure was inconsistent
-  (for example if next_in or next_out was NULL), Z_MEM_ERROR if there was not
-  enough memory, Z_BUF_ERROR if no progress is possible or if there was not
-  enough room in the output buffer when Z_FINISH is used. In the Z_DATA_ERROR
-  case, the application may then call inflateSync to look for a good
-  compression block.
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+  Z_BUF_ERROR if no progress is possible or if there was not enough room in the
+  output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing. If Z_DATA_ERROR is returned, the application may then
+  call inflateSync() to look for a good compression block if a partial recovery
+  of the data is desired.
 */
 
 
@@ -547,16 +594,36 @@ extern int inflateInit2 (z_streamp strm, int  windowBits);
      The windowBits parameter is the base two logarithm of the maximum window
    size (the size of the history buffer).  It should be in the range 8..15 for
    this version of the library. The default value is 15 if inflateInit is used
-   instead. If a compressed stream with a larger window size is given as
-   input, inflate() will return with the error code Z_DATA_ERROR instead of
-   trying to allocate a larger window.
-
-      inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
-   memory, Z_STREAM_ERROR if a parameter is invalid (such as a negative
-   memLevel). msg is set to null if there is no error message.  inflateInit2
-   does not perform any decompression apart from reading the zlib header if
-   present: this will be done by inflate(). (So next_in and avail_in may be
-   modified, but next_out and avail_out are unchanged.)
+   instead. windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used. If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
+   determines the window size. inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream. This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values. If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an adler32 or a crc32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is. Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding. Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is
+   a crc32 instead of an adler32.
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
+   is set to null if there is no error message.  inflateInit2 does not perform
+   any decompression apart from reading the zlib header if present: this will
+   be done by inflate(). (So next_in and avail_in may be modified, but next_out
+   and avail_out are unchanged.)
 */
 
 extern int zlib_inflateSetDictionary (z_streamp strm,
@@ -564,16 +631,19 @@ extern int zlib_inflateSetDictionary (z_streamp strm,
 						     uInt  dictLength);
 /*
      Initializes the decompression dictionary from the given uncompressed byte
-   sequence. This function must be called immediately after a call of inflate
-   if this call returned Z_NEED_DICT. The dictionary chosen by the compressor
-   can be determined from the Adler32 value returned by this call of
-   inflate. The compressor and decompressor must use exactly the same
-   dictionary (see deflateSetDictionary).
+   sequence. This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+   can be determined from the adler32 value returned by that call of inflate.
+   The compressor and decompressor must use exactly the same dictionary (see
+   deflateSetDictionary).  For raw inflate, this function can be called
+   immediately after inflateInit2() or inflateReset() and before any call of
+   inflate() to set the dictionary.  The application must insure that the
+   dictionary that was used for compression is provided.
 
      inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
    parameter is invalid (such as NULL dictionary) or the stream state is
    inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
-   expected one (incorrect Adler32 value). inflateSetDictionary does not
+   expected one (incorrect adler32 value). inflateSetDictionary does not
    perform any decompression: this will be done by subsequent calls of
    inflate().
 */
@@ -614,40 +684,19 @@ extern int zlib_inflateIncomp (z_stream *strm);
    containing the data at next_in (except that the data is not output).
 */
 
-                        /* various hacks, don't look :) */
-
-/* deflateInit and inflateInit are macros to allow checking the zlib version
- * and the compiler's view of z_stream:
- */
-extern int zlib_deflateInit_ (z_streamp strm, int level,
-                                     const char *version, int stream_size);
-extern int zlib_inflateInit_ (z_streamp strm,
-                                     const char *version, int stream_size);
-extern int zlib_deflateInit2_ (z_streamp strm, int  level, int  method,
-                                      int windowBits, int memLevel,
-                                      int strategy, const char *version,
-                                      int stream_size);
-extern int zlib_inflateInit2_ (z_streamp strm, int  windowBits,
-                                      const char *version, int stream_size);
 #define zlib_deflateInit(strm, level) \
-        zlib_deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream))
+	zlib_deflateInit2((strm), (level), Z_DEFLATED, MAX_WBITS, \
+			      DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY)
 #define zlib_inflateInit(strm) \
-        zlib_inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream))
-#define zlib_deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
-        zlib_deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
-                      (strategy), ZLIB_VERSION, sizeof(z_stream))
-#define zlib_inflateInit2(strm, windowBits) \
-        zlib_inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
+	zlib_inflateInit2((strm), DEF_WBITS)
 
+extern int zlib_deflateInit2(z_streamp strm, int  level, int  method,
+                                      int windowBits, int memLevel,
+                                      int strategy);
+extern int zlib_inflateInit2(z_streamp strm, int  windowBits);
 
 #if !defined(_Z_UTIL_H) && !defined(NO_DUMMY_DECL)
     struct internal_state {int dummy;}; /* hack for buggy compilers */
 #endif
 
-extern const char  * zlib_zError           (int err);
-#if 0
-extern int           zlib_inflateSyncPoint (z_streamp z);
-#endif
-extern const uLong * zlib_get_crc_table    (void);
-
 #endif /* _ZLIB_H */
diff --git a/include/linux/zutil.h b/include/linux/zutil.h
index ee0c59cf2136..6adfa9a6ffe9 100644
--- a/include/linux/zutil.h
+++ b/include/linux/zutil.h
@@ -23,18 +23,6 @@ typedef unsigned long  ulg;
 
         /* common constants */
 
-#ifndef DEF_WBITS
-#  define DEF_WBITS MAX_WBITS
-#endif
-/* default windowBits for decompression. MAX_WBITS is for compression only */
-
-#if MAX_MEM_LEVEL >= 8
-#  define DEF_MEM_LEVEL 8
-#else
-#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
-#endif
-/* default memLevel */
-
 #define STORED_BLOCK 0
 #define STATIC_TREES 1
 #define DYN_TREES    2
diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c
index 1653dd9bb01a..c3e4a2baf835 100644
--- a/lib/zlib_deflate/deflate.c
+++ b/lib/zlib_deflate/deflate.c
@@ -164,34 +164,17 @@ static const config configuration_table[10] = {
     memset((char *)s->head, 0, (unsigned)(s->hash_size-1)*sizeof(*s->head));
 
 /* ========================================================================= */
-int zlib_deflateInit_(
-	z_streamp strm,
-	int level,
-	const char *version,
-	int stream_size
-)
-{
-    return zlib_deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS,
-			      DEF_MEM_LEVEL,
-			      Z_DEFAULT_STRATEGY, version, stream_size);
-    /* To do: ignore strm->next_in if we use it as window */
-}
-
-/* ========================================================================= */
-int zlib_deflateInit2_(
+int zlib_deflateInit2(
 	z_streamp strm,
 	int  level,
 	int  method,
 	int  windowBits,
 	int  memLevel,
-	int  strategy,
-	const char *version,
-	int stream_size
+	int  strategy
 )
 {
     deflate_state *s;
     int noheader = 0;
-    static char* my_version = ZLIB_VERSION;
     deflate_workspace *mem;
 
     ush *overlay;
@@ -199,10 +182,6 @@ int zlib_deflateInit2_(
      * output size for (length,distance) codes is <= 24 bits.
      */
 
-    if (version == NULL || version[0] != my_version[0] ||
-        stream_size != sizeof(z_stream)) {
-	return Z_VERSION_ERROR;
-    }
     if (strm == NULL) return Z_STREAM_ERROR;
 
     strm->msg = NULL;
diff --git a/lib/zlib_deflate/deflate_syms.c b/lib/zlib_deflate/deflate_syms.c
index 767b573d1ef6..ccfe25f3920f 100644
--- a/lib/zlib_deflate/deflate_syms.c
+++ b/lib/zlib_deflate/deflate_syms.c
@@ -12,8 +12,7 @@
 
 EXPORT_SYMBOL(zlib_deflate_workspacesize);
 EXPORT_SYMBOL(zlib_deflate);
-EXPORT_SYMBOL(zlib_deflateInit_);
-EXPORT_SYMBOL(zlib_deflateInit2_);
+EXPORT_SYMBOL(zlib_deflateInit2);
 EXPORT_SYMBOL(zlib_deflateEnd);
 EXPORT_SYMBOL(zlib_deflateReset);
 MODULE_LICENSE("GPL");
diff --git a/lib/zlib_inflate/Makefile b/lib/zlib_inflate/Makefile
index 221c139e0df1..bf065482fa67 100644
--- a/lib/zlib_inflate/Makefile
+++ b/lib/zlib_inflate/Makefile
@@ -15,5 +15,5 @@
 
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate.o
 
-zlib_inflate-objs := infblock.o infcodes.o inffast.o inflate.o \
-		     inflate_sync.o inftrees.o infutil.o inflate_syms.o
+zlib_inflate-objs := inffast.o inflate.o \
+		     inftrees.o inflate_syms.o
diff --git a/lib/zlib_inflate/infblock.c b/lib/zlib_inflate/infblock.c
deleted file mode 100644
index c16cdeff51aa..000000000000
--- a/lib/zlib_inflate/infblock.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/* infblock.c -- interpret and process block types to last block
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-#include <linux/zutil.h>
-#include "infblock.h"
-#include "inftrees.h"
-#include "infcodes.h"
-#include "infutil.h"
-
-struct inflate_codes_state;
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-/* Table for deflate from PKZIP's appnote.txt. */
-static const uInt border[] = { /* Order of the bit length code lengths */
-        16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
-
-/*
-   Notes beyond the 1.93a appnote.txt:
-
-   1. Distance pointers never point before the beginning of the output
-      stream.
-   2. Distance pointers can point back across blocks, up to 32k away.
-   3. There is an implied maximum of 7 bits for the bit length table and
-      15 bits for the actual data.
-   4. If only one code exists, then it is encoded using one bit.  (Zero
-      would be more efficient, but perhaps a little confusing.)  If two
-      codes exist, they are coded using one bit each (0 and 1).
-   5. There is no way of sending zero distance codes--a dummy must be
-      sent if there are none.  (History: a pre 2.0 version of PKZIP would
-      store blocks with no distance codes, but this was discovered to be
-      too harsh a criterion.)  Valid only for 1.93a.  2.04c does allow
-      zero distance codes, which is sent as one code of zero bits in
-      length.
-   6. There are up to 286 literal/length codes.  Code 256 represents the
-      end-of-block.  Note however that the static length tree defines
-      288 codes just to fill out the Huffman codes.  Codes 286 and 287
-      cannot be used though, since there is no length base or extra bits
-      defined for them.  Similarily, there are up to 30 distance codes.
-      However, static trees define 32 codes (all 5 bits) to fill out the
-      Huffman codes, but the last two had better not show up in the data.
-   7. Unzip can check dynamic Huffman blocks for complete code sets.
-      The exception is that a single code would not be complete (see #4).
-   8. The five bits following the block type is really the number of
-      literal codes sent minus 257.
-   9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
-      (1+6+6).  Therefore, to output three times the length, you output
-      three codes (1+1+1), whereas to output four times the same length,
-      you only need two codes (1+3).  Hmm.
-  10. In the tree reconstruction algorithm, Code = Code + Increment
-      only if BitLength(i) is not zero.  (Pretty obvious.)
-  11. Correction: 4 Bits: # of Bit Length codes - 4     (4 - 19)
-  12. Note: length code 284 can represent 227-258, but length code 285
-      really is 258.  The last length deserves its own, short code
-      since it gets used a lot in very redundant files.  The length
-      258 is special since 258 - 3 (the min match length) is 255.
-  13. The literal/length and distance code bit lengths are read as a
-      single stream of lengths.  It is possible (and advantageous) for
-      a repeat code (16, 17, or 18) to go across the boundary between
-      the two sets of lengths.
- */
-
-
-void zlib_inflate_blocks_reset(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	uLong *c
-)
-{
-  if (c != NULL)
-    *c = s->check;
-  if (s->mode == CODES)
-    zlib_inflate_codes_free(s->sub.decode.codes, z);
-  s->mode = TYPE;
-  s->bitk = 0;
-  s->bitb = 0;
-  s->read = s->write = s->window;
-  if (s->checkfn != NULL)
-    z->adler = s->check = (*s->checkfn)(0L, NULL, 0);
-}
-
-inflate_blocks_statef *zlib_inflate_blocks_new(
-	z_streamp z,
-	check_func c,
-	uInt w
-)
-{
-  inflate_blocks_statef *s;
-
-  s = &WS(z)->working_blocks_state;
-  s->hufts = WS(z)->working_hufts;
-  s->window = WS(z)->working_window;
-  s->end = s->window + w;
-  s->checkfn = c;
-  s->mode = TYPE;
-  zlib_inflate_blocks_reset(s, z, NULL);
-  return s;
-}
-
-
-int zlib_inflate_blocks(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	int r
-)
-{
-  uInt t;               /* temporary storage */
-  uLong b;              /* bit buffer */
-  uInt k;               /* bits in bit buffer */
-  Byte *p;              /* input data pointer */
-  uInt n;               /* bytes available there */
-  Byte *q;              /* output window write pointer */
-  uInt m;               /* bytes to end of window or read pointer */
-
-  /* copy input/output information to locals (UPDATE macro restores) */
-  LOAD
-
-  /* process input based on current state */
-  while (1) switch (s->mode)
-  {
-    case TYPE:
-      NEEDBITS(3)
-      t = (uInt)b & 7;
-      s->last = t & 1;
-      switch (t >> 1)
-      {
-        case 0:                         /* stored */
-          DUMPBITS(3)
-          t = k & 7;                    /* go to byte boundary */
-          DUMPBITS(t)
-          s->mode = LENS;               /* get length of stored block */
-          break;
-        case 1:                         /* fixed */
-          {
-            uInt bl, bd;
-            inflate_huft *tl, *td;
-
-            zlib_inflate_trees_fixed(&bl, &bd, &tl, &td, s->hufts, z);
-            s->sub.decode.codes = zlib_inflate_codes_new(bl, bd, tl, td, z);
-            if (s->sub.decode.codes == NULL)
-            {
-              r = Z_MEM_ERROR;
-              LEAVE
-            }
-          }
-          DUMPBITS(3)
-          s->mode = CODES;
-          break;
-        case 2:                         /* dynamic */
-          DUMPBITS(3)
-          s->mode = TABLE;
-          break;
-        case 3:                         /* illegal */
-          DUMPBITS(3)
-          s->mode = B_BAD;
-          z->msg = (char*)"invalid block type";
-          r = Z_DATA_ERROR;
-          LEAVE
-      }
-      break;
-    case LENS:
-      NEEDBITS(32)
-      if ((((~b) >> 16) & 0xffff) != (b & 0xffff))
-      {
-        s->mode = B_BAD;
-        z->msg = (char*)"invalid stored block lengths";
-        r = Z_DATA_ERROR;
-        LEAVE
-      }
-      s->sub.left = (uInt)b & 0xffff;
-      b = k = 0;                      /* dump bits */
-      s->mode = s->sub.left ? STORED : (s->last ? DRY : TYPE);
-      break;
-    case STORED:
-      if (n == 0)
-        LEAVE
-      NEEDOUT
-      t = s->sub.left;
-      if (t > n) t = n;
-      if (t > m) t = m;
-      memcpy(q, p, t);
-      p += t;  n -= t;
-      q += t;  m -= t;
-      if ((s->sub.left -= t) != 0)
-        break;
-      s->mode = s->last ? DRY : TYPE;
-      break;
-    case TABLE:
-      NEEDBITS(14)
-      s->sub.trees.table = t = (uInt)b & 0x3fff;
-#ifndef PKZIP_BUG_WORKAROUND
-      if ((t & 0x1f) > 29 || ((t >> 5) & 0x1f) > 29)
-      {
-        s->mode = B_BAD;
-        z->msg = (char*)"too many length or distance symbols";
-        r = Z_DATA_ERROR;
-        LEAVE
-      }
-#endif
-      {
-      	s->sub.trees.blens = WS(z)->working_blens;
-      }
-      DUMPBITS(14)
-      s->sub.trees.index = 0;
-      s->mode = BTREE;
-    case BTREE:
-      while (s->sub.trees.index < 4 + (s->sub.trees.table >> 10))
-      {
-        NEEDBITS(3)
-        s->sub.trees.blens[border[s->sub.trees.index++]] = (uInt)b & 7;
-        DUMPBITS(3)
-      }
-      while (s->sub.trees.index < 19)
-        s->sub.trees.blens[border[s->sub.trees.index++]] = 0;
-      s->sub.trees.bb = 7;
-      t = zlib_inflate_trees_bits(s->sub.trees.blens, &s->sub.trees.bb,
-				  &s->sub.trees.tb, s->hufts, z);
-      if (t != Z_OK)
-      {
-        r = t;
-        if (r == Z_DATA_ERROR)
-          s->mode = B_BAD;
-        LEAVE
-      }
-      s->sub.trees.index = 0;
-      s->mode = DTREE;
-    case DTREE:
-      while (t = s->sub.trees.table,
-             s->sub.trees.index < 258 + (t & 0x1f) + ((t >> 5) & 0x1f))
-      {
-        inflate_huft *h;
-        uInt i, j, c;
-
-        t = s->sub.trees.bb;
-        NEEDBITS(t)
-        h = s->sub.trees.tb + ((uInt)b & zlib_inflate_mask[t]);
-        t = h->bits;
-        c = h->base;
-        if (c < 16)
-        {
-          DUMPBITS(t)
-          s->sub.trees.blens[s->sub.trees.index++] = c;
-        }
-        else /* c == 16..18 */
-        {
-          i = c == 18 ? 7 : c - 14;
-          j = c == 18 ? 11 : 3;
-          NEEDBITS(t + i)
-          DUMPBITS(t)
-          j += (uInt)b & zlib_inflate_mask[i];
-          DUMPBITS(i)
-          i = s->sub.trees.index;
-          t = s->sub.trees.table;
-          if (i + j > 258 + (t & 0x1f) + ((t >> 5) & 0x1f) ||
-              (c == 16 && i < 1))
-          {
-            s->mode = B_BAD;
-            z->msg = (char*)"invalid bit length repeat";
-            r = Z_DATA_ERROR;
-            LEAVE
-          }
-          c = c == 16 ? s->sub.trees.blens[i - 1] : 0;
-          do {
-            s->sub.trees.blens[i++] = c;
-          } while (--j);
-          s->sub.trees.index = i;
-        }
-      }
-      s->sub.trees.tb = NULL;
-      {
-        uInt bl, bd;
-        inflate_huft *tl, *td;
-        inflate_codes_statef *c;
-
-        bl = 9;         /* must be <= 9 for lookahead assumptions */
-        bd = 6;         /* must be <= 9 for lookahead assumptions */
-        t = s->sub.trees.table;
-        t = zlib_inflate_trees_dynamic(257 + (t & 0x1f), 1 + ((t >> 5) & 0x1f),
-				       s->sub.trees.blens, &bl, &bd, &tl, &td,
-				       s->hufts, z);
-        if (t != Z_OK)
-        {
-          if (t == (uInt)Z_DATA_ERROR)
-            s->mode = B_BAD;
-          r = t;
-          LEAVE
-        }
-        if ((c = zlib_inflate_codes_new(bl, bd, tl, td, z)) == NULL)
-        {
-          r = Z_MEM_ERROR;
-          LEAVE
-        }
-        s->sub.decode.codes = c;
-      }
-      s->mode = CODES;
-    case CODES:
-      UPDATE
-      if ((r = zlib_inflate_codes(s, z, r)) != Z_STREAM_END)
-        return zlib_inflate_flush(s, z, r);
-      r = Z_OK;
-      zlib_inflate_codes_free(s->sub.decode.codes, z);
-      LOAD
-      if (!s->last)
-      {
-        s->mode = TYPE;
-        break;
-      }
-      s->mode = DRY;
-    case DRY:
-      FLUSH
-      if (s->read != s->write)
-        LEAVE
-      s->mode = B_DONE;
-    case B_DONE:
-      r = Z_STREAM_END;
-      LEAVE
-    case B_BAD:
-      r = Z_DATA_ERROR;
-      LEAVE
-    default:
-      r = Z_STREAM_ERROR;
-      LEAVE
-  }
-}
-
-
-int zlib_inflate_blocks_free(
-	inflate_blocks_statef *s,
-	z_streamp z
-)
-{
-  zlib_inflate_blocks_reset(s, z, NULL);
-  return Z_OK;
-}
-
-
-#if 0
-void zlib_inflate_set_dictionary(
-	inflate_blocks_statef *s,
-	const Byte *d,
-	uInt  n
-)
-{
-  memcpy(s->window, d, n);
-  s->read = s->write = s->window + n;
-}
-#endif  /*  0  */
-
-
-/* Returns true if inflate is currently at the end of a block generated
- * by Z_SYNC_FLUSH or Z_FULL_FLUSH. 
- * IN assertion: s != NULL
- */
-#if 0
-int zlib_inflate_blocks_sync_point(
-	inflate_blocks_statef *s
-)
-{
-  return s->mode == LENS;
-}
-#endif  /*  0  */
diff --git a/lib/zlib_inflate/infblock.h b/lib/zlib_inflate/infblock.h
deleted file mode 100644
index ceee60b5107c..000000000000
--- a/lib/zlib_inflate/infblock.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* infblock.h -- header to use infblock.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-/* WARNING: this file should *not* be used by applications. It is
-   part of the implementation of the compression library and is
-   subject to change. Applications should only use zlib.h.
- */
-
-#ifndef _INFBLOCK_H
-#define _INFBLOCK_H
-
-struct inflate_blocks_state;
-typedef struct inflate_blocks_state inflate_blocks_statef;
-
-extern inflate_blocks_statef * zlib_inflate_blocks_new (
-    z_streamp z,
-    check_func c,              /* check function */
-    uInt w);                   /* window size */
-
-extern int zlib_inflate_blocks (
-    inflate_blocks_statef *,
-    z_streamp ,
-    int);                      /* initial return code */
-
-extern void zlib_inflate_blocks_reset (
-    inflate_blocks_statef *,
-    z_streamp ,
-    uLong *);                  /* check value on output */
-
-extern int zlib_inflate_blocks_free (
-    inflate_blocks_statef *,
-    z_streamp);
-
-#if 0
-extern void zlib_inflate_set_dictionary (
-    inflate_blocks_statef *s,
-    const Byte *d,  /* dictionary */
-    uInt  n);       /* dictionary length */
-#endif  /*  0  */
-
-#if 0
-extern int zlib_inflate_blocks_sync_point (
-    inflate_blocks_statef *s);
-#endif  /*  0  */
-
-#endif /* _INFBLOCK_H */
diff --git a/lib/zlib_inflate/infcodes.c b/lib/zlib_inflate/infcodes.c
deleted file mode 100644
index 07cd7591cbb7..000000000000
--- a/lib/zlib_inflate/infcodes.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/* infcodes.c -- process literals and length/distance pairs
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-#include <linux/zutil.h>
-#include "inftrees.h"
-#include "infblock.h"
-#include "infcodes.h"
-#include "infutil.h"
-#include "inffast.h"
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-inflate_codes_statef *zlib_inflate_codes_new(
-	uInt bl,
-	uInt bd,
-	inflate_huft *tl,
-	inflate_huft *td, /* need separate declaration for Borland C++ */
-	z_streamp z
-)
-{
-  inflate_codes_statef *c;
-
-  c = &WS(z)->working_state;
-  {
-    c->mode = START;
-    c->lbits = (Byte)bl;
-    c->dbits = (Byte)bd;
-    c->ltree = tl;
-    c->dtree = td;
-  }
-  return c;
-}
-
-
-int zlib_inflate_codes(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	int r
-)
-{
-  uInt j;               /* temporary storage */
-  inflate_huft *t;      /* temporary pointer */
-  uInt e;               /* extra bits or operation */
-  uLong b;              /* bit buffer */
-  uInt k;               /* bits in bit buffer */
-  Byte *p;              /* input data pointer */
-  uInt n;               /* bytes available there */
-  Byte *q;              /* output window write pointer */
-  uInt m;               /* bytes to end of window or read pointer */
-  Byte *f;              /* pointer to copy strings from */
-  inflate_codes_statef *c = s->sub.decode.codes;  /* codes state */
-
-  /* copy input/output information to locals (UPDATE macro restores) */
-  LOAD
-
-  /* process input and output based on current state */
-  while (1) switch (c->mode)
-  {             /* waiting for "i:"=input, "o:"=output, "x:"=nothing */
-    case START:         /* x: set up for LEN */
-#ifndef SLOW
-      if (m >= 258 && n >= 10)
-      {
-        UPDATE
-        r = zlib_inflate_fast(c->lbits, c->dbits, c->ltree, c->dtree, s, z);
-        LOAD
-        if (r != Z_OK)
-        {
-          c->mode = r == Z_STREAM_END ? WASH : BADCODE;
-          break;
-        }
-      }
-#endif /* !SLOW */
-      c->sub.code.need = c->lbits;
-      c->sub.code.tree = c->ltree;
-      c->mode = LEN;
-    case LEN:           /* i: get length/literal/eob next */
-      j = c->sub.code.need;
-      NEEDBITS(j)
-      t = c->sub.code.tree + ((uInt)b & zlib_inflate_mask[j]);
-      DUMPBITS(t->bits)
-      e = (uInt)(t->exop);
-      if (e == 0)               /* literal */
-      {
-        c->sub.lit = t->base;
-        c->mode = LIT;
-        break;
-      }
-      if (e & 16)               /* length */
-      {
-        c->sub.copy.get = e & 15;
-        c->len = t->base;
-        c->mode = LENEXT;
-        break;
-      }
-      if ((e & 64) == 0)        /* next table */
-      {
-        c->sub.code.need = e;
-        c->sub.code.tree = t + t->base;
-        break;
-      }
-      if (e & 32)               /* end of block */
-      {
-        c->mode = WASH;
-        break;
-      }
-      c->mode = BADCODE;        /* invalid code */
-      z->msg = (char*)"invalid literal/length code";
-      r = Z_DATA_ERROR;
-      LEAVE
-    case LENEXT:        /* i: getting length extra (have base) */
-      j = c->sub.copy.get;
-      NEEDBITS(j)
-      c->len += (uInt)b & zlib_inflate_mask[j];
-      DUMPBITS(j)
-      c->sub.code.need = c->dbits;
-      c->sub.code.tree = c->dtree;
-      c->mode = DIST;
-    case DIST:          /* i: get distance next */
-      j = c->sub.code.need;
-      NEEDBITS(j)
-      t = c->sub.code.tree + ((uInt)b & zlib_inflate_mask[j]);
-      DUMPBITS(t->bits)
-      e = (uInt)(t->exop);
-      if (e & 16)               /* distance */
-      {
-        c->sub.copy.get = e & 15;
-        c->sub.copy.dist = t->base;
-        c->mode = DISTEXT;
-        break;
-      }
-      if ((e & 64) == 0)        /* next table */
-      {
-        c->sub.code.need = e;
-        c->sub.code.tree = t + t->base;
-        break;
-      }
-      c->mode = BADCODE;        /* invalid code */
-      z->msg = (char*)"invalid distance code";
-      r = Z_DATA_ERROR;
-      LEAVE
-    case DISTEXT:       /* i: getting distance extra */
-      j = c->sub.copy.get;
-      NEEDBITS(j)
-      c->sub.copy.dist += (uInt)b & zlib_inflate_mask[j];
-      DUMPBITS(j)
-      c->mode = COPY;
-    case COPY:          /* o: copying bytes in window, waiting for space */
-      f = q - c->sub.copy.dist;
-      while (f < s->window)             /* modulo window size-"while" instead */
-        f += s->end - s->window;        /* of "if" handles invalid distances */
-      while (c->len)
-      {
-        NEEDOUT
-        OUTBYTE(*f++)
-        if (f == s->end)
-          f = s->window;
-        c->len--;
-      }
-      c->mode = START;
-      break;
-    case LIT:           /* o: got literal, waiting for output space */
-      NEEDOUT
-      OUTBYTE(c->sub.lit)
-      c->mode = START;
-      break;
-    case WASH:          /* o: got eob, possibly more output */
-      if (k > 7)        /* return unused byte, if any */
-      {
-        k -= 8;
-        n++;
-        p--;            /* can always return one */
-      }
-      FLUSH
-      if (s->read != s->write)
-        LEAVE
-      c->mode = END;
-    case END:
-      r = Z_STREAM_END;
-      LEAVE
-    case BADCODE:       /* x: got error */
-      r = Z_DATA_ERROR;
-      LEAVE
-    default:
-      r = Z_STREAM_ERROR;
-      LEAVE
-  }
-#ifdef NEED_DUMMY_RETURN
-  return Z_STREAM_ERROR;  /* Some dumb compilers complain without this */
-#endif
-}
-
-
-void zlib_inflate_codes_free(
-	inflate_codes_statef *c,
-	z_streamp z
-)
-{
-}
diff --git a/lib/zlib_inflate/infcodes.h b/lib/zlib_inflate/infcodes.h
deleted file mode 100644
index 5cff417523b0..000000000000
--- a/lib/zlib_inflate/infcodes.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* infcodes.h -- header to use infcodes.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-/* WARNING: this file should *not* be used by applications. It is
-   part of the implementation of the compression library and is
-   subject to change. Applications should only use zlib.h.
- */
-
-#ifndef _INFCODES_H
-#define _INFCODES_H
-
-#include "infblock.h"
-
-struct inflate_codes_state;
-typedef struct inflate_codes_state inflate_codes_statef;
-
-extern inflate_codes_statef *zlib_inflate_codes_new (
-    uInt, uInt,
-    inflate_huft *, inflate_huft *,
-    z_streamp );
-
-extern int zlib_inflate_codes (
-    inflate_blocks_statef *,
-    z_streamp ,
-    int);
-
-extern void zlib_inflate_codes_free (
-    inflate_codes_statef *,
-    z_streamp );
-
-#endif /* _INFCODES_H */
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 0bd7623fc85a..02a16eacb72d 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -1,176 +1,312 @@
-/* inffast.c -- process literals and length/distance pairs fast
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+/* inffast.c -- fast decoding
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 #include <linux/zutil.h>
 #include "inftrees.h"
-#include "infblock.h"
-#include "infcodes.h"
-#include "infutil.h"
+#include "inflate.h"
 #include "inffast.h"
 
-struct inflate_codes_state;
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-/* macros for bit input with no checking and for returning unused bytes */
-#define GRABBITS(j) {while(k<(j)){b|=((uLong)NEXTBYTE)<<k;k+=8;}}
-#define UNGRAB {c=z->avail_in-n;c=(k>>3)<c?k>>3:c;n+=c;p-=c;k-=c<<3;}
-
-/* Called with number of bytes left to write in window at least 258
-   (the maximum string length) and number of input bytes available
-   at least ten.  The ten bytes are six bytes for the longest length/
-   distance pair plus four bytes for overloading the bit buffer. */
-
-int zlib_inflate_fast(
-	uInt bl,
-	uInt bd,
-	inflate_huft *tl,
-	inflate_huft *td, /* need separate declaration for Borland C++ */
-	inflate_blocks_statef *s,
-	z_streamp z
-)
+#ifndef ASMINF
+
+/* Allow machine dependent optimization for post-increment or pre-increment.
+   Based on testing to date,
+   Pre-increment preferred for:
+   - PowerPC G3 (Adler)
+   - MIPS R5000 (Randers-Pehrson)
+   Post-increment preferred for:
+   - none
+   No measurable difference:
+   - Pentium III (Anderson)
+   - M68060 (Nikl)
+ */
+#ifdef POSTINC
+#  define OFF 0
+#  define PUP(a) *(a)++
+#else
+#  define OFF 1
+#  define PUP(a) *++(a)
+#endif
+
+/*
+   Decode literal, length, and distance codes and write out the resulting
+   literal and match bytes until either not enough input or output is
+   available, an end-of-block is encountered, or a data error is encountered.
+   When large enough input and output buffers are supplied to inflate(), for
+   example, a 16K input buffer and a 64K output buffer, more than 95% of the
+   inflate execution time is spent in this routine.
+
+   Entry assumptions:
+
+        state->mode == LEN
+        strm->avail_in >= 6
+        strm->avail_out >= 258
+        start >= strm->avail_out
+        state->bits < 8
+
+   On return, state->mode is one of:
+
+        LEN -- ran out of enough output space or enough available input
+        TYPE -- reached end of block code, inflate() to interpret next block
+        BAD -- error in block data
+
+   Notes:
+
+    - The maximum input bits used by a length/distance pair is 15 bits for the
+      length code, 5 bits for the length extra, 15 bits for the distance code,
+      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
+      Therefore if strm->avail_in >= 6, then there is enough input to avoid
+      checking for available input while decoding.
+
+    - The maximum bytes that a single length/distance pair can output is 258
+      bytes, which is the maximum length that can be coded.  inflate_fast()
+      requires strm->avail_out >= 258 for each loop to avoid checking for
+      output space.
+ */
+void inflate_fast(strm, start)
+z_streamp strm;
+unsigned start;         /* inflate()'s starting value for strm->avail_out */
 {
-  inflate_huft *t;      /* temporary pointer */
-  uInt e;               /* extra bits or operation */
-  uLong b;              /* bit buffer */
-  uInt k;               /* bits in bit buffer */
-  Byte *p;              /* input data pointer */
-  uInt n;               /* bytes available there */
-  Byte *q;              /* output window write pointer */
-  uInt m;               /* bytes to end of window or read pointer */
-  uInt ml;              /* mask for literal/length tree */
-  uInt md;              /* mask for distance tree */
-  uInt c;               /* bytes to copy */
-  uInt d;               /* distance back to copy from */
-  Byte *r;              /* copy source pointer */
-
-  /* load input, output, bit values */
-  LOAD
-
-  /* initialize masks */
-  ml = zlib_inflate_mask[bl];
-  md = zlib_inflate_mask[bd];
-
-  /* do until not enough input or output space for fast loop */
-  do {                          /* assume called with m >= 258 && n >= 10 */
-    /* get literal/length code */
-    GRABBITS(20)                /* max bits for literal/length code */
-    if ((e = (t = tl + ((uInt)b & ml))->exop) == 0)
-    {
-      DUMPBITS(t->bits)
-      *q++ = (Byte)t->base;
-      m--;
-      continue;
-    }
+    struct inflate_state *state;
+    unsigned char *in;      /* local strm->next_in */
+    unsigned char *last;    /* while in < last, enough input available */
+    unsigned char *out;     /* local strm->next_out */
+    unsigned char *beg;     /* inflate()'s initial strm->next_out */
+    unsigned char *end;     /* while out < end, enough space available */
+#ifdef INFLATE_STRICT
+    unsigned dmax;              /* maximum distance from zlib header */
+#endif
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned write;             /* window write index */
+    unsigned char *window;  /* allocated sliding window, if wsize != 0 */
+    unsigned long hold;         /* local strm->hold */
+    unsigned bits;              /* local strm->bits */
+    code const *lcode;      /* local strm->lencode */
+    code const *dcode;      /* local strm->distcode */
+    unsigned lmask;             /* mask for first level of length codes */
+    unsigned dmask;             /* mask for first level of distance codes */
+    code this;                  /* retrieved table entry */
+    unsigned op;                /* code bits, operation, extra bits, or */
+                                /*  window position, window bytes to copy */
+    unsigned len;               /* match length, unused bytes */
+    unsigned dist;              /* match distance */
+    unsigned char *from;    /* where to copy match from */
+
+    /* copy state to local variables */
+    state = (struct inflate_state *)strm->state;
+    in = strm->next_in - OFF;
+    last = in + (strm->avail_in - 5);
+    out = strm->next_out - OFF;
+    beg = out - (start - strm->avail_out);
+    end = out + (strm->avail_out - 257);
+#ifdef INFLATE_STRICT
+    dmax = state->dmax;
+#endif
+    wsize = state->wsize;
+    whave = state->whave;
+    write = state->write;
+    window = state->window;
+    hold = state->hold;
+    bits = state->bits;
+    lcode = state->lencode;
+    dcode = state->distcode;
+    lmask = (1U << state->lenbits) - 1;
+    dmask = (1U << state->distbits) - 1;
+
+    /* decode literals and length/distances until end-of-block or not enough
+       input data or output space */
     do {
-      DUMPBITS(t->bits)
-      if (e & 16)
-      {
-        /* get extra bits for length */
-        e &= 15;
-        c = t->base + ((uInt)b & zlib_inflate_mask[e]);
-        DUMPBITS(e)
-
-        /* decode distance base of block to copy */
-        GRABBITS(15);           /* max bits for distance code */
-        e = (t = td + ((uInt)b & md))->exop;
-        do {
-          DUMPBITS(t->bits)
-          if (e & 16)
-          {
-            /* get extra bits to add to distance base */
-            e &= 15;
-            GRABBITS(e)         /* get extra bits (up to 13) */
-            d = t->base + ((uInt)b & zlib_inflate_mask[e]);
-            DUMPBITS(e)
-
-            /* do the copy */
-            m -= c;
-            r = q - d;
-            if (r < s->window)                  /* wrap if needed */
-            {
-              do {
-                r += s->end - s->window;        /* force pointer in window */
-              } while (r < s->window);          /* covers invalid distances */
-              e = s->end - r;
-              if (c > e)
-              {
-                c -= e;                         /* wrapped copy */
-                do {
-                    *q++ = *r++;
-                } while (--e);
-                r = s->window;
-                do {
-                    *q++ = *r++;
-                } while (--c);
-              }
-              else                              /* normal copy */
-              {
-                *q++ = *r++;  c--;
-                *q++ = *r++;  c--;
-                do {
-                    *q++ = *r++;
-                } while (--c);
-              }
+        if (bits < 15) {
+            hold += (unsigned long)(PUP(in)) << bits;
+            bits += 8;
+            hold += (unsigned long)(PUP(in)) << bits;
+            bits += 8;
+        }
+        this = lcode[hold & lmask];
+      dolen:
+        op = (unsigned)(this.bits);
+        hold >>= op;
+        bits -= op;
+        op = (unsigned)(this.op);
+        if (op == 0) {                          /* literal */
+            PUP(out) = (unsigned char)(this.val);
+        }
+        else if (op & 16) {                     /* length base */
+            len = (unsigned)(this.val);
+            op &= 15;                           /* number of extra bits */
+            if (op) {
+                if (bits < op) {
+                    hold += (unsigned long)(PUP(in)) << bits;
+                    bits += 8;
+                }
+                len += (unsigned)hold & ((1U << op) - 1);
+                hold >>= op;
+                bits -= op;
+            }
+            if (bits < 15) {
+                hold += (unsigned long)(PUP(in)) << bits;
+                bits += 8;
+                hold += (unsigned long)(PUP(in)) << bits;
+                bits += 8;
+            }
+            this = dcode[hold & dmask];
+          dodist:
+            op = (unsigned)(this.bits);
+            hold >>= op;
+            bits -= op;
+            op = (unsigned)(this.op);
+            if (op & 16) {                      /* distance base */
+                dist = (unsigned)(this.val);
+                op &= 15;                       /* number of extra bits */
+                if (bits < op) {
+                    hold += (unsigned long)(PUP(in)) << bits;
+                    bits += 8;
+                    if (bits < op) {
+                        hold += (unsigned long)(PUP(in)) << bits;
+                        bits += 8;
+                    }
+                }
+                dist += (unsigned)hold & ((1U << op) - 1);
+#ifdef INFLATE_STRICT
+                if (dist > dmax) {
+                    strm->msg = (char *)"invalid distance too far back";
+                    state->mode = BAD;
+                    break;
+                }
+#endif
+                hold >>= op;
+                bits -= op;
+                op = (unsigned)(out - beg);     /* max distance in output */
+                if (dist > op) {                /* see if copy from window */
+                    op = dist - op;             /* distance back in window */
+                    if (op > whave) {
+                        strm->msg = (char *)"invalid distance too far back";
+                        state->mode = BAD;
+                        break;
+                    }
+                    from = window - OFF;
+                    if (write == 0) {           /* very common case */
+                        from += wsize - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    else if (write < op) {      /* wrap around window */
+                        from += wsize + write - op;
+                        op -= write;
+                        if (op < len) {         /* some from end of window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = window - OFF;
+                            if (write < len) {  /* some from start of window */
+                                op = write;
+                                len -= op;
+                                do {
+                                    PUP(out) = PUP(from);
+                                } while (--op);
+                                from = out - dist;      /* rest from output */
+                            }
+                        }
+                    }
+                    else {                      /* contiguous in window */
+                        from += write - op;
+                        if (op < len) {         /* some from window */
+                            len -= op;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--op);
+                            from = out - dist;  /* rest from output */
+                        }
+                    }
+                    while (len > 2) {
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        len -= 3;
+                    }
+                    if (len) {
+                        PUP(out) = PUP(from);
+                        if (len > 1)
+                            PUP(out) = PUP(from);
+                    }
+                }
+                else {
+                    from = out - dist;          /* copy direct from output */
+                    do {                        /* minimum length is three */
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        PUP(out) = PUP(from);
+                        len -= 3;
+                    } while (len > 2);
+                    if (len) {
+                        PUP(out) = PUP(from);
+                        if (len > 1)
+                            PUP(out) = PUP(from);
+                    }
+                }
+            }
+            else if ((op & 64) == 0) {          /* 2nd level distance code */
+                this = dcode[this.val + (hold & ((1U << op) - 1))];
+                goto dodist;
             }
-            else                                /* normal copy */
-            {
-              *q++ = *r++;  c--;
-              *q++ = *r++;  c--;
-              do {
-                *q++ = *r++;
-              } while (--c);
+            else {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
             }
+        }
+        else if ((op & 64) == 0) {              /* 2nd level length code */
+            this = lcode[this.val + (hold & ((1U << op) - 1))];
+            goto dolen;
+        }
+        else if (op & 32) {                     /* end-of-block */
+            state->mode = TYPE;
             break;
-          }
-          else if ((e & 64) == 0)
-          {
-            t += t->base;
-            e = (t += ((uInt)b & zlib_inflate_mask[e]))->exop;
-          }
-          else
-          {
-            z->msg = (char*)"invalid distance code";
-            UNGRAB
-            UPDATE
-            return Z_DATA_ERROR;
-          }
-        } while (1);
-        break;
-      }
-      if ((e & 64) == 0)
-      {
-        t += t->base;
-        if ((e = (t += ((uInt)b & zlib_inflate_mask[e]))->exop) == 0)
-        {
-          DUMPBITS(t->bits)
-          *q++ = (Byte)t->base;
-          m--;
-          break;
         }
-      }
-      else if (e & 32)
-      {
-        UNGRAB
-        UPDATE
-        return Z_STREAM_END;
-      }
-      else
-      {
-        z->msg = (char*)"invalid literal/length code";
-        UNGRAB
-        UPDATE
-        return Z_DATA_ERROR;
-      }
-    } while (1);
-  } while (m >= 258 && n >= 10);
-
-  /* not enough input or output--restore pointers and return */
-  UNGRAB
-  UPDATE
-  return Z_OK;
+        else {
+            strm->msg = (char *)"invalid literal/length code";
+            state->mode = BAD;
+            break;
+        }
+    } while (in < last && out < end);
+
+    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+    len = bits >> 3;
+    in -= len;
+    bits -= len << 3;
+    hold &= (1U << bits) - 1;
+
+    /* update state and return */
+    strm->next_in = in + OFF;
+    strm->next_out = out + OFF;
+    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+    strm->avail_out = (unsigned)(out < end ?
+                                 257 + (end - out) : 257 - (out - end));
+    state->hold = hold;
+    state->bits = bits;
+    return;
 }
+
+/*
+   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+   - Using bit fields for code structure
+   - Different op definition to avoid & for extra bits (do & for table bits)
+   - Three separate decoding do-loops for direct, window, and write == 0
+   - Special case for distance > 1 copies to do overlapped load and store copy
+   - Explicit branch predictions (based on measured branch probabilities)
+   - Deferring match copy and interspersed it with decoding subsequent codes
+   - Swapping literal/length else
+   - Swapping window/direct else
+   - Larger unrolled copy loops (three is about right)
+   - Moving len -= 3 statement into middle of loop
+ */
+
+#endif /* !ASMINF */
diff --git a/lib/zlib_inflate/inffast.h b/lib/zlib_inflate/inffast.h
index fc720f0fa7f5..40315d9fddc4 100644
--- a/lib/zlib_inflate/inffast.h
+++ b/lib/zlib_inflate/inffast.h
@@ -1,6 +1,6 @@
 /* inffast.h -- header to use inffast.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 /* WARNING: this file should *not* be used by applications. It is
@@ -8,10 +8,4 @@
    subject to change. Applications should only use zlib.h.
  */
 
-extern int zlib_inflate_fast (
-    uInt,
-    uInt,
-    inflate_huft *,
-    inflate_huft *,
-    inflate_blocks_statef *,
-    z_streamp );
+void inflate_fast (z_streamp strm, unsigned start);
diff --git a/lib/zlib_inflate/inffixed.h b/lib/zlib_inflate/inffixed.h
new file mode 100644
index 000000000000..75ed4b5978de
--- /dev/null
+++ b/lib/zlib_inflate/inffixed.h
@@ -0,0 +1,94 @@
+    /* inffixed.h -- table for decoding fixed codes
+     * Generated automatically by makefixed().
+     */
+
+    /* WARNING: this file should *not* be used by applications. It
+       is part of the implementation of the compression library and
+       is subject to change. Applications should only use zlib.h.
+     */
+
+    static const code lenfix[512] = {
+        {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
+        {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
+        {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
+        {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
+        {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
+        {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
+        {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
+        {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
+        {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
+        {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
+        {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
+        {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
+        {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
+        {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
+        {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
+        {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
+        {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
+        {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
+        {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
+        {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
+        {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
+        {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
+        {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
+        {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
+        {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
+        {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
+        {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
+        {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
+        {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
+        {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
+        {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
+        {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
+        {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
+        {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
+        {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
+        {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
+        {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
+        {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
+        {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
+        {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
+        {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
+        {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
+        {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
+        {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
+        {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
+        {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
+        {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
+        {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
+        {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
+        {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
+        {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
+        {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
+        {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
+        {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
+        {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
+        {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
+        {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
+        {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
+        {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
+        {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
+        {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
+        {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
+        {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
+        {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
+        {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
+        {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
+        {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
+        {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
+        {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
+        {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
+        {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
+        {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
+        {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
+        {0,9,255}
+    };
+
+    static const code distfix[32] = {
+        {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
+        {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
+        {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
+        {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
+        {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
+        {22,5,193},{64,5,0}
+    };
diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c
index 31b9e9054bf7..7f922dccf1a5 100644
--- a/lib/zlib_inflate/inflate.c
+++ b/lib/zlib_inflate/inflate.c
@@ -1,89 +1,148 @@
-/* inflate.c -- zlib interface to inflate modules
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+/* inflate.c -- zlib decompression
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Based on zlib 1.2.3 but modified for the Linux Kernel by
+ * Richard Purdie <richard@openedhand.com>
+ *
+ * Changes mainly for static instead of dynamic memory allocation
+ *
  */
 
 #include <linux/zutil.h>
-#include "infblock.h"
+#include "inftrees.h"
+#include "inflate.h"
+#include "inffast.h"
 #include "infutil.h"
 
 int zlib_inflate_workspacesize(void)
 {
-  return sizeof(struct inflate_workspace);
+    return sizeof(struct inflate_workspace);
 }
 
+int zlib_inflateReset(z_streamp strm)
+{
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    strm->total_in = strm->total_out = state->total = 0;
+    strm->msg = NULL;
+    strm->adler = 1;        /* to support ill-conceived Java test suite */
+    state->mode = HEAD;
+    state->last = 0;
+    state->havedict = 0;
+    state->dmax = 32768U;
+    state->hold = 0;
+    state->bits = 0;
+    state->lencode = state->distcode = state->next = state->codes;
 
-int zlib_inflateReset(
-	z_streamp z
-)
+    /* Initialise Window */
+    state->wsize = 1U << state->wbits;
+    state->write = 0;
+    state->whave = 0;
+
+    return Z_OK;
+}
+
+#if 0
+int zlib_inflatePrime(z_streamp strm, int bits, int value)
 {
-  if (z == NULL || z->state == NULL || z->workspace == NULL)
-    return Z_STREAM_ERROR;
-  z->total_in = z->total_out = 0;
-  z->msg = NULL;
-  z->state->mode = z->state->nowrap ? BLOCKS : METHOD;
-  zlib_inflate_blocks_reset(z->state->blocks, z, NULL);
-  return Z_OK;
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR;
+    value &= (1L << bits) - 1;
+    state->hold += value << state->bits;
+    state->bits += bits;
+    return Z_OK;
 }
+#endif
+
+int zlib_inflateInit2(z_streamp strm, int windowBits)
+{
+    struct inflate_state *state;
+
+    if (strm == NULL) return Z_STREAM_ERROR;
+    strm->msg = NULL;                 /* in case we return an error */
+
+    state = &WS(strm)->inflate_state;
+    strm->state = (struct internal_state *)state;
+
+    if (windowBits < 0) {
+        state->wrap = 0;
+        windowBits = -windowBits;
+    }
+    else {
+        state->wrap = (windowBits >> 4) + 1;
+    }
+    if (windowBits < 8 || windowBits > 15) {
+        return Z_STREAM_ERROR;
+    }
+    state->wbits = (unsigned)windowBits;
+    state->window = &WS(strm)->working_window[0];
 
+    return zlib_inflateReset(strm);
+}
 
-int zlib_inflateEnd(
-	z_streamp z
-)
+/*
+   Return state with length and distance decoding tables and index sizes set to
+   fixed code decoding.  This returns fixed tables from inffixed.h.
+ */
+static void zlib_fixedtables(struct inflate_state *state)
 {
-  if (z == NULL || z->state == NULL || z->workspace == NULL)
-    return Z_STREAM_ERROR;
-  if (z->state->blocks != NULL)
-    zlib_inflate_blocks_free(z->state->blocks, z);
-  z->state = NULL;
-  return Z_OK;
+#   include "inffixed.h"
+    state->lencode = lenfix;
+    state->lenbits = 9;
+    state->distcode = distfix;
+    state->distbits = 5;
 }
 
 
-int zlib_inflateInit2_(
-	z_streamp z,
-	int w,
-	const char *version,
-	int stream_size
-)
+/*
+   Update the window with the last wsize (normally 32K) bytes written before
+   returning. This is only called when a window is already in use, or when
+   output has been written during this inflate call, but the end of the deflate
+   stream has not been reached yet. It is also called to window dictionary data
+   when a dictionary is loaded.
+
+   Providing output buffers larger than 32K to inflate() should provide a speed
+   advantage, since only the last 32K of output is copied to the sliding window
+   upon return from inflate(), and since all distances after the first 32K of
+   output will fall in the output data, making match copies simpler and faster.
+   The advantage may be dependent on the size of the processor's data caches.
+ */
+static void zlib_updatewindow(z_streamp strm, unsigned out)
 {
-  if (version == NULL || version[0] != ZLIB_VERSION[0] ||
-      stream_size != sizeof(z_stream) || z->workspace == NULL)
-      return Z_VERSION_ERROR;
-
-  /* initialize state */
-  z->msg = NULL;
-  z->state = &WS(z)->internal_state;
-  z->state->blocks = NULL;
-
-  /* handle undocumented nowrap option (no zlib header or check) */
-  z->state->nowrap = 0;
-  if (w < 0)
-  {
-    w = - w;
-    z->state->nowrap = 1;
-  }
-
-  /* set window size */
-  if (w < 8 || w > 15)
-  {
-    zlib_inflateEnd(z);
-    return Z_STREAM_ERROR;
-  }
-  z->state->wbits = (uInt)w;
-
-  /* create inflate_blocks state */
-  if ((z->state->blocks =
-      zlib_inflate_blocks_new(z, z->state->nowrap ? NULL : zlib_adler32, (uInt)1 << w))
-      == NULL)
-  {
-    zlib_inflateEnd(z);
-    return Z_MEM_ERROR;
-  }
-
-  /* reset state */
-  zlib_inflateReset(z);
-  return Z_OK;
+    struct inflate_state *state;
+    unsigned copy, dist;
+
+    state = (struct inflate_state *)strm->state;
+
+    /* copy state->wsize or less output bytes into the circular window */
+    copy = out - strm->avail_out;
+    if (copy >= state->wsize) {
+        memcpy(state->window, strm->next_out - state->wsize, state->wsize);
+        state->write = 0;
+        state->whave = state->wsize;
+    }
+    else {
+        dist = state->wsize - state->write;
+        if (dist > copy) dist = copy;
+        memcpy(state->window + state->write, strm->next_out - copy, dist);
+        copy -= dist;
+        if (copy) {
+            memcpy(state->window, strm->next_out - copy, copy);
+            state->write = copy;
+            state->whave = state->wsize;
+        }
+        else {
+            state->write += dist;
+            if (state->write == state->wsize) state->write = 0;
+            if (state->whave < state->wsize) state->whave += dist;
+        }
+    }
 }
 
 
@@ -91,157 +150,764 @@ int zlib_inflateInit2_(
  * At the end of a Deflate-compressed PPP packet, we expect to have seen
  * a `stored' block type value but not the (zero) length bytes.
  */
-static int zlib_inflate_packet_flush(inflate_blocks_statef *s)
+/*
+   Returns true if inflate is currently at the end of a block generated by
+   Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
+   implementation to provide an additional safety check. PPP uses
+   Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
+   block. When decompressing, PPP checks that at the end of input packet,
+   inflate is waiting for these length bytes.
+ */
+static int zlib_inflateSyncPacket(z_streamp strm)
 {
-    if (s->mode != LENS)
-	return Z_DATA_ERROR;
-    s->mode = TYPE;
+    struct inflate_state *state;
+
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+
+    if (state->mode == STORED && state->bits == 0) {
+	state->mode = TYPE;
+        return Z_OK;
+    }
+    return Z_DATA_ERROR;
+}
+
+/* Macros for inflate(): */
+
+/* check function to use adler32() for zlib or crc32() for gzip */
+#define UPDATE(check, buf, len) zlib_adler32(check, buf, len)
+
+/* Load registers with state in inflate() for speed */
+#define LOAD() \
+    do { \
+        put = strm->next_out; \
+        left = strm->avail_out; \
+        next = strm->next_in; \
+        have = strm->avail_in; \
+        hold = state->hold; \
+        bits = state->bits; \
+    } while (0)
+
+/* Restore state from registers in inflate() */
+#define RESTORE() \
+    do { \
+        strm->next_out = put; \
+        strm->avail_out = left; \
+        strm->next_in = next; \
+        strm->avail_in = have; \
+        state->hold = hold; \
+        state->bits = bits; \
+    } while (0)
+
+/* Clear the input bit accumulator */
+#define INITBITS() \
+    do { \
+        hold = 0; \
+        bits = 0; \
+    } while (0)
+
+/* Get a byte of input into the bit accumulator, or return from inflate()
+   if there is no input available. */
+#define PULLBYTE() \
+    do { \
+        if (have == 0) goto inf_leave; \
+        have--; \
+        hold += (unsigned long)(*next++) << bits; \
+        bits += 8; \
+    } while (0)
+
+/* Assure that there are at least n bits in the bit accumulator.  If there is
+   not enough available input to do that, then return from inflate(). */
+#define NEEDBITS(n) \
+    do { \
+        while (bits < (unsigned)(n)) \
+            PULLBYTE(); \
+    } while (0)
+
+/* Return the low n bits of the bit accumulator (n < 16) */
+#define BITS(n) \
+    ((unsigned)hold & ((1U << (n)) - 1))
+
+/* Remove n bits from the bit accumulator */
+#define DROPBITS(n) \
+    do { \
+        hold >>= (n); \
+        bits -= (unsigned)(n); \
+    } while (0)
+
+/* Remove zero to seven bits as needed to go to a byte boundary */
+#define BYTEBITS() \
+    do { \
+        hold >>= bits & 7; \
+        bits -= bits & 7; \
+    } while (0)
+
+/* Reverse the bytes in a 32-bit value */
+#define REVERSE(q) \
+    ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
+     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
+
+/*
+   inflate() uses a state machine to process as much input data and generate as
+   much output data as possible before returning.  The state machine is
+   structured roughly as follows:
+
+    for (;;) switch (state) {
+    ...
+    case STATEn:
+        if (not enough input data or output space to make progress)
+            return;
+        ... make progress ...
+        state = STATEm;
+        break;
+    ...
+    }
+
+   so when inflate() is called again, the same case is attempted again, and
+   if the appropriate resources are provided, the machine proceeds to the
+   next state.  The NEEDBITS() macro is usually the way the state evaluates
+   whether it can proceed or should return.  NEEDBITS() does the return if
+   the requested bits are not available.  The typical use of the BITS macros
+   is:
+
+        NEEDBITS(n);
+        ... do something with BITS(n) ...
+        DROPBITS(n);
+
+   where NEEDBITS(n) either returns from inflate() if there isn't enough
+   input left to load n bits into the accumulator, or it continues.  BITS(n)
+   gives the low n bits in the accumulator.  When done, DROPBITS(n) drops
+   the low n bits off the accumulator.  INITBITS() clears the accumulator
+   and sets the number of available bits to zero.  BYTEBITS() discards just
+   enough bits to put the accumulator on a byte boundary.  After BYTEBITS()
+   and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
+
+   NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
+   if there is no input available.  The decoding of variable length codes uses
+   PULLBYTE() directly in order to pull just enough bytes to decode the next
+   code, and no more.
+
+   Some states loop until they get enough input, making sure that enough
+   state information is maintained to continue the loop where it left off
+   if NEEDBITS() returns in the loop.  For example, want, need, and keep
+   would all have to actually be part of the saved state in case NEEDBITS()
+   returns:
+
+    case STATEw:
+        while (want < need) {
+            NEEDBITS(n);
+            keep[want++] = BITS(n);
+            DROPBITS(n);
+        }
+        state = STATEx;
+    case STATEx:
+
+   As shown above, if the next state is also the next case, then the break
+   is omitted.
+
+   A state may also return if there is not enough output space available to
+   complete that state.  Those states are copying stored data, writing a
+   literal byte, and copying a matching string.
+
+   When returning, a "goto inf_leave" is used to update the total counters,
+   update the check value, and determine whether any progress has been made
+   during that inflate() call in order to return the proper return code.
+   Progress is defined as a change in either strm->avail_in or strm->avail_out.
+   When there is a window, goto inf_leave will update the window with the last
+   output written.  If a goto inf_leave occurs in the middle of decompression
+   and there is no window currently, goto inf_leave will create one and copy
+   output to the window for the next call of inflate().
+
+   In this implementation, the flush parameter of inflate() only affects the
+   return code (per zlib.h).  inflate() always writes as much as possible to
+   strm->next_out, given the space available and the provided input--the effect
+   documented in zlib.h of Z_SYNC_FLUSH.  Furthermore, inflate() always defers
+   the allocation of and copying into a sliding window until necessary, which
+   provides the effect documented in zlib.h for Z_FINISH when the entire input
+   stream available.  So the only thing the flush parameter actually does is:
+   when flush is set to Z_FINISH, inflate() cannot return Z_OK.  Instead it
+   will return Z_BUF_ERROR if it has not reached the end of the stream.
+ */
+
+int zlib_inflate(z_streamp strm, int flush)
+{
+    struct inflate_state *state;
+    unsigned char *next;    /* next input */
+    unsigned char *put;     /* next output */
+    unsigned have, left;        /* available input and output */
+    unsigned long hold;         /* bit buffer */
+    unsigned bits;              /* bits in bit buffer */
+    unsigned in, out;           /* save starting available input and output */
+    unsigned copy;              /* number of stored or match bytes to copy */
+    unsigned char *from;    /* where to copy match bytes from */
+    code this;                  /* current decoding table entry */
+    code last;                  /* parent table entry */
+    unsigned len;               /* length to copy for repeats, bits to drop */
+    int ret;                    /* return code */
+    static const unsigned short order[19] = /* permutation of code lengths */
+        {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+    if (strm == NULL || strm->state == NULL || strm->next_out == NULL ||
+        (strm->next_in == NULL && strm->avail_in != 0))
+        return Z_STREAM_ERROR;
+
+    state = (struct inflate_state *)strm->state;
+
+    if (state->mode == TYPE) state->mode = TYPEDO;      /* skip check */
+    LOAD();
+    in = have;
+    out = left;
+    ret = Z_OK;
+    for (;;)
+        switch (state->mode) {
+        case HEAD:
+            if (state->wrap == 0) {
+                state->mode = TYPEDO;
+                break;
+            }
+            NEEDBITS(16);
+            if (
+                ((BITS(8) << 8) + (hold >> 8)) % 31) {
+                strm->msg = (char *)"incorrect header check";
+                state->mode = BAD;
+                break;
+            }
+            if (BITS(4) != Z_DEFLATED) {
+                strm->msg = (char *)"unknown compression method";
+                state->mode = BAD;
+                break;
+            }
+            DROPBITS(4);
+            len = BITS(4) + 8;
+            if (len > state->wbits) {
+                strm->msg = (char *)"invalid window size";
+                state->mode = BAD;
+                break;
+            }
+            state->dmax = 1U << len;
+            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
+            state->mode = hold & 0x200 ? DICTID : TYPE;
+            INITBITS();
+            break;
+        case DICTID:
+            NEEDBITS(32);
+            strm->adler = state->check = REVERSE(hold);
+            INITBITS();
+            state->mode = DICT;
+        case DICT:
+            if (state->havedict == 0) {
+                RESTORE();
+                return Z_NEED_DICT;
+            }
+            strm->adler = state->check = zlib_adler32(0L, NULL, 0);
+            state->mode = TYPE;
+        case TYPE:
+            if (flush == Z_BLOCK) goto inf_leave;
+        case TYPEDO:
+            if (state->last) {
+                BYTEBITS();
+                state->mode = CHECK;
+                break;
+            }
+            NEEDBITS(3);
+            state->last = BITS(1);
+            DROPBITS(1);
+            switch (BITS(2)) {
+            case 0:                             /* stored block */
+                state->mode = STORED;
+                break;
+            case 1:                             /* fixed block */
+                zlib_fixedtables(state);
+                state->mode = LEN;              /* decode codes */
+                break;
+            case 2:                             /* dynamic block */
+                state->mode = TABLE;
+                break;
+            case 3:
+                strm->msg = (char *)"invalid block type";
+                state->mode = BAD;
+            }
+            DROPBITS(2);
+            break;
+        case STORED:
+            BYTEBITS();                         /* go to byte boundary */
+            NEEDBITS(32);
+            if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
+                strm->msg = (char *)"invalid stored block lengths";
+                state->mode = BAD;
+                break;
+            }
+            state->length = (unsigned)hold & 0xffff;
+            INITBITS();
+            state->mode = COPY;
+        case COPY:
+            copy = state->length;
+            if (copy) {
+                if (copy > have) copy = have;
+                if (copy > left) copy = left;
+                if (copy == 0) goto inf_leave;
+                memcpy(put, next, copy);
+                have -= copy;
+                next += copy;
+                left -= copy;
+                put += copy;
+                state->length -= copy;
+                break;
+            }
+            state->mode = TYPE;
+            break;
+        case TABLE:
+            NEEDBITS(14);
+            state->nlen = BITS(5) + 257;
+            DROPBITS(5);
+            state->ndist = BITS(5) + 1;
+            DROPBITS(5);
+            state->ncode = BITS(4) + 4;
+            DROPBITS(4);
+#ifndef PKZIP_BUG_WORKAROUND
+            if (state->nlen > 286 || state->ndist > 30) {
+                strm->msg = (char *)"too many length or distance symbols";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            state->have = 0;
+            state->mode = LENLENS;
+        case LENLENS:
+            while (state->have < state->ncode) {
+                NEEDBITS(3);
+                state->lens[order[state->have++]] = (unsigned short)BITS(3);
+                DROPBITS(3);
+            }
+            while (state->have < 19)
+                state->lens[order[state->have++]] = 0;
+            state->next = state->codes;
+            state->lencode = (code const *)(state->next);
+            state->lenbits = 7;
+            ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid code lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->have = 0;
+            state->mode = CODELENS;
+        case CODELENS:
+            while (state->have < state->nlen + state->ndist) {
+                for (;;) {
+                    this = state->lencode[BITS(state->lenbits)];
+                    if ((unsigned)(this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                if (this.val < 16) {
+                    NEEDBITS(this.bits);
+                    DROPBITS(this.bits);
+                    state->lens[state->have++] = this.val;
+                }
+                else {
+                    if (this.val == 16) {
+                        NEEDBITS(this.bits + 2);
+                        DROPBITS(this.bits);
+                        if (state->have == 0) {
+                            strm->msg = (char *)"invalid bit length repeat";
+                            state->mode = BAD;
+                            break;
+                        }
+                        len = state->lens[state->have - 1];
+                        copy = 3 + BITS(2);
+                        DROPBITS(2);
+                    }
+                    else if (this.val == 17) {
+                        NEEDBITS(this.bits + 3);
+                        DROPBITS(this.bits);
+                        len = 0;
+                        copy = 3 + BITS(3);
+                        DROPBITS(3);
+                    }
+                    else {
+                        NEEDBITS(this.bits + 7);
+                        DROPBITS(this.bits);
+                        len = 0;
+                        copy = 11 + BITS(7);
+                        DROPBITS(7);
+                    }
+                    if (state->have + copy > state->nlen + state->ndist) {
+                        strm->msg = (char *)"invalid bit length repeat";
+                        state->mode = BAD;
+                        break;
+                    }
+                    while (copy--)
+                        state->lens[state->have++] = (unsigned short)len;
+                }
+            }
+
+            /* handle error breaks in while */
+            if (state->mode == BAD) break;
+
+            /* build code tables */
+            state->next = state->codes;
+            state->lencode = (code const *)(state->next);
+            state->lenbits = 9;
+            ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next),
+                                &(state->lenbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid literal/lengths set";
+                state->mode = BAD;
+                break;
+            }
+            state->distcode = (code const *)(state->next);
+            state->distbits = 6;
+            ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
+                            &(state->next), &(state->distbits), state->work);
+            if (ret) {
+                strm->msg = (char *)"invalid distances set";
+                state->mode = BAD;
+                break;
+            }
+            state->mode = LEN;
+        case LEN:
+            if (have >= 6 && left >= 258) {
+                RESTORE();
+                inflate_fast(strm, out);
+                LOAD();
+                break;
+            }
+            for (;;) {
+                this = state->lencode[BITS(state->lenbits)];
+                if ((unsigned)(this.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if (this.op && (this.op & 0xf0) == 0) {
+                last = this;
+                for (;;) {
+                    this = state->lencode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(this.bits);
+            state->length = (unsigned)this.val;
+            if ((int)(this.op) == 0) {
+                state->mode = LIT;
+                break;
+            }
+            if (this.op & 32) {
+                state->mode = TYPE;
+                break;
+            }
+            if (this.op & 64) {
+                strm->msg = (char *)"invalid literal/length code";
+                state->mode = BAD;
+                break;
+            }
+            state->extra = (unsigned)(this.op) & 15;
+            state->mode = LENEXT;
+        case LENEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->length += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+            state->mode = DIST;
+        case DIST:
+            for (;;) {
+                this = state->distcode[BITS(state->distbits)];
+                if ((unsigned)(this.bits) <= bits) break;
+                PULLBYTE();
+            }
+            if ((this.op & 0xf0) == 0) {
+                last = this;
+                for (;;) {
+                    this = state->distcode[last.val +
+                            (BITS(last.bits + last.op) >> last.bits)];
+                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    PULLBYTE();
+                }
+                DROPBITS(last.bits);
+            }
+            DROPBITS(this.bits);
+            if (this.op & 64) {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+            state->offset = (unsigned)this.val;
+            state->extra = (unsigned)(this.op) & 15;
+            state->mode = DISTEXT;
+        case DISTEXT:
+            if (state->extra) {
+                NEEDBITS(state->extra);
+                state->offset += BITS(state->extra);
+                DROPBITS(state->extra);
+            }
+#ifdef INFLATE_STRICT
+            if (state->offset > state->dmax) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+#endif
+            if (state->offset > state->whave + out - left) {
+                strm->msg = (char *)"invalid distance too far back";
+                state->mode = BAD;
+                break;
+            }
+            state->mode = MATCH;
+        case MATCH:
+            if (left == 0) goto inf_leave;
+            copy = out - left;
+            if (state->offset > copy) {         /* copy from window */
+                copy = state->offset - copy;
+                if (copy > state->write) {
+                    copy -= state->write;
+                    from = state->window + (state->wsize - copy);
+                }
+                else
+                    from = state->window + (state->write - copy);
+                if (copy > state->length) copy = state->length;
+            }
+            else {                              /* copy from output */
+                from = put - state->offset;
+                copy = state->length;
+            }
+            if (copy > left) copy = left;
+            left -= copy;
+            state->length -= copy;
+            do {
+                *put++ = *from++;
+            } while (--copy);
+            if (state->length == 0) state->mode = LEN;
+            break;
+        case LIT:
+            if (left == 0) goto inf_leave;
+            *put++ = (unsigned char)(state->length);
+            left--;
+            state->mode = LEN;
+            break;
+        case CHECK:
+            if (state->wrap) {
+                NEEDBITS(32);
+                out -= left;
+                strm->total_out += out;
+                state->total += out;
+                if (out)
+                    strm->adler = state->check =
+                        UPDATE(state->check, put - out, out);
+                out = left;
+                if ((
+                     REVERSE(hold)) != state->check) {
+                    strm->msg = (char *)"incorrect data check";
+                    state->mode = BAD;
+                    break;
+                }
+                INITBITS();
+            }
+            state->mode = DONE;
+        case DONE:
+            ret = Z_STREAM_END;
+            goto inf_leave;
+        case BAD:
+            ret = Z_DATA_ERROR;
+            goto inf_leave;
+        case MEM:
+            return Z_MEM_ERROR;
+        case SYNC:
+        default:
+            return Z_STREAM_ERROR;
+        }
+
+    /*
+       Return from inflate(), updating the total counts and the check value.
+       If there was no progress during the inflate() call, return a buffer
+       error.  Call zlib_updatewindow() to create and/or update the window state.
+     */
+  inf_leave:
+    RESTORE();
+    if (state->wsize || (state->mode < CHECK && out != strm->avail_out))
+        zlib_updatewindow(strm, out);
+
+    in -= strm->avail_in;
+    out -= strm->avail_out;
+    strm->total_in += in;
+    strm->total_out += out;
+    state->total += out;
+    if (state->wrap && out)
+        strm->adler = state->check =
+            UPDATE(state->check, strm->next_out - out, out);
+
+    strm->data_type = state->bits + (state->last ? 64 : 0) +
+                      (state->mode == TYPE ? 128 : 0);
+    if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
+        ret = Z_BUF_ERROR;
+
+    if (flush == Z_PACKET_FLUSH && ret == Z_OK &&
+            (strm->avail_out != 0 || strm->avail_in == 0))
+		return zlib_inflateSyncPacket(strm);
+    return ret;
+}
+
+int zlib_inflateEnd(z_streamp strm)
+{
+    if (strm == NULL || strm->state == NULL)
+        return Z_STREAM_ERROR;
     return Z_OK;
 }
 
+#if 0
+int zlib_inflateSetDictionary(z_streamp strm, const Byte *dictionary,
+        uInt dictLength)
+{
+    struct inflate_state *state;
+    unsigned long id;
+
+    /* check state */
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (state->wrap != 0 && state->mode != DICT)
+        return Z_STREAM_ERROR;
+
+    /* check for correct dictionary id */
+    if (state->mode == DICT) {
+        id = zlib_adler32(0L, NULL, 0);
+        id = zlib_adler32(id, dictionary, dictLength);
+        if (id != state->check)
+            return Z_DATA_ERROR;
+    }
+
+    /* copy dictionary to window */
+    zlib_updatewindow(strm, strm->avail_out);
 
-int zlib_inflateInit_(
-	z_streamp z,
-	const char *version,
-	int stream_size
-)
+    if (dictLength > state->wsize) {
+        memcpy(state->window, dictionary + dictLength - state->wsize,
+                state->wsize);
+        state->whave = state->wsize;
+    }
+    else {
+        memcpy(state->window + state->wsize - dictLength, dictionary,
+                dictLength);
+        state->whave = dictLength;
+    }
+    state->havedict = 1;
+    return Z_OK;
+}
+#endif
+
+#if 0
+/*
+   Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff.  Return when found
+   or when out of input.  When called, *have is the number of pattern bytes
+   found in order so far, in 0..3.  On return *have is updated to the new
+   state.  If on return *have equals four, then the pattern was found and the
+   return value is how many bytes were read including the last byte of the
+   pattern.  If *have is less than four, then the pattern has not been found
+   yet and the return value is len.  In the latter case, zlib_syncsearch() can be
+   called again with more data and the *have state.  *have is initialized to
+   zero for the first call.
+ */
+static unsigned zlib_syncsearch(unsigned *have, unsigned char *buf,
+        unsigned len)
 {
-  return zlib_inflateInit2_(z, DEF_WBITS, version, stream_size);
+    unsigned got;
+    unsigned next;
+
+    got = *have;
+    next = 0;
+    while (next < len && got < 4) {
+        if ((int)(buf[next]) == (got < 2 ? 0 : 0xff))
+            got++;
+        else if (buf[next])
+            got = 0;
+        else
+            got = 4 - got;
+        next++;
+    }
+    *have = got;
+    return next;
 }
+#endif
 
-#undef NEEDBYTE
-#undef NEXTBYTE
-#define NEEDBYTE {if(z->avail_in==0)goto empty;r=trv;}
-#define NEXTBYTE (z->avail_in--,z->total_in++,*z->next_in++)
+#if 0
+int zlib_inflateSync(z_streamp strm)
+{
+    unsigned len;               /* number of bytes to look at or looked at */
+    unsigned long in, out;      /* temporary to save total_in and total_out */
+    unsigned char buf[4];       /* to restore bit buffer to byte string */
+    struct inflate_state *state;
+
+    /* check parameters */
+    if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state *)strm->state;
+    if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR;
+
+    /* if first time, start search in bit buffer */
+    if (state->mode != SYNC) {
+        state->mode = SYNC;
+        state->hold <<= state->bits & 7;
+        state->bits -= state->bits & 7;
+        len = 0;
+        while (state->bits >= 8) {
+            buf[len++] = (unsigned char)(state->hold);
+            state->hold >>= 8;
+            state->bits -= 8;
+        }
+        state->have = 0;
+        zlib_syncsearch(&(state->have), buf, len);
+    }
+
+    /* search available input */
+    len = zlib_syncsearch(&(state->have), strm->next_in, strm->avail_in);
+    strm->avail_in -= len;
+    strm->next_in += len;
+    strm->total_in += len;
+
+    /* return no joy or set up to restart inflate() on a new block */
+    if (state->have != 4) return Z_DATA_ERROR;
+    in = strm->total_in;  out = strm->total_out;
+    zlib_inflateReset(strm);
+    strm->total_in = in;  strm->total_out = out;
+    state->mode = TYPE;
+    return Z_OK;
+}
+#endif
 
-int zlib_inflate(
-	z_streamp z,
-	int f
-)
+/*
+ * This subroutine adds the data at next_in/avail_in to the output history
+ * without performing any output.  The output buffer must be "caught up";
+ * i.e. no pending output but this should always be the case. The state must
+ * be waiting on the start of a block (i.e. mode == TYPE or HEAD).  On exit,
+ * the output will also be caught up, and the checksum will have been updated
+ * if need be.
+ */
+int zlib_inflateIncomp(z_stream *z)
 {
-  int r, trv;
-  uInt b;
-
-  if (z == NULL || z->state == NULL || z->next_in == NULL)
-    return Z_STREAM_ERROR;
-  trv = f == Z_FINISH ? Z_BUF_ERROR : Z_OK;
-  r = Z_BUF_ERROR;
-  while (1) switch (z->state->mode)
-  {
-    case METHOD:
-      NEEDBYTE
-      if (((z->state->sub.method = NEXTBYTE) & 0xf) != Z_DEFLATED)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"unknown compression method";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      if ((z->state->sub.method >> 4) + 8 > z->state->wbits)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"invalid window size";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      z->state->mode = FLAG;
-    case FLAG:
-      NEEDBYTE
-      b = NEXTBYTE;
-      if (((z->state->sub.method << 8) + b) % 31)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"incorrect header check";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      if (!(b & PRESET_DICT))
-      {
-        z->state->mode = BLOCKS;
-        break;
-      }
-      z->state->mode = DICT4;
-    case DICT4:
-      NEEDBYTE
-      z->state->sub.check.need = (uLong)NEXTBYTE << 24;
-      z->state->mode = DICT3;
-    case DICT3:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 16;
-      z->state->mode = DICT2;
-    case DICT2:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 8;
-      z->state->mode = DICT1;
-    case DICT1:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE;
-      z->adler = z->state->sub.check.need;
-      z->state->mode = DICT0;
-      return Z_NEED_DICT;
-    case DICT0:
-      z->state->mode = I_BAD;
-      z->msg = (char*)"need dictionary";
-      z->state->sub.marker = 0;       /* can try inflateSync */
-      return Z_STREAM_ERROR;
-    case BLOCKS:
-      r = zlib_inflate_blocks(z->state->blocks, z, r);
-      if (f == Z_PACKET_FLUSH && z->avail_in == 0 && z->avail_out != 0)
-	  r = zlib_inflate_packet_flush(z->state->blocks);
-      if (r == Z_DATA_ERROR)
-      {
-        z->state->mode = I_BAD;
-        z->state->sub.marker = 0;       /* can try inflateSync */
-        break;
-      }
-      if (r == Z_OK)
-        r = trv;
-      if (r != Z_STREAM_END)
-        return r;
-      r = trv;
-      zlib_inflate_blocks_reset(z->state->blocks, z, &z->state->sub.check.was);
-      if (z->state->nowrap)
-      {
-        z->state->mode = I_DONE;
-        break;
-      }
-      z->state->mode = CHECK4;
-    case CHECK4:
-      NEEDBYTE
-      z->state->sub.check.need = (uLong)NEXTBYTE << 24;
-      z->state->mode = CHECK3;
-    case CHECK3:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 16;
-      z->state->mode = CHECK2;
-    case CHECK2:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE << 8;
-      z->state->mode = CHECK1;
-    case CHECK1:
-      NEEDBYTE
-      z->state->sub.check.need += (uLong)NEXTBYTE;
-
-      if (z->state->sub.check.was != z->state->sub.check.need)
-      {
-        z->state->mode = I_BAD;
-        z->msg = (char*)"incorrect data check";
-        z->state->sub.marker = 5;       /* can't try inflateSync */
-        break;
-      }
-      z->state->mode = I_DONE;
-    case I_DONE:
-      return Z_STREAM_END;
-    case I_BAD:
-      return Z_DATA_ERROR;
-    default:
-      return Z_STREAM_ERROR;
-  }
- empty:
-  if (f != Z_PACKET_FLUSH)
-    return r;
-  z->state->mode = I_BAD;
-  z->msg = (char *)"need more for packet flush";
-  z->state->sub.marker = 0;       /* can try inflateSync */
-  return Z_DATA_ERROR;
+    struct inflate_state *state = (struct inflate_state *)z->state;
+    Byte *saved_no = z->next_out;
+    uInt saved_ao = z->avail_out;
+
+    if (state->mode != TYPE && state->mode != HEAD)
+	return Z_DATA_ERROR;
+
+    /* Setup some variables to allow misuse of updateWindow */
+    z->avail_out = 0;
+    z->next_out = z->next_in + z->avail_in;
+
+    zlib_updatewindow(z, z->avail_in);
+
+    /* Restore saved variables */
+    z->avail_out = saved_ao;
+    z->next_out = saved_no;
+
+    z->adler = state->check =
+        UPDATE(state->check, z->next_in, z->avail_in);
+
+    z->total_out += z->avail_in;
+    z->total_in += z->avail_in;
+    z->next_in += z->avail_in;
+    state->total += z->avail_in;
+    z->avail_in = 0;
+
+    return Z_OK;
 }
diff --git a/lib/zlib_inflate/inflate.h b/lib/zlib_inflate/inflate.h
new file mode 100644
index 000000000000..df8a6c92052d
--- /dev/null
+++ b/lib/zlib_inflate/inflate.h
@@ -0,0 +1,107 @@
+/* inflate.h -- internal inflate state definition
+ * Copyright (C) 1995-2004 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* Possible inflate modes between inflate() calls */
+typedef enum {
+    HEAD,       /* i: waiting for magic header */
+    FLAGS,      /* i: waiting for method and flags (gzip) */
+    TIME,       /* i: waiting for modification time (gzip) */
+    OS,         /* i: waiting for extra flags and operating system (gzip) */
+    EXLEN,      /* i: waiting for extra length (gzip) */
+    EXTRA,      /* i: waiting for extra bytes (gzip) */
+    NAME,       /* i: waiting for end of file name (gzip) */
+    COMMENT,    /* i: waiting for end of comment (gzip) */
+    HCRC,       /* i: waiting for header crc (gzip) */
+    DICTID,     /* i: waiting for dictionary check value */
+    DICT,       /* waiting for inflateSetDictionary() call */
+        TYPE,       /* i: waiting for type bits, including last-flag bit */
+        TYPEDO,     /* i: same, but skip check to exit inflate on new block */
+        STORED,     /* i: waiting for stored size (length and complement) */
+        COPY,       /* i/o: waiting for input or output to copy stored block */
+        TABLE,      /* i: waiting for dynamic block table lengths */
+        LENLENS,    /* i: waiting for code length code lengths */
+        CODELENS,   /* i: waiting for length/lit and distance code lengths */
+            LEN,        /* i: waiting for length/lit code */
+            LENEXT,     /* i: waiting for length extra bits */
+            DIST,       /* i: waiting for distance code */
+            DISTEXT,    /* i: waiting for distance extra bits */
+            MATCH,      /* o: waiting for output space to copy string */
+            LIT,        /* o: waiting for output space to write literal */
+    CHECK,      /* i: waiting for 32-bit check value */
+    LENGTH,     /* i: waiting for 32-bit length (gzip) */
+    DONE,       /* finished check, done -- remain here until reset */
+    BAD,        /* got a data error -- remain here until reset */
+    MEM,        /* got an inflate() memory error -- remain here until reset */
+    SYNC        /* looking for synchronization bytes to restart inflate() */
+} inflate_mode;
+
+/*
+    State transitions between above modes -
+
+    (most modes can go to the BAD or MEM mode -- not shown for clarity)
+
+    Process header:
+        HEAD -> (gzip) or (zlib)
+        (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME
+        NAME -> COMMENT -> HCRC -> TYPE
+        (zlib) -> DICTID or TYPE
+        DICTID -> DICT -> TYPE
+    Read deflate blocks:
+            TYPE -> STORED or TABLE or LEN or CHECK
+            STORED -> COPY -> TYPE
+            TABLE -> LENLENS -> CODELENS -> LEN
+    Read deflate codes:
+                LEN -> LENEXT or LIT or TYPE
+                LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
+                LIT -> LEN
+    Process trailer:
+        CHECK -> LENGTH -> DONE
+ */
+
+/* state maintained between inflate() calls.  Approximately 7K bytes. */
+struct inflate_state {
+    inflate_mode mode;          /* current inflate mode */
+    int last;                   /* true if processing last block */
+    int wrap;                   /* bit 0 true for zlib, bit 1 true for gzip */
+    int havedict;               /* true if dictionary provided */
+    int flags;                  /* gzip header method and flags (0 if zlib) */
+    unsigned dmax;              /* zlib header max distance (INFLATE_STRICT) */
+    unsigned long check;        /* protected copy of check value */
+    unsigned long total;        /* protected copy of output count */
+ /*   gz_headerp head; */           /* where to save gzip header information */
+        /* sliding window */
+    unsigned wbits;             /* log base 2 of requested window size */
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned write;             /* window write index */
+    unsigned char *window;  /* allocated sliding window, if needed */
+        /* bit accumulator */
+    unsigned long hold;         /* input bit accumulator */
+    unsigned bits;              /* number of bits in "in" */
+        /* for string and stored block copying */
+    unsigned length;            /* literal or length of data to copy */
+    unsigned offset;            /* distance back to copy string from */
+        /* for table and code decoding */
+    unsigned extra;             /* extra bits needed */
+        /* fixed and dynamic code tables */
+    code const *lencode;    /* starting table for length/literal codes */
+    code const *distcode;   /* starting table for distance codes */
+    unsigned lenbits;           /* index bits for lencode */
+    unsigned distbits;          /* index bits for distcode */
+        /* dynamic table building */
+    unsigned ncode;             /* number of code length code lengths */
+    unsigned nlen;              /* number of length code lengths */
+    unsigned ndist;             /* number of distance code lengths */
+    unsigned have;              /* number of code lengths in lens[] */
+    code *next;             /* next available space in codes[] */
+    unsigned short lens[320];   /* temporary storage for code lengths */
+    unsigned short work[288];   /* work area for code table building */
+    code codes[ENOUGH];         /* space for code tables */
+};
diff --git a/lib/zlib_inflate/inflate_syms.c b/lib/zlib_inflate/inflate_syms.c
index ef49738f57ec..2061d4f06765 100644
--- a/lib/zlib_inflate/inflate_syms.c
+++ b/lib/zlib_inflate/inflate_syms.c
@@ -12,8 +12,7 @@
 
 EXPORT_SYMBOL(zlib_inflate_workspacesize);
 EXPORT_SYMBOL(zlib_inflate);
-EXPORT_SYMBOL(zlib_inflateInit_);
-EXPORT_SYMBOL(zlib_inflateInit2_);
+EXPORT_SYMBOL(zlib_inflateInit2);
 EXPORT_SYMBOL(zlib_inflateEnd);
 EXPORT_SYMBOL(zlib_inflateReset);
 EXPORT_SYMBOL(zlib_inflateIncomp); 
diff --git a/lib/zlib_inflate/inflate_sync.c b/lib/zlib_inflate/inflate_sync.c
deleted file mode 100644
index 61411ff89d61..000000000000
--- a/lib/zlib_inflate/inflate_sync.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* inflate.c -- zlib interface to inflate modules
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include <linux/zutil.h>
-#include "infblock.h"
-#include "infutil.h"
-
-#if 0
-int zlib_inflateSync(
-	z_streamp z
-)
-{
-  uInt n;       /* number of bytes to look at */
-  Byte *p;      /* pointer to bytes */
-  uInt m;       /* number of marker bytes found in a row */
-  uLong r, w;   /* temporaries to save total_in and total_out */
-
-  /* set up */
-  if (z == NULL || z->state == NULL)
-    return Z_STREAM_ERROR;
-  if (z->state->mode != I_BAD)
-  {
-    z->state->mode = I_BAD;
-    z->state->sub.marker = 0;
-  }
-  if ((n = z->avail_in) == 0)
-    return Z_BUF_ERROR;
-  p = z->next_in;
-  m = z->state->sub.marker;
-
-  /* search */
-  while (n && m < 4)
-  {
-    static const Byte mark[4] = {0, 0, 0xff, 0xff};
-    if (*p == mark[m])
-      m++;
-    else if (*p)
-      m = 0;
-    else
-      m = 4 - m;
-    p++, n--;
-  }
-
-  /* restore */
-  z->total_in += p - z->next_in;
-  z->next_in = p;
-  z->avail_in = n;
-  z->state->sub.marker = m;
-
-  /* return no joy or set up to restart on a new block */
-  if (m != 4)
-    return Z_DATA_ERROR;
-  r = z->total_in;  w = z->total_out;
-  zlib_inflateReset(z);
-  z->total_in = r;  z->total_out = w;
-  z->state->mode = BLOCKS;
-  return Z_OK;
-}
-#endif  /*  0  */
-
-
-/* Returns true if inflate is currently at the end of a block generated
- * by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
- * implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH
- * but removes the length bytes of the resulting empty stored block. When
- * decompressing, PPP checks that at the end of input packet, inflate is
- * waiting for these length bytes.
- */
-#if 0
-int zlib_inflateSyncPoint(
-	z_streamp z
-)
-{
-  if (z == NULL || z->state == NULL || z->state->blocks == NULL)
-    return Z_STREAM_ERROR;
-  return zlib_inflate_blocks_sync_point(z->state->blocks);
-}
-#endif  /*  0  */
-
-/*
- * This subroutine adds the data at next_in/avail_in to the output history
- * without performing any output.  The output buffer must be "caught up";
- * i.e. no pending output (hence s->read equals s->write), and the state must
- * be BLOCKS (i.e. we should be willing to see the start of a series of
- * BLOCKS).  On exit, the output will also be caught up, and the checksum
- * will have been updated if need be.
- */
-static int zlib_inflate_addhistory(inflate_blocks_statef *s,
-				      z_stream              *z)
-{
-    uLong b;              /* bit buffer */  /* NOT USED HERE */
-    uInt k;               /* bits in bit buffer */ /* NOT USED HERE */
-    uInt t;               /* temporary storage */
-    Byte *p;              /* input data pointer */
-    uInt n;               /* bytes available there */
-    Byte *q;              /* output window write pointer */
-    uInt m;               /* bytes to end of window or read pointer */
-
-    if (s->read != s->write)
-	return Z_STREAM_ERROR;
-    if (s->mode != TYPE)
-	return Z_DATA_ERROR;
-
-    /* we're ready to rock */
-    LOAD
-    /* while there is input ready, copy to output buffer, moving
-     * pointers as needed.
-     */
-    while (n) {
-	t = n;  /* how many to do */
-	/* is there room until end of buffer? */
-	if (t > m) t = m;
-	/* update check information */
-	if (s->checkfn != NULL)
-	    s->check = (*s->checkfn)(s->check, q, t);
-	memcpy(q, p, t);
-	q += t;
-	p += t;
-	n -= t;
-	z->total_out += t;
-	s->read = q;    /* drag read pointer forward */
-/*      WWRAP  */ 	/* expand WWRAP macro by hand to handle s->read */
-	if (q == s->end) {
-	    s->read = q = s->window;
-	    m = WAVAIL;
-	}
-    }
-    UPDATE
-    return Z_OK;
-}
-
-
-/*
- * This subroutine adds the data at next_in/avail_in to the output history
- * without performing any output.  The output buffer must be "caught up";
- * i.e. no pending output (hence s->read equals s->write), and the state must
- * be BLOCKS (i.e. we should be willing to see the start of a series of
- * BLOCKS).  On exit, the output will also be caught up, and the checksum
- * will have been updated if need be.
- */
-
-int zlib_inflateIncomp(
-	z_stream *z
-
-)
-{
-    if (z->state->mode != BLOCKS)
-	return Z_DATA_ERROR;
-    return zlib_inflate_addhistory(z->state->blocks, z);
-}
diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c
index 874950ec4858..62343c53bf7e 100644
--- a/lib/zlib_inflate/inftrees.c
+++ b/lib/zlib_inflate/inftrees.c
@@ -1,412 +1,329 @@
 /* inftrees.c -- generate Huffman trees for efficient decoding
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 #include <linux/zutil.h>
 #include "inftrees.h"
-#include "infutil.h"
 
-static const char inflate_copyright[] __attribute_used__ =
-   " inflate 1.1.3 Copyright 1995-1998 Mark Adler ";
+#define MAXBITS 15
+
+const char inflate_copyright[] =
+   " inflate 1.2.3 Copyright 1995-2005 Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
   include such an acknowledgment, I would appreciate that you keep this
   copyright string in the executable of your product.
  */
-struct internal_state;
-
-/* simplify the use of the inflate_huft type with some defines */
-#define exop word.what.Exop
-#define bits word.what.Bits
-
-
-static int huft_build (
-    uInt *,             /* code lengths in bits */
-    uInt,               /* number of codes */
-    uInt,               /* number of "simple" codes */
-    const uInt *,       /* list of base values for non-simple codes */
-    const uInt *,       /* list of extra bits for non-simple codes */
-    inflate_huft **,    /* result: starting table */
-    uInt *,             /* maximum lookup bits (returns actual) */
-    inflate_huft *,     /* space for trees */
-    uInt *,             /* hufts used in space */
-    uInt * );           /* space for values */
-
-/* Tables for deflate from PKZIP's appnote.txt. */
-static const uInt cplens[31] = { /* Copy lengths for literal codes 257..285 */
-        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
-        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
-        /* see note #13 above about 258 */
-static const uInt cplext[31] = { /* Extra bits for literal codes 257..285 */
-        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
-        3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 112, 112}; /* 112==invalid */
-static const uInt cpdist[30] = { /* Copy offsets for distance codes 0..29 */
-        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
-        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
-        8193, 12289, 16385, 24577};
-static const uInt cpdext[30] = { /* Extra bits for distance codes */
-        0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
-        7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
-        12, 12, 13, 13};
 
 /*
-   Huffman code decoding is performed using a multi-level table lookup.
-   The fastest way to decode is to simply build a lookup table whose
-   size is determined by the longest code.  However, the time it takes
-   to build this table can also be a factor if the data being decoded
-   is not very long.  The most common codes are necessarily the
-   shortest codes, so those codes dominate the decoding time, and hence
-   the speed.  The idea is you can have a shorter table that decodes the
-   shorter, more probable codes, and then point to subsidiary tables for
-   the longer codes.  The time it costs to decode the longer codes is
-   then traded against the time it takes to make longer tables.
-
-   This results of this trade are in the variables lbits and dbits
-   below.  lbits is the number of bits the first level table for literal/
-   length codes can decode in one step, and dbits is the same thing for
-   the distance codes.  Subsequent tables are also less than or equal to
-   those sizes.  These values may be adjusted either when all of the
-   codes are shorter than that, in which case the longest code length in
-   bits is used, or when the shortest code is *longer* than the requested
-   table size, in which case the length of the shortest code in bits is
-   used.
-
-   There are two different values for the two tables, since they code a
-   different number of possibilities each.  The literal/length table
-   codes 286 possible values, or in a flat code, a little over eight
-   bits.  The distance table codes 30 possible values, or a little less
-   than five bits, flat.  The optimum values for speed end up being
-   about one bit more than those, so lbits is 8+1 and dbits is 5+1.
-   The optimum values may differ though from machine to machine, and
-   possibly even between compilers.  Your mileage may vary.
+   Build a set of tables to decode the provided canonical Huffman code.
+   The code lengths are lens[0..codes-1].  The result starts at *table,
+   whose indices are 0..2^bits-1.  work is a writable array of at least
+   lens shorts, which is used as a work area.  type is the type of code
+   to be generated, CODES, LENS, or DISTS.  On return, zero is success,
+   -1 is an invalid code, and +1 means that ENOUGH isn't enough.  table
+   on return points to the next available entry's address.  bits is the
+   requested root table index bits, and on return it is the actual root
+   table index bits.  It will differ if the request is greater than the
+   longest code or if it is less than the shortest code.
  */
-
-
-/* If BMAX needs to be larger than 16, then h and x[] should be uLong. */
-#define BMAX 15         /* maximum bit length of any code */
-
-static int huft_build(
-	uInt *b,               /* code lengths in bits (all assumed <= BMAX) */
-	uInt n,                /* number of codes (assumed <= 288) */
-	uInt s,                /* number of simple-valued codes (0..s-1) */
-	const uInt *d,         /* list of base values for non-simple codes */
-	const uInt *e,         /* list of extra bits for non-simple codes */
-	inflate_huft **t,      /* result: starting table */
-	uInt *m,               /* maximum lookup bits, returns actual */
-	inflate_huft *hp,      /* space for trees */
-	uInt *hn,              /* hufts used in space */
-	uInt *v                /* working area: values in order of bit length */
-)
-/* Given a list of code lengths and a maximum table size, make a set of
-   tables to decode that set of codes.  Return Z_OK on success, Z_BUF_ERROR
-   if the given code set is incomplete (the tables are still built in this
-   case), Z_DATA_ERROR if the input is invalid (an over-subscribed set of
-   lengths), or Z_MEM_ERROR if not enough memory. */
+int zlib_inflate_table(type, lens, codes, table, bits, work)
+codetype type;
+unsigned short *lens;
+unsigned codes;
+code **table;
+unsigned *bits;
+unsigned short *work;
 {
+    unsigned len;               /* a code's length in bits */
+    unsigned sym;               /* index of code symbols */
+    unsigned min, max;          /* minimum and maximum code lengths */
+    unsigned root;              /* number of index bits for root table */
+    unsigned curr;              /* number of index bits for current table */
+    unsigned drop;              /* code bits to drop for sub-table */
+    int left;                   /* number of prefix codes available */
+    unsigned used;              /* code entries in table used */
+    unsigned huff;              /* Huffman code */
+    unsigned incr;              /* for incrementing code, index */
+    unsigned fill;              /* index for replicating entries */
+    unsigned low;               /* low bits for current root entry */
+    unsigned mask;              /* mask for low root bits */
+    code this;                  /* table entry for duplication */
+    code *next;             /* next available space in table */
+    const unsigned short *base;     /* base value table to use */
+    const unsigned short *extra;    /* extra bits table to use */
+    int end;                    /* use base and extra for symbol > end */
+    unsigned short count[MAXBITS+1];    /* number of codes of each length */
+    unsigned short offs[MAXBITS+1];     /* offsets in table for each length */
+    static const unsigned short lbase[31] = { /* Length codes 257..285 base */
+        3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+    static const unsigned short lext[31] = { /* Length codes 257..285 extra */
+        16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196};
+    static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
+        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+        8193, 12289, 16385, 24577, 0, 0};
+    static const unsigned short dext[32] = { /* Distance codes 0..29 extra */
+        16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+        23, 23, 24, 24, 25, 25, 26, 26, 27, 27,
+        28, 28, 29, 29, 64, 64};
+
+    /*
+       Process a set of code lengths to create a canonical Huffman code.  The
+       code lengths are lens[0..codes-1].  Each length corresponds to the
+       symbols 0..codes-1.  The Huffman code is generated by first sorting the
+       symbols by length from short to long, and retaining the symbol order
+       for codes with equal lengths.  Then the code starts with all zero bits
+       for the first code of the shortest length, and the codes are integer
+       increments for the same length, and zeros are appended as the length
+       increases.  For the deflate format, these bits are stored backwards
+       from their more natural integer increment ordering, and so when the
+       decoding tables are built in the large loop below, the integer codes
+       are incremented backwards.
+
+       This routine assumes, but does not check, that all of the entries in
+       lens[] are in the range 0..MAXBITS.  The caller must assure this.
+       1..MAXBITS is interpreted as that code length.  zero means that that
+       symbol does not occur in this code.
+
+       The codes are sorted by computing a count of codes for each length,
+       creating from that a table of starting indices for each length in the
+       sorted table, and then entering the symbols in order in the sorted
+       table.  The sorted table is work[], with that space being provided by
+       the caller.
+
+       The length counts are used for other purposes as well, i.e. finding
+       the minimum and maximum length codes, determining if there are any
+       codes at all, checking for a valid set of lengths, and looking ahead
+       at length counts to determine sub-table sizes when building the
+       decoding tables.
+     */
+
+    /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */
+    for (len = 0; len <= MAXBITS; len++)
+        count[len] = 0;
+    for (sym = 0; sym < codes; sym++)
+        count[lens[sym]]++;
+
+    /* bound code lengths, force root to be within code lengths */
+    root = *bits;
+    for (max = MAXBITS; max >= 1; max--)
+        if (count[max] != 0) break;
+    if (root > max) root = max;
+    if (max == 0) {                     /* no symbols to code at all */
+        this.op = (unsigned char)64;    /* invalid code marker */
+        this.bits = (unsigned char)1;
+        this.val = (unsigned short)0;
+        *(*table)++ = this;             /* make a table to force an error */
+        *(*table)++ = this;
+        *bits = 1;
+        return 0;     /* no symbols, but wait for decoding to report error */
+    }
+    for (min = 1; min <= MAXBITS; min++)
+        if (count[min] != 0) break;
+    if (root < min) root = min;
+
+    /* check for an over-subscribed or incomplete set of lengths */
+    left = 1;
+    for (len = 1; len <= MAXBITS; len++) {
+        left <<= 1;
+        left -= count[len];
+        if (left < 0) return -1;        /* over-subscribed */
+    }
+    if (left > 0 && (type == CODES || max != 1))
+        return -1;                      /* incomplete set */
+
+    /* generate offsets into symbol table for each length for sorting */
+    offs[1] = 0;
+    for (len = 1; len < MAXBITS; len++)
+        offs[len + 1] = offs[len] + count[len];
+
+    /* sort symbols by length, by symbol order within each length */
+    for (sym = 0; sym < codes; sym++)
+        if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
+
+    /*
+       Create and fill in decoding tables.  In this loop, the table being
+       filled is at next and has curr index bits.  The code being used is huff
+       with length len.  That code is converted to an index by dropping drop
+       bits off of the bottom.  For codes where len is less than drop + curr,
+       those top drop + curr - len bits are incremented through all values to
+       fill the table with replicated entries.
+
+       root is the number of index bits for the root table.  When len exceeds
+       root, sub-tables are created pointed to by the root entry with an index
+       of the low root bits of huff.  This is saved in low to check for when a
+       new sub-table should be started.  drop is zero when the root table is
+       being filled, and drop is root when sub-tables are being filled.
+
+       When a new sub-table is needed, it is necessary to look ahead in the
+       code lengths to determine what size sub-table is needed.  The length
+       counts are used for this, and so count[] is decremented as codes are
+       entered in the tables.
+
+       used keeps track of how many table entries have been allocated from the
+       provided *table space.  It is checked when a LENS table is being made
+       against the space in *table, ENOUGH, minus the maximum space needed by
+       the worst case distance code, MAXD.  This should never happen, but the
+       sufficiency of ENOUGH has not been proven exhaustively, hence the check.
+       This assumes that when type == LENS, bits == 9.
+
+       sym increments through all symbols, and the loop terminates when
+       all codes of length max, i.e. all codes, have been processed.  This
+       routine permits incomplete codes, so another loop after this one fills
+       in the rest of the decoding tables with invalid code markers.
+     */
+
+    /* set up for code type */
+    switch (type) {
+    case CODES:
+        base = extra = work;    /* dummy value--not used */
+        end = 19;
+        break;
+    case LENS:
+        base = lbase;
+        base -= 257;
+        extra = lext;
+        extra -= 257;
+        end = 256;
+        break;
+    default:            /* DISTS */
+        base = dbase;
+        extra = dext;
+        end = -1;
+    }
 
-  uInt a;                       /* counter for codes of length k */
-  uInt c[BMAX+1];               /* bit length count table */
-  uInt f;                       /* i repeats in table every f entries */
-  int g;                        /* maximum code length */
-  int h;                        /* table level */
-  register uInt i;              /* counter, current code */
-  register uInt j;              /* counter */
-  register int k;               /* number of bits in current code */
-  int l;                        /* bits per table (returned in m) */
-  uInt mask;                    /* (1 << w) - 1, to avoid cc -O bug on HP */
-  register uInt *p;             /* pointer into c[], b[], or v[] */
-  inflate_huft *q;              /* points to current table */
-  struct inflate_huft_s r;      /* table entry for structure assignment */
-  inflate_huft *u[BMAX];        /* table stack */
-  register int w;               /* bits before this table == (l * h) */
-  uInt x[BMAX+1];               /* bit offsets, then code stack */
-  uInt *xp;                     /* pointer into x */
-  int y;                        /* number of dummy codes added */
-  uInt z;                       /* number of entries in current table */
-
-
-  /* Generate counts for each bit length */
-  p = c;
-#define C0 *p++ = 0;
-#define C2 C0 C0 C0 C0
-#define C4 C2 C2 C2 C2
-  C4                            /* clear c[]--assume BMAX+1 is 16 */
-  p = b;  i = n;
-  do {
-    c[*p++]++;                  /* assume all entries <= BMAX */
-  } while (--i);
-  if (c[0] == n)                /* null input--all zero length codes */
-  {
-    *t = NULL;
-    *m = 0;
-    return Z_OK;
-  }
-
-
-  /* Find minimum and maximum length, bound *m by those */
-  l = *m;
-  for (j = 1; j <= BMAX; j++)
-    if (c[j])
-      break;
-  k = j;                        /* minimum code length */
-  if ((uInt)l < j)
-    l = j;
-  for (i = BMAX; i; i--)
-    if (c[i])
-      break;
-  g = i;                        /* maximum code length */
-  if ((uInt)l > i)
-    l = i;
-  *m = l;
-
-
-  /* Adjust last length count to fill out codes, if needed */
-  for (y = 1 << j; j < i; j++, y <<= 1)
-    if ((y -= c[j]) < 0)
-      return Z_DATA_ERROR;
-  if ((y -= c[i]) < 0)
-    return Z_DATA_ERROR;
-  c[i] += y;
-
-
-  /* Generate starting offsets into the value table for each length */
-  x[1] = j = 0;
-  p = c + 1;  xp = x + 2;
-  while (--i) {                 /* note that i == g from above */
-    *xp++ = (j += *p++);
-  }
-
-
-  /* Make a table of values in order of bit lengths */
-  p = b;  i = 0;
-  do {
-    if ((j = *p++) != 0)
-      v[x[j]++] = i;
-  } while (++i < n);
-  n = x[g];                     /* set n to length of v */
-
-
-  /* Generate the Huffman codes and for each, make the table entries */
-  x[0] = i = 0;                 /* first Huffman code is zero */
-  p = v;                        /* grab values in bit order */
-  h = -1;                       /* no tables yet--level -1 */
-  w = -l;                       /* bits decoded == (l * h) */
-  u[0] = NULL;                  /* just to keep compilers happy */
-  q = NULL;                     /* ditto */
-  z = 0;                        /* ditto */
-
-  /* go through the bit lengths (k already is bits in shortest code) */
-  for (; k <= g; k++)
-  {
-    a = c[k];
-    while (a--)
-    {
-      /* here i is the Huffman code of length k bits for value *p */
-      /* make tables up to required level */
-      while (k > w + l)
-      {
-        h++;
-        w += l;                 /* previous table always l bits */
-
-        /* compute minimum size table less than or equal to l bits */
-        z = g - w;
-        z = z > (uInt)l ? l : z;        /* table size upper limit */
-        if ((f = 1 << (j = k - w)) > a + 1)     /* try a k-w bit table */
-        {                       /* too few codes for k-w bit table */
-          f -= a + 1;           /* deduct codes from patterns left */
-          xp = c + k;
-          if (j < z)
-            while (++j < z)     /* try smaller tables up to z bits */
-            {
-              if ((f <<= 1) <= *++xp)
-                break;          /* enough codes to use up j bits */
-              f -= *xp;         /* else deduct codes from patterns */
-            }
+    /* initialize state for loop */
+    huff = 0;                   /* starting code */
+    sym = 0;                    /* starting code symbol */
+    len = min;                  /* starting code length */
+    next = *table;              /* current table to fill in */
+    curr = root;                /* current table index bits */
+    drop = 0;                   /* current bits to drop from code for index */
+    low = (unsigned)(-1);       /* trigger new sub-table when len > root */
+    used = 1U << root;          /* use root table entries */
+    mask = used - 1;            /* mask for comparing low */
+
+    /* check available table space */
+    if (type == LENS && used >= ENOUGH - MAXD)
+        return 1;
+
+    /* process all codes and make table entries */
+    for (;;) {
+        /* create table entry */
+        this.bits = (unsigned char)(len - drop);
+        if ((int)(work[sym]) < end) {
+            this.op = (unsigned char)0;
+            this.val = work[sym];
         }
-        z = 1 << j;             /* table entries for j-bit table */
-
-        /* allocate new table */
-        if (*hn + z > MANY)     /* (note: doesn't matter for fixed) */
-          return Z_DATA_ERROR;  /* overflow of MANY */
-        u[h] = q = hp + *hn;
-        *hn += z;
-
-        /* connect to last table, if there is one */
-        if (h)
-        {
-          x[h] = i;             /* save pattern for backing up */
-          r.bits = (Byte)l;     /* bits to dump before this table */
-          r.exop = (Byte)j;     /* bits in this table */
-          j = i >> (w - l);
-          r.base = (uInt)(q - u[h-1] - j);   /* offset to this table */
-          u[h-1][j] = r;        /* connect to last table */
+        else if ((int)(work[sym]) > end) {
+            this.op = (unsigned char)(extra[work[sym]]);
+            this.val = base[work[sym]];
+        }
+        else {
+            this.op = (unsigned char)(32 + 64);         /* end of block */
+            this.val = 0;
         }
-        else
-          *t = q;               /* first table is returned result */
-      }
-
-      /* set up table entry in r */
-      r.bits = (Byte)(k - w);
-      if (p >= v + n)
-        r.exop = 128 + 64;      /* out of values--invalid code */
-      else if (*p < s)
-      {
-        r.exop = (Byte)(*p < 256 ? 0 : 32 + 64);     /* 256 is end-of-block */
-        r.base = *p++;          /* simple code is just the value */
-      }
-      else
-      {
-        r.exop = (Byte)(e[*p - s] + 16 + 64);/* non-simple--look up in lists */
-        r.base = d[*p++ - s];
-      }
-
-      /* fill code-like entries with r */
-      f = 1 << (k - w);
-      for (j = i >> w; j < z; j += f)
-        q[j] = r;
-
-      /* backwards increment the k-bit code i */
-      for (j = 1 << (k - 1); i & j; j >>= 1)
-        i ^= j;
-      i ^= j;
-
-      /* backup over finished tables */
-      mask = (1 << w) - 1;      /* needed on HP, cc -O bug */
-      while ((i & mask) != x[h])
-      {
-        h--;                    /* don't need to update q */
-        w -= l;
-        mask = (1 << w) - 1;
-      }
-    }
-  }
 
+        /* replicate for those indices with low len bits equal to huff */
+        incr = 1U << (len - drop);
+        fill = 1U << curr;
+        min = fill;                 /* save offset to next table */
+        do {
+            fill -= incr;
+            next[(huff >> drop) + fill] = this;
+        } while (fill != 0);
+
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        }
+        else
+            huff = 0;
 
-  /* Return Z_BUF_ERROR if we were given an incomplete table */
-  return y != 0 && g != 1 ? Z_BUF_ERROR : Z_OK;
-}
+        /* go to next symbol, update count, len */
+        sym++;
+        if (--(count[len]) == 0) {
+            if (len == max) break;
+            len = lens[work[sym]];
+        }
 
+        /* create new sub-table if needed */
+        if (len > root && (huff & mask) != low) {
+            /* if first time, transition to sub-tables */
+            if (drop == 0)
+                drop = root;
+
+            /* increment past last table */
+            next += min;            /* here min is 1 << curr */
+
+            /* determine length of next table */
+            curr = len - drop;
+            left = (int)(1 << curr);
+            while (curr + drop < max) {
+                left -= count[curr + drop];
+                if (left <= 0) break;
+                curr++;
+                left <<= 1;
+            }
 
-int zlib_inflate_trees_bits(
-	uInt *c,                /* 19 code lengths */
-	uInt *bb,               /* bits tree desired/actual depth */
-	inflate_huft **tb,      /* bits tree result */
-	inflate_huft *hp,       /* space for trees */
-	z_streamp z             /* for messages */
-)
-{
-  int r;
-  uInt hn = 0;          /* hufts used in space */
-  uInt *v;              /* work area for huft_build */
-  
-  v = WS(z)->tree_work_area_1;
-  r = huft_build(c, 19, 19, NULL, NULL, tb, bb, hp, &hn, v);
-  if (r == Z_DATA_ERROR)
-    z->msg = (char*)"oversubscribed dynamic bit lengths tree";
-  else if (r == Z_BUF_ERROR || *bb == 0)
-  {
-    z->msg = (char*)"incomplete dynamic bit lengths tree";
-    r = Z_DATA_ERROR;
-  }
-  return r;
-}
+            /* check for enough space */
+            used += 1U << curr;
+            if (type == LENS && used >= ENOUGH - MAXD)
+                return 1;
 
-int zlib_inflate_trees_dynamic(
-	uInt nl,                /* number of literal/length codes */
-	uInt nd,                /* number of distance codes */
-	uInt *c,                /* that many (total) code lengths */
-	uInt *bl,               /* literal desired/actual bit depth */
-	uInt *bd,               /* distance desired/actual bit depth */
-	inflate_huft **tl,      /* literal/length tree result */
-	inflate_huft **td,      /* distance tree result */
-	inflate_huft *hp,       /* space for trees */
-	z_streamp z             /* for messages */
-)
-{
-  int r;
-  uInt hn = 0;          /* hufts used in space */
-  uInt *v;              /* work area for huft_build */
-
-  /* allocate work area */
-  v = WS(z)->tree_work_area_2;
-
-  /* build literal/length tree */
-  r = huft_build(c, nl, 257, cplens, cplext, tl, bl, hp, &hn, v);
-  if (r != Z_OK || *bl == 0)
-  {
-    if (r == Z_DATA_ERROR)
-      z->msg = (char*)"oversubscribed literal/length tree";
-    else if (r != Z_MEM_ERROR)
-    {
-      z->msg = (char*)"incomplete literal/length tree";
-      r = Z_DATA_ERROR;
-    }
-    return r;
-  }
-
-  /* build distance tree */
-  r = huft_build(c + nl, nd, 0, cpdist, cpdext, td, bd, hp, &hn, v);
-  if (r != Z_OK || (*bd == 0 && nl > 257))
-  {
-    if (r == Z_DATA_ERROR)
-      z->msg = (char*)"oversubscribed distance tree";
-    else if (r == Z_BUF_ERROR) {
-#ifdef PKZIP_BUG_WORKAROUND
-      r = Z_OK;
-    }
-#else
-      z->msg = (char*)"incomplete distance tree";
-      r = Z_DATA_ERROR;
-    }
-    else if (r != Z_MEM_ERROR)
-    {
-      z->msg = (char*)"empty distance tree with lengths";
-      r = Z_DATA_ERROR;
+            /* point entry in root table to sub-table */
+            low = huff & mask;
+            (*table)[low].op = (unsigned char)curr;
+            (*table)[low].bits = (unsigned char)root;
+            (*table)[low].val = (unsigned short)(next - *table);
+        }
     }
-    return r;
-#endif
-  }
 
-  /* done */
-  return Z_OK;
-}
+    /*
+       Fill in rest of table for incomplete codes.  This loop is similar to the
+       loop above in incrementing huff for table indices.  It is assumed that
+       len is equal to curr + drop, so there is no loop needed to increment
+       through high index bits.  When the current sub-table is filled, the loop
+       drops back to the root table to fill in any remaining entries there.
+     */
+    this.op = (unsigned char)64;                /* invalid code marker */
+    this.bits = (unsigned char)(len - drop);
+    this.val = (unsigned short)0;
+    while (huff != 0) {
+        /* when done with sub-table, drop back to root table */
+        if (drop != 0 && (huff & mask) != low) {
+            drop = 0;
+            len = root;
+            next = *table;
+            this.bits = (unsigned char)len;
+        }
 
+        /* put invalid code marker in table */
+        next[huff >> drop] = this;
 
-int zlib_inflate_trees_fixed(
-	uInt *bl,                /* literal desired/actual bit depth */
-	uInt *bd,                /* distance desired/actual bit depth */
-	inflate_huft **tl,       /* literal/length tree result */
-	inflate_huft **td,       /* distance tree result */
-	inflate_huft *hp,       /* space for trees */
-	z_streamp z              /* for memory allocation */
-)
-{
-  int i;                /* temporary variable */
-  unsigned l[288];      /* length list for huft_build */
-  uInt *v;              /* work area for huft_build */
-
-  /* set up literal table */
-  for (i = 0; i < 144; i++)
-    l[i] = 8;
-  for (; i < 256; i++)
-    l[i] = 9;
-  for (; i < 280; i++)
-    l[i] = 7;
-  for (; i < 288; i++)          /* make a complete, but wrong code set */
-    l[i] = 8;
-  *bl = 9;
-  v = WS(z)->tree_work_area_1;
-  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0)
-    return i;
-
-  /* set up distance table */
-  for (i = 0; i < 30; i++)      /* make an incomplete code set */
-    l[i] = 5;
-  *bd = 5;
-  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1)
-    return i;
-
-  return Z_OK;
+        /* backwards increment the len-bit code huff */
+        incr = 1U << (len - 1);
+        while (huff & incr)
+            incr >>= 1;
+        if (incr != 0) {
+            huff &= incr - 1;
+            huff += incr;
+        }
+        else
+            huff = 0;
+    }
+
+    /* set return parameters */
+    *table += used;
+    *bits = root;
+    return 0;
 }
diff --git a/lib/zlib_inflate/inftrees.h b/lib/zlib_inflate/inftrees.h
index e37705adc008..5f5219b1240e 100644
--- a/lib/zlib_inflate/inftrees.h
+++ b/lib/zlib_inflate/inftrees.h
@@ -1,6 +1,6 @@
 /* inftrees.h -- header to use inftrees.c
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
 /* WARNING: this file should *not* be used by applications. It is
@@ -8,57 +8,48 @@
    subject to change. Applications should only use zlib.h.
  */
 
-/* Huffman code lookup table entry--this entry is four bytes for machines
-   that have 16-bit pointers (e.g. PC's in the small or medium model). */
-
-#ifndef _INFTREES_H
-#define _INFTREES_H
-
-typedef struct inflate_huft_s inflate_huft;
-
-struct inflate_huft_s {
-  union {
-    struct {
-      Byte Exop;        /* number of extra bits or operation */
-      Byte Bits;        /* number of bits in this code or subcode */
-    } what;
-    uInt pad;           /* pad structure to a power of 2 (4 bytes for */
-  } word;               /*  16-bit, 8 bytes for 32-bit int's) */
-  uInt base;            /* literal, length base, distance base,
-                           or table offset */
-};
+/* Structure for decoding tables.  Each entry provides either the
+   information needed to do the operation requested by the code that
+   indexed that table entry, or it provides a pointer to another
+   table that indexes more bits of the code.  op indicates whether
+   the entry is a pointer to another table, a literal, a length or
+   distance, an end-of-block, or an invalid code.  For a table
+   pointer, the low four bits of op is the number of index bits of
+   that table.  For a length or distance, the low four bits of op
+   is the number of extra bits to get after the code.  bits is
+   the number of bits in this code or part of the code to drop off
+   of the bit buffer.  val is the actual byte to output in the case
+   of a literal, the base length or distance, or the offset from
+   the current table to the next table.  Each entry is four bytes. */
+typedef struct {
+    unsigned char op;           /* operation, extra bits, table bits */
+    unsigned char bits;         /* bits in this part of the code */
+    unsigned short val;         /* offset in table or code value */
+} code;
+
+/* op values as set by inflate_table():
+    00000000 - literal
+    0000tttt - table link, tttt != 0 is the number of table index bits
+    0001eeee - length or distance, eeee is the number of extra bits
+    01100000 - end of block
+    01000000 - invalid code
+ */
 
 /* Maximum size of dynamic tree.  The maximum found in a long but non-
-   exhaustive search was 1004 huft structures (850 for length/literals
-   and 154 for distances, the latter actually the result of an
-   exhaustive search).  The actual maximum is not known, but the
-   value below is more than safe. */
-#define MANY 1440
-
-extern int zlib_inflate_trees_bits (
-    uInt *,                     /* 19 code lengths */
-    uInt *,                     /* bits tree desired/actual depth */
-    inflate_huft **,            /* bits tree result */
-    inflate_huft *,             /* space for trees */
-    z_streamp);                 /* for messages */
-
-extern int zlib_inflate_trees_dynamic (
-    uInt,                       /* number of literal/length codes */
-    uInt,                       /* number of distance codes */
-    uInt *,                     /* that many (total) code lengths */
-    uInt *,                     /* literal desired/actual bit depth */
-    uInt *,                     /* distance desired/actual bit depth */
-    inflate_huft **,            /* literal/length tree result */
-    inflate_huft **,            /* distance tree result */
-    inflate_huft *,             /* space for trees */
-    z_streamp);                 /* for messages */
-
-extern int zlib_inflate_trees_fixed (
-    uInt *,                     /* literal desired/actual bit depth */
-    uInt *,                     /* distance desired/actual bit depth */
-    inflate_huft **,            /* literal/length tree result */
-    inflate_huft **,            /* distance tree result */
-    inflate_huft *,             /* space for trees */
-    z_streamp);                 /* for memory allocation */
-
-#endif /* _INFTREES_H */
+   exhaustive search was 1444 code structures (852 for length/literals
+   and 592 for distances, the latter actually the result of an
+   exhaustive search).  The true maximum is not known, but the value
+   below is more than safe. */
+#define ENOUGH 2048
+#define MAXD 592
+
+/* Type of code to build for inftable() */
+typedef enum {
+    CODES,
+    LENS,
+    DISTS
+} codetype;
+
+extern int zlib_inflate_table (codetype type, unsigned short *lens,
+                             unsigned codes, code **table,
+                             unsigned *bits, unsigned short *work);
diff --git a/lib/zlib_inflate/infutil.c b/lib/zlib_inflate/infutil.c
deleted file mode 100644
index 00202b3438e1..000000000000
--- a/lib/zlib_inflate/infutil.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/* inflate_util.c -- data and routines common to blocks and codes
- * Copyright (C) 1995-1998 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h 
- */
-
-#include <linux/zutil.h>
-#include "infblock.h"
-#include "inftrees.h"
-#include "infcodes.h"
-#include "infutil.h"
-
-struct inflate_codes_state;
-
-/* And'ing with mask[n] masks the lower n bits */
-uInt zlib_inflate_mask[17] = {
-    0x0000,
-    0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
-    0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
-};
-
-
-/* copy as much as possible from the sliding window to the output area */
-int zlib_inflate_flush(
-	inflate_blocks_statef *s,
-	z_streamp z,
-	int r
-)
-{
-  uInt n;
-  Byte *p;
-  Byte *q;
-
-  /* local copies of source and destination pointers */
-  p = z->next_out;
-  q = s->read;
-
-  /* compute number of bytes to copy as far as end of window */
-  n = (uInt)((q <= s->write ? s->write : s->end) - q);
-  if (n > z->avail_out) n = z->avail_out;
-  if (n && r == Z_BUF_ERROR) r = Z_OK;
-
-  /* update counters */
-  z->avail_out -= n;
-  z->total_out += n;
-
-  /* update check information */
-  if (s->checkfn != NULL)
-    z->adler = s->check = (*s->checkfn)(s->check, q, n);
-
-  /* copy as far as end of window */
-  memcpy(p, q, n);
-  p += n;
-  q += n;
-
-  /* see if more to copy at beginning of window */
-  if (q == s->end)
-  {
-    /* wrap pointers */
-    q = s->window;
-    if (s->write == s->end)
-      s->write = s->window;
-
-    /* compute bytes to copy */
-    n = (uInt)(s->write - q);
-    if (n > z->avail_out) n = z->avail_out;
-    if (n && r == Z_BUF_ERROR) r = Z_OK;
-
-    /* update counters */
-    z->avail_out -= n;
-    z->total_out += n;
-
-    /* update check information */
-    if (s->checkfn != NULL)
-      z->adler = s->check = (*s->checkfn)(s->check, q, n);
-
-    /* copy */
-    memcpy(p, q, n);
-    p += n;
-    q += n;
-  }
-
-  /* update pointers */
-  z->next_out = p;
-  s->read = q;
-
-  /* done */
-  return r;
-}
diff --git a/lib/zlib_inflate/infutil.h b/lib/zlib_inflate/infutil.h
index a15875fc5f72..eb1a9007bd86 100644
--- a/lib/zlib_inflate/infutil.h
+++ b/lib/zlib_inflate/infutil.h
@@ -11,184 +11,12 @@
 #ifndef _INFUTIL_H
 #define _INFUTIL_H
 
-#include <linux/zconf.h>
-#include "inftrees.h"
-#include "infcodes.h"
-
-typedef enum {
-      TYPE,     /* get type bits (3, including end bit) */
-      LENS,     /* get lengths for stored */
-      STORED,   /* processing stored block */
-      TABLE,    /* get table lengths */
-      BTREE,    /* get bit lengths tree for a dynamic block */
-      DTREE,    /* get length, distance trees for a dynamic block */
-      CODES,    /* processing fixed or dynamic block */
-      DRY,      /* output remaining window bytes */
-      B_DONE,   /* finished last block, done */
-      B_BAD}    /* got a data error--stuck here */
-inflate_block_mode;
-
-/* inflate blocks semi-private state */
-struct inflate_blocks_state {
-
-  /* mode */
-  inflate_block_mode  mode;     /* current inflate_block mode */
-
-  /* mode dependent information */
-  union {
-    uInt left;          /* if STORED, bytes left to copy */
-    struct {
-      uInt table;               /* table lengths (14 bits) */
-      uInt index;               /* index into blens (or border) */
-      uInt *blens;              /* bit lengths of codes */
-      uInt bb;                  /* bit length tree depth */
-      inflate_huft *tb;         /* bit length decoding tree */
-    } trees;            /* if DTREE, decoding info for trees */
-    struct {
-      inflate_codes_statef 
-         *codes;
-    } decode;           /* if CODES, current state */
-  } sub;                /* submode */
-  uInt last;            /* true if this block is the last block */
-
-  /* mode independent information */
-  uInt bitk;            /* bits in bit buffer */
-  uLong bitb;           /* bit buffer */
-  inflate_huft *hufts;  /* single malloc for tree space */
-  Byte *window;         /* sliding window */
-  Byte *end;            /* one byte after sliding window */
-  Byte *read;           /* window read pointer */
-  Byte *write;          /* window write pointer */
-  check_func checkfn;   /* check function */
-  uLong check;          /* check on output */
-
-};
-
-
-/* defines for inflate input/output */
-/*   update pointers and return */
-#define UPDBITS {s->bitb=b;s->bitk=k;}
-#define UPDIN {z->avail_in=n;z->total_in+=p-z->next_in;z->next_in=p;}
-#define UPDOUT {s->write=q;}
-#define UPDATE {UPDBITS UPDIN UPDOUT}
-#define LEAVE {UPDATE return zlib_inflate_flush(s,z,r);}
-/*   get bytes and bits */
-#define LOADIN {p=z->next_in;n=z->avail_in;b=s->bitb;k=s->bitk;}
-#define NEEDBYTE {if(n)r=Z_OK;else LEAVE}
-#define NEXTBYTE (n--,*p++)
-#define NEEDBITS(j) {while(k<(j)){NEEDBYTE;b|=((uLong)NEXTBYTE)<<k;k+=8;}}
-#define DUMPBITS(j) {b>>=(j);k-=(j);}
-/*   output bytes */
-#define WAVAIL (uInt)(q<s->read?s->read-q-1:s->end-q)
-#define LOADOUT {q=s->write;m=(uInt)WAVAIL;}
-#define WRAP {if(q==s->end&&s->read!=s->window){q=s->window;m=(uInt)WAVAIL;}}
-#define FLUSH {UPDOUT r=zlib_inflate_flush(s,z,r); LOADOUT}
-#define NEEDOUT {if(m==0){WRAP if(m==0){FLUSH WRAP if(m==0) LEAVE}}r=Z_OK;}
-#define OUTBYTE(a) {*q++=(Byte)(a);m--;}
-/*   load local pointers */
-#define LOAD {LOADIN LOADOUT}
-
-/* masks for lower bits (size given to avoid silly warnings with Visual C++) */
-extern uInt zlib_inflate_mask[17];
-
-/* copy as much as possible from the sliding window to the output area */
-extern int zlib_inflate_flush (
-    inflate_blocks_statef *,
-    z_streamp ,
-    int);
-
-/* inflate private state */
-typedef enum {
-      METHOD,   /* waiting for method byte */
-      FLAG,     /* waiting for flag byte */
-      DICT4,    /* four dictionary check bytes to go */
-      DICT3,    /* three dictionary check bytes to go */
-      DICT2,    /* two dictionary check bytes to go */
-      DICT1,    /* one dictionary check byte to go */
-      DICT0,    /* waiting for inflateSetDictionary */
-      BLOCKS,   /* decompressing blocks */
-      CHECK4,   /* four check bytes to go */
-      CHECK3,   /* three check bytes to go */
-      CHECK2,   /* two check bytes to go */
-      CHECK1,   /* one check byte to go */
-      I_DONE,   /* finished check, done */
-      I_BAD}    /* got an error--stay here */
-inflate_mode;
-
-struct internal_state {
-
-  /* mode */
-  inflate_mode  mode;   /* current inflate mode */
-
-  /* mode dependent information */
-  union {
-    uInt method;        /* if FLAGS, method byte */
-    struct {
-      uLong was;                /* computed check value */
-      uLong need;               /* stream check value */
-    } check;            /* if CHECK, check values to compare */
-    uInt marker;        /* if BAD, inflateSync's marker bytes count */
-  } sub;        /* submode */
-
-  /* mode independent information */
-  int  nowrap;          /* flag for no wrapper */
-  uInt wbits;           /* log2(window size)  (8..15, defaults to 15) */
-  inflate_blocks_statef 
-    *blocks;            /* current inflate_blocks state */
-
-};
-
-/* inflate codes private state */
-typedef enum {        /* waiting for "i:"=input, "o:"=output, "x:"=nothing */
-      START,    /* x: set up for LEN */
-      LEN,      /* i: get length/literal/eob next */
-      LENEXT,   /* i: getting length extra (have base) */
-      DIST,     /* i: get distance next */
-      DISTEXT,  /* i: getting distance extra */
-      COPY,     /* o: copying bytes in window, waiting for space */
-      LIT,      /* o: got literal, waiting for output space */
-      WASH,     /* o: got eob, possibly still output waiting */
-      END,      /* x: got eob and all data flushed */
-      BADCODE}  /* x: got error */
-inflate_codes_mode;
-
-struct inflate_codes_state {
-
-  /* mode */
-  inflate_codes_mode mode;      /* current inflate_codes mode */
-
-  /* mode dependent information */
-  uInt len;
-  union {
-    struct {
-      inflate_huft *tree;       /* pointer into tree */
-      uInt need;                /* bits needed */
-    } code;             /* if LEN or DIST, where in tree */
-    uInt lit;           /* if LIT, literal */
-    struct {
-      uInt get;                 /* bits to get for extra */
-      uInt dist;                /* distance back to copy from */
-    } copy;             /* if EXT or COPY, where and how much */
-  } sub;                /* submode */
-
-  /* mode independent information */
-  Byte lbits;           /* ltree bits decoded per branch */
-  Byte dbits;           /* dtree bits decoder per branch */
-  inflate_huft *ltree;          /* literal/length/eob tree */
-  inflate_huft *dtree;          /* distance tree */
-
-};
+#include <linux/zlib.h>
 
 /* memory allocation for inflation */
 
 struct inflate_workspace {
-	inflate_codes_statef working_state;
-	struct inflate_blocks_state working_blocks_state;
-	struct internal_state internal_state;
-	unsigned int tree_work_area_1[19];
-	unsigned int tree_work_area_2[288];
-	unsigned working_blens[258 + 0x1f + 0x1f];
-	inflate_huft working_hufts[MANY];
+	struct inflate_state inflate_state;
 	unsigned char working_window[1 << MAX_WBITS];
 };
 
-- 
cgit v1.2.3


From 4a31e348e3ecaf54c50240109ac4574b180f8840 Mon Sep 17 00:00:00 2001
From: Krzysztof Halasa <khc@pm.waw.pl>
Date: Thu, 22 Jun 2006 22:20:19 +0200
Subject: [PATCH] WAN: register_hdlc_device() doesn't need dev_alloc_name()

David Boggs noticed that register_hdlc_device() no longer needs
to call dev_alloc_name() as it's called by register_netdev().
register_hdlc_device() is currently equivalent to register_netdev().

hdlc_setup() is now EXPORTed as per David's request.

Signed-off-by: Krzysztof Halasa <khc@pm.waw.pl>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/net/wan/hdlc_generic.c | 24 ++----------------------
 include/linux/hdlc.h           |  2 +-
 2 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/hdlc_generic.c b/drivers/net/wan/hdlc_generic.c
index 46cef8f92133..57f9538b8fb5 100644
--- a/drivers/net/wan/hdlc_generic.c
+++ b/drivers/net/wan/hdlc_generic.c
@@ -259,7 +259,7 @@ int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 }
 
-static void hdlc_setup(struct net_device *dev)
+void hdlc_setup(struct net_device *dev)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 
@@ -288,26 +288,6 @@ struct net_device *alloc_hdlcdev(void *priv)
 	return dev;
 }
 
-int register_hdlc_device(struct net_device *dev)
-{
-	int result = dev_alloc_name(dev, "hdlc%d");
-	if (result < 0)
-		return result;
-
-	result = register_netdev(dev);
-	if (result != 0)
-		return -EIO;
-
-#if 0
-	if (netif_carrier_ok(dev))
-		netif_carrier_off(dev); /* no carrier until DCD goes up */
-#endif
-
-	return 0;
-}
-
-
-
 void unregister_hdlc_device(struct net_device *dev)
 {
 	rtnl_lock();
@@ -326,8 +306,8 @@ EXPORT_SYMBOL(hdlc_open);
 EXPORT_SYMBOL(hdlc_close);
 EXPORT_SYMBOL(hdlc_set_carrier);
 EXPORT_SYMBOL(hdlc_ioctl);
+EXPORT_SYMBOL(hdlc_setup);
 EXPORT_SYMBOL(alloc_hdlcdev);
-EXPORT_SYMBOL(register_hdlc_device);
 EXPORT_SYMBOL(unregister_hdlc_device);
 
 static struct packet_type hdlc_packet_type = {
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index df695e9ae327..4513f9e40937 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -188,7 +188,7 @@ int hdlc_x25_ioctl(struct net_device *dev, struct ifreq *ifr);
 int hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
 
 /* Must be used by hardware driver on module startup/exit */
-int register_hdlc_device(struct net_device *dev);
+#define register_hdlc_device(dev)	register_netdev(dev)
 void unregister_hdlc_device(struct net_device *dev);
 
 struct net_device *alloc_hdlcdev(void *priv);
-- 
cgit v1.2.3


From 5b057c6b1a25d57edf2b4d1e956e50936480a9ff Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 23 Jun 2006 02:06:41 -0700
Subject: [NET]: Avoid allocating skb in skb_pad

First of all it is unnecessary to allocate a new skb in skb_pad since
the existing one is not shared.  More importantly, our hard_start_xmit
interface does not allow a new skb to be allocated since that breaks
requeueing.

This patch uses pskb_expand_head to expand the existing skb and linearize
it if needed.  Actually, someone should sift through every instance of
skb_pad on a non-linear skb as they do not fit the reasons why this was
originally created.

Incidentally, this fixes a minor bug when the skb is cloned (tcpdump,
TCP, etc.).  As it is skb_pad will simply write over a cloned skb.  Because
of the position of the write it is unlikely to cause problems but still
it's best if we don't do it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/3c527.c               |  3 +--
 drivers/net/82596.c               |  3 +--
 drivers/net/a2065.c               |  3 +--
 drivers/net/ariadne.c             |  3 +--
 drivers/net/arm/ether1.c          |  3 +--
 drivers/net/arm/ether3.c          |  3 +--
 drivers/net/atarilance.c          |  3 +--
 drivers/net/cassini.c             |  3 +--
 drivers/net/declance.c            |  3 +--
 drivers/net/depca.c               |  7 ++-----
 drivers/net/eepro.c               |  3 +--
 drivers/net/eexpress.c            |  3 +--
 drivers/net/epic100.c             |  7 ++-----
 drivers/net/eth16i.c              |  3 +--
 drivers/net/hp100.c               |  7 ++-----
 drivers/net/lance.c               |  3 +--
 drivers/net/lasi_82596.c          |  3 +--
 drivers/net/lp486e.c              |  3 +--
 drivers/net/myri10ge/myri10ge.c   |  3 +--
 drivers/net/pcmcia/fmvj18x_cs.c   |  3 +--
 drivers/net/pcmcia/xirc2ps_cs.c   |  3 +--
 drivers/net/r8169.c               |  3 +--
 drivers/net/seeq8005.c            |  3 +--
 drivers/net/sis190.c              |  3 +--
 drivers/net/sk98lin/skge.c        |  2 +-
 drivers/net/skge.c                |  3 +--
 drivers/net/smc9194.c             |  3 +--
 drivers/net/sonic.c               |  3 +--
 drivers/net/starfire.c            |  3 +--
 drivers/net/via-rhine.c           |  7 ++-----
 drivers/net/wireless/ray_cs.c     |  3 +--
 drivers/net/wireless/wavelan_cs.c |  7 ++-----
 drivers/net/yellowfin.c           | 12 +++++-------
 drivers/net/znet.c                |  3 +--
 include/linux/skbuff.h            | 11 +++++------
 net/core/skbuff.c                 | 36 ++++++++++++++++++++++++++----------
 36 files changed, 74 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index 1b1cb0026072..157eda573925 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -1031,8 +1031,7 @@ static int mc32_send_packet(struct sk_buff *skb, struct net_device *dev)
 		return 1;
 	}
 
-	skb = skb_padto(skb, ETH_ZLEN);
-	if (skb == NULL) {
+	if (skb_padto(skb, ETH_ZLEN)) {
 		netif_wake_queue(dev);
 		return 0;
 	}
diff --git a/drivers/net/82596.c b/drivers/net/82596.c
index da0c878dcba8..8a9f7d61b9b1 100644
--- a/drivers/net/82596.c
+++ b/drivers/net/82596.c
@@ -1070,8 +1070,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				skb->len, (unsigned int)skb->data));
 
 	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c
index 79bb56b8dcef..71165ac0257a 100644
--- a/drivers/net/a2065.c
+++ b/drivers/net/a2065.c
@@ -573,8 +573,7 @@ static int lance_start_xmit (struct sk_buff *skb, struct net_device *dev)
 	
 	if (len < ETH_ZLEN) {
 		len = ETH_ZLEN;
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 	}
 
diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c
index d1b6b1f794e2..a9bb7a4aff98 100644
--- a/drivers/net/ariadne.c
+++ b/drivers/net/ariadne.c
@@ -607,8 +607,7 @@ static int ariadne_start_xmit(struct sk_buff *skb, struct net_device *dev)
     /* FIXME: is the 79C960 new enough to do its own padding right ? */
     if (skb->len < ETH_ZLEN)
     {
-    	skb = skb_padto(skb, ETH_ZLEN);
-    	if (skb == NULL)
+    	if (skb_padto(skb, ETH_ZLEN))
     	    return 0;
     	len = ETH_ZLEN;
     }
diff --git a/drivers/net/arm/ether1.c b/drivers/net/arm/ether1.c
index 36475eb2727f..312955d07b28 100644
--- a/drivers/net/arm/ether1.c
+++ b/drivers/net/arm/ether1.c
@@ -700,8 +700,7 @@ ether1_sendpacket (struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			goto out;
 	}
 
diff --git a/drivers/net/arm/ether3.c b/drivers/net/arm/ether3.c
index f1d5b1027ff7..081074180e62 100644
--- a/drivers/net/arm/ether3.c
+++ b/drivers/net/arm/ether3.c
@@ -518,8 +518,7 @@ ether3_sendpacket(struct sk_buff *skb, struct net_device *dev)
 
 	length = (length + 1) & ~1;
 	if (length != skb->len) {
-		skb = skb_padto(skb, length);
-		if (skb == NULL)
+		if (skb_padto(skb, length))
 			goto out;
 	}
 
diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c
index 442b2cbeb58a..91783a8008be 100644
--- a/drivers/net/atarilance.c
+++ b/drivers/net/atarilance.c
@@ -804,8 +804,7 @@ static int lance_start_xmit( struct sk_buff *skb, struct net_device *dev )
 		++len;
 		
 	if (len > skb->len) {
-		skb = skb_padto(skb, len);
-		if (skb == NULL)
+		if (skb_padto(skb, len))
 			return 0;
 	}
 		
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 39f36aa05aa8..565a54f1d06a 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -2915,8 +2915,7 @@ static int cas_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	static int ring; 
 
-	skb = skb_padto(skb, cp->min_frame_size);
-	if (!skb)
+	if (skb_padto(skb, cp->min_frame_size))
 		return 0;
 
 	/* XXX: we need some higher-level QoS hooks to steer packets to
diff --git a/drivers/net/declance.c b/drivers/net/declance.c
index f130bdab3fd3..d3d958e7ac56 100644
--- a/drivers/net/declance.c
+++ b/drivers/net/declance.c
@@ -885,8 +885,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	len = skblen;
 	
 	if (len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		len = ETH_ZLEN;
 	}
diff --git a/drivers/net/depca.c b/drivers/net/depca.c
index 0941d40f046f..e946c43d3b10 100644
--- a/drivers/net/depca.c
+++ b/drivers/net/depca.c
@@ -938,11 +938,8 @@ static int depca_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (skb->len < 1)
 		goto out;
 
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			goto out;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		goto out;
 	
 	netif_stop_queue(dev);
 
diff --git a/drivers/net/eepro.c b/drivers/net/eepro.c
index a806dfe54d23..e70f172699db 100644
--- a/drivers/net/eepro.c
+++ b/drivers/net/eepro.c
@@ -1154,8 +1154,7 @@ static int eepro_send_packet(struct sk_buff *skb, struct net_device *dev)
 		printk(KERN_DEBUG  "%s: entering eepro_send_packet routine.\n", dev->name);
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c
index 82bd356e4f3a..a74b20715755 100644
--- a/drivers/net/eexpress.c
+++ b/drivers/net/eexpress.c
@@ -677,8 +677,7 @@ static int eexp_xmit(struct sk_buff *buf, struct net_device *dev)
 #endif
 
 	if (buf->len < ETH_ZLEN) {
-		buf = skb_padto(buf, ETH_ZLEN);
-		if (buf == NULL)
+		if (skb_padto(buf, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c
index 8d680ce600d7..724d7dc35fa3 100644
--- a/drivers/net/epic100.c
+++ b/drivers/net/epic100.c
@@ -1027,11 +1027,8 @@ static int epic_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	u32 ctrl_word;
 	unsigned long flags;
 
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		return 0;
 
 	/* Caution: the write order is important here, set the field with the
 	   "ownership" bit last. */
diff --git a/drivers/net/eth16i.c b/drivers/net/eth16i.c
index b67545be2caa..4bf76f86d8e9 100644
--- a/drivers/net/eth16i.c
+++ b/drivers/net/eth16i.c
@@ -1064,8 +1064,7 @@ static int eth16i_tx(struct sk_buff *skb, struct net_device *dev)
 	unsigned long flags;
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index 247c8ca86033..dd1dc32dc98d 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -1487,11 +1487,8 @@ static int hp100_start_xmit_bm(struct sk_buff *skb, struct net_device *dev)
 	if (skb->len <= 0)
 		return 0;
 		
-	if (skb->len < ETH_ZLEN && lp->chip == HP100_CHIPID_SHASTA) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (lp->chip == HP100_CHIPID_SHASTA && skb_padto(skb, ETH_ZLEN))
+		return 0;
 
 	/* Get Tx ring tail pointer */
 	if (lp->txrtail->next == lp->txrhead) {
diff --git a/drivers/net/lance.c b/drivers/net/lance.c
index bb5ad479210b..c1c3452c90ca 100644
--- a/drivers/net/lance.c
+++ b/drivers/net/lance.c
@@ -968,8 +968,7 @@ static int lance_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* The old LANCE chips doesn't automatically pad buffers to min. size. */
 	if (chip_table[lp->chip_version].flags & LANCE_MUST_PAD) {
 		if (skb->len < ETH_ZLEN) {
-			skb = skb_padto(skb, ETH_ZLEN);
-			if (skb == NULL)
+			if (skb_padto(skb, ETH_ZLEN))
 				goto out;
 			lp->tx_ring[entry].length = -ETH_ZLEN;
 		}
diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c
index 957888de3d7e..1ab09447baa5 100644
--- a/drivers/net/lasi_82596.c
+++ b/drivers/net/lasi_82596.c
@@ -1083,8 +1083,7 @@ static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				skb->len, skb->data));
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/lp486e.c b/drivers/net/lp486e.c
index 94d5ea1ce8bd..bf3f343ae715 100644
--- a/drivers/net/lp486e.c
+++ b/drivers/net/lp486e.c
@@ -877,8 +877,7 @@ static int i596_start_xmit (struct sk_buff *skb, struct net_device *dev) {
 	length = skb->len;
 	
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 5a74f63618bc..b983e1e04348 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1939,8 +1939,7 @@ again:
 
 		/* pad frames to at least ETH_ZLEN bytes */
 		if (unlikely(skb->len < ETH_ZLEN)) {
-			skb = skb_padto(skb, ETH_ZLEN);
-			if (skb == NULL) {
+			if (skb_padto(skb, ETH_ZLEN)) {
 				/* The packet is gone, so we must
 				 * return 0 */
 				mgp->stats.tx_dropped += 1;
diff --git a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c
index 09b11761cdfa..ea93b8f18605 100644
--- a/drivers/net/pcmcia/fmvj18x_cs.c
+++ b/drivers/net/pcmcia/fmvj18x_cs.c
@@ -831,8 +831,7 @@ static int fjn_start_xmit(struct sk_buff *skb, struct net_device *dev)
     
     if (length < ETH_ZLEN)
     {
-    	skb = skb_padto(skb, ETH_ZLEN);
-    	if (skb == NULL)
+    	if (skb_padto(skb, ETH_ZLEN))
     		return 0;
     	length = ETH_ZLEN;
     }
diff --git a/drivers/net/pcmcia/xirc2ps_cs.c b/drivers/net/pcmcia/xirc2ps_cs.c
index e80d1e3aec68..9bae77ce1314 100644
--- a/drivers/net/pcmcia/xirc2ps_cs.c
+++ b/drivers/net/pcmcia/xirc2ps_cs.c
@@ -1374,8 +1374,7 @@ do_start_xmit(struct sk_buff *skb, struct net_device *dev)
      */
     if (pktlen < ETH_ZLEN)
     {
-        skb = skb_padto(skb, ETH_ZLEN);
-        if (skb == NULL)
+        if (skb_padto(skb, ETH_ZLEN))
         	return 0;
 	pktlen = ETH_ZLEN;
     }
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 9945cc6b8d90..985afe0e6273 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2222,8 +2222,7 @@ static int rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		len = skb->len;
 
 		if (unlikely(len < ETH_ZLEN)) {
-			skb = skb_padto(skb, ETH_ZLEN);
-			if (!skb)
+			if (skb_padto(skb, ETH_ZLEN))
 				goto err_update_stats;
 			len = ETH_ZLEN;
 		}
diff --git a/drivers/net/seeq8005.c b/drivers/net/seeq8005.c
index bcef03feb2fc..efd0f235020f 100644
--- a/drivers/net/seeq8005.c
+++ b/drivers/net/seeq8005.c
@@ -396,8 +396,7 @@ static int seeq8005_send_packet(struct sk_buff *skb, struct net_device *dev)
 	unsigned char *buf;
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
index 31dd3f036fa8..df39f3447655 100644
--- a/drivers/net/sis190.c
+++ b/drivers/net/sis190.c
@@ -1156,8 +1156,7 @@ static int sis190_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	dma_addr_t mapping;
 
 	if (unlikely(skb->len < ETH_ZLEN)) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (!skb) {
+		if (skb_padto(skb, ETH_ZLEN)) {
 			tp->stats.tx_dropped++;
 			goto out;
 		}
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index 38a26df4095f..f3efbd177ae7 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -1525,7 +1525,7 @@ struct sk_buff	*pMessage)	/* pointer to send-message              */
 	** This is to resolve faulty padding by the HW with 0xaa bytes.
 	*/
 	if (BytesSend < C_LEN_ETHERNET_MINSIZE) {
-		if ((pMessage = skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) == NULL) {
+		if (skb_padto(pMessage, C_LEN_ETHERNET_MINSIZE)) {
 			spin_unlock_irqrestore(&pTxPort->TxDesRingLock, Flags);
 			return 0;
 		}
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 536dd1cf7f79..19a4a16055dc 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -2310,8 +2310,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	u64 map;
 	unsigned long flags;
 
-	skb = skb_padto(skb, ETH_ZLEN);
-	if (!skb)
+	if (skb_padto(skb, ETH_ZLEN))
 		return NETDEV_TX_OK;
 
 	if (!spin_trylock_irqsave(&skge->tx_lock, flags))
diff --git a/drivers/net/smc9194.c b/drivers/net/smc9194.c
index 6cf16f322ad5..8b0321f1976c 100644
--- a/drivers/net/smc9194.c
+++ b/drivers/net/smc9194.c
@@ -523,8 +523,7 @@ static int smc_wait_to_send_packet( struct sk_buff * skb, struct net_device * de
 	length = skb->len;
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL) {
+		if (skb_padto(skb, ETH_ZLEN)) {
 			netif_wake_queue(dev);
 			return 0;
 		}
diff --git a/drivers/net/sonic.c b/drivers/net/sonic.c
index 90b818a8de6e..cab0dd958492 100644
--- a/drivers/net/sonic.c
+++ b/drivers/net/sonic.c
@@ -231,8 +231,7 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
 
 	length = skb->len;
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index 9b7805be21da..c158eedc7813 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -1349,8 +1349,7 @@ static int start_tx(struct sk_buff *skb, struct net_device *dev)
 
 #if defined(ZEROCOPY) && defined(HAS_BROKEN_FIRMWARE)
 	if (skb->ip_summed == CHECKSUM_HW) {
-		skb = skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK);
-		if (skb == NULL)
+		if (skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK))
 			return NETDEV_TX_OK;
 	}
 #endif /* ZEROCOPY && HAS_BROKEN_FIRMWARE */
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index fdc21037f6dc..c80a4f1d5f7a 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -1284,11 +1284,8 @@ static int rhine_start_tx(struct sk_buff *skb, struct net_device *dev)
 	/* Calculate the next Tx descriptor entry. */
 	entry = rp->cur_tx % TX_RING_SIZE;
 
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		return 0;
 
 	rp->tx_skbuff[entry] = skb;
 
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 879eb427607c..a915fe6c6aa5 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -924,8 +924,7 @@ static int ray_dev_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
     if (length < ETH_ZLEN)
     {
-    	skb = skb_padto(skb, ETH_ZLEN);
-    	if (skb == NULL)
+    	if (skb_padto(skb, ETH_ZLEN))
     		return 0;
     	length = ETH_ZLEN;
     }
diff --git a/drivers/net/wireless/wavelan_cs.c b/drivers/net/wireless/wavelan_cs.c
index f7724eb2fa7e..561250f73fd3 100644
--- a/drivers/net/wireless/wavelan_cs.c
+++ b/drivers/net/wireless/wavelan_cs.c
@@ -3194,11 +3194,8 @@ wavelan_packet_xmit(struct sk_buff *	skb,
 	 * and we don't have the Ethernet specific requirement of beeing
 	 * able to detect collisions, therefore in theory we don't really
 	 * need to pad. Jean II */
-	if (skb->len < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
-			return 0;
-	}
+	if (skb_padto(skb, ETH_ZLEN))
+		return 0;
 
   wv_packet_write(dev, skb->data, skb->len);
 
diff --git a/drivers/net/yellowfin.c b/drivers/net/yellowfin.c
index fd0f43b7db5b..ecec8e5db786 100644
--- a/drivers/net/yellowfin.c
+++ b/drivers/net/yellowfin.c
@@ -862,13 +862,11 @@ static int yellowfin_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		/* Fix GX chipset errata. */
 		if (cacheline_end > 24  || cacheline_end == 0) {
 			len = skb->len + 32 - cacheline_end + 1;
-			if (len != skb->len)
-				skb = skb_padto(skb, len);
-		}
-		if (skb == NULL) {
-			yp->tx_skbuff[entry] = NULL;
-			netif_wake_queue(dev);
-			return 0;
+			if (skb_padto(skb, len)) {
+				yp->tx_skbuff[entry] = NULL;
+				netif_wake_queue(dev);
+				return 0;
+			}
 		}
 	}
 	yp->tx_skbuff[entry] = skb;
diff --git a/drivers/net/znet.c b/drivers/net/znet.c
index 3ac047bc727d..a7c089df66e6 100644
--- a/drivers/net/znet.c
+++ b/drivers/net/znet.c
@@ -544,8 +544,7 @@ static int znet_send_packet(struct sk_buff *skb, struct net_device *dev)
 		printk(KERN_DEBUG "%s: ZNet_send_packet.\n", dev->name);
 
 	if (length < ETH_ZLEN) {
-		skb = skb_padto(skb, ETH_ZLEN);
-		if (skb == NULL)
+		if (skb_padto(skb, ETH_ZLEN))
 			return 0;
 		length = ETH_ZLEN;
 	}
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 66f8819f9568..f8c7eb79a27f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -345,7 +345,7 @@ extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 				       int newheadroom, int newtailroom,
 				       gfp_t priority);
-extern struct sk_buff *		skb_pad(struct sk_buff *skb, int pad);
+extern int	       skb_pad(struct sk_buff *skb, int pad);
 #define dev_kfree_skb(a)	kfree_skb(a)
 extern void	      skb_over_panic(struct sk_buff *skb, int len,
 				     void *here);
@@ -1122,16 +1122,15 @@ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
  *
  *	Pads up a buffer to ensure the trailing bytes exist and are
  *	blanked. If the buffer already contains sufficient data it
- *	is untouched. Returns the buffer, which may be a replacement
- *	for the original, or NULL for out of memory - in which case
- *	the original buffer is still freed.
+ *	is untouched. Otherwise it is extended. Returns zero on
+ *	success. The skb is freed on error.
  */
  
-static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len)
+static inline int skb_padto(struct sk_buff *skb, unsigned int len)
 {
 	unsigned int size = skb->len;
 	if (likely(size >= len))
-		return skb;
+		return 0;
 	return skb_pad(skb, len-size);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb7210f4005e..fe63d4efbd4d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -781,24 +781,40 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
  *	filled. Used by network drivers which may DMA or transfer data
  *	beyond the buffer end onto the wire.
  *
- *	May return NULL in out of memory cases.
+ *	May return error in out of memory cases. The skb is freed on error.
  */
  
-struct sk_buff *skb_pad(struct sk_buff *skb, int pad)
+int skb_pad(struct sk_buff *skb, int pad)
 {
-	struct sk_buff *nskb;
+	int err;
+	int ntail;
 	
 	/* If the skbuff is non linear tailroom is always zero.. */
-	if (skb_tailroom(skb) >= pad) {
+	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
 		memset(skb->data+skb->len, 0, pad);
-		return skb;
+		return 0;
 	}
-	
-	nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
+
+	ntail = skb->data_len + pad - (skb->end - skb->tail);
+	if (likely(skb_cloned(skb) || ntail > 0)) {
+		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+		if (unlikely(err))
+			goto free_skb;
+	}
+
+	/* FIXME: The use of this function with non-linear skb's really needs
+	 * to be audited.
+	 */
+	err = skb_linearize(skb);
+	if (unlikely(err))
+		goto free_skb;
+
+	memset(skb->data + skb->len, 0, pad);
+	return 0;
+
+free_skb:
 	kfree_skb(skb);
-	if (nskb)
-		memset(nskb->data+nskb->len, 0, pad);
-	return nskb;
+	return err;
 }	
  
 /* Trims skb to length len. It can change skb pointers.
-- 
cgit v1.2.3


From 7967168cefdbc63bf332d6b1548eca7cd65ebbcc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 02:40:14 -0700
Subject: [NET]: Merge TSO/UFO fields in sk_buff

Having separate fields in sk_buff for TSO/UFO (tso_size/ufo_size) is not
going to scale if we add any more segmentation methods (e.g., DCCP).  So
let's merge them.

They were used to tell the protocol of a packet.  This function has been
subsumed by the new gso_type field.  This is essentially a set of netdev
feature bits (shifted by 16 bits) that are required to process a specific
skb.  As such it's easy to tell whether a given device can process a GSO
skb: you just have to and the gso_type field and the netdev's features
field.

I've made gso_type a conjunction.  The idea is that you have a base type
(e.g., SKB_GSO_TCPV4) that can be modified further to support new features.
For example, if we add a hardware TSO type that supports ECN, they would
declare NETIF_F_TSO | NETIF_F_TSO_ECN.  All TSO packets with CWR set would
have a gso_type of SKB_GSO_TCPV4 | SKB_GSO_TCPV4_ECN while all other TSO
packets would be SKB_GSO_TCPV4.  This means that only the CWR packets need
to be emulated in software.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/8139cp.c            |  2 +-
 drivers/net/bnx2.c              |  4 ++--
 drivers/net/chelsio/sge.c       |  4 ++--
 drivers/net/e1000/e1000_main.c  | 10 ++++-----
 drivers/net/forcedeth.c         |  4 ++--
 drivers/net/ixgb/ixgb_main.c    |  4 ++--
 drivers/net/loopback.c          |  4 ++--
 drivers/net/myri10ge/myri10ge.c |  4 ++--
 drivers/net/r8169.c             |  2 +-
 drivers/net/s2io.c              | 16 +++++++-------
 drivers/net/sky2.c              |  4 ++--
 drivers/net/tg3.c               |  4 ++--
 drivers/net/typhoon.c           |  2 +-
 drivers/s390/net/qeth_eddp.c    | 12 +++++------
 drivers/s390/net/qeth_main.c    |  4 ++--
 drivers/s390/net/qeth_tso.h     |  2 +-
 include/linux/netdevice.h       | 14 ++++++++++--
 include/linux/skbuff.h          | 12 ++++++++---
 include/net/tcp.h               |  4 ++--
 net/bridge/br_forward.c         |  4 ++--
 net/bridge/br_netfilter.c       |  2 +-
 net/core/skbuff.c               | 16 ++++++++------
 net/ipv4/ip_output.c            | 16 ++++++++------
 net/ipv4/tcp.c                  |  4 ++--
 net/ipv4/tcp_input.c            |  2 +-
 net/ipv4/tcp_output.c           | 47 ++++++++++++++++++++++++-----------------
 net/ipv6/ip6_output.c           |  7 +++---
 27 files changed, 120 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index a26077a175ad..0cdc830449d8 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -797,7 +797,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev)
 	entry = cp->tx_head;
 	eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0;
 	if (dev->features & NETIF_F_TSO)
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 
 	if (skb_shinfo(skb)->nr_frags == 0) {
 		struct cp_desc *txd = &cp->tx_ring[entry];
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 702d546567ad..7635736cc791 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -1640,7 +1640,7 @@ bnx2_tx_int(struct bnx2 *bp)
 		skb = tx_buf->skb;
 #ifdef BCM_TSO 
 		/* partial BD completions possible with TSO packets */
-		if (skb_shinfo(skb)->tso_size) {
+		if (skb_shinfo(skb)->gso_size) {
 			u16 last_idx, last_ring_idx;
 
 			last_idx = sw_cons +
@@ -4428,7 +4428,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			(TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
 	}
 #ifdef BCM_TSO 
-	if ((mss = skb_shinfo(skb)->tso_size) &&
+	if ((mss = skb_shinfo(skb)->gso_size) &&
 		(skb->len > (bp->dev->mtu + ETH_HLEN))) {
 		u32 tcp_opt_len, ip_tcp_len;
 
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
index 4391bf4bf573..53efff6da784 100644
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1418,7 +1418,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct cpl_tx_pkt *cpl;
 
 #ifdef NETIF_F_TSO
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		int eth_type;
 		struct cpl_tx_pkt_lso *hdr;
 
@@ -1433,7 +1433,7 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		hdr->ip_hdr_words = skb->nh.iph->ihl;
 		hdr->tcp_hdr_words = skb->h.th->doff;
 		hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type,
-						skb_shinfo(skb)->tso_size));
+						skb_shinfo(skb)->gso_size));
 		hdr->len = htonl(skb->len - sizeof(*hdr));
 		cpl = (struct cpl_tx_pkt *)hdr;
 		sge->stats.tx_lso_pkts++;
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index a373ccb308d8..32b7d444b374 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring,
 	uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
 	int err;
 
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		if (skb_header_cloned(skb)) {
 			err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 			if (err)
@@ -2402,7 +2402,7 @@ e1000_tso(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring,
 		}
 
 		hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 		if (skb->protocol == htons(ETH_P_IP)) {
 			skb->nh.iph->tot_len = 0;
 			skb->nh.iph->check = 0;
@@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring,
 		 * tso gets written back prematurely before the data is fully
 		 * DMA'd to the controller */
 		if (!skb->data_len && tx_ring->last_tx_tso &&
-		    !skb_shinfo(skb)->tso_size) {
+		    !skb_shinfo(skb)->gso_size) {
 			tx_ring->last_tx_tso = 0;
 			size -= 4;
 		}
@@ -2757,7 +2757,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	}
 
 #ifdef NETIF_F_TSO
-	mss = skb_shinfo(skb)->tso_size;
+	mss = skb_shinfo(skb)->gso_size;
 	/* The controller does a simple calculation to
 	 * make sure there is enough room in the FIFO before
 	 * initiating the DMA for each buffer.  The calc is:
@@ -2807,7 +2807,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 #ifdef NETIF_F_TSO
 	/* Controller Erratum workaround */
 	if (!skb->data_len && tx_ring->last_tx_tso &&
-	    !skb_shinfo(skb)->tso_size)
+	    !skb_shinfo(skb)->gso_size)
 		count++;
 #endif
 
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 191383d461d7..21be4fa071b5 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1495,8 +1495,8 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	np->tx_skbuff[nr] = skb;
 
 #ifdef NETIF_F_TSO
-	if (skb_shinfo(skb)->tso_size)
-		tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT);
+	if (skb_shinfo(skb)->gso_size)
+		tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
 	else
 #endif
 	tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0);
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index 57006fb8840e..8bb32f946993 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb)
 	uint16_t ipcse, tucse, mss;
 	int err;
 
-	if(likely(skb_shinfo(skb)->tso_size)) {
+	if(likely(skb_shinfo(skb)->gso_size)) {
 		if (skb_header_cloned(skb)) {
 			err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
 			if (err)
@@ -1181,7 +1181,7 @@ ixgb_tso(struct ixgb_adapter *adapter, struct sk_buff *skb)
 		}
 
 		hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 		skb->nh.iph->tot_len = 0;
 		skb->nh.iph->check = 0;
 		skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr,
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index b79d6e8d3045..43fef7de8cb9 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -74,7 +74,7 @@ static void emulate_large_send_offload(struct sk_buff *skb)
 	struct iphdr *iph = skb->nh.iph;
 	struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4));
 	unsigned int doffset = (iph->ihl + th->doff) * 4;
-	unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
+	unsigned int mtu = skb_shinfo(skb)->gso_size + doffset;
 	unsigned int offset = 0;
 	u32 seq = ntohl(th->seq);
 	u16 id  = ntohs(iph->id);
@@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 
 #ifdef LOOPBACK_TSO
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		BUG_ON(skb->protocol != htons(ETH_P_IP));
 		BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP);
 
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index b983e1e04348..dbdf189436fa 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1879,7 +1879,7 @@ again:
 
 #ifdef NETIF_F_TSO
 	if (skb->len > (dev->mtu + ETH_HLEN)) {
-		mss = skb_shinfo(skb)->tso_size;
+		mss = skb_shinfo(skb)->gso_size;
 		if (mss != 0)
 			max_segments = MYRI10GE_MAX_SEND_DESC_TSO;
 	}
@@ -2112,7 +2112,7 @@ abort_linearize:
 		}
 		idx = (idx + 1) & tx->mask;
 	} while (idx != last_idx);
-	if (skb_shinfo(skb)->tso_size) {
+	if (skb_shinfo(skb)->gso_size) {
 		printk(KERN_ERR
 		       "myri10ge: %s: TSO but wanted to linearize?!?!?\n",
 		       mgp->dev->name);
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 985afe0e6273..12d1cb289bb0 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2172,7 +2172,7 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
 {
 	if (dev->features & NETIF_F_TSO) {
-		u32 mss = skb_shinfo(skb)->tso_size;
+		u32 mss = skb_shinfo(skb)->gso_size;
 
 		if (mss)
 			return LargeSend | ((mss & MSSMask) << MSSShift);
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 11daed495b97..3defe5d4f7d3 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -3959,8 +3959,8 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	txdp->Control_1 = 0;
 	txdp->Control_2 = 0;
 #ifdef NETIF_F_TSO
-	mss = skb_shinfo(skb)->tso_size;
-	if (mss) {
+	mss = skb_shinfo(skb)->gso_size;
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) {
 		txdp->Control_1 |= TXD_TCP_LSO_EN;
 		txdp->Control_1 |= TXD_TCP_LSO_MSS(mss);
 	}
@@ -3980,10 +3980,10 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	frg_len = skb->len - skb->data_len;
-	if (skb_shinfo(skb)->ufo_size) {
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) {
 		int ufo_size;
 
-		ufo_size = skb_shinfo(skb)->ufo_size;
+		ufo_size = skb_shinfo(skb)->gso_size;
 		ufo_size &= ~7;
 		txdp->Control_1 |= TXD_UFO_EN;
 		txdp->Control_1 |= TXD_UFO_MSS(ufo_size);
@@ -4009,7 +4009,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	txdp->Host_Control = (unsigned long) skb;
 	txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len);
 
-	if (skb_shinfo(skb)->ufo_size)
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 		txdp->Control_1 |= TXD_UFO_EN;
 
 	frg_cnt = skb_shinfo(skb)->nr_frags;
@@ -4024,12 +4024,12 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 		    (sp->pdev, frag->page, frag->page_offset,
 		     frag->size, PCI_DMA_TODEVICE);
 		txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size);
-		if (skb_shinfo(skb)->ufo_size)
+		if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 			txdp->Control_1 |= TXD_UFO_EN;
 	}
 	txdp->Control_1 |= TXD_GATHER_CODE_LAST;
 
-	if (skb_shinfo(skb)->ufo_size)
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 		frg_cnt++; /* as Txd0 was used for inband header */
 
 	tx_fifo = mac_control->tx_FIFO_start[queue];
@@ -4043,7 +4043,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (mss)
 		val64 |= TX_FIFO_SPECIAL_FUNC;
 #endif
-	if (skb_shinfo(skb)->ufo_size)
+	if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
 		val64 |= TX_FIFO_SPECIAL_FUNC;
 	writeq(val64, &tx_fifo->List_Control);
 
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index fba1e4d4d83d..d3577871be28 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1160,7 +1160,7 @@ static unsigned tx_le_req(const struct sk_buff *skb)
 	count = sizeof(dma_addr_t) / sizeof(u32);
 	count += skb_shinfo(skb)->nr_frags * count;
 
-	if (skb_shinfo(skb)->tso_size)
+	if (skb_shinfo(skb)->gso_size)
 		++count;
 
 	if (skb->ip_summed == CHECKSUM_HW)
@@ -1232,7 +1232,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	/* Check for TCP Segmentation Offload */
-	mss = skb_shinfo(skb)->tso_size;
+	mss = skb_shinfo(skb)->gso_size;
 	if (mss != 0) {
 		/* just drop the packet if non-linear expansion fails */
 		if (skb_header_cloned(skb) &&
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index b2ddd4522a87..e3e380f90f86 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3780,7 +3780,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 #if TG3_TSO_SUPPORT != 0
 	mss = 0;
 	if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
-	    (mss = skb_shinfo(skb)->tso_size) != 0) {
+	    (mss = skb_shinfo(skb)->gso_size) != 0) {
 		int tcp_opt_len, ip_tcp_len;
 
 		if (skb_header_cloned(skb) &&
@@ -3905,7 +3905,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev)
 #if TG3_TSO_SUPPORT != 0
 	mss = 0;
 	if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
-	    (mss = skb_shinfo(skb)->tso_size) != 0) {
+	    (mss = skb_shinfo(skb)->gso_size) != 0) {
 		int tcp_opt_len, ip_tcp_len;
 
 		if (skb_header_cloned(skb) &&
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index d9258d42090c..e49e8b520c28 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -340,7 +340,7 @@ enum state_values {
 #endif
 
 #if defined(NETIF_F_TSO)
-#define skb_tso_size(x)		(skb_shinfo(x)->tso_size)
+#define skb_tso_size(x)		(skb_shinfo(x)->gso_size)
 #define TSO_NUM_DESCRIPTORS	2
 #define TSO_OFFLOAD_ON		TYPHOON_OFFLOAD_TCP_SEGMENT
 #else
diff --git a/drivers/s390/net/qeth_eddp.c b/drivers/s390/net/qeth_eddp.c
index 0bab60a20309..38aad8321456 100644
--- a/drivers/s390/net/qeth_eddp.c
+++ b/drivers/s390/net/qeth_eddp.c
@@ -420,7 +420,7 @@ __qeth_eddp_fill_context_tcp(struct qeth_eddp_context *ctx,
        }
 	tcph = eddp->skb->h.th;
 	while (eddp->skb_offset < eddp->skb->len) {
-		data_len = min((int)skb_shinfo(eddp->skb)->tso_size,
+		data_len = min((int)skb_shinfo(eddp->skb)->gso_size,
 			       (int)(eddp->skb->len - eddp->skb_offset));
 		/* prepare qdio hdr */
 		if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){
@@ -515,20 +515,20 @@ qeth_eddp_calc_num_pages(struct qeth_eddp_context *ctx, struct sk_buff *skb,
 
 	QETH_DBF_TEXT(trace, 5, "eddpcanp");
 	/* can we put multiple skbs in one page? */
-	skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len);
+	skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len);
 	if (skbs_per_page > 1){
-		ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) /
+		ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) /
 				 skbs_per_page + 1;
 		ctx->elements_per_skb = 1;
 	} else {
 		/* no -> how many elements per skb? */
-		ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len +
+		ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len +
 				     PAGE_SIZE) >> PAGE_SHIFT;
 		ctx->num_pages = ctx->elements_per_skb *
-				 (skb_shinfo(skb)->tso_segs + 1);
+				 (skb_shinfo(skb)->gso_segs + 1);
 	}
 	ctx->num_elements = ctx->elements_per_skb *
-			    (skb_shinfo(skb)->tso_segs + 1);
+			    (skb_shinfo(skb)->gso_segs + 1);
 }
 
 static inline struct qeth_eddp_context *
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
index 9e671a48cd2f..56009d768326 100644
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -4417,7 +4417,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb)
 	struct qeth_eddp_context *ctx = NULL;
 	int tx_bytes = skb->len;
 	unsigned short nr_frags = skb_shinfo(skb)->nr_frags;
-	unsigned short tso_size = skb_shinfo(skb)->tso_size;
+	unsigned short tso_size = skb_shinfo(skb)->gso_size;
 	int rc;
 
 	QETH_DBF_TEXT(trace, 6, "sendpkt");
@@ -4453,7 +4453,7 @@ qeth_send_packet(struct qeth_card *card, struct sk_buff *skb)
 	queue = card->qdio.out_qs
 		[qeth_get_priority_queue(card, skb, ipv, cast_type)];
 
-	if (skb_shinfo(skb)->tso_size)
+	if (skb_shinfo(skb)->gso_size)
 		large_send = card->options.large_send;
 
 	/*are we able to do TSO ? If so ,prepare and send it from here */
diff --git a/drivers/s390/net/qeth_tso.h b/drivers/s390/net/qeth_tso.h
index 24ef40ca9562..593f298142c1 100644
--- a/drivers/s390/net/qeth_tso.h
+++ b/drivers/s390/net/qeth_tso.h
@@ -51,7 +51,7 @@ qeth_tso_fill_header(struct qeth_card *card, struct sk_buff *skb)
 	hdr->ext.hdr_version = 1;
 	hdr->ext.hdr_len     = 28;
 	/*insert non-fix values */
-	hdr->ext.mss = skb_shinfo(skb)->tso_size;
+	hdr->ext.mss = skb_shinfo(skb)->gso_size;
 	hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4);
 	hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len -
 				       sizeof(struct qeth_hdr_tso));
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cead6be467ed..fa5671307b90 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -308,9 +308,12 @@ struct net_device
 #define NETIF_F_HW_VLAN_RX	256	/* Receive VLAN hw acceleration */
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
-#define NETIF_F_TSO		2048	/* Can offload TCP/IP segmentation */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
-#define NETIF_F_UFO             8192    /* Can offload UDP Large Send*/
+
+	/* Segmentation offload features */
+#define NETIF_F_GSO_SHIFT	16
+#define NETIF_F_TSO		(SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
+#define NETIF_F_UFO		(SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT)
 
 #define NETIF_F_GEN_CSUM	(NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 #define NETIF_F_ALL_CSUM	(NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
@@ -979,6 +982,13 @@ extern void dev_seq_stop(struct seq_file *seq, void *v);
 
 extern void linkwatch_run_queue(void);
 
+static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
+{
+	int feature = skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT;
+	return skb_shinfo(skb)->gso_size &&
+	       (dev->features & feature) != feature;
+}
+
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8c7eb79a27f..97b0d2d1a6b0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -134,9 +134,10 @@ struct skb_frag_struct {
 struct skb_shared_info {
 	atomic_t	dataref;
 	unsigned short	nr_frags;
-	unsigned short	tso_size;
-	unsigned short	tso_segs;
-	unsigned short  ufo_size;
+	unsigned short	gso_size;
+	/* Warning: this field is not always filled in (UFO)! */
+	unsigned short	gso_segs;
+	unsigned short  gso_type;
 	unsigned int    ip6_frag_id;
 	struct sk_buff	*frag_list;
 	skb_frag_t	frags[MAX_SKB_FRAGS];
@@ -168,6 +169,11 @@ enum {
 	SKB_FCLONE_CLONE,
 };
 
+enum {
+	SKB_GSO_TCPV4 = 1 << 0,
+	SKB_GSO_UDPV4 = 1 << 1,
+};
+
 /** 
  *	struct sk_buff - socket buffer
  *	@next: Next buffer in list
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5f4eb5c79689..b197a9e615c1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -569,13 +569,13 @@ struct tcp_skb_cb {
  */
 static inline int tcp_skb_pcount(const struct sk_buff *skb)
 {
-	return skb_shinfo(skb)->tso_segs;
+	return skb_shinfo(skb)->gso_segs;
 }
 
 /* This is valid iff tcp_skb_pcount() > 1. */
 static inline int tcp_skb_mss(const struct sk_buff *skb)
 {
-	return skb_shinfo(skb)->tso_size;
+	return skb_shinfo(skb)->gso_size;
 }
 
 static inline void tcp_dec_pcount_approx(__u32 *count,
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 0dca027ceb80..8be9f2123e54 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -34,8 +34,8 @@ static inline unsigned packet_length(const struct sk_buff *skb)
 
 int br_dev_queue_push_xmit(struct sk_buff *skb)
 {
-	/* drop mtu oversized packets except tso */
-	if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
+	/* drop mtu oversized packets except gso */
+	if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->gso_size)
 		kfree_skb(skb);
 	else {
 #ifdef CONFIG_BRIDGE_NETFILTER
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3e41f9d6d51c..8298a5179aef 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -761,7 +761,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len > skb->dev->mtu &&
-	    !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
+	    !skb_shinfo(skb)->gso_size)
 		return ip_fragment(skb, br_dev_queue_push_xmit);
 	else
 		return br_dev_queue_push_xmit(skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fe63d4efbd4d..368d98578c14 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -172,9 +172,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	shinfo = skb_shinfo(skb);
 	atomic_set(&shinfo->dataref, 1);
 	shinfo->nr_frags  = 0;
-	shinfo->tso_size = 0;
-	shinfo->tso_segs = 0;
-	shinfo->ufo_size = 0;
+	shinfo->gso_size = 0;
+	shinfo->gso_segs = 0;
+	shinfo->gso_type = 0;
 	shinfo->ip6_frag_id = 0;
 	shinfo->frag_list = NULL;
 
@@ -238,8 +238,9 @@ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
 
 	atomic_set(&(skb_shinfo(skb)->dataref), 1);
 	skb_shinfo(skb)->nr_frags  = 0;
-	skb_shinfo(skb)->tso_size = 0;
-	skb_shinfo(skb)->tso_segs = 0;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_segs = 0;
+	skb_shinfo(skb)->gso_type = 0;
 	skb_shinfo(skb)->frag_list = NULL;
 out:
 	return skb;
@@ -528,8 +529,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #endif
 	skb_copy_secmark(new, old);
 	atomic_set(&new->users, 1);
-	skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
-	skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
+	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
 
 /**
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8538aac3d148..7624fd1d8f9f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -210,8 +210,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
 		return dst_output(skb);
 	}
 #endif
-	if (skb->len > dst_mtu(skb->dst) &&
-	    !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
+	if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size)
 		return ip_fragment(skb, ip_finish_output2);
 	else
 		return ip_finish_output2(skb);
@@ -362,7 +361,7 @@ packet_routed:
 	}
 
 	ip_select_ident_more(iph, &rt->u.dst, sk,
-			     (skb_shinfo(skb)->tso_segs ?: 1) - 1);
+			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 
 	/* Add an IP checksum. */
 	ip_send_check(iph);
@@ -744,7 +743,8 @@ static inline int ip_ufo_append_data(struct sock *sk,
 			       (length - transhdrlen));
 	if (!err) {
 		/* specify the length of each IP datagram fragment*/
-		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
 		__skb_queue_tail(&sk->sk_write_queue, skb);
 
 		return 0;
@@ -1087,14 +1087,16 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 
 	inet->cork.length += size;
 	if ((sk->sk_protocol == IPPROTO_UDP) &&
-	    (rt->u.dst.dev->features & NETIF_F_UFO))
-		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
+	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
+		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
+	}
 
 
 	while (size > 0) {
 		int i;
 
-		if (skb_shinfo(skb)->ufo_size)
+		if (skb_shinfo(skb)->gso_size)
 			len = size;
 		else {
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 74998f250071..062dd1a0d8a8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -571,7 +571,7 @@ new_segment:
 		skb->ip_summed = CHECKSUM_HW;
 		tp->write_seq += copy;
 		TCP_SKB_CB(skb)->end_seq += copy;
-		skb_shinfo(skb)->tso_segs = 0;
+		skb_shinfo(skb)->gso_segs = 0;
 
 		if (!copied)
 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
@@ -818,7 +818,7 @@ new_segment:
 
 			tp->write_seq += copy;
 			TCP_SKB_CB(skb)->end_seq += copy;
-			skb_shinfo(skb)->tso_segs = 0;
+			skb_shinfo(skb)->gso_segs = 0;
 
 			from += copy;
 			copied += copy;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e08245bdda3a..94fe5b1f9dcb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1073,7 +1073,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 				else
 					pkt_len = (end_seq -
 						   TCP_SKB_CB(skb)->seq);
-				if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
+				if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
 					break;
 				pcount = tcp_skb_pcount(skb);
 			}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 07bb5a2b375e..bdd71db8bf90 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -515,15 +515,17 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
 		 */
-		skb_shinfo(skb)->tso_segs = 1;
-		skb_shinfo(skb)->tso_size = 0;
+		skb_shinfo(skb)->gso_segs = 1;
+		skb_shinfo(skb)->gso_size = 0;
+		skb_shinfo(skb)->gso_type = 0;
 	} else {
 		unsigned int factor;
 
 		factor = skb->len + (mss_now - 1);
 		factor /= mss_now;
-		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = mss_now;
+		skb_shinfo(skb)->gso_segs = factor;
+		skb_shinfo(skb)->gso_size = mss_now;
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 	}
 }
 
@@ -914,7 +916,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int
 
 	if (!tso_segs ||
 	    (tso_segs > 1 &&
-	     skb_shinfo(skb)->tso_size != mss_now)) {
+	     tcp_skb_mss(skb) != mss_now)) {
 		tcp_set_skb_tso_segs(sk, skb, mss_now);
 		tso_segs = tcp_skb_pcount(skb);
 	}
@@ -1724,8 +1726,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	   tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
 		if (!pskb_trim(skb, 0)) {
 			TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
-			skb_shinfo(skb)->tso_segs = 1;
-			skb_shinfo(skb)->tso_size = 0;
+			skb_shinfo(skb)->gso_segs = 1;
+			skb_shinfo(skb)->gso_size = 0;
+			skb_shinfo(skb)->gso_type = 0;
 			skb->ip_summed = CHECKSUM_NONE;
 			skb->csum = 0;
 		}
@@ -1930,8 +1933,9 @@ void tcp_send_fin(struct sock *sk)
 		skb->csum = 0;
 		TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 		TCP_SKB_CB(skb)->sacked = 0;
-		skb_shinfo(skb)->tso_segs = 1;
-		skb_shinfo(skb)->tso_size = 0;
+		skb_shinfo(skb)->gso_segs = 1;
+		skb_shinfo(skb)->gso_size = 0;
+		skb_shinfo(skb)->gso_type = 0;
 
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
 		TCP_SKB_CB(skb)->seq = tp->write_seq;
@@ -1963,8 +1967,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 	TCP_SKB_CB(skb)->sacked = 0;
-	skb_shinfo(skb)->tso_segs = 1;
-	skb_shinfo(skb)->tso_size = 0;
+	skb_shinfo(skb)->gso_segs = 1;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_type = 0;
 
 	/* Send it off. */
 	TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
@@ -2047,8 +2052,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 	TCP_SKB_CB(skb)->sacked = 0;
-	skb_shinfo(skb)->tso_segs = 1;
-	skb_shinfo(skb)->tso_size = 0;
+	skb_shinfo(skb)->gso_segs = 1;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_type = 0;
 	th->seq = htonl(TCP_SKB_CB(skb)->seq);
 	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
@@ -2152,8 +2158,9 @@ int tcp_connect(struct sock *sk)
 	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
 	TCP_ECN_send_syn(sk, tp, buff);
 	TCP_SKB_CB(buff)->sacked = 0;
-	skb_shinfo(buff)->tso_segs = 1;
-	skb_shinfo(buff)->tso_size = 0;
+	skb_shinfo(buff)->gso_segs = 1;
+	skb_shinfo(buff)->gso_size = 0;
+	skb_shinfo(buff)->gso_type = 0;
 	buff->csum = 0;
 	TCP_SKB_CB(buff)->seq = tp->write_seq++;
 	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
@@ -2257,8 +2264,9 @@ void tcp_send_ack(struct sock *sk)
 		buff->csum = 0;
 		TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
 		TCP_SKB_CB(buff)->sacked = 0;
-		skb_shinfo(buff)->tso_segs = 1;
-		skb_shinfo(buff)->tso_size = 0;
+		skb_shinfo(buff)->gso_segs = 1;
+		skb_shinfo(buff)->gso_size = 0;
+		skb_shinfo(buff)->gso_type = 0;
 
 		/* Send it off, this clears delayed acks for us. */
 		TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
@@ -2293,8 +2301,9 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 	TCP_SKB_CB(skb)->sacked = urgent;
-	skb_shinfo(skb)->tso_segs = 1;
-	skb_shinfo(skb)->tso_size = 0;
+	skb_shinfo(skb)->gso_segs = 1;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_type = 0;
 
 	/* Use a previous sequence.  This should cause the other
 	 * end to send an ack.  Don't queue or clone SKB, just
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d29620f4910e..abb94de33768 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -148,7 +148,7 @@ static int ip6_output2(struct sk_buff *skb)
 
 int ip6_output(struct sk_buff *skb)
 {
-	if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
+	if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) ||
 				dst_allfrag(skb->dst))
 		return ip6_fragment(skb, ip6_output2);
 	else
@@ -833,8 +833,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		struct frag_hdr fhdr;
 
 		/* specify the length of each IP datagram fragment*/
-		skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
-						sizeof(struct frag_hdr);
+		skb_shinfo(skb)->gso_size = mtu - fragheaderlen - 
+					    sizeof(struct frag_hdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
 		ipv6_select_ident(skb, &fhdr);
 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
 		__skb_queue_tail(&sk->sk_write_queue, skb);
-- 
cgit v1.2.3


From f6a78bfcb141f963187464bac838d46a81c3882a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 02:57:17 -0700
Subject: [NET]: Add generic segmentation offload

This patch adds the infrastructure for generic segmentation offload.
The idea is to tap into the potential savings of TSO without hardware
support by postponing the allocation of segmented skb's until just
before the entry point into the NIC driver.

The same structure can be used to support software IPv6 TSO, as well as
UFO and segmentation offload for other relevant protocols, e.g., DCCP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   8 ++-
 net/core/dev.c            | 127 ++++++++++++++++++++++++++++++++++++++++++++--
 net/sched/sch_generic.c   |  19 +++++--
 3 files changed, 143 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa5671307b90..b4eae18390cc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -405,6 +405,9 @@ struct net_device
 	struct list_head	qdisc_list;
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
 
+	/* Partially transmitted GSO packet. */
+	struct sk_buff		*gso_skb;
+
 	/* ingress path synchronizer */
 	spinlock_t		ingress_lock;
 	struct Qdisc		*qdisc_ingress;
@@ -539,6 +542,7 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
+	struct sk_buff		*(*gso_segment)(struct sk_buff *skb, int sg);
 	void			*af_packet_priv;
 	struct list_head	list;
 };
@@ -689,7 +693,8 @@ extern int		dev_change_name(struct net_device *, char *);
 extern int		dev_set_mtu(struct net_device *, int);
 extern int		dev_set_mac_address(struct net_device *,
 					    struct sockaddr *);
-extern void		dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
+extern int		dev_hard_start_xmit(struct sk_buff *skb,
+					    struct net_device *dev);
 
 extern void		dev_init(void);
 
@@ -963,6 +968,7 @@ extern int		netdev_max_backlog;
 extern int		weight_p;
 extern int		netdev_set_master(struct net_device *dev, struct net_device *master);
 extern int skb_checksum_help(struct sk_buff *skb, int inward);
+extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg);
 #ifdef CONFIG_BUG
 extern void netdev_rx_csum_fault(struct net_device *dev);
 #else
diff --git a/net/core/dev.c b/net/core/dev.c
index 29e3888102bc..d293e0f90a0c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -116,6 +116,7 @@
 #include <asm/current.h>
 #include <linux/audit.h>
 #include <linux/dmaengine.h>
+#include <linux/err.h>
 
 /*
  *	The list of packet types we will receive (as opposed to discard)
@@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct sk_buff *skb)
  *	taps currently in use.
  */
 
-void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
 
@@ -1186,6 +1187,40 @@ out:
 	return ret;
 }
 
+/**
+ *	skb_gso_segment - Perform segmentation on skb.
+ *	@skb: buffer to segment
+ *	@sg: whether scatter-gather is supported on the target.
+ *
+ *	This function segments the given skb and returns a list of segments.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+	struct packet_type *ptype;
+	int type = skb->protocol;
+
+	BUG_ON(skb_shinfo(skb)->frag_list);
+	BUG_ON(skb->ip_summed != CHECKSUM_HW);
+
+	skb->mac.raw = skb->data;
+	skb->mac_len = skb->nh.raw - skb->data;
+	__skb_pull(skb, skb->mac_len);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
+		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+			segs = ptype->gso_segment(skb, sg);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return segs;
+}
+
+EXPORT_SYMBOL(skb_gso_segment);
+
 /* Take action when hardware reception checksum errors are detected. */
 #ifdef CONFIG_BUG
 void netdev_rx_csum_fault(struct net_device *dev)
@@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)	(0)
 #endif
 
+struct dev_gso_cb {
+	void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+	struct dev_gso_cb *cb;
+
+	do {
+		struct sk_buff *nskb = skb->next;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+		kfree_skb(nskb);
+	} while (skb->next);
+
+	cb = DEV_GSO_CB(skb);
+	if (cb->destructor)
+		cb->destructor(skb);
+}
+
+/**
+ *	dev_gso_segment - Perform emulated hardware segmentation on skb.
+ *	@skb: buffer to segment
+ *
+ *	This function segments the given skb and stores the list of segments
+ *	in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct sk_buff *segs;
+
+	segs = skb_gso_segment(skb, dev->features & NETIF_F_SG &&
+				    !illegal_highdma(dev, skb));
+	if (unlikely(IS_ERR(segs)))
+		return PTR_ERR(segs);
+
+	skb->next = segs;
+	DEV_GSO_CB(skb)->destructor = skb->destructor;
+	skb->destructor = dev_gso_skb_destructor;
+
+	return 0;
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	if (likely(!skb->next)) {
+		if (netdev_nit)
+			dev_queue_xmit_nit(skb, dev);
+
+		if (!netif_needs_gso(dev, skb))
+			return dev->hard_start_xmit(skb, dev);
+
+		if (unlikely(dev_gso_segment(skb)))
+			goto out_kfree_skb;
+	}
+
+	do {
+		struct sk_buff *nskb = skb->next;
+		int rc;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+		rc = dev->hard_start_xmit(nskb, dev);
+		if (unlikely(rc)) {
+			skb->next = nskb;
+			return rc;
+		}
+	} while (skb->next);
+	
+	skb->destructor = DEV_GSO_CB(skb)->destructor;
+
+out_kfree_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
 #define HARD_TX_LOCK(dev, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		netif_tx_lock(dev);			\
@@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb)
 	struct Qdisc *q;
 	int rc = -ENOMEM;
 
+	/* GSO will handle the following emulations directly. */
+	if (netif_needs_gso(dev, skb))
+		goto gso;
+
 	if (skb_shinfo(skb)->frag_list &&
 	    !(dev->features & NETIF_F_FRAGLIST) &&
 	    __skb_linearize(skb))
@@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	      	if (skb_checksum_help(skb, 0))
 	      		goto out_kfree_skb;
 
+gso:
 	spin_lock_prefetch(&dev->queue_lock);
 
 	/* Disable soft irqs for various locks below. Also 
@@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 			HARD_TX_LOCK(dev, cpu);
 
 			if (!netif_queue_stopped(dev)) {
-				if (netdev_nit)
-					dev_queue_xmit_nit(skb, dev);
-
 				rc = 0;
-				if (!dev->hard_start_xmit(skb, dev)) {
+				if (!dev_hard_start_xmit(skb, dev)) {
 					HARD_TX_UNLOCK(dev);
 					goto out;
 				}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7aad0121232c..74d4a1dceeec 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -96,8 +96,11 @@ static inline int qdisc_restart(struct net_device *dev)
 	struct sk_buff *skb;
 
 	/* Dequeue packet */
-	if ((skb = q->dequeue(q)) != NULL) {
+	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
 		unsigned nolock = (dev->features & NETIF_F_LLTX);
+
+		dev->gso_skb = NULL;
+
 		/*
 		 * When the driver has LLTX set it does its own locking
 		 * in start_xmit. No need to add additional overhead by
@@ -134,10 +137,8 @@ static inline int qdisc_restart(struct net_device *dev)
 
 			if (!netif_queue_stopped(dev)) {
 				int ret;
-				if (netdev_nit)
-					dev_queue_xmit_nit(skb, dev);
 
-				ret = dev->hard_start_xmit(skb, dev);
+				ret = dev_hard_start_xmit(skb, dev);
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						netif_tx_unlock(dev);
@@ -171,7 +172,10 @@ static inline int qdisc_restart(struct net_device *dev)
 		 */
 
 requeue:
-		q->ops->requeue(skb, q);
+		if (skb->next)
+			dev->gso_skb = skb;
+		else
+			q->ops->requeue(skb, q);
 		netif_schedule(dev);
 		return 1;
 	}
@@ -593,6 +597,11 @@ void dev_deactivate(struct net_device *dev)
 	/* Wait for outstanding qdisc_run calls. */
 	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
 		yield();
+
+	if (dev->gso_skb) {
+		kfree_skb(dev->gso_skb);
+		dev->gso_skb = NULL;
+	}
 }
 
 void dev_init_scheduler(struct net_device *dev)
-- 
cgit v1.2.3


From f4c50d990dcf11a296679dc05de3873783236711 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 03:02:40 -0700
Subject: [NET]: Add software TSOv4

This patch adds the GSO implementation for IPv4 TCP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |   1 +
 include/net/protocol.h |   1 +
 include/net/tcp.h      |   2 +
 net/core/skbuff.c      | 126 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c     |  51 ++++++++++++++++++++
 net/ipv4/tcp.c         |  62 ++++++++++++++++++++++++
 6 files changed, 243 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 97b0d2d1a6b0..a45bba9b8cbd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1297,6 +1297,7 @@ extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
 
 extern void	       skb_release_data(struct sk_buff *skb);
+extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg);
 
 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 				       int len, void *buffer)
diff --git a/include/net/protocol.h b/include/net/protocol.h
index bcaee39bd2ff..3b6dc15c68a5 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -36,6 +36,7 @@
 struct net_protocol {
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
+	struct sk_buff	       *(*gso_segment)(struct sk_buff *skb, int sg);
 	int			no_policy;
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b197a9e615c1..ca3d38dfc00b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1086,6 +1086,8 @@ extern struct request_sock_ops tcp_request_sock_ops;
 
 extern int tcp_v4_destroy_sock(struct sock *sk);
 
+extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg);
+
 #ifdef CONFIG_PROC_FS
 extern int  tcp4_proc_init(void);
 extern void tcp4_proc_exit(void);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 368d98578c14..8e5044ba3ab6 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1842,6 +1842,132 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
 
 EXPORT_SYMBOL_GPL(skb_pull_rcsum);
 
+/**
+ *	skb_segment - Perform protocol segmentation on skb.
+ *	@skb: buffer to segment
+ *	@sg: whether scatter-gather can be used for generated segments
+ *
+ *	This function performs segmentation on the given skb.  It returns
+ *	the segment at the given position.  It returns NULL if there are
+ *	no more segments to generate, or when an error is encountered.
+ */
+struct sk_buff *skb_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = NULL;
+	struct sk_buff *tail = NULL;
+	unsigned int mss = skb_shinfo(skb)->gso_size;
+	unsigned int doffset = skb->data - skb->mac.raw;
+	unsigned int offset = doffset;
+	unsigned int headroom;
+	unsigned int len;
+	int nfrags = skb_shinfo(skb)->nr_frags;
+	int err = -ENOMEM;
+	int i = 0;
+	int pos;
+
+	__skb_push(skb, doffset);
+	headroom = skb_headroom(skb);
+	pos = skb_headlen(skb);
+
+	do {
+		struct sk_buff *nskb;
+		skb_frag_t *frag;
+		int hsize, nsize;
+		int k;
+		int size;
+
+		len = skb->len - offset;
+		if (len > mss)
+			len = mss;
+
+		hsize = skb_headlen(skb) - offset;
+		if (hsize < 0)
+			hsize = 0;
+		nsize = hsize + doffset;
+		if (nsize > len + doffset || !sg)
+			nsize = len + doffset;
+
+		nskb = alloc_skb(nsize + headroom, GFP_ATOMIC);
+		if (unlikely(!nskb))
+			goto err;
+
+		if (segs)
+			tail->next = nskb;
+		else
+			segs = nskb;
+		tail = nskb;
+
+		nskb->dev = skb->dev;
+		nskb->priority = skb->priority;
+		nskb->protocol = skb->protocol;
+		nskb->dst = dst_clone(skb->dst);
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+		nskb->pkt_type = skb->pkt_type;
+		nskb->mac_len = skb->mac_len;
+
+		skb_reserve(nskb, headroom);
+		nskb->mac.raw = nskb->data;
+		nskb->nh.raw = nskb->data + skb->mac_len;
+		nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
+		memcpy(skb_put(nskb, doffset), skb->data, doffset);
+
+		if (!sg) {
+			nskb->csum = skb_copy_and_csum_bits(skb, offset,
+							    skb_put(nskb, len),
+							    len, 0);
+			continue;
+		}
+
+		frag = skb_shinfo(nskb)->frags;
+		k = 0;
+
+		nskb->ip_summed = CHECKSUM_HW;
+		nskb->csum = skb->csum;
+		memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
+
+		while (pos < offset + len) {
+			BUG_ON(i >= nfrags);
+
+			*frag = skb_shinfo(skb)->frags[i];
+			get_page(frag->page);
+			size = frag->size;
+
+			if (pos < offset) {
+				frag->page_offset += offset - pos;
+				frag->size -= offset - pos;
+			}
+
+			k++;
+
+			if (pos + size <= offset + len) {
+				i++;
+				pos += size;
+			} else {
+				frag->size -= pos + size - (offset + len);
+				break;
+			}
+
+			frag++;
+		}
+
+		skb_shinfo(nskb)->nr_frags = k;
+		nskb->data_len = len - hsize;
+		nskb->len += nskb->data_len;
+		nskb->truesize += nskb->data_len;
+	} while ((offset += len) < skb->len);
+
+	return segs;
+
+err:
+	while ((skb = segs)) {
+		segs = skb->next;
+		kfree(skb);
+	}
+	return ERR_PTR(err);
+}
+
+EXPORT_SYMBOL_GPL(skb_segment);
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0a277453526b..461216b47948 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -68,6 +68,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *sk)
 
 EXPORT_SYMBOL(inet_sk_rebuild_header);
 
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct iphdr *iph;
+	struct net_protocol *ops;
+	int proto;
+	int ihl;
+	int id;
+
+	if (!pskb_may_pull(skb, sizeof(*iph)))
+		goto out;
+
+	iph = skb->nh.iph;
+	ihl = iph->ihl * 4;
+	if (ihl < sizeof(*iph))
+		goto out;
+
+	if (!pskb_may_pull(skb, ihl))
+		goto out;
+
+	skb->h.raw = __skb_pull(skb, ihl);
+	iph = skb->nh.iph;
+	id = ntohs(iph->id);
+	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet_protos[proto]);
+	if (ops && ops->gso_segment)
+		segs = ops->gso_segment(skb, sg);
+	rcu_read_unlock();
+
+	if (IS_ERR(segs))
+		goto out;
+
+	skb = segs;
+	do {
+		iph = skb->nh.iph;
+		iph->id = htons(id++);
+		iph->tot_len = htons(skb->len - skb->mac_len);
+		iph->check = 0;
+		iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
+	} while ((skb = skb->next));
+
+out:
+	return segs;
+}
+
 #ifdef CONFIG_IP_MULTICAST
 static struct net_protocol igmp_protocol = {
 	.handler =	igmp_rcv,
@@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol = {
 static struct net_protocol tcp_protocol = {
 	.handler =	tcp_v4_rcv,
 	.err_handler =	tcp_v4_err,
+	.gso_segment =	tcp_tso_segment,
 	.no_policy =	1,
 };
 
@@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void);
 static struct packet_type ip_packet_type = {
 	.type = __constant_htons(ETH_P_IP),
 	.func = ip_rcv,
+	.gso_segment = inet_gso_segment,
 };
 
 static int __init inet_init(void)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 062dd1a0d8a8..0e029c4e2903 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -258,6 +258,7 @@
 #include <linux/random.h>
 #include <linux/bootmem.h>
 #include <linux/cache.h>
+#include <linux/err.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -2144,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL(compat_tcp_getsockopt);
 #endif
 
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct tcphdr *th;
+	unsigned thlen;
+	unsigned int seq;
+	unsigned int delta;
+	unsigned int oldlen;
+	unsigned int len;
+
+	if (!pskb_may_pull(skb, sizeof(*th)))
+		goto out;
+
+	th = skb->h.th;
+	thlen = th->doff * 4;
+	if (thlen < sizeof(*th))
+		goto out;
+
+	if (!pskb_may_pull(skb, thlen))
+		goto out;
+
+	oldlen = ~htonl(skb->len);
+	__skb_pull(skb, thlen);
+
+	segs = skb_segment(skb, sg);
+	if (IS_ERR(segs))
+		goto out;
+
+	len = skb_shinfo(skb)->gso_size;
+	delta = csum_add(oldlen, htonl(thlen + len));
+
+	skb = segs;
+	th = skb->h.th;
+	seq = ntohl(th->seq);
+
+	do {
+		th->fin = th->psh = 0;
+
+		if (skb->ip_summed == CHECKSUM_NONE) {
+			th->check = csum_fold(csum_partial(
+				skb->h.raw, thlen, csum_add(skb->csum, delta)));
+		}
+
+		seq += len;
+		skb = skb->next;
+		th = skb->h.th;
+
+		th->seq = htonl(seq);
+		th->cwr = 0;
+	} while (skb->next);
+
+	if (skb->ip_summed == CHECKSUM_NONE) {
+		delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw));
+		th->check = csum_fold(csum_partial(
+			skb->h.raw, thlen, csum_add(skb->csum, delta)));
+	}
+
+out:
+	return segs;
+}
+
 extern void __skb_cb_too_small_for_tcp(int, int);
 extern struct tcp_congestion_ops tcp_reno;
 
-- 
cgit v1.2.3


From 37c3185a02d4b85fbe134bf5204535405dd2c957 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 22 Jun 2006 03:07:29 -0700
Subject: [NET]: Added GSO toggle

This patch adds a generic segmentation offload toggle that can be turned
on/off for each net device.  For now it only supports in TCPv4.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h   |  2 ++
 include/linux/netdevice.h |  1 +
 include/net/sock.h        |  4 ++++
 net/bridge/br_if.c        | 17 +++++++++++------
 net/core/ethtool.c        | 29 +++++++++++++++++++++++++++++
 5 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index cf2abeca92a0..c6310aef5ab0 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -411,6 +411,8 @@ struct ethtool_ops {
 #define ETHTOOL_GPERMADDR	0x00000020 /* Get permanent hardware address */
 #define ETHTOOL_GUFO		0x00000021 /* Get UFO enable (ethtool_value) */
 #define ETHTOOL_SUFO		0x00000022 /* Set UFO enable (ethtool_value) */
+#define ETHTOOL_GGSO		0x00000023 /* Get GSO enable (ethtool_value) */
+#define ETHTOOL_SGSO		0x00000024 /* Set GSO enable (ethtool_value) */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b4eae18390cc..bc747e5d7138 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -308,6 +308,7 @@ struct net_device
 #define NETIF_F_HW_VLAN_RX	256	/* Receive VLAN hw acceleration */
 #define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
 #define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
+#define NETIF_F_GSO		2048	/* Enable software GSO. */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
 
 	/* Segmentation offload features */
diff --git a/include/net/sock.h b/include/net/sock.h
index d10dfecb6cbd..a897f05de3b5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1030,9 +1030,13 @@ static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
 	__sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
+	if (sk->sk_route_caps & NETIF_F_GSO)
+		sk->sk_route_caps |= NETIF_F_TSO;
 	if (sk->sk_route_caps & NETIF_F_TSO) {
 		if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
 			sk->sk_route_caps &= ~NETIF_F_TSO;
+		else 
+			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
 	}
 }
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index fdec773f5b52..07956ecf545e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -376,15 +376,20 @@ void br_features_recompute(struct net_bridge *br)
 	features = br->feature_mask & ~NETIF_F_ALL_CSUM;
 
 	list_for_each_entry(p, &br->port_list, list) {
-		if (checksum & NETIF_F_NO_CSUM &&
-		    !(p->dev->features & NETIF_F_NO_CSUM))
+		unsigned long feature = p->dev->features;
+
+		if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
 			checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
-		if (checksum & NETIF_F_HW_CSUM &&
-		    !(p->dev->features & NETIF_F_HW_CSUM))
+		if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
 			checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
-		if (!(p->dev->features & NETIF_F_IP_CSUM))
+		if (!(feature & NETIF_F_IP_CSUM))
 			checksum = 0;
-		features &= p->dev->features;
+
+		if (feature & NETIF_F_GSO)
+			feature |= NETIF_F_TSO;
+		feature |= NETIF_F_GSO;
+
+		features &= feature;
 	}
 
 	br->dev->features = features | checksum | NETIF_F_LLTX;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 33ce7ed6afc6..27ce1683caf5 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -614,6 +614,29 @@ static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
 	return dev->ethtool_ops->set_ufo(dev, edata.data);
 }
 
+static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata = { ETHTOOL_GGSO };
+
+	edata.data = dev->features & NETIF_F_GSO;
+	if (copy_to_user(useraddr, &edata, sizeof(edata)))
+		 return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+	if (edata.data)
+		dev->features |= NETIF_F_GSO;
+	else
+		dev->features &= ~NETIF_F_GSO;
+	return 0;
+}
+
 static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_test test;
@@ -905,6 +928,12 @@ int dev_ethtool(struct ifreq *ifr)
 	case ETHTOOL_SUFO:
 		rc = ethtool_set_ufo(dev, useraddr);
 		break;
+	case ETHTOOL_GGSO:
+		rc = ethtool_get_gso(dev, useraddr);
+		break;
+	case ETHTOOL_SGSO:
+		rc = ethtool_set_gso(dev, useraddr);
+		break;
 	default:
 		rc =  -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From c8a553ad7f0bf943047943a758cf07017819cb3c Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 22 Jun 2006 14:28:09 -0700
Subject: [TCP]: Move inclusion of <linux/dmaengine.h> to correct place in
 <linux/tcp.h>

The new <linux/dmaengine.h> header shouldn't be included from
the !__KERNEL__ portion of tcp.h

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 420a689c3fb4..8ebf497907f8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,7 +18,6 @@
 #define _LINUX_TCP_H
 
 #include <linux/types.h>
-#include <linux/dmaengine.h>
 #include <asm/byteorder.h>
 
 struct tcphdr {
@@ -161,6 +160,7 @@ struct tcp_info
 #ifdef __KERNEL__
 
 #include <linux/skbuff.h>
+#include <linux/dmaengine.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
-- 
cgit v1.2.3


From f4b8ea7849544114e9d3d682df4d400180854677 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 22 Jun 2006 16:00:11 -0700
Subject: [NET]: fix net-core kernel-doc

Warning(/var/linsrc/linux-2617-g4//include/linux/skbuff.h:304): No description found for parameter 'dma_cookie'
Warning(/var/linsrc/linux-2617-g4//include/net/sock.h:1274): No description found for parameter 'copied_early'
Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'chan'
Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'event'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 ++
 include/net/sock.h     | 1 +
 net/core/dev.c         | 4 ++--
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a45bba9b8cbd..16eef03ce0eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -215,6 +215,8 @@ enum {
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
+ *	@dma_cookie: a cookie to one of several possible DMA operations
+ *		done by skb DMA functions
  *	@secmark: security marking
  */
 
diff --git a/include/net/sock.h b/include/net/sock.h
index a897f05de3b5..2d8d6adf1616 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1269,6 +1269,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
  * sk_eat_skb - Release a skb if it is no longer needed
  * @sk: socket to eat this skb from
  * @skb: socket buffer to eat
+ * @copied_early: flag indicating whether DMA operations copied this data early
  *
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
diff --git a/net/core/dev.c b/net/core/dev.c
index d293e0f90a0c..9b8f0f22c81d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3418,8 +3418,8 @@ static void net_dma_rebalance(void)
 /**
  * netdev_dma_event - event callback for the net_dma_client
  * @client: should always be net_dma_client
- * @chan:
- * @event:
+ * @chan: DMA channel for the event
+ * @event: event type
  */
 static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 	enum dma_event event)
-- 
cgit v1.2.3


From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:57 -0700
Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount

Extend the get_sb() filesystem operation to take an extra argument that
permits the VFS to pass in the target vfsmount that defines the mountpoint.

The filesystem is then required to manually set the superblock and root dentry
pointers.  For most filesystems, this should be done with simple_set_mnt()
which will set the superblock pointer and then set the root dentry to the
superblock's s_root (as per the old default behaviour).

The get_sb() op now returns an integer as there's now no need to return the
superblock pointer.

This patch permits a superblock to be implicitly shared amongst several mount
points, such as can be done with NFS to avoid potential inode aliasing.  In
such a case, simple_set_mnt() would not be called, and instead the mnt_root
and mnt_sb would be set directly.

The patch also makes the following changes:

 (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount
     pointer argument and return an integer, so most filesystems have to change
     very little.

 (*) If one of the convenience function is not used, then get_sb() should
     normally call simple_set_mnt() to instantiate the vfsmount. This will
     always return 0, and so can be tail-called from get_sb().

 (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the
     dcache upon superblock destruction rather than shrink_dcache_anon().

     This is required because the superblock may now have multiple trees that
     aren't actually bound to s_root, but that still need to be cleaned up. The
     currently called functions assume that the whole tree is rooted at s_root,
     and that anonymous dentries are not the roots of trees which results in
     dentries being left unculled.

     However, with the way NFS superblock sharing are currently set to be
     implemented, these assumptions are violated: the root of the filesystem is
     simply a dummy dentry and inode (the real inode for '/' may well be
     inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries
     with child trees.

     [*] Anonymous until discovered from another tree.

 (*) The documentation has been adjusted, including the additional bit of
     changing ext2_* into foo_* in the documentation.

[akpm@osdl.org: convert ipath_fs, do other stuff]
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/Locking         |   7 +-
 Documentation/filesystems/porting         |   7 +-
 Documentation/filesystems/vfs.txt         |   4 +-
 arch/ia64/kernel/perfmon.c                |   7 +-
 arch/powerpc/platforms/cell/spufs/inode.c |   6 +-
 drivers/infiniband/core/uverbs_main.c     |   7 +-
 drivers/infiniband/hw/ipath/ipath_fs.c    |  13 ++--
 drivers/isdn/capi/capifs.c                |   6 +-
 drivers/misc/ibmasm/ibmasmfs.c            |   7 +-
 drivers/oprofile/oprofilefs.c             |   6 +-
 drivers/usb/core/inode.c                  |   6 +-
 drivers/usb/gadget/inode.c                |   6 +-
 fs/9p/vfs_super.c                         |  21 +++---
 fs/adfs/super.c                           |   7 +-
 fs/affs/super.c                           |   7 +-
 fs/afs/super.c                            |  24 ++++---
 fs/autofs/init.c                          |   6 +-
 fs/autofs4/init.c                         |   6 +-
 fs/befs/linuxvfs.c                        |   7 +-
 fs/bfs/inode.c                            |   6 +-
 fs/binfmt_misc.c                          |   6 +-
 fs/block_dev.c                            |   6 +-
 fs/cifs/cifsfs.c                          |  10 +--
 fs/coda/inode.c                           |   6 +-
 fs/configfs/mount.c                       |   6 +-
 fs/cramfs/inode.c                         |   7 +-
 fs/dcache.c                               |  40 -----------
 fs/debugfs/inode.c                        |   8 +--
 fs/devfs/base.c                           |   8 +--
 fs/devpts/inode.c                         |   6 +-
 fs/efs/super.c                            |   6 +-
 fs/eventpoll.c                            |  13 ++--
 fs/ext2/super.c                           |   6 +-
 fs/ext3/super.c                           |   6 +-
 fs/freevxfs/vxfs_super.c                  |   7 +-
 fs/fuse/inode.c                           |   8 +--
 fs/hfs/super.c                            |   7 +-
 fs/hfsplus/super.c                        |   8 ++-
 fs/hostfs/hostfs_kern.c                   |   8 +--
 fs/hpfs/super.c                           |   7 +-
 fs/hppfs/hppfs_kern.c                     |   8 +--
 fs/hugetlbfs/inode.c                      |   6 +-
 fs/inotify_user.c                         |   6 +-
 fs/isofs/inode.c                          |   7 +-
 fs/jffs/inode-v23.c                       |   7 +-
 fs/jffs2/super.c                          |  49 ++++++++------
 fs/jfs/super.c                            |   7 +-
 fs/libfs.c                                |  12 ++--
 fs/minix/inode.c                          |   7 +-
 fs/msdos/namei.c                          |   9 +--
 fs/namespace.c                            |   9 +++
 fs/ncpfs/inode.c                          |   6 +-
 fs/nfs/inode.c                            |  96 +++++++++++++++-----------
 fs/nfsd/nfsctl.c                          |   6 +-
 fs/ntfs/super.c                           |   7 +-
 fs/ocfs2/dlm/dlmfs.c                      |   6 +-
 fs/ocfs2/super.c                          |  12 ++--
 fs/openpromfs/inode.c                     |   6 +-
 fs/pipe.c                                 |   9 ++-
 fs/proc/root.c                            |   6 +-
 fs/qnx4/inode.c                           |   7 +-
 fs/ramfs/inode.c                          |  13 ++--
 fs/reiserfs/super.c                       |   9 +--
 fs/romfs/inode.c                          |   7 +-
 fs/smbfs/inode.c                          |   6 +-
 fs/super.c                                | 109 +++++++++++++++++-------------
 fs/sysfs/mount.c                          |   6 +-
 fs/sysv/super.c                           |  13 ++--
 fs/udf/super.c                            |   6 +-
 fs/ufs/super.c                            |   6 +-
 fs/vfat/namei.c                           |   9 +--
 fs/xfs/linux-2.6/xfs_super.c              |   8 ++-
 include/linux/dcache.h                    |   1 -
 include/linux/fs.h                        |  25 ++++---
 include/linux/ramfs.h                     |   4 +-
 ipc/mqueue.c                              |   8 +--
 kernel/cpuset.c                           |   8 +--
 kernel/futex.c                            |   8 +--
 mm/shmem.c                                |   6 +-
 net/socket.c                              |   7 +-
 net/sunrpc/rpc_pipe.c                     |   6 +-
 security/inode.c                          |   8 +--
 security/selinux/selinuxfs.c              |   7 +-
 83 files changed, 482 insertions(+), 431 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 1045da582b9b..3abf08f1b14a 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -142,15 +142,16 @@ see also dquot_operations section.
 
 --------------------------- file_system_type ---------------------------
 prototypes:
-	struct super_block *(*get_sb) (struct file_system_type *, int,
-			const char *, void *);
+	struct int (*get_sb) (struct file_system_type *, int,
+			const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 locking rules:
 		may block	BKL
 get_sb		yes		yes
 kill_sb		yes		yes
 
-->get_sb() returns error or a locked superblock (exclusive on ->s_umount).
+->get_sb() returns error or 0 with locked superblock attached to the vfsmount
+(exclusive on ->s_umount).
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.
 
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 2f388460cbe7..5531694059ab 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -50,10 +50,11 @@ Turn your foo_read_super() into a function that would return 0 in case of
 success and negative number in case of error (-EINVAL unless you have more
 informative error value to report).  Call it foo_fill_super().  Now declare
 
-struct super_block foo_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int foo_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super,
+			   mnt);
 }
 
 (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3a2e5520c1e3..dd7d0dcedc87 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -113,8 +113,8 @@ members are defined:
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-        struct super_block *(*get_sb) (struct file_system_type *, int,
-                                       const char *, void *);
+        struct int (*get_sb) (struct file_system_type *, int,
+                              const char *, void *, struct vfsmount *);
         void (*kill_sb) (struct super_block *);
         struct module *owner;
         struct file_system_type * next;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 077f21216b65..2359e2809f50 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -595,10 +595,11 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 }
 
 
-static struct super_block *
-pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
+static int
+pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
+	     struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
 }
 
 static struct file_system_type pfm_fs_type = {
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1987697b23a0..7b4572805db9 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -436,11 +436,11 @@ spufs_fill_super(struct super_block *sb, void *data, int silent)
 	return spufs_create_root(sb, data);
 }
 
-static struct super_block *
+static int
 spufs_get_sb(struct file_system_type *fstype, int flags,
-		const char *name, void *data)
+		const char *name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fstype, flags, data, spufs_fill_super);
+	return get_sb_single(fstype, flags, data, spufs_fill_super, mnt);
 }
 
 static struct file_system_type spufs_type = {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 5ec2d49e9bb6..e57d3c50f75f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -821,11 +821,12 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
 }
 
-static struct super_block *uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
-					       const char *dev_name, void *data)
+static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
+			       const char *dev_name, void *data,
+			       struct vfsmount *mnt)
 {
 	return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
-			     INFINIBANDEVENTFS_MAGIC);
+			     INFINIBANDEVENTFS_MAGIC, mnt);
 }
 
 static struct file_system_type uverbs_event_fs = {
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index e274120567e1..63de3046aff3 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -542,13 +542,14 @@ bail:
 	return ret;
 }
 
-static struct super_block *ipathfs_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int ipathfs_get_sb(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	ipath_super = get_sb_single(fs_type, flags, data,
-				    ipathfs_fill_super);
-	return ipath_super;
+	int ret = get_sb_single(fs_type, flags, data,
+				    ipathfs_fill_super, mnt);
+	if (ret >= 0)
+		ipath_super = mnt->mnt_sb;
+	return ret;
 }
 
 static void ipathfs_kill_super(struct super_block *s)
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 0a37aded4b54..9ea6bd0ddc35 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -121,10 +121,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *capifs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int capifs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, capifs_fill_super);
+	return get_sb_single(fs_type, flags, data, capifs_fill_super, mnt);
 }
 
 static struct file_system_type capifs_fs_type = {
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 26a230b6ff80..4a35caff5d02 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -90,10 +90,11 @@ static void ibmasmfs_create_files (struct super_block *sb, struct dentry *root);
 static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent);
 
 
-static struct super_block *ibmasmfs_get_super(struct file_system_type *fst,
-			int flags, const char *name, void *data)
+static int ibmasmfs_get_super(struct file_system_type *fst,
+			int flags, const char *name, void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_single(fst, flags, data, ibmasmfs_fill_super);
+	return get_sb_single(fst, flags, data, ibmasmfs_fill_super, mnt);
 }
 
 static struct super_operations ibmasmfs_s_ops = {
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index b62da9b0cbf0..71c2da277d6e 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -272,10 +272,10 @@ static int oprofilefs_fill_super(struct super_block * sb, void * data, int silen
 }
 
 
-static struct super_block *oprofilefs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int oprofilefs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, oprofilefs_fill_super);
+	return get_sb_single(fs_type, flags, data, oprofilefs_fill_super, mnt);
 }
 
 
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 3cf945cc5b9a..95f5ad923b0f 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -543,10 +543,10 @@ static void fs_remove_file (struct dentry *dentry)
 
 /* --------------------------------------------------------------------- */
 
-static struct super_block *usb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int usb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, usbfs_fill_super);
+	return get_sb_single(fs_type, flags, data, usbfs_fill_super, mnt);
 }
 
 static struct file_system_type usb_fs_type = {
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index aef0722b8f17..3bdc5e3ba234 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -2070,11 +2070,11 @@ enomem0:
 }
 
 /* "mount -t gadgetfs path /dev/gadget" ends up here */
-static struct super_block *
+static int
 gadgetfs_get_sb (struct file_system_type *t, int flags,
-		const char *path, void *opts)
+		const char *path, void *opts, struct vfsmount *mnt)
 {
-	return get_sb_single (t, flags, opts, gadgetfs_fill_super);
+	return get_sb_single (t, flags, opts, gadgetfs_fill_super, mnt);
 }
 
 static void
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e3..872943004e59 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -99,12 +99,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
  * @flags: mount flags
  * @dev_name: device name that was mounted
  * @data: mount options
+ * @mnt: mountpoint record to be instantiated
  *
  */
 
-static struct super_block *v9fs_get_sb(struct file_system_type
-				       *fs_type, int flags,
-				       const char *dev_name, void *data)
+static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data,
+		       struct vfsmount *mnt)
 {
 	struct super_block *sb = NULL;
 	struct v9fs_fcall *fcall = NULL;
@@ -123,17 +124,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		sb = ERR_PTR(newfid);
+		retval = newfid;
 		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-	if (IS_ERR(sb))
+	if (IS_ERR(sb)) {
+		retval = PTR_ERR(sb);
 		goto out_close_session;
+	}
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -184,19 +187,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		goto put_back_sb;
 	}
 
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
 out_close_session:
 	v9fs_session_close(v9ses);
 out_free_session:
 	kfree(v9ses);
-	return sb;
+	return retval;
 
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
-	return ERR_PTR(retval);
+	return retval;
 }
 
 /**
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 252abda0d200..1b58a9b7f6aa 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -470,10 +470,11 @@ error:
 	return -EINVAL;
 }
 
-static struct super_block *adfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int adfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type adfs_fs_type = {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4d7e5b19e5cd..6a52e7875403 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -524,10 +524,11 @@ affs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *affs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int affs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type affs_fs_type = {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231ab..82468df0ba54 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -38,9 +38,9 @@ struct afs_mount_params {
 static void afs_i_init_once(void *foo, kmem_cache_t *cachep,
 			    unsigned long flags);
 
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name,
-				      void *data);
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name,
+		      void *data, struct vfsmount *mnt);
 
 static struct inode *afs_alloc_inode(struct super_block *sb);
 
@@ -294,10 +294,11 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
  * get an AFS superblock
  * - TODO: don't use get_sb_nodev(), but rather call sget() directly
  */
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags,
-				      const char *dev_name,
-				      void *options)
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags,
+		      const char *dev_name,
+		      void *options,
+		      struct vfsmount *mnt)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
@@ -311,7 +312,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 	ret = afscm_start();
 	if (ret < 0) {
 		_leave(" = %d", ret);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	/* parse the options */
@@ -348,18 +349,19 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 		goto error;
 	}
 	sb->s_flags |= MS_ACTIVE;
+	simple_set_mnt(mnt, sb);
 
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
-	_leave(" = %p", sb);
-	return sb;
+	_leave(" = 0 [%p]", 0, sb);
+	return 0;
 
  error:
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
 	afscm_stop();
 	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	return ret;
 } /* end afs_get_sb() */
 
 /*****************************************************************************/
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b977ece69f0c..aca123752406 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index acecec8578ce..5d9193332bef 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 68ebd10f345d..6ed07a5f10c6 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -899,11 +899,12 @@ befs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *
+static int
 befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-	    void *data)
+	    void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type befs_fs_type = {
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 55a7a78332f8..e7da03f63a5a 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -410,10 +410,10 @@ out:
 	return -EINVAL;
 }
 
-static struct super_block *bfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
 }
 
 static struct file_system_type bfs_fs_type = {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 599f36fd0f67..07a4996cca3f 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -739,10 +739,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 	return err;
 }
 
-static struct super_block *bm_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bm_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, bm_fill_super);
+	return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
 }
 
 static struct linux_binfmt misc_format = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 44aaba202f78..028d9fb9c2d5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -300,10 +300,10 @@ static struct super_operations bdev_sops = {
 	.clear_inode = bdev_clear_inode,
 };
 
-static struct super_block *bd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 }
 
 static struct file_system_type bd_type = {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce9..08b35801dfed 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -460,9 +460,9 @@ struct super_operations cifs_super_ops = {
 	.remount_fs = cifs_remount,
 };
 
-static struct super_block *
+static int
 cifs_get_sb(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data)
+	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	int rc;
 	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
@@ -470,7 +470,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 	cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
 
 	if (IS_ERR(sb))
-		return sb;
+		return PTR_ERR(sb);
 
 	sb->s_flags = flags;
 
@@ -478,10 +478,10 @@ cifs_get_sb(struct file_system_type *fs_type,
 	if (rc) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(rc);
+		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 }
 
 static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index ada1a81df6bd..cba70201567d 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -307,10 +307,10 @@ static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
 
 /* init_coda: used by filesystems.c to register coda */
 
-static struct super_block *coda_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int coda_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, coda_fill_super);
+	return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
 }
 
 struct file_system_type coda_fs_type = {
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e5..94dab7bdd851 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -103,10 +103,10 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+	return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
 }
 
 static struct file_system_type configfs_fs_type = {
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9efcc3a164e8..37a91a153aa5 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -528,10 +528,11 @@ static struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static struct super_block *cramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int cramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dcache.c b/fs/dcache.c
index 59dbc92c2079..313b54b2b8f2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -687,46 +687,6 @@ void shrink_dcache_parent(struct dentry * parent)
 		prune_dcache(found, parent->d_sb);
 }
 
-/**
- * shrink_dcache_anon - further prune the cache
- * @head: head of d_hash list of dentries to prune
- *
- * Prune the dentries that are anonymous
- *
- * parsing d_hash list does not hlist_for_each_entry_rcu() as it
- * done under dcache_lock.
- *
- */
-void shrink_dcache_anon(struct super_block *sb)
-{
-	struct hlist_node *lp;
-	struct hlist_head *head = &sb->s_anon;
-	int found;
-	do {
-		found = 0;
-		spin_lock(&dcache_lock);
-		hlist_for_each(lp, head) {
-			struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-			if (!list_empty(&this->d_lru)) {
-				dentry_stat.nr_unused--;
-				list_del_init(&this->d_lru);
-			}
-
-			/* 
-			 * move only zero ref count dentries to the end 
-			 * of the unused list for prune_dcache
-			 */
-			if (!atomic_read(&this->d_count)) {
-				list_add_tail(&this->d_lru, &dentry_unused);
-				dentry_stat.nr_unused++;
-				found++;
-			}
-		}
-		spin_unlock(&dcache_lock);
-		prune_dcache(found, sb);
-	} while(found);
-}
-
 /*
  * Scan `nr' dentries and return the number which remain.
  *
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a676..440128ebef3b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -111,11 +111,11 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static struct super_block *debug_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int debug_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, debug_fill_super);
+	return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
 }
 
 static struct file_system_type debug_fs_type = {
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 52f5059c4f31..51a97f132745 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2549,11 +2549,11 @@ static int devfs_fill_super(struct super_block *sb, void *data, int silent)
 	return -EINVAL;
 }				/*  End Function devfs_fill_super  */
 
-static struct super_block *devfs_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int devfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devfs_fill_super);
+	return get_sb_single(fs_type, flags, data, devfs_fill_super, mnt);
 }
 
 static struct file_system_type devfs_fs_type = {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14c5620b5cab..f7aef5bb584a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,10 +130,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int devpts_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super);
+	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
 
 static struct file_system_type devpts_fs_type = {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index dff623e3ddbf..1ba5e14f879f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -18,10 +18,10 @@
 static int efs_statfs(struct super_block *s, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
-static struct super_block *efs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int efs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
 }
 
 static struct file_system_type efs_fs_type = {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2695337d4d64..08e7e6a555ca 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -268,9 +268,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout);
 static int eventpollfs_delete_dentry(struct dentry *dentry);
 static struct inode *ep_eventpoll_inode(void);
-static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data);
+static int eventpollfs_get_sb(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, struct vfsmount *mnt);
 
 /*
  * This semaphore is used to serialize ep_free() and eventpoll_release_file().
@@ -1595,11 +1595,12 @@ eexit_1:
 }
 
 
-static struct super_block *
+static int
 eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
-		   const char *dev_name, void *data)
+		   const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
+			     mnt);
 }
 
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7e30bae174ed..a4dfffac5967 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1087,10 +1087,10 @@ static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
 	return 0;
 }
 
-static struct super_block *ext2_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext2_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
 }
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f8a5266ea1ff..657f8e73b62f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2646,10 +2646,10 @@ out:
 
 #endif
 
-static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext3_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
 }
 
 static struct file_system_type ext3_fs_type = {
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b44c916d24a1..d76eeaafbde2 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -241,10 +241,11 @@ out:
 /*
  * The usual module blurb.
  */
-static struct super_block *vxfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int vxfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vxfs_fs_type = {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b2..c91f0a50aadb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -569,11 +569,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	return err;
 }
 
-static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *raw_data)
+static int fuse_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *raw_data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
 static struct file_system_type fuse_fs_type = {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1181d116117d..ee5b80a409e8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -413,10 +413,11 @@ bail:
 	return res;
 }
 
-static struct super_block *hfs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int hfs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
 }
 
 static struct file_system_type hfs_fs_type = {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7843f792a4b7..0ed8b7e8e87f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -450,10 +450,12 @@ static void hfsplus_destroy_inode(struct inode *inode)
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
-static struct super_block *hfsplus_get_sb(struct file_system_type *fs_type,
-					  int flags, const char *dev_name, void *data)
+static int hfsplus_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hfsplus_fs_type = {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bf0f8e16e433..04035e08f5c1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -993,11 +993,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hostfs_read_sb(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hostfs_read_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
+	return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
 }
 
 static struct file_system_type hostfs_type = {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index d72d8c87c996..3b25cf3e2e65 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -662,10 +662,11 @@ bail0:
 	return -EINVAL;
 }
 
-static struct super_block *hpfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hpfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hpfs_fs_type = {
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 5e6363be246f..ec43c22bc9c0 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -769,11 +769,11 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hppfs_read_super(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hppfs_fill_super));
+	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
 }
 
 static struct file_system_type hppfs_type = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a5b4e923455..4665c26171f7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -723,10 +723,10 @@ void hugetlb_put_quota(struct address_space *mapping)
 	}
 }
 
-static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 9e9931e2badd..f2386442adee 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -672,11 +672,11 @@ out:
 	return ret;
 }
 
-static struct super_block *
+static int
 inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data)
+	       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type inotify_fs_type = {
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 70adbb98bad1..17268da63a49 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1399,10 +1399,11 @@ struct inode *isofs_iget(struct super_block *sb,
 	return inode;
 }
 
-static struct super_block *isofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int isofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type iso9660_fs_type = {
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 020cc097c539..dd93a091ad67 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1785,10 +1785,11 @@ static struct super_operations jffs_ops =
 	.remount_fs	= jffs_remount,
 };
 
-static struct super_block *jffs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int jffs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type jffs_fs_type = {
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9d0521451f59..2378a662c256 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -111,9 +111,10 @@ static int jffs2_sb_set(struct super_block *sb, void *data)
 	return 0;
 }
 
-static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, struct mtd_info *mtd)
+static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
+			    int flags, const char *dev_name,
+			    void *data, struct mtd_info *mtd,
+			    struct vfsmount *mnt)
 {
 	struct super_block *sb;
 	struct jffs2_sb_info *c;
@@ -121,19 +122,20 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	memset(c, 0, sizeof(*c));
 	c->mtd = mtd;
 
 	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
 
 	if (IS_ERR(sb))
-		goto out_put;
+		goto out_error;
 
 	if (sb->s_root) {
 		/* New mountpoint for JFFS2 which is already mounted */
 		D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n",
 			  mtd->index, mtd->name));
+		ret = simple_set_mnt(mnt, sb);
 		goto out_put;
 	}
 
@@ -161,44 +163,47 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 		/* Failure case... */
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
+out_error:
+	ret = PTR_ERR(sb);
  out_put:
 	kfree(c);
 	put_mtd_device(mtd);
 
-	return sb;
+	return ret;
 }
 
-static struct super_block *jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, int mtdnr)
+static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, int mtdnr,
+			      struct vfsmount *mnt)
 {
 	struct mtd_info *mtd;
 
 	mtd = get_mtd_device(NULL, mtdnr);
 	if (!mtd) {
 		D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr));
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 }
 
-static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int jffs2_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
 	int err;
 	struct nameidata nd;
 	int mtdnr;
 
 	if (!dev_name)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name));
 
@@ -220,7 +225,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 				mtd = get_mtd_device(NULL, mtdnr);
 				if (mtd) {
 					if (!strcmp(mtd->name, dev_name+4))
-						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 					put_mtd_device(mtd);
 				}
 			}
@@ -233,7 +238,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 			if (!*endptr) {
 				/* It was a valid number */
 				D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr));
-				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 			}
 		}
 	}
@@ -247,7 +252,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 		  err, nd.dentry->d_inode));
 
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	err = -EINVAL;
 
@@ -269,11 +274,11 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 	mtdnr = iminor(nd.dentry->d_inode);
 	path_release(&nd);
 
-	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 
 out:
 	path_release(&nd);
-	return ERR_PTR(err);
+	return err;
 }
 
 static void jffs2_put_super (struct super_block *sb)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index db6f41d6dd60..18a28137b90e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -565,10 +565,11 @@ static void jfs_unlockfs(struct super_block *sb)
 	}
 }
 
-static struct super_block *jfs_get_sb(struct file_system_type *fs_type, 
-	int flags, const char *dev_name, void *data)
+static int jfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+			   mnt);
 }
 
 static int jfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d0..7d70efa46da9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -196,9 +196,9 @@ struct inode_operations simple_dir_inode_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct super_block *
-get_sb_pseudo(struct file_system_type *fs_type, char *name,
-	struct super_operations *ops, unsigned long magic)
+int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+	struct super_operations *ops, unsigned long magic,
+	struct vfsmount *mnt)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	static struct super_operations default_ops = {.statfs = simple_statfs};
@@ -207,7 +207,7 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct qstr d_name = {.name = name, .len = strlen(name)};
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = ~0ULL;
@@ -232,12 +232,12 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 
 Enomem:
 	up_write(&s->s_umount);
 	deactivate_super(s);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2dcccf1d1b7f..14f24dfbfe30 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -559,10 +559,11 @@ void minix_truncate(struct inode * inode)
 		V2_minix_truncate(inode);
 }
 
-static struct super_block *minix_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int minix_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+			   mnt);
 }
 
 static struct file_system_type minix_fs_type = {
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 5b76ccd19e3f..9e44158a7540 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -661,11 +661,12 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *msdos_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
 }
 
 static struct file_system_type msdos_fs_type = {
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..c13072a5f1ee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -86,6 +86,15 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 	return mnt;
 }
 
+int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+{
+	mnt->mnt_sb = sb;
+	mnt->mnt_root = dget(sb->s_root);
+	return 0;
+}
+
+EXPORT_SYMBOL(simple_set_mnt);
+
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a1f3e972c6ef..8db033fab3fd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -957,10 +957,10 @@ out:
 	return result;
 }
 
-static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ncp_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ncp_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
 }
 
 static struct file_system_type ncp_fs_type = {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d0b991a92327..ff645a961bc8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1690,8 +1690,8 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return !nfs_compare_fh(&old->fh, &server->fh);
 }
 
-static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server = NULL;
@@ -1699,14 +1699,14 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	struct nfs_fh *root;
 	struct nfs_mount_data *data = raw_data;
 
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	switch (data->version) {
 		case 1:
@@ -1718,7 +1718,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support NFSv3\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 			data->root.size = NFS2_FHSIZE;
 			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
@@ -1727,24 +1727,24 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support strong security\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 		case 5:
 			memset(data->context, 0, sizeof(data->context));
 	}
 #ifndef CONFIG_NFS_V3
 	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-	s = ERR_PTR(-EPROTONOSUPPORT);
+	error = -EPROTONOSUPPORT;
 	if (data->flags & NFS_MOUNT_VER3) {
 		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 #endif /* CONFIG_NFS_V3 */
 
-	s = ERR_PTR(-ENOMEM);
+	error = -ENOMEM;
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		goto out_err;
+		goto out_err_noserver;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -1754,7 +1754,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 		root->size = data->root.size;
 	else
 		root->size = NFS2_FHSIZE;
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (root->size > sizeof(root->data)) {
 		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
 		goto out_err;
@@ -1770,15 +1770,20 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_err;
 	}
 
 	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_rpciod;
+	}
+
+	if (s->s_root)
 		goto out_rpciod_down;
 
 	s->s_flags = flags;
@@ -1787,15 +1792,22 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
+
 out_rpciod_down:
 	rpciod_down();
+	kfree(server);
+	return simple_set_mnt(mnt, s);
+
+out_err_rpciod:
+	rpciod_down();
 out_err:
 	kfree(server);
-	return s;
+out_err_noserver:
+	return error;
 }
 
 static void nfs_kill_super(struct super_block *s)
@@ -2032,8 +2044,8 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
 	return dst;
 }
 
-static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server;
@@ -2043,16 +2055,16 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -2074,33 +2086,41 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	/* We now require that the mount process passes the remote address */
 	if (data->host_addrlen != sizeof(server->addr)) {
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-		s = ERR_PTR(-EFAULT);
+		error = -EFAULT;
 		goto out_free;
 	}
 	if (server->addr.sin_family != AF_INET ||
 	    server->addr.sin_addr.s_addr == INADDR_ANY) {
 		dprintk("%s: mount program didn't pass remote IP address!\n",
 				__FUNCTION__);
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_free;
 	}
 
 	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
 		goto out_free;
+	}
+
+	if (s->s_root) {
+		kfree(server->mnt_path);
+		kfree(server->hostname);
+		kfree(server);
+		return simple_set_mnt(mnt, s);
+	}
 
 	s->s_flags = flags;
 
@@ -2108,17 +2128,17 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 out_err:
-	s = (struct super_block *)p;
+	error = PTR_ERR(p);
 out_free:
 	kfree(server->mnt_path);
 	kfree(server->hostname);
 	kfree(server);
-	return s;
+	return error;
 }
 
 static void nfs4_kill_super(struct super_block *sb)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3ef017b3b5bd..a1810e6a93e5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -494,10 +494,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
-static struct super_block *nfsd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int nfsd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, nfsd_fill_super);
+	return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
 }
 
 static struct file_system_type nfsd_fs_type = {
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 27833f6df49f..d5d5e969294f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3093,10 +3093,11 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
 
-static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ntfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ntfs_fs_type = {
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7e88e24b3471..7273d9fa6bab 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -574,10 +574,10 @@ static struct inode_operations dlmfs_file_inode_operations = {
 	.getattr	= simple_getattr,
 };
 
-static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
 }
 
 static struct file_system_type dlmfs_fs_type = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 949b3dac30f1..788b8b50dc4c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -672,12 +672,14 @@ read_super_error:
 	return status;
 }
 
-static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
-					int flags,
-					const char *dev_name,
-					void *data)
+static int ocfs2_get_sb(struct file_system_type *fs_type,
+			int flags,
+			const char *dev_name,
+			void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ocfs2_fs_type = {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 0f14276a2e51..464e2bce0203 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1054,10 +1054,10 @@ out_no_root:
 	return -ENOMEM;
 }
 
-static struct super_block *openprom_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int openprom_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, openprom_fill_super);
+	return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
 }
 
 static struct file_system_type openprom_fs_type = {
diff --git a/fs/pipe.c b/fs/pipe.c
index 5acd8954aaa0..20352573e025 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -979,12 +979,11 @@ no_files:
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-
-static struct super_block *
-pipefs_get_sb(struct file_system_type *fs_type, int flags,
-	      const char *dev_name, void *data)
+static int pipefs_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data,
+			 struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
 }
 
 static struct file_system_type pipe_fs_type = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c3fd3611112f..9995356ce73e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -26,10 +26,10 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc
 struct proc_dir_entry *proc_sys_root;
 #endif
 
-static struct super_block *proc_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int proc_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, proc_fill_super);
+	return get_sb_single(fs_type, flags, data, proc_fill_super, mnt);
 }
 
 static struct file_system_type proc_fs_type = {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2ecd46f85e9f..e6cca5cd4b44 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -561,10 +561,11 @@ static void destroy_inodecache(void)
 		       "qnx4_inode_cache: not all structures were freed\n");
 }
 
-static struct super_block *qnx4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int qnx4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+			   mnt);
 }
 
 static struct file_system_type qnx4_fs_type = {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 14bd2246fb6d..b9677335cc8d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -185,16 +185,17 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	return 0;
 }
 
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int ramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
 }
 
-static struct super_block *rootfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int rootfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+			    mnt);
 }
 
 static struct file_system_type ramfs_fs_type = {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cae2abbc0c71..f3ff41d33989 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2249,11 +2249,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 
 #endif
 
-static struct super_block *get_super_block(struct file_system_type *fs_type,
-					   int flags, const char *dev_name,
-					   void *data)
+static int get_super_block(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+			   mnt);
 }
 
 static int __init init_reiserfs_fs(void)
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 9b9eda7b335c..4d6cd6621062 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -607,10 +607,11 @@ static struct super_operations romfs_ops = {
 	.remount_fs	= romfs_remount,
 };
 
-static struct super_block *romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int romfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type romfs_fs_type = {
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fdeabc0a34f7..4a37c2bbfa3f 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -782,10 +782,10 @@ out:
 	return error;
 }
 
-static struct super_block *smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int smb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super);
+	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
 }
 
 static struct file_system_type smb_fs_type = {
diff --git a/fs/super.c b/fs/super.c
index 9d5c2add7228..324c2d232f54 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(sb);
+		shrink_dcache_sb(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
@@ -676,9 +676,10 @@ static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
 	}
 }
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -686,7 +687,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 	bdev = open_bdev_excl(dev_name, flags, fs_type);
 	if (IS_ERR(bdev))
-		return (struct super_block *)bdev;
+		return PTR_ERR(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -697,15 +698,17 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
 	mutex_unlock(&bdev->bd_mount_mutex);
 	if (IS_ERR(s))
-		goto out;
+		goto error_s;
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(-EBUSY);
+			error = -EBUSY;
+			goto error_bdev;
 		}
-		goto out;
+
+		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -716,18 +719,21 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(error);
-		} else {
-			s->s_flags |= MS_ACTIVE;
-			bdev_uevent(bdev, KOBJ_MOUNT);
+			goto error;
 		}
+
+		s->s_flags |= MS_ACTIVE;
+		bdev_uevent(bdev, KOBJ_MOUNT);
 	}
 
-	return s;
+	return simple_set_mnt(mnt, s);
 
-out:
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
 	close_bdev_excl(bdev);
-	return s;
+error:
+	return error;
 }
 
 EXPORT_SYMBOL(get_sb_bdev);
@@ -744,15 +750,16 @@ void kill_block_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_block_super);
 
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	int error;
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = flags;
 
@@ -760,10 +767,10 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -773,94 +780,102 @@ static int compare_single(struct super_block *s, void *p)
 	return 1;
 }
 
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct super_block *s;
 	int error;
 
 	s = sget(fs_type, compare_single, set_anon_super, NULL);
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			return ERR_PTR(error);
+			return error;
 		}
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
-	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
-	int error;
 	char *secdata = NULL;
+	int error;
 
 	if (!type)
 		return ERR_PTR(-ENODEV);
 
+	error = -ENOMEM;
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
 
 	if (data) {
 		secdata = alloc_secdata();
-		if (!secdata) {
-			sb = ERR_PTR(-ENOMEM);
+		if (!secdata)
 			goto out_mnt;
-		}
 
 		error = security_sb_copy_data(type, data, secdata);
-		if (error) {
-			sb = ERR_PTR(error);
+		if (error)
 			goto out_free_secdata;
-		}
 	}
 
-	sb = type->get_sb(type, flags, name, data);
-	if (IS_ERR(sb))
+	error = type->get_sb(type, flags, name, data, mnt);
+	if (error < 0)
 		goto out_free_secdata;
- 	error = security_sb_kern_mount(sb, secdata);
+
+ 	error = security_sb_kern_mount(mnt->mnt_sb, secdata);
  	if (error)
  		goto out_sb;
-	mnt->mnt_sb = sb;
-	mnt->mnt_root = dget(sb->s_root);
-	mnt->mnt_mountpoint = sb->s_root;
+
+	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
-	up_write(&sb->s_umount);
+	up_write(&mnt->mnt_sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
-	sb = ERR_PTR(error);
+	dput(mnt->mnt_root);
+	up_write(&mnt->mnt_sb->s_umount);
+	deactivate_super(mnt->mnt_sb);
 out_free_secdata:
 	free_secdata(secdata);
 out_mnt:
 	free_vfsmnt(mnt);
 out:
+	return ERR_PTR(error);
+}
+
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
 	put_filesystem(type);
-	return (struct vfsmount *)sb;
+	return mnt;
 }
 
 EXPORT_SYMBOL_GPL(do_kern_mount);
 
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f1117e885bd6..40190c489271 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -66,10 +66,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *sysfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sysfs_fill_super);
+	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
 }
 
 static struct file_system_type sysfs_fs_type = {
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index e92b991e6dda..876639b93321 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -506,16 +506,17 @@ failed:
 
 /* Every kernel module contains stuff like this. */
 
-static struct super_block *sysv_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysv_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+			   mnt);
 }
 
-static struct super_block *v7_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int v7_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
 }
 
 static struct file_system_type sysv_fs_type = {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e45789fe38e8..2250774a831d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -94,10 +94,10 @@ static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct super_block *, struct kstatfs *);
 
 /* UDF filesystem type */
-static struct super_block *udf_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int udf_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
 }
 
 static struct file_system_type udf_fstype = {
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index db98a4c71e63..768fb8d9e67a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1311,10 +1311,10 @@ out:
 
 #endif
 
-static struct super_block *ufs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ufs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
 }
 
 static struct file_system_type ufs_fs_type = {
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a56cec3be5f0..9a8f48bae956 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -1023,11 +1023,12 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *vfat_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *data)
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vfat_fs_type = {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2a0778536f4..d03c89a36655 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -853,14 +853,16 @@ fail_vfsop:
 	return -error;
 }
 
-STATIC struct super_block *
+STATIC int
 xfs_fs_get_sb(
 	struct file_system_type	*fs_type,
 	int			flags,
 	const char		*dev_name,
-	void			*data)
+	void			*data,
+	struct vfsmount		*mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+			   mnt);
 }
 
 STATIC struct super_operations xfs_super_operations = {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 46d0e079735d..0dd1610a94a9 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,7 +217,6 @@ extern struct dentry * d_alloc_anon(struct inode *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_anon(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 73c7d6f04b31..3e50dd24af87 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1269,23 +1269,26 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-	struct super_block *(*get_sb) (struct file_system_type *, int,
-				       const char *, void *);
+	int (*get_sb) (struct file_system_type *, int,
+		       const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
 	struct list_head fs_supers;
 };
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+extern int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
+extern int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
+extern int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int));
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt);
 void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
@@ -1296,8 +1299,10 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			void *data);
-struct super_block *get_sb_pseudo(struct file_system_type *, char *,
-			struct super_operations *ops, unsigned long);
+extern int get_sb_pseudo(struct file_system_type *, char *,
+	struct super_operations *ops, unsigned long,
+	struct vfsmount *mnt);
+extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super(struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
 void unnamed_dev_init(void);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 78ecfa28b1c2..00b340ba6612 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -2,8 +2,8 @@
 #define _LINUX_RAMFS_H
 
 struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	 int flags, const char *dev_name, void *data);
+extern int ramfs_get_sb(struct file_system_type *fs_type,
+	 int flags, const char *dev_name, void *data, struct vfsmount *mnt);
 
 #ifndef CONFIG_MMU
 extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1511714a9585..0a2a24b6ebe4 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -205,11 +205,11 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *mqueue_get_sb(struct file_system_type *fs_type,
-					 int flags, const char *dev_name,
-					 void *data)
+static int mqueue_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name,
+			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, mqueue_fill_super);
+	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
 static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..77f45ffd5ea1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
 	return 0;
 }
 
-static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
-					int flags, const char *unused_dev_name,
-					void *data)
+static int cpuset_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+	return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
 }
 
 static struct file_system_type cpuset_fs_type = {
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..e1a380c77a5a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
 			(unsigned long)uaddr2, val2, val3);
 }
 
-static struct super_block *
-futexfs_get_sb(struct file_system_type *fs_type,
-	       int flags, const char *dev_name, void *data)
+static int futexfs_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type futex_fs_type = {
diff --git a/mm/shmem.c b/mm/shmem.c
index 1e43c8a865ba..7617bb1c6bf7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2233,10 +2233,10 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
 
 
-static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int shmem_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
 }
 
 static struct file_system_type tmpfs_fs_type = {
diff --git a/net/socket.c b/net/socket.c
index 02948b622bd2..565f5e8d1191 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -335,10 +335,11 @@ static struct super_operations sockfs_ops = {
 	.statfs =	simple_statfs,
 };
 
-static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sockfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
+	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
+			     mnt);
 }
 
 static struct vfsmount *sock_mnt __read_mostly;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index cc673dd8433f..8241fa726803 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -815,11 +815,11 @@ out:
 	return -ENOMEM;
 }
 
-static struct super_block *
+static int
 rpc_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *data)
+		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, rpc_fill_super);
+	return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt);
 }
 
 static struct file_system_type rpc_pipe_fs_type = {
diff --git a/security/inode.c b/security/inode.c
index 0f77b0223662..e6fc29ac8564 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -135,11 +135,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, SECURITYFS_MAGIC, files);
 }
 
-static struct super_block *get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int get_sb(struct file_system_type *fs_type,
+		  int flags, const char *dev_name,
+		  void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, fill_super);
+	return get_sb_single(fs_type, flags, data, fill_super, mnt);
 }
 
 static struct file_system_type fs_type = {
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 2e73d3279f2d..7029bbc9bef8 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1345,10 +1345,11 @@ err:
 	goto out;
 }
 
-static struct super_block *sel_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int sel_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sel_fill_super);
+	return get_sb_single(fs_type, flags, data, sel_fill_super, mnt);
 }
 
 static struct file_system_type sel_fs_type = {
-- 
cgit v1.2.3


From 726c334223180e3c0197cc980a432681370d4baf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:58 -0700
Subject: [PATCH] VFS: Permit filesystem to perform statfs with a known root
 dentry

Give the statfs superblock operation a dentry pointer rather than a superblock
pointer.

This complements the get_sb() patch.  That reduced the significance of
sb->s_root, allowing NFS to place a fake root there.  However, NFS does
require a dentry to use as a target for the statfs operation.  This permits
the root in the vfsmount to be used instead.

linux/mount.h has been added where necessary to make allyesconfig build
successfully.

Interest has also been expressed for use with the FUSE and XFS filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  2 +-
 arch/alpha/kernel/osf_sys.c       |  2 +-
 arch/mips/kernel/sysirix.c        | 12 ++++++------
 arch/parisc/hpux/sys_hpux.c       | 10 +++++-----
 arch/sparc64/solaris/fs.c         |  4 ++--
 fs/adfs/super.c                   |  8 ++++----
 fs/affs/super.c                   |  5 +++--
 fs/befs/linuxvfs.c                |  5 +++--
 fs/bfs/inode.c                    |  3 ++-
 fs/cifs/cifsfs.c                  |  3 ++-
 fs/coda/inode.c                   |  6 +++---
 fs/coda/upcall.c                  |  4 ++--
 fs/compat.c                       |  8 ++++----
 fs/cramfs/inode.c                 |  4 +++-
 fs/efs/super.c                    |  6 +++---
 fs/ext2/super.c                   |  5 +++--
 fs/ext3/super.c                   |  5 +++--
 fs/fat/inode.c                    |  8 ++++----
 fs/freevxfs/vxfs_super.c          | 13 +++++++------
 fs/fuse/inode.c                   |  3 ++-
 fs/hfs/super.c                    |  4 +++-
 fs/hfsplus/super.c                |  4 +++-
 fs/hostfs/hostfs_kern.c           |  4 ++--
 fs/hpfs/super.c                   |  3 ++-
 fs/hppfs/hppfs_kern.c             |  2 +-
 fs/hugetlbfs/inode.c              |  4 ++--
 fs/isofs/inode.c                  |  6 ++++--
 fs/jffs/inode-v23.c               |  4 ++--
 fs/jffs2/fs.c                     |  4 ++--
 fs/jffs2/os-linux.h               |  2 +-
 fs/jfs/super.c                    |  4 ++--
 fs/libfs.c                        |  4 ++--
 fs/minix/inode.c                  | 10 +++++-----
 fs/ncpfs/inode.c                  |  5 +++--
 fs/nfs/inode.c                    |  5 +++--
 fs/nfsd/nfs4xdr.c                 |  2 +-
 fs/nfsd/vfs.c                     |  2 +-
 fs/ntfs/super.c                   |  7 ++++---
 fs/ocfs2/super.c                  | 10 +++++-----
 fs/open.c                         | 26 +++++++++++++-------------
 fs/qnx4/inode.c                   |  6 ++++--
 fs/reiserfs/super.c               |  8 ++++----
 fs/romfs/inode.c                  |  4 ++--
 fs/smbfs/inode.c                  |  6 +++---
 fs/smbfs/proc.c                   |  4 ++--
 fs/smbfs/proto.h                  |  2 +-
 fs/super.c                        |  2 +-
 fs/sysv/inode.c                   |  3 ++-
 fs/udf/super.c                    |  6 ++++--
 fs/ufs/super.c                    |  3 ++-
 fs/xfs/linux-2.6/xfs_super.c      |  4 ++--
 include/linux/coda_psdev.h        |  2 +-
 include/linux/fs.h                |  6 +++---
 include/linux/mount.h             |  5 +++++
 include/linux/security.h          | 14 +++++++-------
 kernel/acct.c                     |  2 +-
 mm/shmem.c                        |  4 ++--
 security/dummy.c                  |  2 +-
 security/selinux/hooks.c          |  6 +++---
 60 files changed, 175 insertions(+), 144 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 3abf08f1b14a..d31efbbdfe50 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -99,7 +99,7 @@ prototypes:
 	int (*sync_fs)(struct super_block *sb, int wait);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
-	int (*statfs) (struct super_block *, struct kstatfs *);
+	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index dd7d0dcedc87..9d3aed628bc1 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -211,7 +211,7 @@ struct super_operations {
         int (*sync_fs)(struct super_block *sb, int wait);
         void (*write_super_lockfs) (struct super_block *);
         void (*unlockfs) (struct super_block *);
-        int (*statfs) (struct super_block *, struct kstatfs *);
+        int (*statfs) (struct dentry *, struct kstatfs *);
         int (*remount_fs) (struct super_block *, int *, char *);
         void (*clear_inode) (struct inode *);
         void (*umount_begin) (struct super_block *);
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 31afe3d91ac6..e15dcf4f3dcd 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -244,7 +244,7 @@ do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer,
 	      unsigned long bufsiz)
 {
 	struct kstatfs linux_stat;
-	int error = vfs_statfs(dentry->d_inode->i_sb, &linux_stat);
+	int error = vfs_statfs(dentry, &linux_stat);
 	if (!error)
 		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
 	return error;	
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 5407b784cd01..19e1ef43eb4b 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -694,7 +694,7 @@ asmlinkage int irix_statfs(const char __user *path,
 	if (error)
 		goto out;
 
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -732,7 +732,7 @@ asmlinkage int irix_fstatfs(unsigned int fd, struct irix_statfs __user *buf)
 		goto out;
 	}
 
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
@@ -1360,7 +1360,7 @@ asmlinkage int irix_statvfs(char __user *fname, struct irix_statvfs __user *buf)
 	error = user_path_walk(fname, &nd);
 	if (error)
 		goto out;
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -1406,7 +1406,7 @@ asmlinkage int irix_fstatvfs(int fd, struct irix_statvfs __user *buf)
 		error = -EBADF;
 		goto out;
 	}
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
@@ -1611,7 +1611,7 @@ asmlinkage int irix_statvfs64(char __user *fname, struct irix_statvfs64 __user *
 	error = user_path_walk(fname, &nd);
 	if (error)
 		goto out;
-	error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(nd.dentry, &kbuf);
 	if (error)
 		goto dput_and_out;
 
@@ -1658,7 +1658,7 @@ asmlinkage int irix_fstatvfs64(int fd, struct irix_statvfs __user *buf)
 		error = -EBADF;
 		goto out;
 	}
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf);
+	error = vfs_statfs(file->f_dentry, &kbuf);
 	if (error)
 		goto out_f;
 
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 05273ccced0e..cb69727027ae 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -145,7 +145,7 @@ static int hpux_ustat(dev_t dev, struct hpux_ustat __user *ubuf)
 	s = user_get_super(dev);
 	if (s == NULL)
 		goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
@@ -186,12 +186,12 @@ struct hpux_statfs {
      int16_t f_pad;
 };
 
-static int vfs_statfs_hpux(struct super_block *sb, struct hpux_statfs *buf)
+static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 	
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -219,7 +219,7 @@ asmlinkage long hpux_statfs(const char __user *path,
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct hpux_statfs tmp;
-		error = vfs_statfs_hpux(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_hpux(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -237,7 +237,7 @@ asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_hpux(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_hpux(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c
index 4885ca6cbc77..0f0eb6aa1c40 100644
--- a/arch/sparc64/solaris/fs.c
+++ b/arch/sparc64/solaris/fs.c
@@ -356,7 +356,7 @@ static int report_statvfs(struct vfsmount *mnt, struct inode *inode, u32 buf)
 	int error;
 	struct sol_statvfs __user *ss = A(buf);
 
-	error = vfs_statfs(mnt->mnt_sb, &s);
+	error = vfs_statfs(mnt->mnt_root, &s);
 	if (!error) {
 		const char *p = mnt->mnt_sb->s_type->name;
 		int i = 0;
@@ -392,7 +392,7 @@ static int report_statvfs64(struct vfsmount *mnt, struct inode *inode, u32 buf)
 	int error;
 	struct sol_statvfs64 __user *ss = A(buf);
 			
-	error = vfs_statfs(mnt->mnt_sb, &s);
+	error = vfs_statfs(mnt->mnt_root, &s);
 	if (!error) {
 		const char *p = mnt->mnt_sb->s_type->name;
 		int i = 0;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 1b58a9b7f6aa..ba1c88af49fe 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -196,17 +196,17 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 	return parse_options(sb, data);
 }
 
-static int adfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct adfs_sb_info *asb = ADFS_SB(sb);
+	struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
 
 	buf->f_type    = ADFS_SUPER_MAGIC;
 	buf->f_namelen = asb->s_namelen;
-	buf->f_bsize   = sb->s_blocksize;
+	buf->f_bsize   = dentry->d_sb->s_blocksize;
 	buf->f_blocks  = asb->s_size;
 	buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
 	buf->f_bavail  =
-	buf->f_bfree   = adfs_map_free(sb);
+	buf->f_bfree   = adfs_map_free(dentry->d_sb);
 	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
 
 	return 0;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6a52e7875403..8765cba35bb9 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,7 +18,7 @@
 
 extern struct timezone sys_tz;
 
-static int affs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
@@ -508,8 +508,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-affs_statfs(struct super_block *sb, struct kstatfs *buf)
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int		 free;
 
 	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 6ed07a5f10c6..08201fab26cd 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -49,7 +49,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static void befs_put_super(struct super_block *);
 static int befs_remount(struct super_block *, int *, char *);
-static int befs_statfs(struct super_block *, struct kstatfs *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
 static int parse_options(char *, befs_mount_options *);
 
 static const struct super_operations befs_sops = {
@@ -880,8 +880,9 @@ befs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-befs_statfs(struct super_block *sb, struct kstatfs *buf)
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 
 	befs_debug(sb, "---> befs_statfs()");
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e7da03f63a5a..cf74f3d4d966 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -203,8 +203,9 @@ static void bfs_put_super(struct super_block *s)
 	s->s_fs_info = NULL;
 }
 
-static int bfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct bfs_sb_info *info = BFS_SB(s);
 	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
 	buf->f_type = BFS_MAGIC;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 08b35801dfed..7520f4687158 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -166,8 +166,9 @@ cifs_put_super(struct super_block *sb)
 }
 
 static int
-cifs_statfs(struct super_block *sb, struct kstatfs *buf)
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int xid; 
 	int rc = -EOPNOTSUPP;
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cba70201567d..87f1dc8aa24b 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -36,7 +36,7 @@
 /* VFS super_block ops */
 static void coda_clear_inode(struct inode *);
 static void coda_put_super(struct super_block *);
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static kmem_cache_t * coda_inode_cachep;
 
@@ -278,13 +278,13 @@ struct inode_operations coda_file_inode_operations = {
 	.setattr	= coda_setattr,
 };
 
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int error;
 	
 	lock_kernel();
 
-	error = venus_statfs(sb, buf);
+	error = venus_statfs(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1bae99650a91..b040eba13a7d 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -611,7 +611,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 	return error;
 }
 
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs) 
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 { 
         union inputArgs *inp;
         union outputArgs *outp;
@@ -620,7 +620,7 @@ int venus_statfs(struct super_block *sb, struct kstatfs *sfs)
 	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
 	UPARG(CODA_STATFS);
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+        error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp);
 	
         if (!error) {
 		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
diff --git a/fs/compat.c b/fs/compat.c
index b1f64786a613..7e7e5bc4f3cf 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,7 +197,7 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
 		path_release(&nd);
@@ -215,7 +215,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
 	fput(file);
@@ -265,7 +265,7 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
 		path_release(&nd);
@@ -286,7 +286,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
 	fput(file);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 37a91a153aa5..8a9d5d3b3262 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -322,8 +322,10 @@ out:
 	return -EINVAL;
 }
 
-static int cramfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_blocks = CRAMFS_SB(sb)->blocks;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 1ba5e14f879f..8ac2462ae5dd 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,7 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
 static int efs_get_sb(struct file_system_type *fs_type,
@@ -322,8 +322,8 @@ out_no_fs:
 	return -EINVAL;
 }
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf) {
-	struct efs_sb_info *sb = SUPER_INFO(s);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
+	struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
 
 	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
 	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a4dfffac5967..a6c4d6e02324 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -39,7 +39,7 @@
 static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -1038,8 +1038,9 @@ restore_opts:
 	return err;
 }
 
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	unsigned long overhead;
 	int i;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 657f8e73b62f..e08b6439617c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -58,7 +58,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait);
 static const char *ext3_decode_error(struct super_block * sb, int errno,
 				     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
@@ -2318,8 +2318,9 @@ restore_opts:
 	return err;
 }
 
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
 	unsigned long overhead;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c1ce284f8a94..7c35d582ec10 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -539,18 +539,18 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static int fat_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 
 	/* If the count of free cluster is still unknown, counts it here. */
 	if (sbi->free_clusters == -1) {
-		int err = fat_count_free_clusters(sb);
+		int err = fat_count_free_clusters(dentry->d_sb);
 		if (err)
 			return err;
 	}
 
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = sbi->cluster_size;
 	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
 	buf->f_bfree = sbi->free_clusters;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index d76eeaafbde2..b74b791fc23b 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 
 #include "vxfs.h"
 #include "vxfs_extern.h"
@@ -55,7 +56,7 @@ MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
 
 
 static void		vxfs_put_super(struct super_block *);
-static int		vxfs_statfs(struct super_block *, struct kstatfs *);
+static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int		vxfs_remount(struct super_block *, int *, char *);
 
 static struct super_operations vxfs_super_ops = {
@@ -90,12 +91,12 @@ vxfs_put_super(struct super_block *sbp)
 
 /**
  * vxfs_statfs - get filesystem information
- * @sbp:	VFS superblock
+ * @dentry:	VFS dentry to locate superblock
  * @bufp:	output buffer
  *
  * Description:
  *   vxfs_statfs fills the statfs buffer @bufp with information
- *   about the filesystem described by @sbp.
+ *   about the filesystem described by @dentry.
  *
  * Returns:
  *   Zero.
@@ -107,12 +108,12 @@ vxfs_put_super(struct super_block *sbp)
  *   This is everything but complete...
  */
 static int
-vxfs_statfs(struct super_block *sbp, struct kstatfs *bufp)
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 {
-	struct vxfs_sb_info		*infp = VXFS_SBI(sbp);
+	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
 
 	bufp->f_type = VXFS_SUPER_MAGIC;
-	bufp->f_bsize = sbp->s_blocksize;
+	bufp->f_bsize = dentry->d_sb->s_blocksize;
 	bufp->f_blocks = infp->vsi_raw->vs_dsize;
 	bufp->f_bfree = infp->vsi_raw->vs_free;
 	bufp->f_bavail = 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c91f0a50aadb..a13c0f529058 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -236,8 +236,9 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 	/* fsid is left zero */
 }
 
-static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
 	struct fuse_statfs_out outarg;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ee5b80a409e8..d9227bf14e86 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -80,8 +80,10 @@ static void hfs_put_super(struct super_block *sb)
  *
  * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
  */
-static int hfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 0ed8b7e8e87f..0a92fa2336a2 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -212,8 +212,10 @@ static void hfsplus_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 04035e08f5c1..8e0d37743e7c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -239,7 +239,7 @@ static int read_inode(struct inode *ino)
 	return(err);
 }
 
-int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
+int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	/* do_statfs uses struct statfs64 internally, but the linux kernel
 	 * struct statfs still has 32-bit versions for most of these fields,
@@ -252,7 +252,7 @@ int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
 	long long f_files;
 	long long f_ffree;
 
-	err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
+	err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
 			&sf->f_namelen, sf->f_spare);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 3b25cf3e2e65..f798480a363f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -135,8 +135,9 @@ static unsigned count_bitmaps(struct super_block *s)
 	return count;
 }
 
-static int hpfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	lock_kernel();
 
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index ec43c22bc9c0..3a9bdf58166f 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -616,7 +616,7 @@ static const struct file_operations hppfs_dir_fops = {
 	.fsync		= hppfs_fsync,
 };
 
-static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	sf->f_blocks = 0;
 	sf->f_bfree = 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4665c26171f7..678fc72c3646 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -467,9 +467,9 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
 
 	buf->f_type = HUGETLBFS_MAGIC;
 	buf->f_bsize = HPAGE_SIZE;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 17268da63a49..3f9c8ba1fa1f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -56,7 +56,7 @@ static void isofs_put_super(struct super_block *sb)
 }
 
 static void isofs_read_inode(struct inode *);
-static int isofs_statfs (struct super_block *, struct kstatfs *);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
 
 static kmem_cache_t *isofs_inode_cachep;
 
@@ -901,8 +901,10 @@ out_freesbi:
 	return -EINVAL;
 }
 
-static int isofs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = ISOFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (ISOFS_SB(sb)->s_nzones
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index dd93a091ad67..9e46ea6da752 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -377,9 +377,9 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 
 /* Get statistics of the file system.  */
 static int
-jffs_statfs(struct super_block *sb, struct kstatfs *buf)
+jffs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs_control *c = (struct jffs_control *) sb->s_fs_info;
+	struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info;
 	struct jffs_fmcontrol *fmc;
 
 	lock_kernel();
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7b6c24b14f85..2900ec3ec3af 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -192,9 +192,9 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 	return rc;
 }
 
-int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
 	unsigned long avail;
 
 	buf->f_type = JFFS2_SUPER_MAGIC;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index cd4021bcb944..6b5223565405 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -175,7 +175,7 @@ void jffs2_clear_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
-int jffs2_statfs (struct super_block *, struct kstatfs *);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
 void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 18a28137b90e..73d2aba084c6 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -139,9 +139,9 @@ static void jfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(jfs_inode_cachep, ji);
 }
 
-static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
 	s64 maxinodes;
 	struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 7d70efa46da9..1b1156381787 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,9 +20,9 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
 	return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 14f24dfbfe30..a6fb509b7341 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -19,7 +19,7 @@
 
 static void minix_read_inode(struct inode * inode);
 static int minix_write_inode(struct inode * inode, int wait);
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
 static void minix_delete_inode(struct inode *inode)
@@ -296,11 +296,11 @@ out_bad_sb:
 	return -EINVAL;
 }
 
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf)
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct minix_sb_info *sbi = minix_sb(sb);
-	buf->f_type = sb->s_magic;
-	buf->f_bsize = sb->s_blocksize;
+	struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
 	buf->f_bfree = minix_count_free_blocks(sbi);
 	buf->f_bavail = buf->f_bfree;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8db033fab3fd..90d2ea28f333 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -39,7 +39,7 @@
 
 static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
-static int  ncp_statfs(struct super_block *, struct kstatfs *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
 
 static kmem_cache_t * ncp_inode_cachep;
 
@@ -724,13 +724,14 @@ static void ncp_put_super(struct super_block *sb)
 	kfree(server);
 }
 
-static int ncp_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct dentry* d;
 	struct inode* i;
 	struct ncp_inode_info* ni;
 	struct ncp_server* s;
 	struct ncp_volume_info vi;
+	struct super_block *sb = dentry->d_sb;
 	int err;
 	__u8 dh;
 	
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ff645a961bc8..937fbfc381bb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -65,7 +65,7 @@ static int nfs_write_inode(struct inode *,int);
 static void nfs_delete_inode(struct inode *);
 static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
-static int  nfs_statfs(struct super_block *, struct kstatfs *);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
@@ -534,8 +534,9 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 }
 
 static int
-nfs_statfs(struct super_block *sb, struct kstatfs *buf)
+nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct nfs_server *server = NFS_SB(sb);
 	unsigned char blockbits;
 	unsigned long blockres;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index de3998f15f10..5446a0861d1d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1310,7 +1310,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		status = vfs_statfs(dentry->d_inode->i_sb, &statfs);
+		status = vfs_statfs(dentry, &statfs);
 		if (status)
 			goto out_nfserr;
 	}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1d65f13f458c..245eaa1fb59b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1737,7 +1737,7 @@ int
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
 	int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
-	if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat))
+	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d5d5e969294f..0e14acea3f8b 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2601,10 +2601,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 
 /**
  * ntfs_statfs - return information about mounted NTFS volume
- * @sb:		super block of mounted volume
+ * @dentry:	dentry from mounted volume
  * @sfs:	statfs structure in which to return the information
  *
- * Return information about the mounted NTFS volume @sb in the statfs structure
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
  * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
  * called). We interpret the values to be correct of the moment in time at
  * which we are called. Most values are variable otherwise and this isn't just
@@ -2617,8 +2617,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
  *
  * Return 0 on success or -errno on error.
  */
-static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 {
+	struct super_block *sb = dentry->d_sb;
 	s64 size;
 	ntfs_volume *vol = NTFS_SB(sb);
 	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 788b8b50dc4c..cdf73393f094 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -100,7 +100,7 @@ static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
 static void ocfs2_delete_osb(struct ocfs2_super *osb);
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int ocfs2_sync_fs(struct super_block *sb, int wait);
 
@@ -857,7 +857,7 @@ static void ocfs2_put_super(struct super_block *sb)
 	mlog_exit_void();
 }
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ocfs2_super *osb;
 	u32 numbits, freebits;
@@ -866,9 +866,9 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog_entry("(%p, %p)\n", sb, buf);
+	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
 
-	osb = OCFS2_SB(sb);
+	osb = OCFS2_SB(dentry->d_sb);
 
 	inode = ocfs2_get_system_file_inode(osb,
 					    GLOBAL_BITMAP_SYSTEM_INODE,
@@ -891,7 +891,7 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
 
 	buf->f_type = OCFS2_SUPER_MAGIC;
-	buf->f_bsize = sb->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
 	buf->f_blocks = ((sector_t) numbits) *
 			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
diff --git a/fs/open.c b/fs/open.c
index 4f178acd4c09..a37ff861108f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -31,18 +31,18 @@
 
 #include <asm/unistd.h>
 
-int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval = -ENODEV;
 
-	if (sb) {
+	if (dentry) {
 		retval = -ENOSYS;
-		if (sb->s_op->statfs) {
+		if (dentry->d_sb->s_op->statfs) {
 			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(sb);
+			retval = security_sb_statfs(dentry);
 			if (retval)
 				return retval;
-			retval = sb->s_op->statfs(sb, buf);
+			retval = dentry->d_sb->s_op->statfs(dentry, buf);
 			if (retval == 0 && buf->f_frsize == 0)
 				buf->f_frsize = buf->f_bsize;
 		}
@@ -52,12 +52,12 @@ int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
 
 EXPORT_SYMBOL(vfs_statfs);
 
-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -95,12 +95,12 @@ static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
 	return 0;
 }
 
-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf)
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -130,7 +130,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_native(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -149,7 +149,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs64(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -168,7 +168,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_native(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -189,7 +189,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs64(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e6cca5cd4b44..2f24c46f72a1 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -128,7 +128,7 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static void qnx4_read_inode(struct inode *);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
-static int qnx4_statfs(struct super_block *, struct kstatfs *);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
 
 static struct super_operations qnx4_sops =
 {
@@ -282,8 +282,10 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 	return block;
 }
 
-static int qnx4_statfs(struct super_block *sb, struct kstatfs *buf)
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	lock_kernel();
 
 	buf->f_type    = sb->s_magic;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3ff41d33989..00f1321e9209 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -60,7 +60,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 }
 
 static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
@@ -1938,15 +1938,15 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return errval;
 }
 
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
 
 	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
 	buf->f_bfree = sb_free_blocks(rs);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-	buf->f_bsize = s->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	/* changed to accommodate gcc folks. */
 	buf->f_type = REISERFS_SUPER_MAGIC;
 	return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 4d6cd6621062..283fbc6b8eea 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -179,12 +179,12 @@ outnobh:
 /* That's simple too. */
 
 static int
-romfs_statfs(struct super_block *sb, struct kstatfs *buf)
+romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	buf->f_type = ROMFS_MAGIC;
 	buf->f_bsize = ROMBSIZE;
 	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(sb)+ROMBSIZE-1)>>ROMBSBITS;
+	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
 	buf->f_namelen = ROMFS_MAXFN;
 	return 0;
 }
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4a37c2bbfa3f..506ff87c1d4b 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -48,7 +48,7 @@
 
 static void smb_delete_inode(struct inode *);
 static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct super_block *, struct kstatfs *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
 
 static kmem_cache_t *smb_inode_cachep;
@@ -641,13 +641,13 @@ out_no_server:
 }
 
 static int
-smb_statfs(struct super_block *sb, struct kstatfs *buf)
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int result;
 	
 	lock_kernel();
 
-	result = smb_proc_dskattr(sb, buf);
+	result = smb_proc_dskattr(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index b1b878b81730..c3495059889d 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3226,9 +3226,9 @@ smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
 }
 
 int
-smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr)
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
 {
-	struct smb_sb_info *server = SMB_SB(sb);
+	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
 	int result;
 	char *p;
 	long unit;
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 47664597e6b1..972ed7dad388 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -29,7 +29,7 @@ extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
 extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
 extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
 extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
diff --git a/fs/super.c b/fs/super.c
index 324c2d232f54..057b5325b7ef 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -486,7 +486,7 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
         s = user_get_super(new_decode_dev(dev));
         if (s == NULL)
                 goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3ff89cc5833a..58b2d22142ba 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -85,8 +85,9 @@ static void sysv_put_super(struct super_block *sb)
 	kfree(sbi);
 }
 
-static int sysv_statfs(struct super_block *sb, struct kstatfs *buf)
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
 	buf->f_type = sb->s_magic;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2250774a831d..44fe2cb0bbb2 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -91,7 +91,7 @@ static void udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
-static int udf_statfs(struct super_block *, struct kstatfs *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
 
 /* UDF filesystem type */
 static int udf_get_sb(struct file_system_type *fs_type,
@@ -1779,8 +1779,10 @@ udf_put_super(struct super_block *sb)
  *	Written, tested, and released.
  */
 static int
-udf_statfs(struct super_block *sb, struct kstatfs *buf)
+udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = UDF_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = UDF_SB_PARTLEN(sb, UDF_SB_PARTITION(sb));
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 768fb8d9e67a..fe5ab2aa2899 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1113,8 +1113,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block * usb;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index d03c89a36655..4fb0fc65af34 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -684,10 +684,10 @@ xfs_fs_sync_super(
 
 STATIC int
 xfs_fs_statfs(
-	struct super_block	*sb,
+	struct dentry		*dentry,
 	struct kstatfs		*statp)
 {
-	return -bhv_vfs_statvfs(vfs_from_sb(sb), statp, NULL);
+	return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, NULL);
 }
 
 STATIC int
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index d539262a8f89..98f6c52c152b 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -70,7 +70,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 		 unsigned int cmd, struct PioctlData *data);
 int coda_downcall(int opcode, union outputArgs *out, struct super_block *sb);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs);
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
 
 /* messages between coda filesystem in kernel and Venus */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e50dd24af87..c823a3815e24 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1096,7 +1096,7 @@ struct super_operations {
 	int (*sync_fs)(struct super_block *sb, int wait);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
-	int (*statfs) (struct super_block *, struct kstatfs *);
+	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
@@ -1325,7 +1325,7 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 				  struct vfsmount *);
 
-extern int vfs_statfs(struct super_block *, struct kstatfs *);
+extern int vfs_statfs(struct dentry *, struct kstatfs *);
 
 /* /sys/fs */
 extern struct subsystem fs_subsys;
@@ -1746,7 +1746,7 @@ extern int dcache_dir_close(struct inode *, struct file *);
 extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
 extern int dcache_readdir(struct file *, void *, filldir_t);
 extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int simple_statfs(struct super_block *, struct kstatfs *);
+extern int simple_statfs(struct dentry *, struct kstatfs *);
 extern int simple_link(struct dentry *, struct inode *, struct dentry *);
 extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b7472ae91fa4..60718f12caa9 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -17,6 +17,11 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
+struct super_block;
+struct vfsmount;
+struct dentry;
+struct namespace;
+
 #define MNT_NOSUID	0x01
 #define MNT_NODEV	0x02
 #define MNT_NOEXEC	0x04
diff --git a/include/linux/security.h b/include/linux/security.h
index 47722d355532..383c320fc834 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -171,9 +171,9 @@ struct swap_info_struct;
  *	Deallocate and clear the sb->s_security field.
  *	@sb contains the super_block structure to be modified.
  * @sb_statfs:
- *	Check permission before obtaining filesystem statistics for the @sb
- *	filesystem.
- *	@sb contains the super_block structure for the filesystem.
+ *	Check permission before obtaining filesystem statistics for the @mnt
+ *	mountpoint.
+ *	@dentry is a handle on the superblock for the filesystem.
  *	Return 0 if permission is granted.  
  * @sb_mount:
  *	Check permission before an object specified by @dev_name is mounted on
@@ -1127,7 +1127,7 @@ struct security_operations {
 	int (*sb_copy_data)(struct file_system_type *type,
 			    void *orig, void *copy);
 	int (*sb_kern_mount) (struct super_block *sb, void *data);
-	int (*sb_statfs) (struct super_block * sb);
+	int (*sb_statfs) (struct dentry *dentry);
 	int (*sb_mount) (char *dev_name, struct nameidata * nd,
 			 char *type, unsigned long flags, void *data);
 	int (*sb_check_sb) (struct vfsmount * mnt, struct nameidata * nd);
@@ -1450,9 +1450,9 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data)
 	return security_ops->sb_kern_mount (sb, data);
 }
 
-static inline int security_sb_statfs (struct super_block *sb)
+static inline int security_sb_statfs (struct dentry *dentry)
 {
-	return security_ops->sb_statfs (sb);
+	return security_ops->sb_statfs (dentry);
 }
 
 static inline int security_sb_mount (char *dev_name, struct nameidata *nd,
@@ -2162,7 +2162,7 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
-static inline int security_sb_statfs (struct super_block *sb)
+static inline int security_sb_statfs (struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..6802020e0ceb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
 	spin_unlock(&acct_globals.lock);
 
 	/* May block */
-	if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+	if (vfs_statfs(file->f_dentry, &sbuf))
 		return res;
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
diff --git a/mm/shmem.c b/mm/shmem.c
index 7617bb1c6bf7..10020d8b4073 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1654,9 +1654,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
 	return desc.error;
 }
 
-static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
 
 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
diff --git a/security/dummy.c b/security/dummy.c
index 6de4a4a5eb13..c98d553984ec 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -191,7 +191,7 @@ static int dummy_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
-static int dummy_sb_statfs (struct super_block *sb)
+static int dummy_sb_statfs (struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 524915dfda64..093efba4d9b6 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1903,13 +1903,13 @@ static int selinux_sb_kern_mount(struct super_block *sb, void *data)
 	return superblock_has_perm(current, sb, FILESYSTEM__MOUNT, &ad);
 }
 
-static int selinux_sb_statfs(struct super_block *sb)
+static int selinux_sb_statfs(struct dentry *dentry)
 {
 	struct avc_audit_data ad;
 
 	AVC_AUDIT_DATA_INIT(&ad,FS);
-	ad.u.fs.dentry = sb->s_root;
-	return superblock_has_perm(current, sb, FILESYSTEM__GETATTR, &ad);
+	ad.u.fs.dentry = dentry->d_sb->s_root;
+	return superblock_has_perm(current, dentry->d_sb, FILESYSTEM__GETATTR, &ad);
 }
 
 static int selinux_mount(char * dev_name,
-- 
cgit v1.2.3


From cb2b95e1c6b56e3d2369d3a5f4bc97f4fa180683 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 23 Jun 2006 02:03:01 -0700
Subject: [PATCH] zone handle unaligned zone boundaries

The buddy allocator has a requirement that boundaries between contigious
zones occur aligned with the the MAX_ORDER ranges.  Where they do not we
will incorrectly merge pages cross zone boundaries.  This can lead to pages
from the wrong zone being handed out.

Originally the buddy allocator would check that buddies were in the same
zone by referencing the zone start and end page frame numbers.  This was
removed as it became very expensive and the buddy allocator already made
the assumption that zones boundaries were aligned.

It is clear that not all configurations and architectures are honouring
this alignment requirement.  Therefore it seems safest to reintroduce
support for non-aligned zone boundaries.  This patch introduces a new check
when considering a page a buddy it compares the zone_table index for the
two pages and refuses to merge the pages where they do not match.  The
zone_table index is unique for each node/zone combination when
FLATMEM/DISCONTIGMEM is enabled and for each section/zone combination when
SPARSEMEM is enabled (a SPARSEMEM section is at least a MAX_ORDER size).

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  7 +++++--
 mm/page_alloc.c    | 17 +++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2fa375e478e..697c6bf248c2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -465,10 +465,13 @@ static inline unsigned long page_zonenum(struct page *page)
 struct zone;
 extern struct zone *zone_table[];
 
+static inline int page_zone_id(struct page *page)
+{
+	return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
+}
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
-			ZONETABLE_MASK];
+	return zone_table[page_zone_id(page)];
 }
 
 static inline unsigned long page_to_nid(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..fd631c2536a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we use PG_buddy.
  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+								int order)
 {
 #ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
+	if (!pfn_valid(page_to_pfn(buddy)))
 		return 0;
 #endif
 
-	if (PageBuddy(page) && page_order(page) == order) {
-		BUG_ON(page_count(page) != 0);
+	if (page_zone_id(page) != page_zone_id(buddy))
+		return 0;
+
+	if (PageBuddy(buddy) && page_order(buddy) == order) {
+		BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
@@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page,
 		struct page *buddy;
 
 		buddy = __page_find_buddy(page, page_idx, order);
-		if (!page_is_buddy(buddy, order))
+		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 
 		list_del(&buddy->lru);
-- 
cgit v1.2.3


From f886ed443fedb109e2062988bf120a531f0ec80a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 02:03:06 -0700
Subject: [PATCH] PG_uncached is ia64 only

As Nick points out, only ia64 uses PG_uncached.  So we can push it up into the
higher bits of the lower half of page->flags and make room for another flag on
32-bit machines.

Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jesse Barnes <jbarnes@sgi.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d276a4e2f825..0c076d58c676 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -7,6 +7,8 @@
 
 #include <linux/percpu.h>
 #include <linux/cache.h>
+#include <linux/types.h>
+
 #include <asm/pgtable.h>
 
 /*
@@ -88,7 +90,17 @@
 #define PG_nosave_free		18	/* Free, should not be written */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
-#define PG_uncached		20	/* Page has been mapped as uncached */
+
+#if (BITS_PER_LONG > 32)
+/*
+ * 64-bit-only flags build down from bit 31
+ *
+ * 32 bit  -------------------------------| FIELDS |       FLAGS         |
+ * 64 bit  |           FIELDS             | ??????         FLAGS         |
+ *         63                            32                              0
+ */
+#define PG_uncached		31	/* Page has been mapped as uncached */
+#endif
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
-- 
cgit v1.2.3


From 02b694dea473ad3db1e2d1b14c1fef8fbd92e5e6 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:08 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 change name of wait_table_size()

This is just to rename from wait_table_size() to wait_table_hash_nr_entries().

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  4 ++--
 mm/page_alloc.c        | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9742e3c16222..652673ea92f1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -197,7 +197,7 @@ struct zone {
 
 	/*
 	 * wait_table		-- the array holding the hash table
-	 * wait_table_size	-- the size of the hash table array
+	 * wait_table_hash_nr_entries	-- the size of the hash table array
 	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
 	 *
 	 * The purpose of all these is to keep track of the people
@@ -220,7 +220,7 @@ struct zone {
 	 * free_area_init_core() performs the initialization of them.
 	 */
 	wait_queue_head_t	* wait_table;
-	unsigned long		wait_table_size;
+	unsigned long		wait_table_hash_nr_entries;
 	unsigned long		wait_table_bits;
 
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd631c2536a5..27320a0542d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1727,7 +1727,7 @@ void __init build_all_zonelists(void)
  */
 #define PAGES_PER_WAITQUEUE	256
 
-static inline unsigned long wait_table_size(unsigned long pages)
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 
@@ -2019,13 +2019,15 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
-	zone->wait_table_size = wait_table_size(zone_size_pages);
-	zone->wait_table_bits =	wait_table_bits(zone->wait_table_size);
+	zone->wait_table_hash_nr_entries =
+		 wait_table_hash_nr_entries(zone_size_pages);
+	zone->wait_table_bits =
+		wait_table_bits(zone->wait_table_hash_nr_entries);
 	zone->wait_table = (wait_queue_head_t *)
-		alloc_bootmem_node(pgdat, zone->wait_table_size
+		alloc_bootmem_node(pgdat, zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t));
 
-	for(i = 0; i < zone->wait_table_size; ++i)
+	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 }
 
-- 
cgit v1.2.3


From 86356ab147669bd3bcb2149fd9561d1280835c24 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:09 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd:
 change to meminit for build_zonelist

Change definitions of some functions and data from __init to __meminit.

These functions and data can be used after bootup by this patch to be used for
hot-add codes.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h |  4 ++--
 mm/page_alloc.c         | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index da2d107fe2cf..22866fa2d960 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -91,8 +91,8 @@ static inline void *alloc_remap(int nid, unsigned long size)
 }
 #endif
 
-extern unsigned long __initdata nr_kernel_pages;
-extern unsigned long __initdata nr_all_pages;
+extern unsigned long nr_kernel_pages;
+extern unsigned long nr_all_pages;
 
 extern void *__init alloc_large_system_hash(const char *tablename,
 					    unsigned long bucketsize,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 27320a0542d3..5ae75bead4df 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
-unsigned long __initdata nr_kernel_pages;
-unsigned long __initdata nr_all_pages;
+unsigned long __meminitdata nr_kernel_pages;
+unsigned long __meminitdata nr_all_pages;
 
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -1517,7 +1517,7 @@ void show_free_areas(void)
  *
  * Add all populated zones of a node to the zonelist.
  */
-static int __init build_zonelists_node(pg_data_t *pgdat,
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
 			struct zonelist *zonelist, int nr_zones, int zone_type)
 {
 	struct zone *zone;
@@ -1553,7 +1553,7 @@ static inline int highest_zone(int zone_bits)
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
+static int __meminitdata node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
@@ -1568,7 +1568,7 @@ static int __initdata node_load[MAX_NUMNODES];
  * on them otherwise.
  * It returns -1 if no node is found.
  */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
@@ -1614,7 +1614,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
 	return best_node;
 }
 
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 	int prev_node, load;
@@ -1666,7 +1666,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
 
 #else	/* CONFIG_NUMA */
 
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
 
@@ -2071,7 +2071,7 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
-static void __init free_area_init_core(struct pglist_data *pgdat,
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long j;
@@ -2159,7 +2159,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long node_start_pfn,
 		unsigned long *zholes_size)
 {
-- 
cgit v1.2.3


From 718127cc3170454f4aa274fdd2f1e01574fecd66 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:10 -0700
Subject: [PATCH] wait_table and zonelist initializing for memory hotadd: add
 return code for init_current_empty_zone

When add_zone() is called against empty zone (not populated zone), we have to
initialize the zone which didn't initialize at boot time.  But,
init_currently_empty_zone() may fail due to allocation of wait table.  So,
this patch is to catch its error code.

Changes against wait_table is in the next patch.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  3 +++
 mm/memory_hotplug.c    | 15 +++++++++++++--
 mm/page_alloc.c        | 11 ++++++++---
 3 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 652673ea92f1..e82fc1a52cd0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -333,6 +333,9 @@ void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 
+extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
+				     unsigned long size);
+
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
 #else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957e..71da5c98c9c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,7 +26,7 @@
 
 extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
 			  unsigned long size);
-static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	int zone_type;
 
 	zone_type = zone - pgdat->node_zones;
+	if (!populated_zone(zone)) {
+		int ret = 0;
+		ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+		if (ret < 0)
+			return ret;
+	}
 	memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
 	zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+	return 0;
 }
 
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 	if (ret < 0)
 		return ret;
 
-	__add_zone(zone, phys_start_pfn);
+	ret = __add_zone(zone, phys_start_pfn);
+
+	if (ret < 0)
+		return ret;
+
 	return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5ae75bead4df..4bc66f6b7718 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2050,8 +2050,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
 			zone->name, zone->present_pages, batch);
 }
 
-static __meminit void init_currently_empty_zone(struct zone *zone,
-		unsigned long zone_start_pfn, unsigned long size)
+__meminit int init_currently_empty_zone(struct zone *zone,
+					unsigned long zone_start_pfn,
+					unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 
@@ -2063,6 +2064,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
 
 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+	return 0;
 }
 
 /*
@@ -2077,6 +2080,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 	unsigned long j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+	int ret;
 
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
@@ -2118,7 +2122,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 			continue;
 
 		zonetable_add(zone, nid, j, zone_start_pfn, size);
-		init_currently_empty_zone(zone, zone_start_pfn, size);
+		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+		BUG_ON(ret);
 		zone_start_pfn += size;
 	}
 }
-- 
cgit v1.2.3


From fadd8fbd153c12963f8fe3c9ef7f8967f286f98b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:13 -0700
Subject: [PATCH] support for panic at OOM

This patch adds panic_on_oom sysctl under sys.vm.

When sysctl vm.panic_on_oom = 1, the kernel panics intead of killing rogue
processes.  And if vm.panic_on_oom is 0 the kernel will do oom_kill() in
the same way as it does today.  Of course, the default value is 0 and only
root can modifies it.

In general, oom_killer works well and kill rogue processes.  So the whole
system can survive.  But there are environments where panic is preferable
rather than kill some processes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 13 +++++++++++++
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             |  9 +++++++++
 mm/oom_kill.c               |  3 +++
 4 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index a46c10fcddfc..2dc246af4885 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
 - drop-caches
 - zone_reclaim_mode
 - zone_reclaim_interval
+- panic_on_oom
 
 ==============================================================
 
@@ -178,3 +179,15 @@ Time is set in seconds and set by default to 30 seconds.
 Reduce the interval if undesired off node allocations occur. However, too
 frequent scans will have a negative impact onoff node allocation performance.
 
+=============================================================
+
+panic_on_oom
+
+This enables or disables panic on out-of-memory feature.  If this is set to 1,
+the kernel panics when out-of-memory happens.  If this is set to 0, the kernel
+will kill some rogue process, called oom_killer.  Usually, oom_killer can kill
+rogue processes and system will survive.  If you want to panic the system
+rather than killing rogue processes, set this to 1.
+
+The default value is 0.
+
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index cee944dbdcd4..c7132029af0f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -186,6 +186,7 @@ enum
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
+	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d656e61621d..072ac446810a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 extern int C_A_D;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
 extern int max_threads;
 extern int sysrq_enabled;
 extern int core_uses_pid;
@@ -701,6 +702,14 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= VM_PANIC_ON_OOM,
+		.procname	= "panic_on_oom",
+		.data		= &sysctl_panic_on_oom,
+		.maxlen		= sizeof(sysctl_panic_on_oom),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= VM_OVERCOMMIT_RATIO,
 		.procname	= "overcommit_ratio",
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3ee..f9bb3cf32384 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,6 +22,7 @@
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
 
+int sysctl_panic_on_oom;
 /* #define DEBUG */
 
 /**
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 		break;
 
 	case CONSTRAINT_NONE:
+		if (sysctl_panic_on_oom)
+			panic("out of memory. panic_on_oom is selected\n");
 retry:
 		/*
 		 * Rambo mode: Shoot down a process and hope it solves whatever
-- 
cgit v1.2.3


From e8f03d02080b25f53cd6bba8dc3a297803f18c01 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@shaw.ca>
Date: Fri, 23 Jun 2006 02:03:14 -0700
Subject: [PATCH] reserve space for swap label

Reserve space in the swap disk header for a LABEL and UUID to be specified.
 This has been possible with util-linux-2.12b (via e2fsprogs 1.36
libblkid), and is used by at least FC3 and later.  The kernel doesn't
really care about this, but the space shouldn't accidentally be used by
something else either.

Also make the on-disk structures be fixed-size types, instead of "int",
though I don't know of any architecture in use where an "int" isn't the
same size as a "__u32" (all current kernel arches have it as "unsigned
int").

Signed-off-by: Andreas Dilger <adilger@shaw.ca>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index aca9bfae208f..cd28ad206dae 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -48,12 +48,14 @@ union swap_header {
 		char magic[10];			/* SWAP-SPACE or SWAPSPACE2 */
 	} magic;
 	struct {
-		char	     bootbits[1024];	/* Space for disklabel etc. */
-		unsigned int version;
-		unsigned int last_page;
-		unsigned int nr_badpages;
-		unsigned int padding[125];
-		unsigned int badpages[1];
+		char		bootbits[1024];	/* Space for disklabel etc. */
+		__u32		version;
+		__u32		last_page;
+		__u32		nr_badpages;
+		unsigned char	sws_uuid[16];
+		unsigned char	sws_volume[16];
+		__u32		padding[117];
+		__u32		badpages[1];
 	} info;
 };
 
-- 
cgit v1.2.3


From a43a8c39bbb493c9e93f6764b350de2e33e18e92 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Fri, 23 Jun 2006 02:03:15 -0700
Subject: [PATCH] tightening hugetlb strict accounting

Current hugetlb strict accounting for shared mapping always assume mapping
starts at zero file offset and reserves pages between zero and size of the
file.  This assumption often reserves (or lock down) a lot more pages then
necessary if application maps at none zero file offset.  libhugetlbfs is
one example that requires proper reservation on shared mapping starts at
none zero offset.

This patch extends the reservation and hugetlb strict accounting to support
any arbitrary pair of (offset, len), resulting a much more robust and
accurate scheme.  More importantly, it won't lock down any hugetlb pages
outside file mapping.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c    |  21 ++--
 include/linux/hugetlb.h |   8 +-
 mm/hugetlb.c            | 282 +++++++++++++++++++++++++++---------------------
 3 files changed, 173 insertions(+), 138 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 678fc72c3646..e6410d8edd0e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	if (vma->vm_flags & VM_MAYSHARE)
-		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
-			goto out;
+	if (vma->vm_flags & VM_MAYSHARE &&
+	    hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+				  len >> HPAGE_SHIFT))
+		goto out;
 
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
-	int i;
+	int i, freed = 0;
 
-	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
-				     lstart >> HPAGE_SHIFT);
-	if (!mapping->nrpages)
-		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 			truncate_huge_page(page);
 			unlock_page(page);
 			hugetlb_put_quota(mapping);
+			freed++;
 		}
 		huge_pagevec_release(&pvec);
 	}
 	BUG_ON(!lstart && mapping->nrpages);
+	hugetlb_unreserve_pages(inode, start, freed);
 }
 
 static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
 		switch (mode & S_IFMT) {
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
-	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
 		goto out_file;
 
 	error = -ENOMEM;
-	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
-				       size >> HPAGE_SHIFT) != 0)
+	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4c5e610fe442..c25a38d8f600 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,6 +23,8 @@ int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
+int hugetlb_reserve_pages(struct inode *inode, long from, long to);
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -139,8 +141,6 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
-	/* Protected by the (global) hugetlb_lock */
-	unsigned long prereserved_hpages;
 	struct inode vfs_inode;
 };
 
@@ -157,10 +157,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-			       unsigned long atleast_hpages);
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-				  unsigned long atmost_hpages);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr)
 {
-	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
-	int use_reserve = 0;
-	unsigned long idx;
 
 	spin_lock(&hugetlb_lock);
-
-	if (vma->vm_flags & VM_MAYSHARE) {
-
-		/* idx = radix tree index, i.e. offset into file in
-		 * HPAGE_SIZE units */
-		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-
-		/* The hugetlbfs specific inode info stores the number
-		 * of "guaranteed available" (huge) pages.  That is,
-		 * the first 'prereserved_hpages' pages of the inode
-		 * are either already instantiated, or have been
-		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
-		 * we're in the process of instantiating the page, so
-		 * we use this to determine whether to draw from the
-		 * pre-reserved pool or the truly free pool. */
-		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
-			use_reserve = 1;
-	}
-
-	if (!use_reserve) {
-		if (free_huge_pages <= reserved_huge_pages)
-			goto fail;
-	} else {
-		BUG_ON(reserved_huge_pages == 0);
-		reserved_huge_pages--;
-	}
+	if (vma->vm_flags & VM_MAYSHARE)
+		resv_huge_pages--;
+	else if (free_huge_pages <= resv_huge_pages)
+		goto fail;
 
 	page = dequeue_huge_page(vma, addr);
 	if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	set_page_refcounted(page);
 	return page;
 
- fail:
-	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+fail:
 	spin_unlock(&hugetlb_lock);
 	return NULL;
 }
 
-/* hugetlb_extend_reservation()
- *
- * Ensure that at least 'atleast' hugepages are, and will remain,
- * available to instantiate the first 'atleast' pages of the given
- * inode.  If the inode doesn't already have this many pages reserved
- * or instantiated, set aside some hugepages in the reserved pool to
- * satisfy later faults (or fail now if there aren't enough, rather
- * than getting the SIGBUS later).
- */
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-			       unsigned long atleast)
-{
-	struct inode *inode = &info->vfs_inode;
-	unsigned long change_in_reserve = 0;
-	int ret = 0;
-
-	spin_lock(&hugetlb_lock);
-	read_lock_irq(&inode->i_mapping->tree_lock);
-
-	if (info->prereserved_hpages >= atleast)
-		goto out;
-
-	/* Because we always call this on shared mappings, none of the
-	 * pages beyond info->prereserved_hpages can have been
-	 * instantiated, so we need to reserve all of them now. */
-	change_in_reserve = atleast - info->prereserved_hpages;
-
-	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	reserved_huge_pages += change_in_reserve;
-	info->prereserved_hpages = atleast;
-
- out:
-	read_unlock_irq(&inode->i_mapping->tree_lock);
-	spin_unlock(&hugetlb_lock);
-
-	return ret;
-}
-
-/* hugetlb_truncate_reservation()
- *
- * This returns pages reserved for the given inode to the general free
- * hugepage pool.  If the inode has any pages prereserved, but not
- * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
- * them.
- */
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-				  unsigned long atmost)
-{
-	struct inode *inode = &info->vfs_inode;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long idx;
-	unsigned long change_in_reserve = 0;
-	struct page *page;
-
-	spin_lock(&hugetlb_lock);
-	read_lock_irq(&inode->i_mapping->tree_lock);
-
-	if (info->prereserved_hpages <= atmost)
-		goto out;
-
-	/* Count pages which were reserved, but not instantiated, and
-	 * which we can now release. */
-	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
-		page = radix_tree_lookup(&mapping->page_tree, idx);
-		if (!page)
-			/* Pages which are already instantiated can't
-			 * be unreserved (and in fact have already
-			 * been removed from the reserved pool) */
-			change_in_reserve++;
-	}
-
-	BUG_ON(reserved_huge_pages < change_in_reserve);
-	reserved_huge_pages -= change_in_reserve;
-	info->prereserved_hpages = atmost;
-
- out:
-	read_unlock_irq(&inode->i_mapping->tree_lock);
-	spin_unlock(&hugetlb_lock);
-}
-
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 		return nr_huge_pages;
 
 	spin_lock(&hugetlb_lock);
-	count = max(count, reserved_huge_pages);
+	count = max(count, resv_huge_pages);
 	try_to_free_low(count);
 	while (count < nr_huge_pages) {
 		struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
-		        "HugePages_Rsvd:  %5lu\n"
+			"HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
-		        reserved_huge_pages,
+			resv_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
+struct file_region {
+	struct list_head link;
+	long from;
+	long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+	struct file_region *rg, *nrg, *trg;
+
+	/* Locate the region we are either in or before. */
+	list_for_each_entry(rg, head, link)
+		if (f <= rg->to)
+			break;
+
+	/* Round our left edge to the current segment if it encloses us. */
+	if (f > rg->from)
+		f = rg->from;
+
+	/* Check for and consume any regions we now overlap with. */
+	nrg = rg;
+	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		if (rg->from > t)
+			break;
+
+		/* If this area reaches higher then extend our area to
+		 * include it completely.  If this is not the first area
+		 * which we intend to reuse, free it. */
+		if (rg->to > t)
+			t = rg->to;
+		if (rg != nrg) {
+			list_del(&rg->link);
+			kfree(rg);
+		}
+	}
+	nrg->from = f;
+	nrg->to = t;
+	return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+	struct file_region *rg, *nrg;
+	long chg = 0;
+
+	/* Locate the region we are before or in. */
+	list_for_each_entry(rg, head, link)
+		if (f <= rg->to)
+			break;
+
+	/* If we are below the current region then a new region is required.
+	 * Subtle, allocate a new region at the position but make it zero
+	 * size such that we can guarentee to record the reservation. */
+	if (&rg->link == head || t < rg->from) {
+		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+		if (nrg == 0)
+			return -ENOMEM;
+		nrg->from = f;
+		nrg->to   = f;
+		INIT_LIST_HEAD(&nrg->link);
+		list_add(&nrg->link, rg->link.prev);
+
+		return t - f;
+	}
+
+	/* Round our left edge to the current segment if it encloses us. */
+	if (f > rg->from)
+		f = rg->from;
+	chg = t - f;
+
+	/* Check for and consume any regions we now overlap with. */
+	list_for_each_entry(rg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		if (rg->from > t)
+			return chg;
+
+		/* We overlap with this area, if it extends futher than
+		 * us then we must extend ourselves.  Account for its
+		 * existing reservation. */
+		if (rg->to > t) {
+			chg += rg->to - t;
+			t = rg->to;
+		}
+		chg -= rg->to - rg->from;
+	}
+	return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+	struct file_region *rg, *trg;
+	long chg = 0;
+
+	/* Locate the region we are either in or before. */
+	list_for_each_entry(rg, head, link)
+		if (end <= rg->to)
+			break;
+	if (&rg->link == head)
+		return 0;
+
+	/* If we are in the middle of a region then adjust it. */
+	if (end > rg->from) {
+		chg = rg->to - end;
+		rg->to = end;
+		rg = list_entry(rg->link.next, typeof(*rg), link);
+	}
+
+	/* Drop any remaining regions. */
+	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+		if (&rg->link == head)
+			break;
+		chg += rg->to - rg->from;
+		list_del(&rg->link);
+		kfree(rg);
+	}
+	return chg;
+}
+
+static int hugetlb_acct_memory(long delta)
+{
+	int ret = -ENOMEM;
+
+	spin_lock(&hugetlb_lock);
+	if ((delta + resv_huge_pages) <= free_huge_pages) {
+		resv_huge_pages += delta;
+		ret = 0;
+	}
+	spin_unlock(&hugetlb_lock);
+	return ret;
+}
+
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+	long ret, chg;
+
+	chg = region_chg(&inode->i_mapping->private_list, from, to);
+	if (chg < 0)
+		return chg;
+	ret = hugetlb_acct_memory(chg);
+	if (ret < 0)
+		return ret;
+	region_add(&inode->i_mapping->private_list, from, to);
+	return 0;
+}
+
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+	long chg = region_truncate(&inode->i_mapping->private_list, offset);
+	hugetlb_acct_memory(freed - chg);
+}
-- 
cgit v1.2.3


From 762834e8bf46bf41ce9034d062a7c1f8563175f3 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:19 -0700
Subject: [PATCH] Unify pxm_to_node() and node_to_pxm()

Consolidate the various arch-specific implementations of pxm_to_node() and
node_to_pxm() into a single generic version.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig               |  6 ++++++
 arch/i386/kernel/srat.c         | 19 ++--------------
 arch/ia64/hp/common/sba_iommu.c |  2 +-
 arch/ia64/kernel/acpi.c         | 24 +++++++--------------
 arch/ia64/pci/pci.c             |  2 +-
 arch/ia64/sn/kernel/setup.c     |  4 ++--
 arch/x86_64/mm/srat.c           | 33 +---------------------------
 drivers/acpi/Kconfig            |  2 +-
 drivers/acpi/numa.c             | 48 +++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_numa.h        | 23 ++++++++++++++++++++
 include/asm-x86_64/numa.h       |  1 -
 include/linux/acpi.h            |  9 ++++++++
 12 files changed, 102 insertions(+), 71 deletions(-)
 create mode 100644 include/acpi/acpi_numa.h

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8dfa3054f10f..15d23da2455f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -173,6 +173,12 @@ config ACPI_SRAT
 	bool
 	default y
 	depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
+	select ACPI_NUMA
+
+config HAVE_ARCH_PARSE_SRAT
+       bool
+       default y
+       depends on ACPI_SRAT
 
 config X86_SUMMIT_NUMA
 	bool
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index 52b3ed5d2cb5..989c85255dbe 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -39,7 +39,6 @@
 #define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
 #define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
 #define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-#define MAX_PXM_DOMAINS		256	/* 1 byte and no promises about values */
 /* bitmap length; _PXM is at most 255 */
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
 static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
@@ -213,19 +212,11 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 		node_end_pfn[nid] = memory_chunk->end_pfn;
 }
 
-static u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */
-
-int pxm_to_node(int pxm)
-{
-	return pxm_to_nid_map[pxm];
-}
-
 /* Parse the ACPI Static Resource Affinity Table */
 static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 {
 	u8 *start, *end, *p;
 	int i, j, nid;
-	u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */
 
 	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
 	p = start;
@@ -235,10 +226,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
 	memset(zholes_size, 0, sizeof(zholes_size));
 
-	/* -1 in these maps means not available */
-	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
-	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
-
 	num_memory_chunks = 0;
 	while (p < end) {
 		switch (*p) {
@@ -278,9 +265,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	nodes_clear(node_online_map);
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (BMAP_TEST(pxm_bitmap, i)) {
-			nid = num_online_nodes();
-			pxm_to_nid_map[i] = nid;
-			nid_to_pxm_map[nid] = i;
+			int nid = acpi_map_pxm_to_node(i);
 			node_set_online(nid);
 		}
 	}
@@ -288,7 +273,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 
 	/* set cnode id in memory chunk structure */
 	for (i = 0; i < num_memory_chunks; i++)
-		node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm];
+		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
 
 	printk("pxm bitmap: ");
 	for (i = 0; i < sizeof(pxm_bitmap); i++) {
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index bdccd0b1eb60..3ce443e6c016 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1958,7 +1958,7 @@ sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
 	if (pxm < 0)
 		return;
 
-	node = pxm_to_nid_map[pxm];
+	node = pxm_to_node(pxm);
 
 	if (node >= MAX_NUMNODES || !node_online(node))
 		return;
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 58c93a30348c..d1c52cf67882 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -415,9 +415,6 @@ static int __initdata srat_num_cpus;	/* number of cpus */
 static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
 #define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
 #define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
-/* maps to convert between proximity domain and logical node ID */
-int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-int __initdata nid_to_pxm_map[MAX_NUMNODES];
 static struct acpi_table_slit __initdata *slit_table;
 
 static int get_processor_proximity_domain(struct acpi_table_processor_affinity *pa)
@@ -533,22 +530,17 @@ void __init acpi_numa_arch_fixup(void)
 	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
 	 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
 	 */
-	/* calculate total number of nodes in system from PXM bitmap */
-	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
-	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
 	nodes_clear(node_online_map);
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (pxm_bit_test(i)) {
-			int nid = num_online_nodes();
-			pxm_to_nid_map[i] = nid;
-			nid_to_pxm_map[nid] = i;
+			int nid = acpi_map_pxm_to_node(i);
 			node_set_online(nid);
 		}
 	}
 
 	/* set logical node id in memory chunk structure */
 	for (i = 0; i < num_node_memblks; i++)
-		node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+		node_memblk[i].nid = pxm_to_node(node_memblk[i].nid);
 
 	/* assign memory bank numbers for each chunk on each node */
 	for_each_online_node(i) {
@@ -562,7 +554,7 @@ void __init acpi_numa_arch_fixup(void)
 
 	/* set logical node id in cpu structure */
 	for (i = 0; i < srat_num_cpus; i++)
-		node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
 
 	printk(KERN_INFO "Number of logical nodes in system = %d\n",
 	       num_online_nodes());
@@ -575,11 +567,11 @@ void __init acpi_numa_arch_fixup(void)
 	for (i = 0; i < slit_table->localities; i++) {
 		if (!pxm_bit_test(i))
 			continue;
-		node_from = pxm_to_nid_map[i];
+		node_from = pxm_to_node(i);
 		for (j = 0; j < slit_table->localities; j++) {
 			if (!pxm_bit_test(j))
 				continue;
-			node_to = pxm_to_nid_map[j];
+			node_to = pxm_to_node(j);
 			node_distance(node_from, node_to) =
 			    slit_table->entry[i * slit_table->localities + j];
 		}
@@ -785,9 +777,9 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, long physid)
 
 	/*
 	 * Assuming that the container driver would have set the proximity
-	 * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
+	 * domain and would have initialized pxm_to_node(pxm_id) && pxm_flag
 	 */
-	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_nid_map[pxm_id];
+	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_node(pxm_id);
 
 	node_cpuid[cpu].phys_id = physid;
 #endif
@@ -966,7 +958,7 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 	if (pxm < 0)
 		return AE_OK;
 
-	node = pxm_to_nid_map[pxm];
+	node = pxm_to_node(pxm);
 
 	if (node >= MAX_NUMNODES || !node_online(node) ||
 	    cpus_empty(node_to_cpumask(node)))
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index ab829a22f8a4..cf7751b99d1c 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -352,7 +352,7 @@ pci_acpi_scan_root(struct acpi_device *device, int domain, int bus)
 	pxm = acpi_get_pxm(controller->acpi_handle);
 #ifdef CONFIG_NUMA
 	if (pxm >= 0)
-		controller->node = pxm_to_nid_map[pxm];
+		controller->node = pxm_to_node(pxm);
 #endif
 
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_window,
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 30988dfbddff..93577abae36d 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -139,7 +139,7 @@ static int __init pxm_to_nasid(int pxm)
 	int i;
 	int nid;
 
-	nid = pxm_to_nid_map[pxm];
+	nid = pxm_to_node(pxm);
 	for (i = 0; i < num_node_memblks; i++) {
 		if (node_memblk[i].nid == nid) {
 			return NASID_GET(node_memblk[i].start_paddr);
@@ -704,7 +704,7 @@ void __init build_cnode_tables(void)
 	 * cnode == node for all C & M bricks.
 	 */
 	for_each_online_node(node) {
-		nasid = pxm_to_nasid(nid_to_pxm_map[node]);
+		nasid = pxm_to_nasid(node_to_pxm(node));
 		sn_cnodeid_to_nasid[node] = nasid;
 		physical_node_map[nasid] = node;
 	}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 474df22c6ed2..502fce65e96a 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -30,7 +30,6 @@
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
-static nodemask_t nodes_found __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
 static int found_add_area __initdata;
@@ -38,33 +37,14 @@ int hotadd_percent __initdata = 0;
 #ifndef RESERVE_HOTADD
 #define hotadd_percent 0	/* Ignore all settings */
 #endif
-static u8 pxm2node[256] = { [0 ... 255] = 0xff };
 
 /* Too small nodes confuse the VM badly. Usually they result
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
 
-static int node_to_pxm(int n);
-
-int pxm_to_node(int pxm)
-{
-	if ((unsigned)pxm >= 256)
-		return -1;
-	/* Extend 0xff to (int)-1 */
-	return (signed char)pxm2node[pxm];
-}
-
 static __init int setup_node(int pxm)
 {
-	unsigned node = pxm2node[pxm];
-	if (node == 0xff) {
-		if (nodes_weight(nodes_found) >= MAX_NUMNODES)
-			return -1;
-		node = first_unset_node(nodes_found); 
-		node_set(node, nodes_found);
-		pxm2node[pxm] = node;
-	}
-	return pxm2node[pxm];
+	return acpi_map_pxm_to_node(pxm);
 }
 
 static __init int conflicting_nodes(unsigned long start, unsigned long end)
@@ -440,17 +420,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	return 0;
 }
 
-static int node_to_pxm(int n)
-{
-       int i;
-       if (pxm2node[n] == n)
-               return n;
-       for (i = 0; i < 256; i++)
-               if (pxm2node[i] == n)
-                       return i;
-       return 0;
-}
-
 void __init srat_reserve_add_area(int nodeid)
 {
 	if (found_add_area && nodes_add[nodeid].end) {
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index c24652d31bf9..230c53852231 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -162,7 +162,7 @@ config ACPI_THERMAL
 config ACPI_NUMA
 	bool "NUMA support"
 	depends on NUMA
-	depends on (IA64 || X86_64)
+	depends on (X86 || IA64)
 	default y if IA64_GENERIC || IA64_SGI_SN2
 
 config ACPI_ASUS
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 64b98e82feb7..e2c1a16078c9 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -36,12 +36,60 @@
 #define _COMPONENT	ACPI_NUMA
 ACPI_MODULE_NAME("numa")
 
+static nodemask_t nodes_found_map = NODE_MASK_NONE;
+#define PXM_INVAL	-1
+#define NID_INVAL	-1
+
+/* maps to convert between proximity domain and logical node ID */
+int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
+				= { [0 ... MAX_PXM_DOMAINS - 1] = NID_INVAL };
+int __cpuinitdata node_to_pxm_map[MAX_NUMNODES]
+				= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
+
 extern int __init acpi_table_parse_madt_family(enum acpi_table_id id,
 					       unsigned long madt_size,
 					       int entry_id,
 					       acpi_madt_entry_handler handler,
 					       unsigned int max_entries);
 
+int __cpuinit pxm_to_node(int pxm)
+{
+	if (pxm < 0)
+		return NID_INVAL;
+	return pxm_to_node_map[pxm];
+}
+
+int __cpuinit node_to_pxm(int node)
+{
+	if (node < 0)
+		return PXM_INVAL;
+	return node_to_pxm_map[node];
+}
+
+int __cpuinit acpi_map_pxm_to_node(int pxm)
+{
+	int node = pxm_to_node_map[pxm];
+
+	if (node < 0){
+		if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
+			return NID_INVAL;
+		node = first_unset_node(nodes_found_map);
+		pxm_to_node_map[pxm] = node;
+		node_to_pxm_map[node] = pxm;
+		node_set(node, nodes_found_map);
+	}
+
+	return node;
+}
+
+void __cpuinit acpi_unmap_pxm_to_node(int node)
+{
+	int pxm = node_to_pxm_map[node];
+	pxm_to_node_map[pxm] = NID_INVAL;
+	node_to_pxm_map[node] = PXM_INVAL;
+	node_clear(node, nodes_found_map);
+}
+
 void __init acpi_table_print_srat_entry(acpi_table_entry_header * header)
 {
 
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
new file mode 100644
index 000000000000..1049f2a0a6db
--- /dev/null
+++ b/include/acpi/acpi_numa.h
@@ -0,0 +1,23 @@
+#ifndef __ACPI_NUMA_H
+#define __ACPI_NUMA_H
+
+#ifdef CONFIG_ACPI_NUMA
+#include <linux/kernel.h>
+
+/* Proximity bitmap length */
+#if MAX_NUMNODES > 256
+#define MAX_PXM_DOMAINS MAX_NUMNODES
+#else
+#define MAX_PXM_DOMAINS (256) /* Old pxm spec is defined 8 bit */
+#endif
+
+extern int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS];
+extern int __cpuinitdata node_to_pxm_map[MAX_NUMNODES];
+
+extern int __cpuinit pxm_to_node(int);
+extern int __cpuinit node_to_pxm(int);
+extern int __cpuinit acpi_map_pxm_to_node(int);
+extern void __cpuinit acpi_unmap_pxm_to_node(int);
+
+#endif				/* CONFIG_ACPI_NUMA */
+#endif				/* __ACP_NUMA_H */
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index 1cc92fe02503..933ff11ece15 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -8,7 +8,6 @@ struct bootnode {
 };
 
 extern int compute_hash_shift(struct bootnode *nodes, int numnodes);
-extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 1cf0b91d05bd..90d6df1551ed 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -37,6 +37,7 @@
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <acpi/acpi_numa.h>
 #include <asm/acpi.h>
 
 
@@ -407,10 +408,18 @@ void acpi_table_print_madt_entry (acpi_table_entry_header *madt);
 void acpi_table_print_srat_entry (acpi_table_entry_header *srat);
 
 /* the following four functions are architecture-dependent */
+#ifdef CONFIG_HAVE_ARCH_PARSE_SRAT
+#define NR_NODE_MEMBLKS MAX_NUMNODES
+#define acpi_numa_slit_init(slit) do {} while (0)
+#define acpi_numa_processor_affinity_init(pa) do {} while (0)
+#define acpi_numa_memory_affinity_init(ma) do {} while (0)
+#define acpi_numa_arch_fixup() do {} while (0)
+#else
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 void acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa);
 void acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma);
 void acpi_numa_arch_fixup(void);
+#endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-- 
cgit v1.2.3


From 833423143c3a7c6545e409d65febd0d92deb351b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 23 Jun 2006 02:03:20 -0700
Subject: [PATCH] mm: introduce remap_vmalloc_range()

Add remap_vmalloc_range, vmalloc_user, and vmalloc_32_user so that drivers
can have a nice interface for remapping vmalloc memory.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vmalloc.h |   8 ++++
 mm/vmalloc.c            | 122 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 128 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 1d5577b2b752..f6024ab4eff0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,10 +4,13 @@
 #include <linux/spinlock.h>
 #include <asm/page.h>		/* pgprot_t */
 
+struct vm_area_struct;
+
 /* bits in vm_struct->flags */
 #define VM_IOREMAP	0x00000001	/* ioremap() and friends */
 #define VM_ALLOC	0x00000002	/* vmalloc() */
 #define VM_MAP		0x00000004	/* vmap()ed pages */
+#define VM_USERMAP	0x00000008	/* suitable for remap_vmalloc_range */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -32,9 +35,11 @@ struct vm_struct {
  *	Highlevel APIs for driver use
  */
 extern void *vmalloc(unsigned long size);
+extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
+extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
 				pgprot_t prot);
@@ -45,6 +50,9 @@ extern void vfree(void *addr);
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
 extern void vunmap(void *addr);
+
+extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+							unsigned long pgoff);
  
 /*
  *	Lowlevel-APIs (not for driver use!)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34eb..35f8553f893a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,6 +256,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
 	return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
 }
 
+/* Caller must hold vmlist_lock */
+static struct vm_struct *__find_vm_area(void *addr)
+{
+	struct vm_struct *tmp;
+
+	for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
+		 if (tmp->addr == addr)
+			break;
+	}
+
+	return tmp;
+}
+
 /* Caller must hold vmlist_lock */
 struct vm_struct *__remove_vm_area(void *addr)
 {
@@ -498,10 +511,32 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
-       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 EXPORT_SYMBOL(vmalloc);
 
+/**
+ *	vmalloc_user  -  allocate virtually contiguous memory which has
+ *			   been zeroed so it can be mapped to userspace without
+ *			   leaking data.
+ *
+ *	@size:		allocation size
+ */
+void *vmalloc_user(unsigned long size)
+{
+	struct vm_struct *area;
+	void *ret;
+
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+	write_lock(&vmlist_lock);
+	area = __find_vm_area(ret);
+	area->flags |= VM_USERMAP;
+	write_unlock(&vmlist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
 /**
  *	vmalloc_node  -  allocate memory on a specific node
  *
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+	return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
 }
 EXPORT_SYMBOL(vmalloc_node);
 
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_32);
 
+/**
+ *	vmalloc_32_user  -  allocate virtually contiguous memory (32bit
+ *			      addressable) which is zeroed so it can be
+ *			      mapped to userspace without leaking data.
+ *
+ *	@size:		allocation size
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+	struct vm_struct *area;
+	void *ret;
+
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+	write_lock(&vmlist_lock);
+	area = __find_vm_area(ret);
+	area->flags |= VM_USERMAP;
+	write_unlock(&vmlist_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmalloc_32_user);
+
 long vread(char *buf, char *addr, unsigned long count)
 {
 	struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
 	read_unlock(&vmlist_lock);
 	return buf - buf_start;
 }
+
+/**
+ *	remap_vmalloc_range  -  map vmalloc pages to userspace
+ *
+ *	@vma:		vma to cover (map full range of vma)
+ *	@addr:		vmalloc memory
+ *	@pgoff:		number of pages into addr before first page to map
+ *	@returns:	0 for success, -Exxx on failure
+ *
+ *	This function checks that addr is a valid vmalloc'ed area, and
+ *	that it is big enough to cover the vma. Will return failure if
+ *	that criteria isn't met.
+ *
+ *	Similar to remap_pfn_range (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+						unsigned long pgoff)
+{
+	struct vm_struct *area;
+	unsigned long uaddr = vma->vm_start;
+	unsigned long usize = vma->vm_end - vma->vm_start;
+	int ret;
+
+	if ((PAGE_SIZE-1) & (unsigned long)addr)
+		return -EINVAL;
+
+	read_lock(&vmlist_lock);
+	area = __find_vm_area(addr);
+	if (!area)
+		goto out_einval_locked;
+
+	if (!(area->flags & VM_USERMAP))
+		goto out_einval_locked;
+
+	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+		goto out_einval_locked;
+	read_unlock(&vmlist_lock);
+
+	addr += pgoff << PAGE_SHIFT;
+	do {
+		struct page *page = vmalloc_to_page(addr);
+		ret = vm_insert_page(vma, uaddr, page);
+		if (ret)
+			return ret;
+
+		uaddr += PAGE_SIZE;
+		addr += PAGE_SIZE;
+		usize -= PAGE_SIZE;
+	} while (usize > 0);
+
+	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
+	vma->vm_flags |= VM_RESERVED;
+
+	return ret;
+
+out_einval_locked:
+	read_unlock(&vmlist_lock);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
-- 
cgit v1.2.3


From 929f97276bcf7f4a95272ed08a85339b98ba210d Mon Sep 17 00:00:00 2001
From: Dean Nelson <dcn@sgi.com>
Date: Fri, 23 Jun 2006 02:03:21 -0700
Subject: [PATCH] change gen_pool allocator to not touch managed memory

Modify the gen_pool allocator (lib/genalloc.c) to utilize a bitmap scheme
instead of the buddy scheme.  The purpose of this change is to eliminate
the touching of the actual memory being allocated.

Since the change modifies the interface, a change to the uncached allocator
(arch/ia64/kernel/uncached.c) is also required.

Both Andrey Volkov and Jes Sorenson have expressed a desire that the
gen_pool allocator not write to the memory being managed. See the
following:

  http://marc.theaimsgroup.com/?l=linux-kernel&m=113518602713125&w=2
  http://marc.theaimsgroup.com/?l=linux-kernel&m=113533568827916&w=2

Signed-off-by: Dean Nelson <dcn@sgi.com>
Cc: Andrey Volkov <avolkov@varma-el.com>
Acked-by: Jes Sorensen <jes@trained-monkey.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/uncached.c     | 200 +++++++++++++++---------------
 arch/ia64/sn/kernel/sn2/cache.c |  15 ++-
 include/linux/genalloc.h        |  35 +++---
 lib/genalloc.c                  | 263 ++++++++++++++++++----------------------
 4 files changed, 252 insertions(+), 261 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index fcd2bad0286f..5f03b9e524dd 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2001-2005 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (C) 2001-2006 Silicon Graphics, Inc.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License
@@ -29,15 +29,8 @@
 #include <asm/tlbflush.h>
 #include <asm/sn/arch.h>
 
-#define DEBUG	0
 
-#if DEBUG
-#define dprintk			printk
-#else
-#define dprintk(x...)		do { } while (0)
-#endif
-
-void __init efi_memmap_walk_uc (efi_freemem_callback_t callback);
+extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *);
 
 #define MAX_UNCACHED_GRANULES	5
 static int allocated_granules;
@@ -60,6 +53,7 @@ static void uncached_ipi_visibility(void *data)
 static void uncached_ipi_mc_drain(void *data)
 {
 	int status;
+
 	status = ia64_pal_mc_drain();
 	if (status)
 		printk(KERN_WARNING "ia64_pal_mc_drain() failed with %i on "
@@ -67,30 +61,35 @@ static void uncached_ipi_mc_drain(void *data)
 }
 
 
-static unsigned long
-uncached_get_new_chunk(struct gen_pool *poolp)
+/*
+ * Add a new chunk of uncached memory pages to the specified pool.
+ *
+ * @pool: pool to add new chunk of uncached memory to
+ * @nid: node id of node to allocate memory from, or -1
+ *
+ * This is accomplished by first allocating a granule of cached memory pages
+ * and then converting them to uncached memory pages.
+ */
+static int uncached_add_chunk(struct gen_pool *pool, int nid)
 {
 	struct page *page;
-	void *tmp;
 	int status, i;
-	unsigned long addr, node;
+	unsigned long c_addr, uc_addr;
 
 	if (allocated_granules >= MAX_UNCACHED_GRANULES)
-		return 0;
+		return -1;
+
+	/* attempt to allocate a granule's worth of cached memory pages */
 
-	node = poolp->private;
-	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
+	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	if (!page)
+		return -1;
 
-	dprintk(KERN_INFO "get_new_chunk page %p, addr %lx\n",
-		page, (unsigned long)(page-vmem_map) << PAGE_SHIFT);
+	/* convert the memory pages from cached to uncached */
 
-	/*
-	 * Do magic if no mem on local node! XXX
-	 */
-	if (!page)
-		return 0;
-	tmp = page_address(page);
+	c_addr = (unsigned long)page_address(page);
+	uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
 
 	/*
 	 * There's a small race here where it's possible for someone to
@@ -100,76 +99,90 @@ uncached_get_new_chunk(struct gen_pool *poolp)
 	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
 		SetPageUncached(&page[i]);
 
-	flush_tlb_kernel_range(tmp, tmp + IA64_GRANULE_SIZE);
+	flush_tlb_kernel_range(uc_addr, uc_adddr + IA64_GRANULE_SIZE);
 
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
-
-	dprintk(KERN_INFO "pal_prefetch_visibility() returns %i on cpu %i\n",
-		status, raw_smp_processor_id());
-
 	if (!status) {
 		status = smp_call_function(uncached_ipi_visibility, NULL, 0, 1);
 		if (status)
-			printk(KERN_WARNING "smp_call_function failed for "
-			       "uncached_ipi_visibility! (%i)\n", status);
+			goto failed;
 	}
 
+	preempt_disable();
+
 	if (ia64_platform_is("sn2"))
-		sn_flush_all_caches((unsigned long)tmp, IA64_GRANULE_SIZE);
+		sn_flush_all_caches(uc_addr, IA64_GRANULE_SIZE);
 	else
-		flush_icache_range((unsigned long)tmp,
-				   (unsigned long)tmp+IA64_GRANULE_SIZE);
+		flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
+
+	/* flush the just introduced uncached translation from the TLB */
+	local_flush_tlb_all();
+
+	preempt_enable();
 
 	ia64_pal_mc_drain();
 	status = smp_call_function(uncached_ipi_mc_drain, NULL, 0, 1);
 	if (status)
-		printk(KERN_WARNING "smp_call_function failed for "
-		       "uncached_ipi_mc_drain! (%i)\n", status);
+		goto failed;
 
-	addr = (unsigned long)tmp - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
+	/*
+	 * The chunk of memory pages has been converted to uncached so now we
+	 * can add it to the pool.
+	 */
+	status = gen_pool_add(pool, uc_addr, IA64_GRANULE_SIZE, nid);
+	if (status)
+		goto failed;
 
 	allocated_granules++;
-	return addr;
+	return 0;
+
+	/* failed to convert or add the chunk so give it back to the kernel */
+failed:
+	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
+		ClearPageUncached(&page[i]);
+
+	free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	return -1;
 }
 
 
 /*
  * uncached_alloc_page
  *
+ * @starting_nid: node id of node to start with, or -1
+ *
  * Allocate 1 uncached page. Allocates on the requested node. If no
  * uncached pages are available on the requested node, roundrobin starting
- * with higher nodes.
+ * with the next higher node.
  */
-unsigned long
-uncached_alloc_page(int nid)
+unsigned long uncached_alloc_page(int starting_nid)
 {
-	unsigned long maddr;
+	unsigned long uc_addr;
+	struct gen_pool *pool;
+	int nid;
 
-	maddr = gen_pool_alloc(uncached_pool[nid], PAGE_SIZE);
+	if (unlikely(starting_nid >= MAX_NUMNODES))
+		return 0;
 
-	dprintk(KERN_DEBUG "uncached_alloc_page returns %lx on node %i\n",
-		maddr, nid);
+	if (starting_nid < 0)
+		starting_nid = numa_node_id();
+	nid = starting_nid;
 
-	/*
-	 * If no memory is availble on our local node, try the
-	 * remaining nodes in the system.
-	 */
-	if (!maddr) {
-		int i;
-
-		for (i = MAX_NUMNODES - 1; i >= 0; i--) {
-			if (i == nid || !node_online(i))
-				continue;
-			maddr = gen_pool_alloc(uncached_pool[i], PAGE_SIZE);
-			dprintk(KERN_DEBUG "uncached_alloc_page alternate search "
-				"returns %lx on node %i\n", maddr, i);
-			if (maddr) {
-				break;
-			}
-		}
-	}
+	do {
+		if (!node_online(nid))
+			continue;
+		pool = uncached_pool[nid];
+		if (pool == NULL)
+			continue;
+		do {
+			uc_addr = gen_pool_alloc(pool, PAGE_SIZE);
+			if (uc_addr != 0)
+				return uc_addr;
+		} while (uncached_add_chunk(pool, nid) == 0);
+
+	} while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid);
 
-	return maddr;
+	return 0;
 }
 EXPORT_SYMBOL(uncached_alloc_page);
 
@@ -177,21 +190,22 @@ EXPORT_SYMBOL(uncached_alloc_page);
 /*
  * uncached_free_page
  *
+ * @uc_addr: uncached address of page to free
+ *
  * Free a single uncached page.
  */
-void
-uncached_free_page(unsigned long maddr)
+void uncached_free_page(unsigned long uc_addr)
 {
-	int node;
-
-	node = paddr_to_nid(maddr - __IA64_UNCACHED_OFFSET);
+	int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pool[nid];
 
-	dprintk(KERN_DEBUG "uncached_free_page(%lx) on node %i\n", maddr, node);
+	if (unlikely(pool == NULL))
+		return;
 
-	if ((maddr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
-		panic("uncached_free_page invalid address %lx\n", maddr);
+	if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
+		panic("uncached_free_page invalid address %lx\n", uc_addr);
 
-	gen_pool_free(uncached_pool[node], maddr, PAGE_SIZE);
+	gen_pool_free(pool, uc_addr, PAGE_SIZE);
 }
 EXPORT_SYMBOL(uncached_free_page);
 
@@ -199,43 +213,39 @@ EXPORT_SYMBOL(uncached_free_page);
 /*
  * uncached_build_memmap,
  *
+ * @uc_start: uncached starting address of a chunk of uncached memory
+ * @uc_end: uncached ending address of a chunk of uncached memory
+ * @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc())
+ *
  * Called at boot time to build a map of pages that can be used for
  * memory special operations.
  */
-static int __init
-uncached_build_memmap(unsigned long start, unsigned long end, void *arg)
+static int __init uncached_build_memmap(unsigned long uc_start,
+					unsigned long uc_end, void *arg)
 {
-	long length = end - start;
-	int node;
-
-	dprintk(KERN_ERR "uncached_build_memmap(%lx %lx)\n", start, end);
+	int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pool[nid];
+	size_t size = uc_end - uc_start;
 
 	touch_softlockup_watchdog();
-	memset((char *)start, 0, length);
 
-	node = paddr_to_nid(start - __IA64_UNCACHED_OFFSET);
-
-	for (; start < end ; start += PAGE_SIZE) {
-		dprintk(KERN_INFO "sticking %lx into the pool!\n", start);
-		gen_pool_free(uncached_pool[node], start, PAGE_SIZE);
+	if (pool != NULL) {
+		memset((char *)uc_start, 0, size);
+		(void) gen_pool_add(pool, uc_start, size, nid);
 	}
-
 	return 0;
 }
 
 
-static int __init uncached_init(void) {
-	int i;
+static int __init uncached_init(void)
+{
+	int nid;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (!node_online(i))
-			continue;
-		uncached_pool[i] = gen_pool_create(0, IA64_GRANULE_SHIFT,
-						   &uncached_get_new_chunk, i);
+	for_each_online_node(nid) {
+		uncached_pool[nid] = gen_pool_create(PAGE_SHIFT, nid);
 	}
 
-	efi_memmap_walk_uc(uncached_build_memmap);
-
+	efi_memmap_walk_uc(uncached_build_memmap, NULL);
 	return 0;
 }
 
diff --git a/arch/ia64/sn/kernel/sn2/cache.c b/arch/ia64/sn/kernel/sn2/cache.c
index bc3cfa17cd0f..2862cb33026d 100644
--- a/arch/ia64/sn/kernel/sn2/cache.c
+++ b/arch/ia64/sn/kernel/sn2/cache.c
@@ -3,11 +3,12 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  * 
- * Copyright (C) 2001-2003 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2001-2003, 2006 Silicon Graphics, Inc. All rights reserved.
  *
  */
 #include <linux/module.h>
 #include <asm/pgalloc.h>
+#include <asm/sn/arch.h>
 
 /**
  * sn_flush_all_caches - flush a range of address from all caches (incl. L4)
@@ -17,18 +18,24 @@
  * Flush a range of addresses from all caches including L4. 
  * All addresses fully or partially contained within 
  * @flush_addr to @flush_addr + @bytes are flushed
- * from the all caches.
+ * from all caches.
  */
 void
 sn_flush_all_caches(long flush_addr, long bytes)
 {
-	flush_icache_range(flush_addr, flush_addr+bytes);
+	unsigned long addr = flush_addr;
+
+	/* SHub1 requires a cached address */
+	if (is_shub1() && (addr & RGN_BITS) == RGN_BASE(RGN_UNCACHED))
+		addr = (addr - RGN_BASE(RGN_UNCACHED)) + RGN_BASE(RGN_KERNEL);
+
+	flush_icache_range(addr, addr + bytes);
 	/*
 	 * The last call may have returned before the caches
 	 * were actually flushed, so we call it again to make
 	 * sure.
 	 */
-	flush_icache_range(flush_addr, flush_addr+bytes);
+	flush_icache_range(addr, addr + bytes);
 	mb();
 }
 EXPORT_SYMBOL(sn_flush_all_caches);
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 7fd0576a4454..690c42803d2e 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -4,37 +4,32 @@
  * Uses for this includes on-device special memory, uncached memory
  * etc.
  *
- * This code is based on the buddy allocator found in the sym53c8xx_2
- * driver, adapted for general purpose use.
- *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#include <linux/spinlock.h>
 
-#define ALLOC_MIN_SHIFT		5 /* 32 bytes minimum */
 /*
- *  Link between free memory chunks of a given size.
+ *  General purpose special memory pool descriptor.
  */
-struct gen_pool_link {
-	struct gen_pool_link *next;
+struct gen_pool {
+	rwlock_t lock;
+	struct list_head chunks;	/* list of chunks in this pool */
+	int min_alloc_order;		/* minimum allocation order */
 };
 
 /*
- *  Memory pool descriptor.
+ *  General purpose special memory pool chunk descriptor.
  */
-struct gen_pool {
+struct gen_pool_chunk {
 	spinlock_t lock;
-	unsigned long (*get_new_chunk)(struct gen_pool *);
-	struct gen_pool *next;
-	struct gen_pool_link *h;
-	unsigned long private;
-	int max_chunk_shift;
+	struct list_head next_chunk;	/* next chunk in pool */
+	unsigned long start_addr;	/* starting address of memory chunk */
+	unsigned long end_addr;		/* ending address of memory chunk */
+	unsigned long bits[0];		/* bitmap for allocating memory chunk */
 };
 
-unsigned long gen_pool_alloc(struct gen_pool *poolp, int size);
-void gen_pool_free(struct gen_pool *mp, unsigned long ptr, int size);
-struct gen_pool *gen_pool_create(int nr_chunks, int max_chunk_shift,
-				 unsigned long (*fp)(struct gen_pool *),
-				 unsigned long data);
+extern struct gen_pool *gen_pool_create(int, int);
+extern int gen_pool_add(struct gen_pool *, unsigned long, size_t, int);
+extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
+extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 9ce0a6a3b85a..71338b48e889 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -4,10 +4,6 @@
  * Uses for this includes on-device special memory, uncached memory
  * etc.
  *
- * This code is based on the buddy allocator found in the sym53c8xx_2
- * driver Copyright (C) 1999-2001  Gerard Roudier <groudier@free.fr>,
- * and adapted for general purpose use.
- *
  * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
  *
  * This source code is licensed under the GNU General Public License,
@@ -15,172 +11,155 @@
  */
 
 #include <linux/module.h>
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
 #include <linux/genalloc.h>
 
-#include <asm/page.h>
-
 
-struct gen_pool *gen_pool_create(int nr_chunks, int max_chunk_shift,
-				 unsigned long (*fp)(struct gen_pool *),
-				 unsigned long data)
+/*
+ * Create a new special memory pool.
+ *
+ * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
+ * @nid: node id of the node the pool structure should be allocated on, or -1
+ */
+struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 {
-	struct gen_pool *poolp;
-	unsigned long tmp;
-	int i;
-
-	/*
-	 * This is really an arbitrary limit, +10 is enough for
-	 * IA64_GRANULE_SHIFT, aka 16MB. If anyone needs a large limit
-	 * this can be increased without problems.
-	 */
-	if ((max_chunk_shift > (PAGE_SHIFT + 10)) ||
-	    ((max_chunk_shift < ALLOC_MIN_SHIFT) && max_chunk_shift))
-		return NULL;
-
-	if (!max_chunk_shift)
-		max_chunk_shift = PAGE_SHIFT;
-
-	poolp = kmalloc(sizeof(struct gen_pool), GFP_KERNEL);
-	if (!poolp)
-		return NULL;
-	memset(poolp, 0, sizeof(struct gen_pool));
-	poolp->h = kmalloc(sizeof(struct gen_pool_link) *
-			   (max_chunk_shift - ALLOC_MIN_SHIFT + 1),
-			   GFP_KERNEL);
-	if (!poolp->h) {
-		printk(KERN_WARNING "gen_pool_alloc() failed to allocate\n");
-		kfree(poolp);
-		return NULL;
-	}
-	memset(poolp->h, 0, sizeof(struct gen_pool_link) *
-	       (max_chunk_shift - ALLOC_MIN_SHIFT + 1));
-
-	spin_lock_init(&poolp->lock);
-	poolp->get_new_chunk = fp;
-	poolp->max_chunk_shift = max_chunk_shift;
-	poolp->private = data;
-
-	for (i = 0; i < nr_chunks; i++) {
-		tmp = poolp->get_new_chunk(poolp);
-		printk(KERN_INFO "allocated %lx\n", tmp);
-		if (!tmp)
-			break;
-		gen_pool_free(poolp, tmp, (1 << poolp->max_chunk_shift));
-	}
+	struct gen_pool *pool;
 
-	return poolp;
+	pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
+	if (pool != NULL) {
+		rwlock_init(&pool->lock);
+		INIT_LIST_HEAD(&pool->chunks);
+		pool->min_alloc_order = min_alloc_order;
+	}
+	return pool;
 }
 EXPORT_SYMBOL(gen_pool_create);
 
 
 /*
- *  Simple power of two buddy-like generic allocator.
- *  Provides naturally aligned memory chunks.
+ * Add a new chunk of memory to the specified pool.
+ *
+ * @pool: pool to add new memory chunk to
+ * @addr: starting address of memory chunk to add to pool
+ * @size: size in bytes of the memory chunk to add to pool
+ * @nid: node id of the node the chunk structure and bitmap should be
+ *       allocated on, or -1
  */
-unsigned long gen_pool_alloc(struct gen_pool *poolp, int size)
+int gen_pool_add(struct gen_pool *pool, unsigned long addr, size_t size,
+		 int nid)
 {
-	int j, i, s, max_chunk_size;
-	unsigned long a, flags;
-	struct gen_pool_link *h = poolp->h;
+	struct gen_pool_chunk *chunk;
+	int nbits = size >> pool->min_alloc_order;
+	int nbytes = sizeof(struct gen_pool_chunk) +
+				(nbits + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
 
-	max_chunk_size = 1 << poolp->max_chunk_shift;
+	chunk = kmalloc_node(nbytes, GFP_KERNEL, nid);
+	if (unlikely(chunk == NULL))
+		return -1;
 
-	if (size > max_chunk_size)
-		return 0;
+	memset(chunk, 0, nbytes);
+	spin_lock_init(&chunk->lock);
+	chunk->start_addr = addr;
+	chunk->end_addr = addr + size;
 
-	size = max(size, 1 << ALLOC_MIN_SHIFT);
-	i = fls(size - 1);
-	s = 1 << i;
-	j = i -= ALLOC_MIN_SHIFT;
-
-	spin_lock_irqsave(&poolp->lock, flags);
-	while (!h[j].next) {
-		if (s == max_chunk_size) {
-			struct gen_pool_link *ptr;
-			spin_unlock_irqrestore(&poolp->lock, flags);
-			ptr = (struct gen_pool_link *)poolp->get_new_chunk(poolp);
-			spin_lock_irqsave(&poolp->lock, flags);
-			h[j].next = ptr;
-			if (h[j].next)
-				h[j].next->next = NULL;
-			break;
-		}
-		j++;
-		s <<= 1;
-	}
-	a = (unsigned long) h[j].next;
-	if (a) {
-		h[j].next = h[j].next->next;
-		/*
-		 * This should be split into a seperate function doing
-		 * the chunk split in order to support custom
-		 * handling memory not physically accessible by host
-		 */
-		while (j > i) {
-			j -= 1;
-			s >>= 1;
-			h[j].next = (struct gen_pool_link *) (a + s);
-			h[j].next->next = NULL;
-		}
-	}
-	spin_unlock_irqrestore(&poolp->lock, flags);
-	return a;
+	write_lock(&pool->lock);
+	list_add(&chunk->next_chunk, &pool->chunks);
+	write_unlock(&pool->lock);
+
+	return 0;
 }
-EXPORT_SYMBOL(gen_pool_alloc);
+EXPORT_SYMBOL(gen_pool_add);
 
 
 /*
- *  Counter-part of the generic allocator.
+ * Allocate the requested number of bytes from the specified pool.
+ * Uses a first-fit algorithm.
+ *
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
  */
-void gen_pool_free(struct gen_pool *poolp, unsigned long ptr, int size)
+unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 {
-	struct gen_pool_link *q;
-	struct gen_pool_link *h = poolp->h;
-	unsigned long a, b, flags;
-	int i, s, max_chunk_size;
-
-	max_chunk_size = 1 << poolp->max_chunk_shift;
+	struct list_head *_chunk;
+	struct gen_pool_chunk *chunk;
+	unsigned long addr, flags;
+	int order = pool->min_alloc_order;
+	int nbits, bit, start_bit, end_bit;
 
-	if (size > max_chunk_size)
-		return;
-
-	size = max(size, 1 << ALLOC_MIN_SHIFT);
-	i = fls(size - 1);
-	s = 1 << i;
-	i -= ALLOC_MIN_SHIFT;
-
-	a = ptr;
+	if (size == 0)
+		return 0;
 
-	spin_lock_irqsave(&poolp->lock, flags);
-	while (1) {
-		if (s == max_chunk_size) {
-			((struct gen_pool_link *)a)->next = h[i].next;
-			h[i].next = (struct gen_pool_link *)a;
-			break;
+	nbits = (size + (1UL << order) - 1) >> order;
+
+	read_lock(&pool->lock);
+	list_for_each(_chunk, &pool->chunks) {
+		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+
+		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
+		end_bit -= nbits + 1;
+
+		spin_lock_irqsave(&chunk->lock, flags);
+		bit = -1;
+		while (bit + 1 < end_bit) {
+			bit = find_next_zero_bit(chunk->bits, end_bit, bit + 1);
+			if (bit >= end_bit)
+				break;
+
+			start_bit = bit;
+			if (nbits > 1) {
+				bit = find_next_bit(chunk->bits, bit + nbits,
+							bit + 1);
+				if (bit - start_bit < nbits)
+					continue;
+			}
+
+			addr = chunk->start_addr +
+					    ((unsigned long)start_bit << order);
+			while (nbits--)
+				__set_bit(start_bit++, &chunk->bits);
+			spin_unlock_irqrestore(&chunk->lock, flags);
+			read_unlock(&pool->lock);
+			return addr;
 		}
-		b = a ^ s;
-		q = &h[i];
+		spin_unlock_irqrestore(&chunk->lock, flags);
+	}
+	read_unlock(&pool->lock);
+	return 0;
+}
+EXPORT_SYMBOL(gen_pool_alloc);
 
-		while (q->next && q->next != (struct gen_pool_link *)b)
-			q = q->next;
 
-		if (!q->next) {
-			((struct gen_pool_link *)a)->next = h[i].next;
-			h[i].next = (struct gen_pool_link *)a;
+/*
+ * Free the specified memory back to the specified pool.
+ *
+ * @pool: pool to free to
+ * @addr: starting address of memory to free back to pool
+ * @size: size in bytes of memory to free
+ */
+void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
+{
+	struct list_head *_chunk;
+	struct gen_pool_chunk *chunk;
+	unsigned long flags;
+	int order = pool->min_alloc_order;
+	int bit, nbits;
+
+	nbits = (size + (1UL << order) - 1) >> order;
+
+	read_lock(&pool->lock);
+	list_for_each(_chunk, &pool->chunks) {
+		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+
+		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
+			BUG_ON(addr + size > chunk->end_addr);
+			spin_lock_irqsave(&chunk->lock, flags);
+			bit = (addr - chunk->start_addr) >> order;
+			while (nbits--)
+				__clear_bit(bit++, &chunk->bits);
+			spin_unlock_irqrestore(&chunk->lock, flags);
 			break;
 		}
-		q->next = q->next->next;
-		a = a & b;
-		s <<= 1;
-		i++;
 	}
-	spin_unlock_irqrestore(&poolp->lock, flags);
+	BUG_ON(nbits > 0);
+	read_unlock(&pool->lock);
 }
 EXPORT_SYMBOL(gen_pool_free);
-- 
cgit v1.2.3


From 612d6c19db2fd0dc97b0fa370613ecd4a305ffc3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 23 Jun 2006 02:03:22 -0700
Subject: [PATCH] radix-tree: direct data

The ability to have height 0 radix trees (a direct pointer to the data item
rather than going through a full node->slot) quietly disappeared with
old-2.6-bkcvs commit ffee171812d51652f9ba284302d9e5c5cc14bdfd.  On 64-bit
machines this causes nearly 600 bytes to be used for every <= 4K file in
pagecache.

Re-introduce this feature, root tags stored in spare ->gfp_mask bits.

Simplify radix_tree_delete's complex tag clearing arrangement (which would
become even more complex) by just falling back to tag clearing functions
(the pagecache radix-tree never uses this path anyway, so the icache
savings will mean it's actually a speedup).

On my 4GB G5, this saves 8MB RAM per kernel kernel source+object tree in
pagecache.

Pagecache lookup, insertion, and removal speed for small files will also be
improved.

This makes RCU radix tree harder, but it's worth it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/radix-tree.h |   5 +-
 lib/radix-tree.c           | 192 ++++++++++++++++++++++++++-------------------
 2 files changed, 114 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index dd83cca28001..9158a68140c9 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -23,6 +23,9 @@
 #include <linux/preempt.h>
 #include <linux/types.h>
 
+#define RADIX_TREE_MAX_TAGS 2
+
+/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
 	unsigned int		height;
 	gfp_t			gfp_mask;
@@ -45,8 +48,6 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
-#define RADIX_TREE_MAX_TAGS 2
-
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7097bb239e40..35a2d93b3528 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -74,6 +74,11 @@ struct radix_tree_preload {
 };
 DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
+static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+{
+	return root->gfp_mask & __GFP_BITS_MASK;
+}
+
 /*
  * This assumes that the caller has performed appropriate preallocation, and
  * that the caller has pinned this thread of control to the current CPU.
@@ -82,9 +87,10 @@ static struct radix_tree_node *
 radix_tree_node_alloc(struct radix_tree_root *root)
 {
 	struct radix_tree_node *ret;
+	gfp_t gfp_mask = root_gfp_mask(root);
 
-	ret = kmem_cache_alloc(radix_tree_node_cachep, root->gfp_mask);
-	if (ret == NULL && !(root->gfp_mask & __GFP_WAIT)) {
+	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
+	if (ret == NULL && !(gfp_mask & __GFP_WAIT)) {
 		struct radix_tree_preload *rtp;
 
 		rtp = &__get_cpu_var(radix_tree_preloads);
@@ -152,6 +158,27 @@ static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
 	return test_bit(offset, node->tags[tag]);
 }
 
+static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= ~(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK;
+}
+
+static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
 /*
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
@@ -182,7 +209,6 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_node *node;
 	unsigned int height;
-	char tags[RADIX_TREE_MAX_TAGS];
 	int tag;
 
 	/* Figure out what the height should be.  */
@@ -195,16 +221,6 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 		goto out;
 	}
 
-	/*
-	 * Prepare the tag status of the top-level node for propagation
-	 * into the newly-pushed top-level node(s)
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 0;
-		if (any_tag_set(root->rnode, tag))
-			tags[tag] = 1;
-	}
-
 	do {
 		if (!(node = radix_tree_node_alloc(root)))
 			return -ENOMEM;
@@ -214,7 +230,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 
 		/* Propagate the aggregated tag info into the new root */
 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
+			if (root_tag_get(root, tag))
 				tag_set(node, tag, 0);
 		}
 
@@ -243,8 +259,7 @@ int radix_tree_insert(struct radix_tree_root *root,
 	int error;
 
 	/* Make sure the tree is high enough.  */
-	if ((!index && !root->rnode) ||
-			index > radix_tree_maxindex(root->height)) {
+	if (index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
 		if (error)
 			return error;
@@ -255,7 +270,7 @@ int radix_tree_insert(struct radix_tree_root *root,
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
 	offset = 0;			/* uninitialised var warning */
-	do {
+	while (height > 0) {
 		if (slot == NULL) {
 			/* Have to add a child node.  */
 			if (!(slot = radix_tree_node_alloc(root)))
@@ -273,16 +288,21 @@ int radix_tree_insert(struct radix_tree_root *root,
 		slot = node->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
-	} while (height > 0);
+	}
 
 	if (slot != NULL)
 		return -EEXIST;
 
-	BUG_ON(!node);
-	node->count++;
-	node->slots[offset] = item;
-	BUG_ON(tag_get(node, 0, offset));
-	BUG_ON(tag_get(node, 1, offset));
+	if (node) {
+		node->count++;
+		node->slots[offset] = item;
+		BUG_ON(tag_get(node, 0, offset));
+		BUG_ON(tag_get(node, 1, offset));
+	} else {
+		root->rnode = item;
+		BUG_ON(root_tag_get(root, 0));
+		BUG_ON(root_tag_get(root, 1));
+	}
 
 	return 0;
 }
@@ -295,9 +315,13 @@ static inline void **__lookup_slot(struct radix_tree_root *root,
 	struct radix_tree_node **slot;
 
 	height = root->height;
+
 	if (index > radix_tree_maxindex(height))
 		return NULL;
 
+	if (height == 0 && root->rnode)
+		return (void **)&root->rnode;
+
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 	slot = &root->rnode;
 
@@ -368,8 +392,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	if (index > radix_tree_maxindex(height))
 		return NULL;
 
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 
 	while (height > 0) {
 		int offset;
@@ -383,6 +407,10 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		height--;
 	}
 
+	/* set the root's tag bit */
+	if (slot && !root_tag_get(root, tag))
+		root_tag_set(root, tag);
+
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
@@ -405,9 +433,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_node *slot;
+	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
-	void *ret = NULL;
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
@@ -432,20 +459,24 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		height--;
 	}
 
-	ret = slot;
-	if (ret == NULL)
+	if (slot == NULL)
 		goto out;
 
-	do {
+	while (pathp->node) {
 		if (!tag_get(pathp->node, tag, pathp->offset))
 			goto out;
 		tag_clear(pathp->node, tag, pathp->offset);
 		if (any_tag_set(pathp->node, tag))
 			goto out;
 		pathp--;
-	} while (pathp->node);
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+
 out:
-	return ret;
+	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
@@ -458,9 +489,8 @@ EXPORT_SYMBOL(radix_tree_tag_clear);
  *
  * Return values:
  *
- *  0: tag not present
- *  1: tag present, set
- * -1: tag present, unset
+ *  0: tag not present or not set
+ *  1: tag set
  */
 int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag)
@@ -473,6 +503,13 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 	if (index > radix_tree_maxindex(height))
 		return 0;
 
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	if (height == 0)
+		return 1;
+
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
 
@@ -494,7 +531,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
 			int ret = tag_get(slot, tag, offset);
 
 			BUG_ON(ret && saw_unset_tag);
-			return ret ? 1 : -1;
+			return ret;
 		}
 		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
@@ -514,8 +551,11 @@ __lookup(struct radix_tree_root *root, void **results, unsigned long index,
 	unsigned long i;
 
 	height = root->height;
-	if (height == 0)
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
 		goto out;
+	}
 
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
@@ -603,10 +643,16 @@ __lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
 	unsigned int height = root->height;
 	struct radix_tree_node *slot;
 
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
+		goto out;
+	}
+
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	slot = root->rnode;
 
-	while (height > 0) {
+	do {
 		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
 
 		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
@@ -637,7 +683,7 @@ __lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
 		}
 		shift -= RADIX_TREE_MAP_SHIFT;
 		slot = slot->slots[i];
-	}
+	} while (height > 0);
 out:
 	*next_index = index;
 	return nr_found;
@@ -665,6 +711,10 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 	unsigned long cur_index = first_index;
 	unsigned int ret = 0;
 
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
 	while (ret < max_items) {
 		unsigned int nr_found;
 		unsigned long next_index;	/* Index of next search */
@@ -689,7 +739,7 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
 static inline void radix_tree_shrink(struct radix_tree_root *root)
 {
 	/* try to shrink tree height */
-	while (root->height > 1 &&
+	while (root->height > 0 &&
 			root->rnode->count == 1 &&
 			root->rnode->slots[0]) {
 		struct radix_tree_node *to_free = root->rnode;
@@ -717,12 +767,8 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_path *orig_pathp;
-	struct radix_tree_node *slot;
+	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
-	void *ret = NULL;
-	char tags[RADIX_TREE_MAX_TAGS];
-	int nr_cleared_tags;
 	int tag;
 	int offset;
 
@@ -730,11 +776,17 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	if (index > radix_tree_maxindex(height))
 		goto out;
 
+	slot = root->rnode;
+	if (height == 0 && root->rnode) {
+		root_tag_clear_all(root);
+		root->rnode = NULL;
+		goto out;
+	}
+
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	pathp->node = NULL;
-	slot = root->rnode;
 
-	for ( ; height > 0; height--) {
+	do {
 		if (slot == NULL)
 			goto out;
 
@@ -744,44 +796,22 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 		pathp->node = slot;
 		slot = slot->slots[offset];
 		shift -= RADIX_TREE_MAP_SHIFT;
-	}
+		height--;
+	} while (height > 0);
 
-	ret = slot;
-	if (ret == NULL)
+	if (slot == NULL)
 		goto out;
 
-	orig_pathp = pathp;
-
 	/*
 	 * Clear all tags associated with the just-deleted item
 	 */
-	nr_cleared_tags = 0;
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		tags[tag] = 1;
-		if (tag_get(pathp->node, tag, pathp->offset)) {
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (!any_tag_set(pathp->node, tag)) {
-				tags[tag] = 0;
-				nr_cleared_tags++;
-			}
-		}
-	}
-
-	for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (tags[tag])
-				continue;
-
-			tag_clear(pathp->node, tag, pathp->offset);
-			if (any_tag_set(pathp->node, tag)) {
-				tags[tag] = 1;
-				nr_cleared_tags--;
-			}
-		}
+		if (tag_get(pathp->node, tag, pathp->offset))
+			radix_tree_tag_clear(root, index, tag);
 	}
 
 	/* Now free the nodes we do not need anymore */
-	for (pathp = orig_pathp; pathp->node; pathp--) {
+	while (pathp->node) {
 		pathp->node->slots[pathp->offset] = NULL;
 		pathp->node->count--;
 
@@ -793,11 +823,15 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 
 		/* Node with zero slots in use so free it */
 		radix_tree_node_free(pathp->node);
+
+		pathp--;
 	}
-	root->rnode = NULL;
+	root_tag_clear_all(root);
 	root->height = 0;
+	root->rnode = NULL;
+
 out:
-	return ret;
+	return slot;
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
@@ -808,11 +842,7 @@ EXPORT_SYMBOL(radix_tree_delete);
  */
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 {
-  	struct radix_tree_node *rnode;
-  	rnode = root->rnode;
-  	if (!rnode)
-  		return 0;
-	return any_tag_set(rnode, tag);
+	return root_tag_get(root, tag);
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
-- 
cgit v1.2.3


From 111ebb6e6f7bd7de6d722c5848e95621f43700d9 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 23 Jun 2006 02:03:26 -0700
Subject: [PATCH] writeback: fix range handling

When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request.  Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".

To make all this sane, the patch changes range of writeback_control.

So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.

And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.

This patch does,

    - Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
      -1 is usually ok for range_end (type is long long). But, if someone did,

		range_end += val;		range_end is "val - 1"
		u64val = range_end >> bits;	u64val is "~(0ULL)"

      or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
      things, and uses LLONG_MAX for range_end.

    - All callers of ->writepages() sets range_start/end or range_cyclic.

    - Fix updates of ->writeback_index. It seems already bit strange.
      If it starts at 0 and ended by check of nr_to_write, this last
      index may reduce chance to scan end of file.  So, this updates
      ->writeback_index only if range_cyclic is true or whole-file is
      scanned.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/file.c            | 24 +++++++++++-------------
 fs/fs-writeback.c         |  4 ++++
 fs/mpage.c                | 22 ++++++++++------------
 fs/sync.c                 |  2 +-
 include/linux/kernel.h    |  3 +++
 include/linux/writeback.h |  5 +++--
 mm/filemap.c              |  6 +++---
 mm/page-writeback.c       |  3 +++
 mm/vmscan.c               |  2 ++
 9 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2b4ce1dad66..487ea8b3baaa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1079,9 +1079,9 @@ static int cifs_writepages(struct address_space *mapping,
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
 	int done = 0;
-	pgoff_t end = -1;
+	pgoff_t end;
 	pgoff_t index;
-	int is_range = 0;
+ 	int range_whole = 0;
 	struct kvec iov[32];
 	int len;
 	int n_iov = 0;
@@ -1122,16 +1122,14 @@ static int cifs_writepages(struct address_space *mapping,
 	xid = GetXid();
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE)
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
-	else {
-		index = 0;
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -1167,7 +1165,7 @@ retry:
 				break;
 			}
 
-			if (unlikely(is_range) && (page->index > end)) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				break;
@@ -1271,7 +1269,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 
 	FreeXid(xid);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3fbe2d030f4..6db95cf3aaa2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -461,6 +461,8 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct writeback_control wbc = {
 		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.range_start	= 0,
+		.range_end	= LLONG_MAX,
 	};
 	unsigned long nr_dirty = read_page_state(nr_dirty);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
@@ -559,6 +561,8 @@ int write_inode_now(struct inode *inode, int sync)
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
 		.sync_mode = WB_SYNC_ALL,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
 	};
 
 	if (!mapping_cap_writeback_dirty(inode->i_mapping))
diff --git a/fs/mpage.c b/fs/mpage.c
index 9bf2eb30e6f4..1e4598247d0b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -707,9 +707,9 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
-	pgoff_t end = -1;		/* Inclusive */
+	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int is_range = 0;
+	int range_whole = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -721,16 +721,14 @@ mpage_writepages(struct address_space *mapping,
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE) {
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
 	} else {
-		index = 0;			  /* whole-file sweep */
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -759,7 +757,7 @@ retry:
 				continue;
 			}
 
-			if (unlikely(is_range) && page->index > end) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				continue;
@@ -810,7 +808,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
diff --git a/fs/sync.c b/fs/sync.c
index aab5ffe77e9f..955aef04da28 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -100,7 +100,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 	}
 
 	if (nbytes == 0)
-		endbyte = -1;
+		endbyte = LLONG_MAX;
 	else
 		endbyte--;		/* inclusive */
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f4fc576ed4c4..25fccd859fbf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -24,6 +24,9 @@ extern const char linux_banner[];
 #define LONG_MAX	((long)(~0UL>>1))
 #define LONG_MIN	(-LONG_MAX - 1)
 #define ULONG_MAX	(~0UL)
+#define LLONG_MAX	((long long)(~0ULL>>1))
+#define LLONG_MIN	(-LLONG_MAX - 1)
+#define ULLONG_MAX	(~0ULL)
 
 #define STACK_MAGIC	0xdeadbeef
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 56f92fcbe94a..9e38b566d0e7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -50,14 +50,15 @@ struct writeback_control {
 	 * a hint that the filesystem need only write out the pages inside that
 	 * byterange.  The byte at `end' is included in the writeout request.
 	 */
-	loff_t start;
-	loff_t end;
+	loff_t range_start;
+	loff_t range_end;
 
 	unsigned nonblocking:1;		/* Don't get stuck on request queues */
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned for_writepages:1;	/* This is a writepages() call */
+	unsigned range_cyclic:1;	/* range_start is cyclic */
 };
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index fd57442186cb..3342067ca436 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -190,8 +190,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	struct writeback_control wbc = {
 		.sync_mode = sync_mode,
 		.nr_to_write = mapping->nrpages * 2,
-		.start = start,
-		.end = end,
+		.range_start = start,
+		.range_end = end,
 	};
 
 	if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +204,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 static inline int __filemap_fdatawrite(struct address_space *mapping,
 	int sync_mode)
 {
-	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 }
 
 int filemap_fdatawrite(struct address_space *mapping)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79bb..8ccf6f1b1473 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 			.sync_mode	= WB_SYNC_NONE,
 			.older_than_this = NULL,
 			.nr_to_write	= write_chunk,
+			.range_cyclic	= 1,
 		};
 
 		get_dirty_limits(&wbs, &background_thresh,
@@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages)
 		.older_than_this = NULL,
 		.nr_to_write	= 0,
 		.nonblocking	= 1,
+		.range_cyclic	= 1,
 	};
 
 	for ( ; ; ) {
@@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg)
 		.nr_to_write	= 0,
 		.nonblocking	= 1,
 		.for_kupdate	= 1,
+		.range_cyclic	= 1,
 	};
 
 	sync_supers();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 46be8a02280e..bc5d4f43036c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -339,6 +339,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
 			.nr_to_write = SWAP_CLUSTER_MAX,
+			.range_start = 0,
+			.range_end = LLONG_MAX,
 			.nonblocking = 1,
 			.for_reclaim = 1,
 		};
-- 
cgit v1.2.3


From e7340f73307abed9283d0a07570d06e228c205dd Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:29 -0700
Subject: [PATCH] page migration cleanup: remove useless definitions

Remove the export for migrate_page_remove_references() and migrate_page_copy()
that are unlikely to be used directly by filesystems implementing migration.
The export was useful when buffer_migrate_page() lived in fs/buffer.c but it
has now been moved to migrate.c in the migration reorg.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h | 2 --
 mm/migrate.c            | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 6789c4940c9c..e8d3b08cc354 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,8 +7,6 @@
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct page *, struct page *);
-extern void migrate_page_copy(struct page *, struct page *);
-extern int migrate_page_remove_references(struct page *, struct page *, int);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int migrate_pages_to(struct list_head *pagelist,
diff --git a/mm/migrate.c b/mm/migrate.c
index 49e71ddb6792..be3f141e53a4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -169,7 +169,7 @@ retry:
  * Remove references for a page and establish the new page with the correct
  * basic settings to be able to stop accesses to the page.
  */
-int migrate_page_remove_references(struct page *newpage,
+static int migrate_page_remove_references(struct page *newpage,
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
@@ -246,12 +246,11 @@ int migrate_page_remove_references(struct page *newpage,
 
 	return 0;
 }
-EXPORT_SYMBOL(migrate_page_remove_references);
 
 /*
  * Copy the page to its new location
  */
-void migrate_page_copy(struct page *newpage, struct page *page)
+static void migrate_page_copy(struct page *newpage, struct page *page)
 {
 	copy_highpage(newpage, page);
 
@@ -286,7 +285,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 	if (PageWriteback(newpage))
 		end_page_writeback(newpage);
 }
-EXPORT_SYMBOL(migrate_page_copy);
 
 /************************************************************
  *                    Migration functions
-- 
cgit v1.2.3


From 2d1db3b1170db4e8bf0531dd636742269c2cf579 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:33 -0700
Subject: [PATCH] page migration cleanup: pass "mapping" to migration functions

Change handling of address spaces.

Pass a pointer to the address space in which the page is migrated to all
migration function.  This avoids repeatedly having to retrieve the address
space pointer from the page and checking it for validity.  The old page
mapping will change once migration has gone to a certain step, so it is less
confusing to have the pointer always available.

Move the setting of the mapping and index for the new page into
migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h      |  6 +++--
 include/linux/migrate.h |  6 +++--
 mm/migrate.c            | 70 ++++++++++++++++++++++++-------------------------
 3 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c823a3815e24..e917403f4d58 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -376,7 +376,8 @@ struct address_space_operations {
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
 	/* migrate the contents of a page to the specified target */
-	int (*migratepage) (struct page *, struct page *);
+	int (*migratepage) (struct address_space *,
+			struct page *, struct page *);
 };
 
 struct backing_dev_info;
@@ -1772,7 +1773,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *, size_t, loff_t *, const void *, size_t);
 
 #ifdef CONFIG_MIGRATION
-extern int buffer_migrate_page(struct page *, struct page *);
+extern int buffer_migrate_page(struct address_space *,
+				struct page *, struct page *);
 #else
 #define buffer_migrate_page NULL
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e8d3b08cc354..287c47b5e5df 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -6,12 +6,14 @@
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
-extern int migrate_page(struct page *, struct page *);
+extern int migrate_page(struct address_space *,
+			struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest);
-extern int fail_migrate_page(struct page *, struct page *);
+extern int fail_migrate_page(struct address_space *,
+			struct page *, struct page *);
 
 extern int migrate_prep(void);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 8095c607a494..f65e69d94527 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -173,15 +173,11 @@ retry:
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate set.
  */
-static int migrate_page_move_mapping(struct page *newpage,
-				struct page *page)
+static int migrate_page_move_mapping(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
-	struct address_space *mapping = page_mapping(page);
 	struct page **radix_pointer;
 
-	if (!mapping)
-		return -EAGAIN;
-
 	write_lock_irq(&mapping->tree_lock);
 
 	radix_pointer = (struct page **)radix_tree_lookup_slot(
@@ -197,15 +193,8 @@ static int migrate_page_move_mapping(struct page *newpage,
 
 	/*
 	 * Now we know that no one else is looking at the page.
-	 *
-	 * Certain minimal information about a page must be available
-	 * in order for other subsystems to properly handle the page if they
-	 * find it through the radix tree update before we are finished
-	 * copying the page.
 	 */
 	get_page(newpage);
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
 	if (PageSwapCache(page)) {
 		SetPageSwapCache(newpage);
 		set_page_private(newpage, page_private(page));
@@ -262,7 +251,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
  ***********************************************************/
 
 /* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct page *newpage, struct page *page)
+int fail_migrate_page(struct address_space *mapping,
+			struct page *newpage, struct page *page)
 {
 	return -EIO;
 }
@@ -274,13 +264,14 @@ EXPORT_SYMBOL(fail_migrate_page);
  *
  * Pages are locked upon entry and exit.
  */
-int migrate_page(struct page *newpage, struct page *page)
+int migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
 	int rc;
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page);
 
 	if (rc)
 		return rc;
@@ -305,21 +296,18 @@ EXPORT_SYMBOL(migrate_page);
  * if the underlying filesystem guarantees that no other references to "page"
  * exist.
  */
-int buffer_migrate_page(struct page *newpage, struct page *page)
+int buffer_migrate_page(struct address_space *mapping,
+		struct page *newpage, struct page *page)
 {
-	struct address_space *mapping = page->mapping;
 	struct buffer_head *bh, *head;
 	int rc;
 
-	if (!mapping)
-		return -EAGAIN;
-
 	if (!page_has_buffers(page))
-		return migrate_page(newpage, page);
+		return migrate_page(mapping, newpage, page);
 
 	head = page_buffers(page);
 
-	rc = migrate_page_move_mapping(newpage, page);
+	rc = migrate_page_move_mapping(mapping, newpage, page);
 
 	if (rc)
 		return rc;
@@ -448,9 +436,6 @@ redo:
 			goto next;
 		}
 
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-
 		/*
 		 * Establish swap ptes for anonymous pages or destroy pte
 		 * maps for files.
@@ -473,11 +458,18 @@ redo:
 		rc = -EPERM;
 		if (try_to_unmap(page, 1) == SWAP_FAIL)
 			/* A vma has VM_LOCKED set -> permanent failure */
-			goto unlock_both;
+			goto unlock_page;
 
 		rc = -EAGAIN;
 		if (page_mapped(page))
-			goto unlock_both;
+			goto unlock_page;
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+		/* Prepare mapping for the new page.*/
+		newpage->index = page->index;
+		newpage->mapping = page->mapping;
+
 		/*
 		 * Pages are properly locked and writeback is complete.
 		 * Try to migrate the page.
@@ -494,7 +486,8 @@ redo:
 			 * own migration function. This is the most common
 			 * path for page migration.
 			 */
-			rc = mapping->a_ops->migratepage(newpage, page);
+			rc = mapping->a_ops->migratepage(mapping,
+							newpage, page);
 			goto unlock_both;
                 }
 
@@ -524,7 +517,7 @@ redo:
 		 */
 		if (!page_has_buffers(page) ||
 		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(newpage, page);
+			rc = migrate_page(mapping, newpage, page);
 			goto unlock_both;
 		}
 
@@ -553,12 +546,17 @@ unlock_page:
 		unlock_page(page);
 
 next:
-		if (rc == -EAGAIN) {
-			retry++;
-		} else if (rc) {
-			/* Permanent failure */
-			list_move(&page->lru, failed);
-			nr_failed++;
+		if (rc) {
+			if (newpage)
+				newpage->mapping = NULL;
+
+			if (rc == -EAGAIN)
+				retry++;
+			else {
+				/* Permanent failure */
+				list_move(&page->lru, failed);
+				nr_failed++;
+			}
 		} else {
 			if (newpage) {
 				/* Successful migration. Return page to LRU */
-- 
cgit v1.2.3


From 0697212a411c1dae03c27845f2de2f3adb32c331 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:35 -0700
Subject: [PATCH] Swapless page migration: add R/W migration entries

Implement read/write migration ptes

We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.

The VM is modified to handle the migration entries.  migration entries can
only be encountered when the page they are pointing to is locked.  This limits
the number of places one has to fix.  We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.

We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock.  This allows us to effectively stop all accesses
to apge.

Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c

From: Hugh Dickins <hugh@veritas.com>

  Several times while testing swapless page migration (I've no NUMA, just
  hacking it up to migrate recklessly while running load), I've hit the
  BUG_ON(!PageLocked(p)) in migration_entry_to_page.

  This comes from an orphaned migration entry, unrelated to the current
  correctly locked migration, but hit by remove_anon_migration_ptes as it
  checks an address in each vma of the anon_vma list.

  Such an orphan may be left behind if an earlier migration raced with fork:
  copy_one_pte can duplicate a migration entry from parent to child, after
  remove_anon_migration_ptes has checked the child vma, but before it has
  removed it from the parent vma.  (If the process were later to fault on this
  orphaned entry, it would hit the same BUG from migration_entry_wait.)

  This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
  not.  There's no such problem with file pages, because vma_prio_tree_add
  adds child vma after parent vma, and the page table locking at each end is
  enough to serialize.  Follow that example with anon_vma: add new vmas to the
  tail instead of the head.

  (There's no corresponding problem when inserting migration entries,
  because a missed pte will leave the page count and mapcount high, which is
  allowed for.  And there's no corresponding problem when migrating via swap,
  because a leftover swap entry will be correctly faulted.  But the swapless
  method has no refcounting of its entries.)

From: Ingo Molnar <mingo@elte.hu>

  pte_unmap_unlock() takes the pte pointer as an argument.

From: Hugh Dickins <hugh@veritas.com>

  Several times while testing swapless page migration, gcc has tried to exec
  a pointer instead of a string: smells like COW mappings are not being
  properly write-protected on fork.

  The protection in copy_one_pte looks very convincing, until at last you
  realize that the second arg to make_migration_entry is a boolean "write",
  and SWP_MIGRATION_READ is 30.

  Anyway, it's better done like in change_pte_range, using
  is_write_migration_entry and make_migration_entry_read.

From: Hugh Dickins <hugh@veritas.com>

  Remove unnecessary obfuscation from sys_swapon's range check on swap type,
  which blew up causing memory corruption once swapless migration made
  MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   7 +++
 include/linux/swapops.h |  53 ++++++++++++++++++++
 mm/memory.c             |  18 ++++++-
 mm/migrate.c            | 128 +++++++++++++++++++++++++++++++++++++++++++++++-
 mm/mprotect.c           |  23 +++++++--
 mm/rmap.c               |  38 ++++++++------
 mm/swapfile.c           |  20 +++-----
 7 files changed, 255 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index cd28ad206dae..7cee73ef4f15 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -28,7 +28,14 @@ static inline int current_is_kswapd(void)
  * the type/offset into the pte as 5/27 as well.
  */
 #define MAX_SWAPFILES_SHIFT	5
+#ifndef CONFIG_MIGRATION
 #define MAX_SWAPFILES		(1 << MAX_SWAPFILES_SHIFT)
+#else
+/* Use last two entries for page migration swap entries */
+#define MAX_SWAPFILES		((1 << MAX_SWAPFILES_SHIFT)-2)
+#define SWP_MIGRATION_READ	MAX_SWAPFILES
+#define SWP_MIGRATION_WRITE	(MAX_SWAPFILES + 1)
+#endif
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 87b9d14c710d..ec639aa3a1d3 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+#ifdef CONFIG_MIGRATION
+static inline swp_entry_t make_migration_entry(struct page *page, int write)
+{
+	BUG_ON(!PageLocked(page));
+	return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
+			page_to_pfn(page));
+}
+
+static inline int is_migration_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
+			swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
+}
+
+static inline struct page *migration_entry_to_page(swp_entry_t entry)
+{
+	struct page *p = pfn_to_page(swp_offset(entry));
+	/*
+	 * Any use of migration entries may only occur while the
+	 * corresponding page is locked
+	 */
+	BUG_ON(!PageLocked(p));
+	return p;
+}
+
+static inline void make_migration_entry_read(swp_entry_t *entry)
+{
+	*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
+}
+
+extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+					unsigned long address);
+#else
+
+#define make_migration_entry(page, write) swp_entry(0, 0)
+#define is_migration_entry(swp) 0
+#define migration_entry_to_page(swp) NULL
+static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+					 unsigned long address) { }
+static inline int is_write_migration_entry(swp_entry_t entry)
+{
+	return 0;
+}
+
+#endif
+
diff --git a/mm/memory.c b/mm/memory.c
index 7e3683fd4f3c..11673c5d2c20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
 		if (!pte_file(pte)) {
-			swap_duplicate(pte_to_swp_entry(pte));
+			swp_entry_t entry = pte_to_swp_entry(pte);
+
+			swap_duplicate(entry);
 			/* make sure dst_mm is on swapoff's mmlist. */
 			if (unlikely(list_empty(&dst_mm->mmlist))) {
 				spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
+			if (is_write_migration_entry(entry) &&
+					is_cow_mapping(vm_flags)) {
+				/*
+				 * COW mappings require pages in both parent
+				 * and child to be set to read.
+				 */
+				make_migration_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				set_pte_at(src_mm, addr, src_pte, pte);
+			}
 		}
 		goto out_set_pte;
 	}
@@ -1879,6 +1891,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
+	if (is_migration_entry(entry)) {
+		migration_entry_wait(mm, pmd, address);
+		goto out;
+	}
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5a340f4ca212..0a011e421bb4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
 #include <linux/migrate.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
@@ -23,7 +24,6 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/swapops.h>
 
 #include "internal.h"
 
@@ -119,6 +119,132 @@ int putback_lru_pages(struct list_head *l)
 	return count;
 }
 
+static inline int is_swap_pte(pte_t pte)
+{
+	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+}
+
+/*
+ * Restore a potential migration pte to a working pte entry
+ */
+static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+		struct page *old, struct page *new)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	swp_entry_t entry;
+ 	pgd_t *pgd;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+	pte_t *ptep, pte;
+ 	spinlock_t *ptl;
+
+ 	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+                return;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+                return;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return;
+
+	ptep = pte_offset_map(pmd, addr);
+
+	if (!is_swap_pte(*ptep)) {
+		pte_unmap(ptep);
+ 		return;
+ 	}
+
+ 	ptl = pte_lockptr(mm, pmd);
+ 	spin_lock(ptl);
+	pte = *ptep;
+	if (!is_swap_pte(pte))
+		goto out;
+
+	entry = pte_to_swp_entry(pte);
+
+	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
+		goto out;
+
+	inc_mm_counter(mm, anon_rss);
+	get_page(new);
+	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+	if (is_write_migration_entry(entry))
+		pte = pte_mkwrite(pte);
+	set_pte_at(mm, addr, ptep, pte);
+	page_add_anon_rmap(new, vma, addr);
+out:
+	pte_unmap_unlock(ptep, ptl);
+}
+
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ *
+ * Must hold mmap_sem lock on at least one of the vmas containing
+ * the page so that the anon_vma cannot vanish.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	unsigned long mapping;
+
+	mapping = (unsigned long)new->mapping;
+
+	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+		return;
+
+	/*
+	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+	 */
+	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+		remove_migration_pte(vma, page_address_in_vma(new, vma),
+					old, new);
+
+	spin_unlock(&anon_vma->lock);
+}
+
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long address)
+{
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	swp_entry_t entry;
+	struct page *page;
+
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	pte = *ptep;
+	if (!is_swap_pte(pte))
+		goto out;
+
+	entry = pte_to_swp_entry(pte);
+	if (!is_migration_entry(entry))
+		goto out;
+
+	page = migration_entry_to_page(entry);
+
+	get_page(page);
+	pte_unmap_unlock(ptep, ptl);
+	wait_on_page_locked(page);
+	put_page(page);
+	return;
+out:
+	pte_unmap_unlock(ptep, ptl);
+}
+
 /*
  * swapout a single page
  * page is locked upon entry, unlocked on exit
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5faf01ad3ef8..14f93e62270f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
-
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		unsigned long addr, unsigned long end, pgprot_t newprot)
 {
-	pte_t *pte;
+	pte_t *pte, oldpte;
 	spinlock_t *ptl;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
-		if (pte_present(*pte)) {
+		oldpte = *pte;
+		if (pte_present(oldpte)) {
 			pte_t ptent;
 
 			/* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
 			set_pte_at(mm, addr, pte, ptent);
 			lazy_mmu_prot_update(ptent);
+#ifdef CONFIG_MIGRATION
+		} else if (!pte_file(oldpte)) {
+			swp_entry_t entry = pte_to_swp_entry(oldpte);
+
+			if (is_write_migration_entry(entry)) {
+				/*
+				 * A protection check is difficult so
+				 * just be safe and disable write
+				 */
+				make_migration_entry_read(&entry);
+				set_pte_at(mm, addr, pte,
+					swp_entry_to_pte(entry));
+			}
+#endif
 		}
+
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(pte - 1, ptl);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 10806b7af40c..3b8ce86daa3a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 		spin_lock(&mm->page_table_lock);
 		if (likely(!vma->anon_vma)) {
 			vma->anon_vma = anon_vma;
-			list_add(&vma->anon_vma_node, &anon_vma->head);
+			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 			allocated = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
 	struct anon_vma *anon_vma = vma->anon_vma;
 
 	if (anon_vma) {
-		list_add(&vma->anon_vma_node, &anon_vma->head);
+		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 	}
 }
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
 
 	if (anon_vma) {
 		spin_lock(&anon_vma->lock);
-		list_add(&vma->anon_vma_node, &anon_vma->head);
+		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
 		validate_anon_vma(vma);
 		spin_unlock(&anon_vma->lock);
 	}
@@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			if (list_empty(&mm->mmlist))
-				list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+
+		if (PageSwapCache(page)) {
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+			swap_duplicate(entry);
+			if (list_empty(&mm->mmlist)) {
+				spin_lock(&mmlist_lock);
+				if (list_empty(&mm->mmlist))
+					list_add(&mm->mmlist, &init_mm.mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+		} else {
+			/*
+			 * Store the pfn of the page in a special migration
+			 * pte. do_swap_page() will wait until the migration
+			 * pte is removed and then restart fault handling.
+			 */
+			BUG_ON(!migration);
+			entry = make_migration_entry(page, pte_write(pteval));
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 47a6812f5f8c..e3b1362372c2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
 	struct swap_info_struct * p;
 	struct page *page = NULL;
 
+	if (is_migration_entry(entry))
+		return;
+
 	p = swap_info_get(entry);
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -1400,19 +1403,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		if (!(p->flags & SWP_USED))
 			break;
 	error = -EPERM;
-	/*
-	 * Test if adding another swap device is possible. There are
-	 * two limiting factors: 1) the number of bits for the swap
-	 * type swp_entry_t definition and 2) the number of bits for
-	 * the swap type in the swap ptes as defined by the different
-	 * architectures. To honor both limitations a swap entry
-	 * with swap offset 0 and swap type ~0UL is created, encoded
-	 * to a swap pte, decoded to a swp_entry_t again and finally
-	 * the swap type part is extracted. This will mask all bits
-	 * from the initial ~0UL that can't be encoded in either the
-	 * swp_entry_t or the architecture definition of a swap pte.
-	 */
-	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+	if (type >= MAX_SWAPFILES) {
 		spin_unlock(&swap_lock);
 		goto out;
 	}
@@ -1702,6 +1693,9 @@ int swap_duplicate(swp_entry_t entry)
 	unsigned long offset, type;
 	int result = 0;
 
+	if (is_migration_entry(entry))
+		return 1;
+
 	type = swp_type(entry);
 	if (type >= nr_swapfiles)
 		goto bad_file;
-- 
cgit v1.2.3


From d75a0fcda2cfc71b50e16dc89e0c32c57d427e85 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:36 -0700
Subject: [PATCH] Swapless page migration: rip out swap based logic

Rip the page migration logic out.

Remove all code that has to do with swapping during page migration.

This also guts the ability to migrate pages to swap.  No one used that so lets
let it go for good.

Page migration should be a bit broken after this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rmap.h |  1 -
 mm/migrate.c         | 75 +++-------------------------------------------------
 mm/rmap.c            | 38 --------------------------
 mm/swapfile.c        |  9 -------
 4 files changed, 3 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2d4c81a220db..bf97b0900014 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -91,7 +91,6 @@ static inline void page_dup_rmap(struct page *page)
  */
 int page_referenced(struct page *, int is_locked);
 int try_to_unmap(struct page *, int ignore_refs);
-void remove_from_swap(struct page *page);
 
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
diff --git a/mm/migrate.c b/mm/migrate.c
index 0a011e421bb4..81721a061d50 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -70,10 +70,6 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
  */
 int migrate_prep(void)
 {
-	/* Must have swap device for migration */
-	if (nr_swap_pages <= 0)
-		return -ENODEV;
-
 	/*
 	 * Clear the LRU lists so pages can be isolated.
 	 * Note that pages may be moved off the LRU after we have
@@ -245,52 +241,6 @@ out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-
-	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-			goto unlock_retry;
-
-	if (PageDirty(page)) {
-		/* Page is dirty, try to write it out here */
-		switch(pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			goto unlock_retry;
-
-		case PAGE_SUCCESS:
-			goto retry;
-
-		case PAGE_CLEAN:
-			; /* try to free the page below */
-		}
-	}
-
-	if (PagePrivate(page)) {
-		if (!try_to_release_page(page, GFP_KERNEL) ||
-		    (!mapping && page_count(page) == 1))
-			goto unlock_retry;
-	}
-
-	if (remove_mapping(mapping, page)) {
-		/* Success */
-		unlock_page(page);
-		return 0;
-	}
-
-unlock_retry:
-	unlock_page(page);
-
-retry:
-	return -EAGAIN;
-}
-
 /*
  * Replace the page in the mapping.
  *
@@ -517,8 +467,7 @@ static int fallback_migrate_page(struct address_space *mapping,
  * Two lists are passed to this function. The first list
  * contains the pages isolated from the LRU to be migrated.
  * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
+ * can be moved to.
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
@@ -574,29 +523,11 @@ redo:
 		 * Only wait on writeback if we have already done a pass where
 		 * we we may have triggered writeouts for lots of pages.
 		 */
-		if (pass > 0) {
+		if (pass > 0)
 			wait_on_page_writeback(page);
-		} else {
+		else
 			if (PageWriteback(page))
 				goto unlock_page;
-		}
-
-		/*
-		 * Anonymous pages must have swap cache references otherwise
-		 * the information contained in the page maps cannot be
-		 * preserved.
-		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page, GFP_KERNEL)) {
-				rc = -ENOMEM;
-				goto unlock_page;
-			}
-		}
-
-		if (!to) {
-			rc = swap_page(page);
-			goto next;
-		}
 
 		/*
 		 * Establish swap ptes for anonymous pages or destroy pte
diff --git a/mm/rmap.c b/mm/rmap.c
index 3b8ce86daa3a..a53a10b93ecf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -205,44 +205,6 @@ out:
 	return anon_vma;
 }
 
-#ifdef CONFIG_MIGRATION
-/*
- * Remove an anonymous page from swap replacing the swap pte's
- * through real pte's pointing to valid pages and then releasing
- * the page from the swap cache.
- *
- * Must hold page lock on page and mmap_sem of one vma that contains
- * the page.
- */
-void remove_from_swap(struct page *page)
-{
-	struct anon_vma *anon_vma;
-	struct vm_area_struct *vma;
-	unsigned long mapping;
-
-	if (!PageSwapCache(page))
-		return;
-
-	mapping = (unsigned long)page->mapping;
-
-	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-		return;
-
-	/*
-	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
-	 */
-	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-	spin_lock(&anon_vma->lock);
-
-	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-		remove_vma_swap(vma, page);
-
-	spin_unlock(&anon_vma->lock);
-	delete_from_swap_cache(page);
-}
-EXPORT_SYMBOL(remove_from_swap);
-#endif
-
 /*
  * At what user virtual address is page expected in vma?
  */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e3b1362372c2..fbceed67a075 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -618,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm,
 	return 0;
 }
 
-#ifdef CONFIG_MIGRATION
-int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
-{
-	swp_entry_t entry = { .val = page_private(page) };
-
-	return unuse_vma(vma, entry, page);
-}
-#endif
-
 /*
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
-- 
cgit v1.2.3


From 04e62a29bf157ce1edd168f2b71b533c80d13628 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:38 -0700
Subject: [PATCH] More page migration: use migration entries for file pages

This implements the use of migration entries to preserve ptes of file backed
pages during migration.  Processes can therefore be migrated back and forth
without loosing their connection to pagecache pages.

Note that we implement the migration entries only for linear mappings.
Nonlinear mappings still require the unmapping of the ptes for migration.

And another writepage() ugliness shows up.  writepage() can drop the page
lock.  Therefore we have to remove migration ptes before calling writepages()
in order to avoid having migration entries point to unlocked pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  15 ------
 mm/migrate.c         | 127 ++++++++++++++++++++++++++++++++++++++++-----------
 mm/rmap.c            |  11 +++++
 mm/vmscan.c          |  14 +++++-
 4 files changed, 124 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7cee73ef4f15..1cf234e8df55 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -186,20 +186,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
-extern pageout_t pageout(struct page *page, struct address_space *mapping);
-
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
 extern int zone_reclaim_interval;
@@ -259,7 +245,6 @@ extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
 extern spinlock_t swap_lock;
-extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
diff --git a/mm/migrate.c b/mm/migrate.c
index 96b9546e69e0..b5000d463893 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -24,6 +24,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/writeback.h>
 
 #include "internal.h"
 
@@ -123,7 +124,7 @@ static inline int is_swap_pte(pte_t pte)
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
+static void remove_migration_pte(struct vm_area_struct *vma,
 		struct page *old, struct page *new)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -133,6 +134,10 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
  	pmd_t *pmd;
 	pte_t *ptep, pte;
  	spinlock_t *ptl;
+	unsigned long addr = page_address_in_vma(new, vma);
+
+	if (addr == -EFAULT)
+		return;
 
  	pgd = pgd_offset(mm, addr);
 	if (!pgd_present(*pgd))
@@ -169,19 +174,47 @@ static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
 	if (is_write_migration_entry(entry))
 		pte = pte_mkwrite(pte);
 	set_pte_at(mm, addr, ptep, pte);
-	page_add_anon_rmap(new, vma, addr);
+
+	if (PageAnon(new))
+		page_add_anon_rmap(new, vma, addr);
+	else
+		page_add_file_rmap(new);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(vma, addr, pte);
+	lazy_mmu_prot_update(pte);
+
 out:
 	pte_unmap_unlock(ptep, ptl);
 }
 
 /*
- * Get rid of all migration entries and replace them by
- * references to the indicated page.
- *
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+	struct vm_area_struct *vma;
+	struct address_space *mapping = page_mapping(new);
+	struct prio_tree_iter iter;
+	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+	if (!mapping)
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+		remove_migration_pte(vma, old, new);
+
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
  * Must hold mmap_sem lock on at least one of the vmas containing
  * the page so that the anon_vma cannot vanish.
  */
-static void remove_migration_ptes(struct page *old, struct page *new)
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
@@ -199,12 +232,23 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 	spin_lock(&anon_vma->lock);
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-		remove_migration_pte(vma, page_address_in_vma(new, vma),
-					old, new);
+		remove_migration_pte(vma, old, new);
 
 	spin_unlock(&anon_vma->lock);
 }
 
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+	if (PageAnon(new))
+		remove_anon_migration_ptes(old, new);
+	else
+		remove_file_migration_ptes(old, new);
+}
+
 /*
  * Something used the pte of a page under migration. We need to
  * get to the page and wait until migration is finished.
@@ -424,30 +468,59 @@ int buffer_migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(buffer_migrate_page);
 
-static int fallback_migrate_page(struct address_space *mapping,
-	struct page *newpage, struct page *page)
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
 {
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.nonblocking = 1,
+		.for_reclaim = 1
+	};
+	int rc;
+
+	if (!mapping->a_ops->writepage)
+		/* No write method for the address space */
+		return -EINVAL;
+
+	if (!clear_page_dirty_for_io(page))
+		/* Someone else already triggered a write */
+		return -EAGAIN;
+
 	/*
-	 * Default handling if a filesystem does not provide
-	 * a migration function. We can only migrate clean
-	 * pages so try to write out any dirty pages first.
+	 * A dirty page may imply that the underlying filesystem has
+	 * the page on some queue. So the page must be clean for
+	 * migration. Writeout may mean we loose the lock and the
+	 * page state is no longer what we checked for earlier.
+	 * At this point we know that the migration attempt cannot
+	 * be successful.
 	 */
-	if (PageDirty(page)) {
-		switch (pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			return -EAGAIN;
+	remove_migration_ptes(page, page);
 
-		case PAGE_SUCCESS:
-			/* Relock since we lost the lock */
-			lock_page(page);
-			/* Must retry since page state may have changed */
-			return -EAGAIN;
+	rc = mapping->a_ops->writepage(page, &wbc);
+	if (rc < 0)
+		/* I/O Error writing */
+		return -EIO;
 
-		case PAGE_CLEAN:
-			; /* try to migrate the page below */
-		}
-	}
+	if (rc != AOP_WRITEPAGE_ACTIVATE)
+		/* unlocked. Relock */
+		lock_page(page);
+
+	return -EAGAIN;
+}
+
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+	struct page *newpage, struct page *page)
+{
+	if (PageDirty(page))
+		return writeout(mapping, page);
 
 	/*
 	 * Buffers may be managed in a filesystem specific way.
diff --git a/mm/rmap.c b/mm/rmap.c
index 05d6d73a692d..882a85826bb2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -596,6 +596,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
 		} else {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -604,12 +605,22 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			BUG_ON(!migration);
 			entry = make_migration_entry(page, pte_write(pteval));
+#endif
 		}
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 	} else
+#ifdef CONFIG_MIGRATION
+	if (migration) {
+		/* Establish migration entry for a file page */
+		swp_entry_t entry;
+		entry = make_migration_entry(page, pte_write(pteval));
+		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+	} else
+#endif
 		dec_mm_counter(mm, file_rss);
 
+
 	page_remove_rmap(page);
 	page_cache_release(page);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc5d4f43036c..71a02e295037 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -290,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
 	unlock_page(page);
 }
 
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
-- 
cgit v1.2.3


From 30c253e6da655d73eb8bfe2adca9b8f4d82fb81e Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Fri, 23 Jun 2006 02:03:41 -0700
Subject: [PATCH] sparsemem: record nid during memory present

Record the node id as we mark sections for instantiation.  Use this nid
during instantiation to direct allocations.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Bob Picco <bob.picco@hp.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  5 +++++
 mm/sparse.c            | 22 ++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e82fc1a52cd0..d6120fa69116 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -509,6 +509,10 @@ struct mem_section {
 	 * pages.  However, it is stored with some other magic.
 	 * (see sparse.c::sparse_init_one_section())
 	 *
+	 * Additionally during early boot we encode node id of
+	 * the location of the section here to guide allocation.
+	 * (see sparse.c::memory_present())
+	 *
 	 * Making it a UL at least makes someone do a cast
 	 * before using it wrong.
 	 */
@@ -548,6 +552,7 @@ extern int __section_nr(struct mem_section* ms);
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_MAP_LAST_BIT	(1UL<<2)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
+#define SECTION_NID_SHIFT	2
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb6..e0a3fe48aa37 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms)
 	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node.  This keeps us from having to use another data structure.  The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+	return (nid << SECTION_NID_SHIFT);
+}
+
+static inline int sparse_early_nid(struct mem_section *section)
+{
+	return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
+
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 
 		ms = __nr_to_section(section);
 		if (!ms->section_mem_map)
-			ms->section_mem_map = SECTION_MARKED_PRESENT;
+			ms->section_mem_map = sparse_encode_early_nid(nid) |
+							SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms,
 	if (!valid_section(ms))
 		return -EINVAL;
 
+	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
 
 	return 1;
@@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms,
 static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
-	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
 	struct mem_section *ms = __nr_to_section(pnum);
+	int nid = sparse_early_nid(ms);
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
-- 
cgit v1.2.3


From bd96b9eb7cfd6ab24ba244360a09980a720874d2 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Fri, 23 Jun 2006 02:03:42 -0700
Subject: [PATCH] mm: fix swap unused warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_SWAP is not defined we get:

mm/vmscan.c: In function âremove_mappingâ:
mm/vmscan.c:387: warning: unused variable âswapâ

Convert defines in swap.h into blank inline functions to fix this warning
and be consistent.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 64 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1cf234e8df55..f1a827a972e0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -282,18 +282,60 @@ static inline void disable_swap_token(void)
 #define free_pages_and_swap_cache(pages, nr) \
 	release_pages((pages), (nr), 0);
 
-#define show_swap_cache_info()			/*NOTHING*/
-#define free_swap_and_cache(swp)		/*NOTHING*/
-#define swap_duplicate(swp)			/*NOTHING*/
-#define swap_free(swp)				/*NOTHING*/
-#define read_swap_cache_async(swp,vma,addr)	NULL
-#define lookup_swap_cache(swp)			NULL
-#define valid_swaphandles(swp, off)		0
+static inline void show_swap_cache_info(void)
+{
+}
+
+static inline void free_swap_and_cache(swp_entry_t swp)
+{
+}
+
+static inline int swap_duplicate(swp_entry_t swp)
+{
+	return 0;
+}
+
+static inline void swap_free(swp_entry_t swp)
+{
+}
+
+static inline struct page *read_swap_cache_async(swp_entry_t swp,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	return NULL;
+}
+
+static inline struct page *lookup_swap_cache(swp_entry_t swp)
+{
+	return NULL;
+}
+
+static inline int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
+{
+	return 0;
+}
+
 #define can_share_swap_page(p)			(page_mapcount(p) == 1)
-#define move_to_swap_cache(p, swp)		1
-#define move_from_swap_cache(p, i, m)		1
-#define __delete_from_swap_cache(p)		/*NOTHING*/
-#define delete_from_swap_cache(p)		/*NOTHING*/
+
+static inline int move_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+	return 1;
+}
+
+static inline int move_from_swap_cache(struct page *page, unsigned long index,
+					struct address_space *mapping)
+{
+	return 1;
+}
+
+static inline void __delete_from_swap_cache(struct page *page)
+{
+}
+
+static inline void delete_from_swap_cache(struct page *page)
+{
+}
+
 #define swap_token_default_timeout		0
 
 static inline int remove_exclusive_swap_page(struct page *p)
-- 
cgit v1.2.3


From 9637a5efd4fbe36164c5ce7f6a0ee68b2bf22b7f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:03:43 -0700
Subject: [PATCH] add page_mkwrite() vm_operations method

Add a new VMA operation to notify a filesystem or other driver about the
MMU generating a fault because userspace attempted to write to a page
mapped through a read-only PTE.

This facility permits the filesystem or driver to:

 (*) Implement storage allocation/reservation on attempted write, and so to
     deal with problems such as ENOSPC more gracefully (perhaps by generating
     SIGBUS).

 (*) Delay making the page writable until the contents have been written to a
     backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS.
     It permits the filesystem to have some guarantee about the state of the
     cache.

 (*) Account and limit number of dirty pages. This is one piece of the puzzle
     needed to make shared writable mapping work safely in FUSE.

Needed by cachefs (Or is it cachefiles?  Or fscache? <head spins>).

At least four other groups have stated an interest in it or a desire to use
the functionality it provides: FUSE, OCFS2, NTFS and JFFS2.  Also, things like
EXT3 really ought to use it to deal with the case of shared-writable mmap
encountering ENOSPC before we permit the page to be dirtied.

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

  get_user_pages(.write=1, .force=1) can generate COW hits on read-only
  shared mappings, this patch traps those as mkpage_write candidates and fails
  to handle them the old way.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Joel Becker <Joel.Becker@oracle.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |   4 +++
 mm/memory.c        | 100 ++++++++++++++++++++++++++++++++++++++++-------------
 mm/mmap.c          |  12 +++++--
 mm/mprotect.c      |  11 ++++--
 4 files changed, 99 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 697c6bf248c2..3b09444121d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -199,6 +199,10 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
 	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+
+	/* notification that a previously read-only page is about to become
+	 * writable, if an error is returned it will cause a SIGBUS */
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
 #ifdef CONFIG_NUMA
 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 11673c5d2c20..247b5c312b9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1457,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int ret = VM_FAULT_MINOR;
+	int reuse, ret = VM_FAULT_MINOR;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page)
 		goto gotten;
 
-	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
-		int reuse = can_share_swap_page(old_page);
-		unlock_page(old_page);
-		if (reuse) {
-			flush_cache_page(vma, address, pte_pfn(orig_pte));
-			entry = pte_mkyoung(orig_pte);
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-			ptep_set_access_flags(vma, address, page_table, entry, 1);
-			update_mmu_cache(vma, address, entry);
-			lazy_mmu_prot_update(entry);
-			ret |= VM_FAULT_WRITE;
-			goto unlock;
+	if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
+				(VM_SHARED|VM_WRITE))) {
+		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			/*
+			 * Notify the address space that the page is about to
+			 * become writable so that it can prohibit this or wait
+			 * for the page to get into an appropriate state.
+			 *
+			 * We do this without the lock held, so that it can
+			 * sleep if it needs to.
+			 */
+			page_cache_get(old_page);
+			pte_unmap_unlock(page_table, ptl);
+
+			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+				goto unwritable_page;
+
+			page_cache_release(old_page);
+
+			/*
+			 * Since we dropped the lock we need to revalidate
+			 * the PTE as someone else may have changed it.  If
+			 * they did, we just return, as we can count on the
+			 * MMU to tell us if they didn't also make it writable.
+			 */
+			page_table = pte_offset_map_lock(mm, pmd, address,
+							 &ptl);
+			if (!pte_same(*page_table, orig_pte))
+				goto unlock;
 		}
+
+		reuse = 1;
+	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+		reuse = can_share_swap_page(old_page);
+		unlock_page(old_page);
+	} else {
+		reuse = 0;
+	}
+
+	if (reuse) {
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
+		entry = pte_mkyoung(orig_pte);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		ptep_set_access_flags(vma, address, page_table, entry, 1);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+		ret |= VM_FAULT_WRITE;
+		goto unlock;
 	}
 
 	/*
@@ -1535,6 +1570,10 @@ oom:
 	if (old_page)
 		page_cache_release(old_page);
 	return VM_FAULT_OOM;
+
+unwritable_page:
+	page_cache_release(old_page);
+	return VM_FAULT_SIGBUS;
 }
 
 /*
@@ -2083,18 +2122,31 @@ retry:
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	if (write_access && !(vma->vm_flags & VM_SHARED)) {
-		struct page *page;
+	if (write_access) {
+		if (!(vma->vm_flags & VM_SHARED)) {
+			struct page *page;
 
-		if (unlikely(anon_vma_prepare(vma)))
-			goto oom;
-		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-		if (!page)
-			goto oom;
-		copy_user_highpage(page, new_page, address);
-		page_cache_release(new_page);
-		new_page = page;
-		anon = 1;
+			if (unlikely(anon_vma_prepare(vma)))
+				goto oom;
+			page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+			if (!page)
+				goto oom;
+			copy_user_highpage(page, new_page, address);
+			page_cache_release(new_page);
+			new_page = page;
+			anon = 1;
+
+		} else {
+			/* if the page will be shareable, see if the backing
+			 * address space wants to know that the page is about
+			 * to become writable */
+			if (vma->vm_ops->page_mkwrite &&
+			    vma->vm_ops->page_mkwrite(vma, new_page) < 0
+			    ) {
+				page_cache_release(new_page);
+				return VM_FAULT_SIGBUS;
+			}
+		}
 	}
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b13..6446c6134b04 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
-	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+	vma->vm_page_prot = protection_map[vm_flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
 			goto free_vma;
 	}
 
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		vma->vm_page_prot =
+			protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
+
 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 	 * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
 	vma->vm_flags = flags;
-	vma->vm_page_prot = protection_map[flags & 0x0f];
+	vma->vm_page_prot = protection_map[flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 14f93e62270f..638edabaff71 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -123,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned long charged = 0;
+	unsigned int mask;
 	pgprot_t newprot;
 	pgoff_t pgoff;
 	int error;
@@ -149,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		}
 	}
 
-	newprot = protection_map[newflags & 0xf];
-
 	/*
 	 * First try to merge with previous and/or next vma.
 	 */
@@ -177,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	}
 
 success:
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		mask &= ~VM_SHARED;
+
+	newprot = protection_map[newflags & mask];
+
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_sem
 	 * held in write mode.
-- 
cgit v1.2.3


From bd1e22b8e0a90f9a91e4c27db14ca15773659bf7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 02:03:47 -0700
Subject: [PATCH] initialise total_memory() earlier

Initialise total_memory earlier in boot.  Because if for some reason we run
page reclaim early in boot, we don't want total_memory to be zero when we use
it as a divisor.

And rename total_memory to vm_total_pages to avoid naming clashes with
architectures.

Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 1 +
 mm/page_alloc.c      | 6 +++---
 mm/vmscan.c          | 5 ++---
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f1a827a972e0..dc3f3aa0c83e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -185,6 +185,7 @@ extern unsigned long try_to_free_pages(struct zone **, gfp_t);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
+extern long vm_total_pages;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5af33186a25f..71a0b2a23f5b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1725,9 +1725,9 @@ void __meminit build_all_zonelists(void)
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
 		/* cpuset refresh routine should be here */
 	}
-
-	printk("Built %i zonelists\n", num_online_nodes());
-
+	vm_total_pages = nr_free_pagecache_pages();
+	printk("Built %i zonelists.  Total pages: %ld\n",
+			num_online_nodes(), vm_total_pages);
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71a02e295037..72babac71dea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -110,7 +110,7 @@ struct shrinker {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages;	/* The total number of pages which the VM controls */
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -743,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		 * how much memory
 		 * is mapped.
 		 */
-		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+		mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
 
 		/*
 		 * Now decide how much we really want to unmap some pages.  The
@@ -1482,7 +1482,6 @@ static int __init kswapd_init(void)
 		pgdat->kswapd = find_task_by_pid(pid);
 		read_unlock(&tasklist_lock);
 	}
-	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
-- 
cgit v1.2.3


From 800590f523bf3bde9fa6c8e4d6763e4bf6a2c8ec Mon Sep 17 00:00:00 2001
From: Paul Drynoff <pauldrynoff@gmail.com>
Date: Fri, 23 Jun 2006 02:03:48 -0700
Subject: [PATCH] slab: kmalloc, kzalloc comments cleanup and fix

- Move comments for kmalloc to right place, currently it near __do_kmalloc

- Comments for kzalloc

- More detailed comments for kmalloc

- Appearance of "kmalloc" and "kzalloc" man pages after "make mandocs"

[rdunlap@xenotime.net: simplification]
Signed-off-by: Paul Drynoff <pauldrynoff@gmail.com>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DocBook/kernel-api.tmpl |  1 +
 include/linux/slab.h                  | 50 +++++++++++++++++++++++++++++++++++
 mm/slab.c                             | 20 ++------------
 3 files changed, 53 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index ca02e04a906c..6dab3dd36995 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -117,6 +117,7 @@ X!Ilib/string.c
   <chapter id="mm">
      <title>Memory Management in Linux</title>
      <sect1><title>The Slab Cache</title>
+!Iinclude/linux/slab.h
 !Emm/slab.c
      </sect1>
      <sect1><title>User Space Memory Access</title>
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9dc93163e065..45ad55b70d1c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -86,6 +86,51 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
     __kmalloc_track_caller(size, flags, __builtin_return_address(0))
 #endif
 
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ *
+ * The @flags argument may be one of:
+ *
+ * %GFP_USER - Allocate memory on behalf of user.  May sleep.
+ *
+ * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
+ *
+ * %GFP_ATOMIC - Allocation will not sleep.
+ *   For example, use this inside interrupt handlers.
+ *
+ * %GFP_HIGHUSER - Allocate pages from high memory.
+ *
+ * %GFP_NOIO - Do not do any I/O at all while trying to get memory.
+ *
+ * %GFP_NOFS - Do not make any fs calls while trying to get memory.
+ *
+ * Also it is possible to set different flags by OR'ing
+ * in one or more of the following additional @flags:
+ *
+ * %__GFP_COLD - Request cache-cold pages instead of
+ *   trying to return cache-warm pages.
+ *
+ * %__GFP_DMA - Request memory from the DMA-capable zone.
+ *
+ * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
+ *
+ * %__GFP_HIGHMEM - Allocated memory may be from highmem.
+ *
+ * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
+ *   (think twice before using).
+ *
+ * %__GFP_NORETRY - If memory is not immediately available,
+ *   then give up at once.
+ *
+ * %__GFP_NOWARN - If allocation fails, don't issue any warnings.
+ *
+ * %__GFP_REPEAT - If allocation fails initially, try once more before failing.
+ */
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
@@ -111,6 +156,11 @@ found:
 
 extern void *__kzalloc(size_t, gfp_t);
 
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
 static inline void *kzalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
diff --git a/mm/slab.c b/mm/slab.c
index 664c3a10acf2..98ac20bc0de9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3277,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node);
 #endif
 
 /**
- * kmalloc - allocate memory
+ * __do_kmalloc - allocate memory
  * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
+ * @flags: the type of memory to allocate (see kmalloc).
  * @caller: function caller for debug tracking of the caller
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- *
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user.  May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
- *
- * Additionally, the %GFP_DMA flag may be set to indicate the memory
- * must be suitable for DMA.  This can mean different things on different
- * platforms.  For example, on i386, it means that the memory must come
- * from the first 16MB.
  */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 					  void *caller)
-- 
cgit v1.2.3


From aaa994b300a172afafab47938804836b923e5ef7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:52 -0700
Subject: [PATCH] page migration: handle freeing of pages in migrate_pages()

Do not leave pages on the lists passed to migrate_pages().  Seems that we will
not need any postprocessing of pages.  This will simplify the handling of
pages by the callers of migrate_pages().

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h |  7 +++----
 mm/mempolicy.c          |  8 +-------
 mm/migrate.c            | 48 +++++++++++++++++++++++-------------------------
 3 files changed, 27 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 287c47b5e5df..83af25949fa9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -8,8 +8,7 @@ extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
-extern int migrate_pages(struct list_head *l, struct list_head *t,
-		struct list_head *moved, struct list_head *failed);
+extern int migrate_pages(struct list_head *l, struct list_head *t);
 extern int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest);
 extern int fail_migrate_page(struct address_space *,
@@ -22,8 +21,8 @@ extern int migrate_prep(void);
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
 					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t,
-	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+static inline int migrate_pages(struct list_head *l, struct list_head *t)
+					{ return -ENOSYS; }
 
 static inline int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest) { return 0; }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c4..244f3f130e4a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -603,11 +603,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
-	if (!list_empty(&pagelist)) {
+	if (!list_empty(&pagelist))
 		err = migrate_pages_to(&pagelist, NULL, dest);
-		if (!list_empty(&pagelist))
-			putback_lru_pages(&pagelist);
-	}
 	return err;
 }
 
@@ -773,9 +770,6 @@ long do_mbind(unsigned long start, unsigned long len,
 			err = -EIO;
 	}
 
-	if (!list_empty(&pagelist))
-		putback_lru_pages(&pagelist);
-
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
 	return err;
diff --git a/mm/migrate.c b/mm/migrate.c
index 09038163bfec..d3a1810a4c9f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -624,6 +624,15 @@ unlock:
 	unlock_page(page);
 ret:
 	if (rc != -EAGAIN) {
+ 		/*
+ 		 * A page that has been migrated has all references
+ 		 * removed and will be freed. A page that has not been
+ 		 * migrated will have kepts its references and be
+ 		 * restored.
+ 		 */
+ 		list_del(&page->lru);
+ 		move_to_lru(page);
+
 		list_del(&newpage->lru);
 		move_to_lru(newpage);
 	}
@@ -640,12 +649,12 @@ ret:
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
+ * or no retryable pages exist anymore. All pages will be
+ * retruned to the LRU or freed.
  *
- * Return: Number of pages not migrated when "to" ran empty.
+ * Return: Number of pages not migrated.
  */
-int migrate_pages(struct list_head *from, struct list_head *to,
-		  struct list_head *moved, struct list_head *failed)
+int migrate_pages(struct list_head *from, struct list_head *to)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -675,11 +684,9 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 				retry++;
 				break;
 			case 0:
-				list_move(&page->lru, moved);
 				break;
 			default:
 				/* Permanent failure */
-				list_move(&page->lru, failed);
 				nr_failed++;
 				break;
 			}
@@ -689,6 +696,7 @@ int migrate_pages(struct list_head *from, struct list_head *to,
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
+	putback_lru_pages(from);
 	return nr_failed + retry;
 }
 
@@ -702,11 +710,10 @@ int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest)
 {
 	LIST_HEAD(newlist);
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
 	int err = 0;
 	unsigned long offset = 0;
 	int nr_pages;
+	int nr_failed = 0;
 	struct page *page;
 	struct list_head *p;
 
@@ -740,26 +747,17 @@ redo:
 		if (nr_pages > MIGRATE_CHUNK_SIZE)
 			break;
 	}
-	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+	err = migrate_pages(pagelist, &newlist);
 
-	putback_lru_pages(&moved);	/* Call release pages instead ?? */
-
-	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-		goto redo;
-out:
-	/* Return leftover allocated pages */
-	while (!list_empty(&newlist)) {
-		page = list_entry(newlist.next, struct page, lru);
-		list_del(&page->lru);
-		__free_page(page);
+	if (err >= 0) {
+		nr_failed += err;
+		if (list_empty(&newlist) && !list_empty(pagelist))
+			goto redo;
 	}
-	list_splice(&failed, pagelist);
-	if (err < 0)
-		return err;
+out:
 
 	/* Calculate number of leftover pages */
-	nr_pages = 0;
 	list_for_each(p, pagelist)
-		nr_pages++;
-	return nr_pages;
+		nr_failed++;
+	return nr_failed;
 }
-- 
cgit v1.2.3


From 95a402c3847cc16f4ba03013cd01404fa0f14c2e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:53 -0700
Subject: [PATCH] page migration: use allocator function for migrate_pages()

Instead of passing a list of new pages, pass a function to allocate a new
page.  This allows the correct placement of MPOL_INTERLEAVE pages during page
migration.  It also further simplifies the callers of migrate pages.
migrate_pages() becomes similar to migrate_pages_to() so drop
migrate_pages_to().  The batching of new page allocations becomes unnecessary.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/migrate.h |  11 ++---
 mm/mempolicy.c          |  23 +++++++++-
 mm/migrate.c            | 115 ++++++++++++++----------------------------------
 3 files changed, 59 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 83af25949fa9..5b95d6568dc4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,14 +3,15 @@
 
 #include <linux/mm.h>
 
+typedef struct page *new_page_t(struct page *, unsigned long private);
+
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
-extern int migrate_pages(struct list_head *l, struct list_head *t);
-extern int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest);
+extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long);
+
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
 
@@ -21,8 +22,8 @@ extern int migrate_prep(void);
 static inline int isolate_lru_page(struct page *p, struct list_head *list)
 					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t)
-					{ return -ENOSYS; }
+static inline int migrate_pages(struct list_head *l, new_page_t x,
+		unsigned long private) { return -ENOSYS; }
 
 static inline int migrate_pages_to(struct list_head *pagelist,
 			struct vm_area_struct *vma, int dest) { return 0; }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 244f3f130e4a..f432642e9e66 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,7 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
+#include <linux/rmap.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -587,6 +588,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 		isolate_lru_page(page, pagelist);
 }
 
+static struct page *new_node_page(struct page *page, unsigned long node)
+{
+	return alloc_pages_node(node, GFP_HIGHUSER, 0);
+}
+
 /*
  * Migrate pages from one node to a target node.
  * Returns error or the number of pages not migrated.
@@ -604,7 +610,8 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist))
-		err = migrate_pages_to(&pagelist, NULL, dest);
+		err = migrate_pages(&pagelist, new_node_page, dest);
+
 	return err;
 }
 
@@ -691,6 +698,12 @@ int do_migrate_pages(struct mm_struct *mm,
 
 }
 
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+	struct vm_area_struct *vma = (struct vm_area_struct *)private;
+
+	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+}
 #else
 
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -703,6 +716,11 @@ int do_migrate_pages(struct mm_struct *mm,
 {
 	return -ENOSYS;
 }
+
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+	return NULL;
+}
 #endif
 
 long do_mbind(unsigned long start, unsigned long len,
@@ -764,7 +782,8 @@ long do_mbind(unsigned long start, unsigned long len,
 		err = mbind_range(vma, start, end, new);
 
 		if (!list_empty(&pagelist))
-			nr_failed = migrate_pages_to(&pagelist, vma, -1);
+			nr_failed = migrate_pages(&pagelist, new_vma_page,
+						(unsigned long)vma);
 
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
diff --git a/mm/migrate.c b/mm/migrate.c
index d3a1810a4c9f..251a8d158257 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -28,9 +28,6 @@
 
 #include "internal.h"
 
-/* The maximum number of pages to take off the LRU for migration */
-#define MIGRATE_CHUNK_SIZE 256
-
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 /*
@@ -587,18 +584,23 @@ static int move_to_new_page(struct page *newpage, struct page *page)
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
  */
-static int unmap_and_move(struct page *newpage, struct page *page, int force)
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+			struct page *page, int force)
 {
 	int rc = 0;
+	struct page *newpage = get_new_page(page, private);
+
+	if (!newpage)
+		return -ENOMEM;
 
 	if (page_count(page) == 1)
 		/* page was freed from under us. So we are done. */
-		goto ret;
+		goto move_newpage;
 
 	rc = -EAGAIN;
 	if (TestSetPageLocked(page)) {
 		if (!force)
-			goto ret;
+			goto move_newpage;
 		lock_page(page);
 	}
 
@@ -622,7 +624,7 @@ static int unmap_and_move(struct page *newpage, struct page *page, int force)
 		remove_migration_ptes(page, page);
 unlock:
 	unlock_page(page);
-ret:
+
 	if (rc != -EAGAIN) {
  		/*
  		 * A page that has been migrated has all references
@@ -632,29 +634,33 @@ ret:
  		 */
  		list_del(&page->lru);
  		move_to_lru(page);
-
-		list_del(&newpage->lru);
-		move_to_lru(newpage);
 	}
+
+move_newpage:
+	/*
+	 * Move the new page to the LRU. If migration was not successful
+	 * then this will free the page.
+	 */
+	move_to_lru(newpage);
 	return rc;
 }
 
 /*
  * migrate_pages
  *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the isolated pages
- * can be moved to.
+ * The function takes one list of pages to migrate and a function
+ * that determines from the page to be migrated and the private data
+ * the target of the move and allocates the page.
  *
  * The function returns after 10 attempts or if no pages
  * are movable anymore because to has become empty
  * or no retryable pages exist anymore. All pages will be
  * retruned to the LRU or freed.
  *
- * Return: Number of pages not migrated.
+ * Return: Number of pages not migrated or error code.
  */
-int migrate_pages(struct list_head *from, struct list_head *to)
+int migrate_pages(struct list_head *from,
+		new_page_t get_new_page, unsigned long private)
 {
 	int retry = 1;
 	int nr_failed = 0;
@@ -671,15 +677,14 @@ int migrate_pages(struct list_head *from, struct list_head *to)
 		retry = 0;
 
 		list_for_each_entry_safe(page, page2, from, lru) {
-
-			if (list_empty(to))
-				break;
-
 			cond_resched();
 
-			rc = unmap_and_move(lru_to_page(to), page, pass > 2);
+			rc = unmap_and_move(get_new_page, private,
+						page, pass > 2);
 
 			switch(rc) {
+			case -ENOMEM:
+				goto out;
 			case -EAGAIN:
 				retry++;
 				break;
@@ -692,72 +697,16 @@ int migrate_pages(struct list_head *from, struct list_head *to)
 			}
 		}
 	}
-
+	rc = 0;
+out:
 	if (!swapwrite)
 		current->flags &= ~PF_SWAPWRITE;
 
 	putback_lru_pages(from);
-	return nr_failed + retry;
-}
 
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest)
-{
-	LIST_HEAD(newlist);
-	int err = 0;
-	unsigned long offset = 0;
-	int nr_pages;
-	int nr_failed = 0;
-	struct page *page;
-	struct list_head *p;
-
-redo:
-	nr_pages = 0;
-	list_for_each(p, pagelist) {
-		if (vma) {
-			/*
-			 * The address passed to alloc_page_vma is used to
-			 * generate the proper interleave behavior. We fake
-			 * the address here by an increasing offset in order
-			 * to get the proper distribution of pages.
-			 *
-			 * No decision has been made as to which page
-			 * a certain old page is moved to so we cannot
-			 * specify the correct address.
-			 */
-			page = alloc_page_vma(GFP_HIGHUSER, vma,
-					offset + vma->vm_start);
-			offset += PAGE_SIZE;
-		}
-		else
-			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-		if (!page) {
-			err = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&page->lru, &newlist);
-		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE)
-			break;
-	}
-	err = migrate_pages(pagelist, &newlist);
-
-	if (err >= 0) {
-		nr_failed += err;
-		if (list_empty(&newlist) && !list_empty(pagelist))
-			goto redo;
-	}
-out:
+	if (rc)
+		return rc;
 
-	/* Calculate number of leftover pages */
-	list_for_each(p, pagelist)
-		nr_failed++;
-	return nr_failed;
+	return nr_failed + retry;
 }
+
-- 
cgit v1.2.3


From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:55 -0700
Subject: [PATCH] page migration: sys_move_pages(): support moving of
 individual pages

move_pages() is used to move individual pages of a process. The function can
be used to determine the location of pages and to move them onto the desired
node. move_pages() returns status information for each page.

long move_pages(pid, number_of_pages_to_move,
		addresses_of_pages[],
		nodes[] or NULL,
		status[],
		flags);

The addresses of pages is an array of void * pointing to the
pages to be moved.

The nodes array contains the node numbers that the pages should be moved
to. If a NULL is passed instead of an array then no pages are moved but
the status array is updated. The status request may be used to determine
the page state before issuing another move_pages() to move pages.

The status array will contain the state of all individual page migration
attempts when the function terminates. The status array is only valid if
move_pages() completed successfullly.

Possible page states in status[]:

0..MAX_NUMNODES	The page is now on the indicated node.

-ENOENT		Page is not present

-EACCES		Page is mapped by multiple processes and can only
		be moved if MPOL_MF_MOVE_ALL is specified.

-EPERM		The page has been mlocked by a process/driver and
		cannot be moved.

-EBUSY		Page is busy and cannot be moved. Try again later.

-EFAULT		Invalid address (no VMA or zero page).

-ENOMEM		Unable to allocate memory on target node.

-EIO		Unable to write back page. The page must be written
		back in order to move it since the page is dirty and the
		filesystem does not provide a migration function that
		would allow the moving of dirty pages.

-EINVAL		A dirty page cannot be moved. The filesystem does not provide
		a migration function and has no ability to write back pages.

The flags parameter indicates what types of pages to move:

MPOL_MF_MOVE	Move pages that are only mapped by the process.

MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes.
		Requires sufficient capabilities.

Possible return codes from move_pages()

-ENOENT		No pages found that would require moving. All pages
		are either already on the target node, not present, had an
		invalid address or could not be moved because they were
		mapped by multiple processes.

-EINVAL		Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt
		to migrate pages in a kernel thread.

-EPERM		MPOL_MF_MOVE_ALL specified without sufficient priviledges.
		or an attempt to move a process belonging to another user.

-EACCES		One of the target nodes is not allowed by the current cpuset.

-ENODEV		One of the target nodes is not online.

-ESRCH		Process does not exist.

-E2BIG		Too many pages to move.

-ENOMEM		Not enough memory to allocate control array.

-EFAULT		Parameters could not be accessed.

A test program for move_pages() may be found with the patches
on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3

From: Christoph Lameter <clameter@sgi.com>

  Detailed results for sys_move_pages()

  Pass a pointer to an integer to get_new_page() that may be used to
  indicate where the completion status of a migration operation should be
  placed.  This allows sys_move_pags() to report back exactly what happened to
  each page.

  Wish there would be a better way to do this. Looks a bit hacky.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Jes Sorensen <jes@trained-monkey.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/vm/page_migration |  29 ++---
 arch/ia64/kernel/entry.S        |   2 +-
 include/asm-ia64/unistd.h       |   2 +-
 include/linux/migrate.h         |   2 +-
 include/linux/syscalls.h        |   5 +
 kernel/sys_ni.c                 |   1 +
 mm/mempolicy.c                  |   4 +-
 mm/migrate.c                    | 268 +++++++++++++++++++++++++++++++++++++++-
 8 files changed, 288 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index 0a5d5fb18854..99f89aa10169 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package.
 Manual migration is useful if for example the scheduler has relocated
 a process to a processor on a distant node. A batch scheduler or an
 administrator may detect the situation and move the pages of the process
-nearer to the new processor. At some point in the future we may have
-some mechanism in the scheduler that will automatically move the pages.
+nearer to the new processor. The kernel itself does only provide
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+A NUMA profiler may f.e. obtain a log showing frequent off node
+accesses and may use the result to move pages to more advantageous
+locations.
 
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
@@ -62,22 +67,14 @@ A. In kernel use of migrate_pages()
    It also prevents the swapper or other scans to encounter
    the page.
 
-2. Generate a list of newly allocates pages. These pages will contain the
-   contents of the pages from the first list after page migration is
-   complete.
+2. We need to have a function of type new_page_t that can be
+   passed to migrate_pages(). This function should figure out
+   how to allocate the correct new page given the old page.
 
 3. The migrate_pages() function is called which attempts
-   to do the migration. It returns the moved pages in the
-   list specified as the third parameter and the failed
-   migrations in the fourth parameter. When the function
-   returns the first list will contain the pages that could still be retried.
-
-4. The leftover pages of various types are returned
-   to the LRU using putback_to_lru_pages() or otherwise
-   disposed of. The pages will still have the refcount as
-   increased by isolate_lru_pages() if putback_to_lru_pages() is not
-   used! The kernel may want to handle the various cases of failures in
-   different ways.
+   to do the migration. It will call the function to allocate
+   the new page for each page that is considered for
+   moving.
 
 B. How migrate_pages() works
 ----------------------------
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index bcb80ca5cf40..32c999f58d12 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1584,7 +1584,7 @@ sys_call_table:
 	data8 sys_keyctl
 	data8 sys_ioprio_set
 	data8 sys_ioprio_get			// 1275
-	data8 sys_ni_syscall
+	data8 sys_move_pages
 	data8 sys_inotify_init
 	data8 sys_inotify_add_watch
 	data8 sys_inotify_rm_watch
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 632f2eedf72c..bb0eb727dcd0 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -265,7 +265,7 @@
 #define __NR_keyctl			1273
 #define __NR_ioprio_set			1274
 #define __NR_ioprio_get			1275
-/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */
+#define __NR_move_pages			1276
 #define __NR_inotify_init		1277
 #define __NR_inotify_add_watch		1278
 #define __NR_inotify_rm_watch		1279
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 5b95d6568dc4..5dba23a1c0d0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -3,7 +3,7 @@
 
 #include <linux/mm.h>
 
-typedef struct page *new_page_t(struct page *, unsigned long private);
+typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bd67a4413df7..7e3f23490918 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 				const unsigned long __user *from,
 				const unsigned long __user *to);
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+				const void __user * __user *pages,
+				const int __user *nodes,
+				int __user *status,
+				int flags);
 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				unsigned long mode,
 				unsigned long __user *nmask,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..597229749dec 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
 cond_syscall(sys_chown16);
 cond_syscall(sys_fchown16);
 cond_syscall(sys_getegid16);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f432642e9e66..05b84acf0bb3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 		isolate_lru_page(page, pagelist);
 }
 
-static struct page *new_node_page(struct page *page, unsigned long node)
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
 	return alloc_pages_node(node, GFP_HIGHUSER, 0);
 }
@@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm,
 
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private)
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 {
 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 251a8d158257..033a12f4c949 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -25,6 +25,8 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
 
 #include "internal.h"
 
@@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
 }
 
 /*
- * migrate_prep() needs to be called after we have compiled the list of pages
- * to be migrated using isolate_lru_page() but before we begin a series of calls
- * to migrate_pages().
+ * migrate_prep() needs to be called before we start compiling a list of pages
+ * to be migrated using isolate_lru_page().
  */
 int migrate_prep(void)
 {
@@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 			struct page *page, int force)
 {
 	int rc = 0;
-	struct page *newpage = get_new_page(page, private);
+	int *result = NULL;
+	struct page *newpage = get_new_page(page, private, &result);
 
 	if (!newpage)
 		return -ENOMEM;
@@ -642,6 +644,12 @@ move_newpage:
 	 * then this will free the page.
 	 */
 	move_to_lru(newpage);
+	if (result) {
+		if (rc)
+			*result = rc;
+		else
+			*result = page_to_nid(newpage);
+	}
 	return rc;
 }
 
@@ -710,3 +718,255 @@ out:
 	return nr_failed + retry;
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * Move a list of individual pages
+ */
+struct page_to_node {
+	unsigned long addr;
+	struct page *page;
+	int node;
+	int status;
+};
+
+static struct page *new_page_node(struct page *p, unsigned long private,
+		int **result)
+{
+	struct page_to_node *pm = (struct page_to_node *)private;
+
+	while (pm->node != MAX_NUMNODES && pm->page != p)
+		pm++;
+
+	if (pm->node == MAX_NUMNODES)
+		return NULL;
+
+	*result = &pm->status;
+
+	return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
+}
+
+/*
+ * Move a set of pages as indicated in the pm array. The addr
+ * field must be set to the virtual address of the page to be moved
+ * and the node number must contain a valid target node.
+ */
+static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
+				int migrate_all)
+{
+	int err;
+	struct page_to_node *pp;
+	LIST_HEAD(pagelist);
+
+	down_read(&mm->mmap_sem);
+
+	/*
+	 * Build a list of pages to migrate
+	 */
+	migrate_prep();
+	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
+		struct vm_area_struct *vma;
+		struct page *page;
+
+		/*
+		 * A valid page pointer that will not match any of the
+		 * pages that will be moved.
+		 */
+		pp->page = ZERO_PAGE(0);
+
+		err = -EFAULT;
+		vma = find_vma(mm, pp->addr);
+		if (!vma)
+			goto set_status;
+
+		page = follow_page(vma, pp->addr, FOLL_GET);
+		err = -ENOENT;
+		if (!page)
+			goto set_status;
+
+		if (PageReserved(page))		/* Check for zero page */
+			goto put_and_set;
+
+		pp->page = page;
+		err = page_to_nid(page);
+
+		if (err == pp->node)
+			/*
+			 * Node already in the right place
+			 */
+			goto put_and_set;
+
+		err = -EACCES;
+		if (page_mapcount(page) > 1 &&
+				!migrate_all)
+			goto put_and_set;
+
+		err = isolate_lru_page(page, &pagelist);
+put_and_set:
+		/*
+		 * Either remove the duplicate refcount from
+		 * isolate_lru_page() or drop the page ref if it was
+		 * not isolated.
+		 */
+		put_page(page);
+set_status:
+		pp->status = err;
+	}
+
+	if (!list_empty(&pagelist))
+		err = migrate_pages(&pagelist, new_page_node,
+				(unsigned long)pm);
+	else
+		err = -ENOENT;
+
+	up_read(&mm->mmap_sem);
+	return err;
+}
+
+/*
+ * Determine the nodes of a list of pages. The addr in the pm array
+ * must have been set to the virtual address of which we want to determine
+ * the node number.
+ */
+static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
+{
+	down_read(&mm->mmap_sem);
+
+	for ( ; pm->node != MAX_NUMNODES; pm++) {
+		struct vm_area_struct *vma;
+		struct page *page;
+		int err;
+
+		err = -EFAULT;
+		vma = find_vma(mm, pm->addr);
+		if (!vma)
+			goto set_status;
+
+		page = follow_page(vma, pm->addr, 0);
+		err = -ENOENT;
+		/* Use PageReserved to check for zero page */
+		if (!page || PageReserved(page))
+			goto set_status;
+
+		err = page_to_nid(page);
+set_status:
+		pm->status = err;
+	}
+
+	up_read(&mm->mmap_sem);
+	return 0;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+			const void __user * __user *pages,
+			const int __user *nodes,
+			int __user *status, int flags)
+{
+	int err = 0;
+	int i;
+	struct task_struct *task;
+	nodemask_t task_nodes;
+	struct mm_struct *mm;
+	struct page_to_node *pm = NULL;
+
+	/* Check flags */
+	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+		return -EINVAL;
+
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	/* Find the mm_struct */
+	read_lock(&tasklist_lock);
+	task = pid ? find_task_by_pid(pid) : current;
+	if (!task) {
+		read_unlock(&tasklist_lock);
+		return -ESRCH;
+	}
+	mm = get_task_mm(task);
+	read_unlock(&tasklist_lock);
+
+	if (!mm)
+		return -EINVAL;
+
+	/*
+	 * Check if this process has the right to modify the specified
+	 * process. The right exists if the process has administrative
+	 * capabilities, superuser privileges or the same
+	 * userid as the target process.
+	 */
+	if ((current->euid != task->suid) && (current->euid != task->uid) &&
+	    (current->uid != task->suid) && (current->uid != task->uid) &&
+	    !capable(CAP_SYS_NICE)) {
+		err = -EPERM;
+		goto out2;
+	}
+
+	task_nodes = cpuset_mems_allowed(task);
+
+	/* Limit nr_pages so that the multiplication may not overflow */
+	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+		err = -E2BIG;
+		goto out2;
+	}
+
+	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+	if (!pm) {
+		err = -ENOMEM;
+		goto out2;
+	}
+
+	/*
+	 * Get parameters from user space and initialize the pm
+	 * array. Return various errors if the user did something wrong.
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		const void *p;
+
+		err = -EFAULT;
+		if (get_user(p, pages + i))
+			goto out;
+
+		pm[i].addr = (unsigned long)p;
+		if (nodes) {
+			int node;
+
+			if (get_user(node, nodes + i))
+				goto out;
+
+			err = -ENODEV;
+			if (!node_online(node))
+				goto out;
+
+			err = -EACCES;
+			if (!node_isset(node, task_nodes))
+				goto out;
+
+			pm[i].node = node;
+		}
+	}
+	/* End marker */
+	pm[nr_pages].node = MAX_NUMNODES;
+
+	if (nodes)
+		err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
+	else
+		err = do_pages_stat(mm, pm);
+
+	if (err >= 0)
+		/* Return status information */
+		for (i = 0; i < nr_pages; i++)
+			if (put_user(pm[i].status, status + i))
+				err = -EFAULT;
+
+out:
+	vfree(pm);
+out2:
+	mmput(mm);
+	return err;
+}
+#endif
+
-- 
cgit v1.2.3


From 1b2db9fb7adc4d67d9ce7d16ce79c41ee84730fe Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:56 -0700
Subject: [PATCH] sys_move_pages: 32bit support (i386, x86_64)

sys_move_pages() support for 32bit (i386 plus x86_64 compat layer)

Add support for move_pages() on i386 and also add the compat functions
necessary to run 32 bit binaries on x86_64.

Add compat_sys_move_pages to the x86_64 32bit binary layer.  Note that it is
not up to date so I added the missing pieces.  Not sure if this is done the
right way.

[akpm@osdl.org: compile fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |  1 +
 arch/x86_64/ia32/ia32entry.S     |  1 +
 include/asm-i386/unistd.h        |  3 ++-
 include/linux/syscalls.h         |  5 +++++
 kernel/compat.c                  | 23 +++++++++++++++++++++++
 kernel/sys_ni.c                  |  1 +
 6 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index af56987f69b0..dd63d4775398 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -316,3 +316,4 @@ ENTRY(sys_call_table)
 	.long sys_sync_file_range
 	.long sys_tee			/* 315 */
 	.long sys_vmsplice
+	.long sys_move_pages
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5a92fed2d1d5..4ec594ab1a98 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -696,4 +696,5 @@ ia32_sys_call_table:
 	.quad sys_sync_file_range
 	.quad sys_tee
 	.quad compat_sys_vmsplice
+	.quad compat_sys_move_pages
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index de2ccc149e34..fc1c8ddae149 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -322,10 +322,11 @@
 #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
+#define __NR_move_pages		317
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 317
+#define NR_syscalls 318
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7e3f23490918..e42738c69166 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -521,6 +521,11 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 				const int __user *nodes,
 				int __user *status,
 				int flags);
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page,
+				void __user *pages,
+				const int __user *nodes,
+				int __user *status,
+				int flags);
 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				unsigned long mode,
 				unsigned long __user *nmask,
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..ccea93e28954 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
+#include <linux/migrate.h>
 
 #include <asm/uaccess.h>
 
@@ -934,3 +935,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 
 	return ret;
 }
+
+#ifdef CONFIG_NUMA
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
+		void __user *pages32,
+		const int __user *nodes,
+		int __user *status,
+		int flags)
+{
+	const void __user * __user *pages;
+	int i;
+
+	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+	for (i = 0; i < nr_pages; i++) {
+		compat_uptr_t p;
+
+		if (get_user(p, (compat_uptr_t *)(pages32 + i)) ||
+			put_user(compat_ptr(p), pages + i))
+			return -EFAULT;
+	}
+	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 597229749dec..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -133,3 +133,4 @@ cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
-- 
cgit v1.2.3


From 9216dfad4fc97ab639ef0885efc713f3d7a20d5b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 23 Jun 2006 02:03:57 -0700
Subject: [PATCH] move_pages: fix 32 -> 64 bit compat function

The definition of the third parameter is a pointer to an array of virtual
addresses which give us some trouble.  The existing code calculated the
wrong address in the array since I used void to avoid having to specify a
type.

I now use the correct type "compat_uptr_t __user *" in the definition of
the function in kernel/compat.c.

However, I used __u32 in syscalls.h.  Would have to include compat.h there
in order to provide the same definition which would generate an ugly
include situation.

On both ia64 and x86_64 compat_uptr_t is u32. So this works although
parameter declarations differ.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/syscalls.h | 2 +-
 kernel/compat.c          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e42738c69166..33785b79d548 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -522,7 +522,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 				int __user *status,
 				int flags);
 asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page,
-				void __user *pages,
+				__u32 __user *pages,
 				const int __user *nodes,
 				int __user *status,
 				int flags);
diff --git a/kernel/compat.c b/kernel/compat.c
index ccea93e28954..2f672332430f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -938,7 +938,7 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 
 #ifdef CONFIG_NUMA
 asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
-		void __user *pages32,
+		compat_uptr_t __user *pages32,
 		const int __user *nodes,
 		int __user *status,
 		int flags)
@@ -950,7 +950,7 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
 	for (i = 0; i < nr_pages; i++) {
 		compat_uptr_t p;
 
-		if (get_user(p, (compat_uptr_t *)(pages32 + i)) ||
+		if (get_user(p, pages32 + i) ||
 			put_user(compat_ptr(p), pages + i))
 			return -EFAULT;
 	}
-- 
cgit v1.2.3


From 03e68060636e05989ea94bcb671ab633948f328c Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 23 Jun 2006 02:03:58 -0700
Subject: [PATCH] lsm: add task_setioprio hook

Implement an LSM hook for setting a task's IO priority, similar to the hook
for setting a tasks's nice value.

A previous version of this LSM hook was included in an older version of
multiadm by Jan Engelhardt, although I don't recall it being submitted
upstream.

Also included is the corresponding SELinux hook, which re-uses the setsched
permission in the proccess class.

Signed-off-by: James Morris <jmorris@namei.org>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Cc: Jan Engelhardt <jengelh@linux01.gwdg.de>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ioprio.c              |  6 ++++++
 include/linux/security.h | 16 ++++++++++++++++
 security/dummy.c         |  6 ++++++
 security/selinux/hooks.c |  6 ++++++
 4 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index ca77008146c0..7fa76ed53c10 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -24,15 +24,21 @@
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/security.h>
 
 static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
+	int err;
 	struct io_context *ioc;
 
 	if (task->uid != current->euid &&
 	    task->uid != current->uid && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
+	err = security_task_setioprio(task, ioprio);
+	if (err)
+		return err;
+
 	task_lock(task);
 
 	task->ioprio = ioprio;
diff --git a/include/linux/security.h b/include/linux/security.h
index 383c320fc834..65b32a0c6207 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -577,6 +577,11 @@ struct swap_info_struct;
  *	@p contains the task_struct of process.
  *	@nice contains the new nice value.
  *	Return 0 if permission is granted.
+ * @task_setioprio
+ *	Check permission before setting the ioprio value of @p to @ioprio.
+ *	@p contains the task_struct of process.
+ *	@ioprio contains the new ioprio value
+ *	Return 0 if permission is granted.
  * @task_setrlimit:
  *	Check permission before setting the resource limits of the current
  *	process for @resource to @new_rlim.  The old resource limit values can
@@ -1210,6 +1215,7 @@ struct security_operations {
 	int (*task_getsid) (struct task_struct * p);
 	int (*task_setgroups) (struct group_info *group_info);
 	int (*task_setnice) (struct task_struct * p, int nice);
+	int (*task_setioprio) (struct task_struct * p, int ioprio);
 	int (*task_setrlimit) (unsigned int resource, struct rlimit * new_rlim);
 	int (*task_setscheduler) (struct task_struct * p, int policy,
 				  struct sched_param * lp);
@@ -1836,6 +1842,11 @@ static inline int security_task_setnice (struct task_struct *p, int nice)
 	return security_ops->task_setnice (p, nice);
 }
 
+static inline int security_task_setioprio (struct task_struct *p, int ioprio)
+{
+	return security_ops->task_setioprio (p, ioprio);
+}
+
 static inline int security_task_setrlimit (unsigned int resource,
 					   struct rlimit *new_rlim)
 {
@@ -2478,6 +2489,11 @@ static inline int security_task_setnice (struct task_struct *p, int nice)
 	return 0;
 }
 
+static inline int security_task_setioprio (struct task_struct *p, int ioprio)
+{
+	return 0;
+}
+
 static inline int security_task_setrlimit (unsigned int resource,
 					   struct rlimit *new_rlim)
 {
diff --git a/security/dummy.c b/security/dummy.c
index c98d553984ec..879a98523b1b 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -516,6 +516,11 @@ static int dummy_task_setnice (struct task_struct *p, int nice)
 	return 0;
 }
 
+static int dummy_task_setioprio (struct task_struct *p, int ioprio)
+{
+	return 0;
+}
+
 static int dummy_task_setrlimit (unsigned int resource, struct rlimit *new_rlim)
 {
 	return 0;
@@ -972,6 +977,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, task_getsid);
 	set_to_dummy_if_null(ops, task_setgroups);
 	set_to_dummy_if_null(ops, task_setnice);
+	set_to_dummy_if_null(ops, task_setioprio);
 	set_to_dummy_if_null(ops, task_setrlimit);
 	set_to_dummy_if_null(ops, task_setscheduler);
 	set_to_dummy_if_null(ops, task_getscheduler);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 093efba4d9b6..9dcf298921d4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2645,6 +2645,11 @@ static int selinux_task_setnice(struct task_struct *p, int nice)
 	return task_has_perm(current,p, PROCESS__SETSCHED);
 }
 
+static int selinux_task_setioprio(struct task_struct *p, int ioprio)
+{
+	return task_has_perm(current, p, PROCESS__SETSCHED);
+}
+
 static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
 {
 	struct rlimit *old_rlim = current->signal->rlim + resource;
@@ -4383,6 +4388,7 @@ static struct security_operations selinux_ops = {
 	.task_getsid =		        selinux_task_getsid,
 	.task_setgroups =		selinux_task_setgroups,
 	.task_setnice =			selinux_task_setnice,
+	.task_setioprio =		selinux_task_setioprio,
 	.task_setrlimit =		selinux_task_setrlimit,
 	.task_setscheduler =		selinux_task_setscheduler,
 	.task_getscheduler =		selinux_task_getscheduler,
-- 
cgit v1.2.3


From 35601547baf92d984b6e59cf3583649da04baea5 Mon Sep 17 00:00:00 2001
From: David Quigley <dpquigl@tycho.nsa.gov>
Date: Fri, 23 Jun 2006 02:04:01 -0700
Subject: [PATCH] SELinux: add task_movememory hook

This patch adds new security hook, task_movememory, to be called when memory
owened by a task is to be moved (e.g.  when migrating pages to a this hook is
identical to the setscheduler implementation, but a separate hook introduced
to allow this check to be specialized in the future if necessary.

Since the last posting, the hook has been renamed following feedback from
Christoph Lameter.

Signed-off-by: David Quigley <dpquigl@tycho.nsa.gov>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/security.h | 15 +++++++++++++++
 security/dummy.c         |  6 ++++++
 security/selinux/hooks.c |  6 ++++++
 3 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 65b32a0c6207..d2c17bd91a29 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -601,6 +601,10 @@ struct swap_info_struct;
  *	@p.
  *	@p contains the task_struct for process.
  *	Return 0 if permission is granted.
+ * @task_movememory
+ *	Check permission before moving memory owned by process @p.
+ *	@p contains the task_struct for process.
+ *	Return 0 if permission is granted.
  * @task_kill:
  *	Check permission before sending signal @sig to @p.  @info can be NULL,
  *	the constant 1, or a pointer to a siginfo structure.  If @info is 1 or
@@ -1220,6 +1224,7 @@ struct security_operations {
 	int (*task_setscheduler) (struct task_struct * p, int policy,
 				  struct sched_param * lp);
 	int (*task_getscheduler) (struct task_struct * p);
+	int (*task_movememory) (struct task_struct * p);
 	int (*task_kill) (struct task_struct * p,
 			  struct siginfo * info, int sig);
 	int (*task_wait) (struct task_struct * p);
@@ -1865,6 +1870,11 @@ static inline int security_task_getscheduler (struct task_struct *p)
 	return security_ops->task_getscheduler (p);
 }
 
+static inline int security_task_movememory (struct task_struct *p)
+{
+	return security_ops->task_movememory (p);
+}
+
 static inline int security_task_kill (struct task_struct *p,
 				      struct siginfo *info, int sig)
 {
@@ -2512,6 +2522,11 @@ static inline int security_task_getscheduler (struct task_struct *p)
 	return 0;
 }
 
+static inline int security_task_movememory (struct task_struct *p)
+{
+	return 0;
+}
+
 static inline int security_task_kill (struct task_struct *p,
 				      struct siginfo *info, int sig)
 {
diff --git a/security/dummy.c b/security/dummy.c
index 879a98523b1b..c3c5493581e2 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -537,6 +537,11 @@ static int dummy_task_getscheduler (struct task_struct *p)
 	return 0;
 }
 
+static int dummy_task_movememory (struct task_struct *p)
+{
+	return 0;
+}
+
 static int dummy_task_wait (struct task_struct *p)
 {
 	return 0;
@@ -981,6 +986,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, task_setrlimit);
 	set_to_dummy_if_null(ops, task_setscheduler);
 	set_to_dummy_if_null(ops, task_getscheduler);
+	set_to_dummy_if_null(ops, task_movememory);
 	set_to_dummy_if_null(ops, task_wait);
 	set_to_dummy_if_null(ops, task_kill);
 	set_to_dummy_if_null(ops, task_prctl);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9dcf298921d4..79c16e31c884 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2679,6 +2679,11 @@ static int selinux_task_getscheduler(struct task_struct *p)
 	return task_has_perm(current, p, PROCESS__GETSCHED);
 }
 
+static int selinux_task_movememory(struct task_struct *p)
+{
+	return task_has_perm(current, p, PROCESS__SETSCHED);
+}
+
 static int selinux_task_kill(struct task_struct *p, struct siginfo *info, int sig)
 {
 	u32 perm;
@@ -4392,6 +4397,7 @@ static struct security_operations selinux_ops = {
 	.task_setrlimit =		selinux_task_setrlimit,
 	.task_setscheduler =		selinux_task_setscheduler,
 	.task_getscheduler =		selinux_task_getscheduler,
+	.task_movememory =		selinux_task_movememory,
 	.task_kill =			selinux_task_kill,
 	.task_wait =			selinux_task_wait,
 	.task_prctl =			selinux_task_prctl,
-- 
cgit v1.2.3


From c22ce143d15eb288543fe9873e1c5ac1c01b69a1 Mon Sep 17 00:00:00 2001
From: Hiro Yoshioka <hyoshiok@miraclelinux.com>
Date: Fri, 23 Jun 2006 02:04:16 -0700
Subject: [PATCH] x86: cache pollution aware __copy_from_user_ll()

Use the x86 cache-bypassing copy instructions for copy_from_user().

Some performance data are

Total of GLOBAL_POWER_EVENTS (CPU cycle samples)

2.6.12.4.orig    1921587
2.6.12.4.nt      1599424
1599424/1921587=83.23% (16.77% reduction)

BSQ_CACHE_REFERENCE (L3 cache miss)
2.6.12.4.orig      57427
2.6.12.4.nt        20858
20858/57427=36.32% (63.7% reduction)

L3 cache miss reduction of __copy_from_user_ll
samples  %
37408    65.1412  vmlinux                  __copy_from_user_ll
23        0.1103  vmlinux                  __copy_user_zeroing_intel_nocache
23/37408=0.061% (99.94% reduction)

Top 5 of 2.6.12.4.nt
Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000
samples  %        app name                 symbol name
128392    8.0274  vmlinux                  __copy_user_zeroing_intel_nocache
64206     4.0143  vmlinux                  journal_add_journal_head
59746     3.7355  vmlinux                  do_get_write_access
47674     2.9807  vmlinux                  journal_put_journal_head
46021     2.8774  vmlinux                  journal_dirty_metadata
pattern9-0-cpu4-0-09011728/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000
samples  %        app name                 symbol name
69755     4.2861  vmlinux                  __copy_user_zeroing_intel_nocache
55685     3.4215  vmlinux                  journal_add_journal_head
52371     3.2179  vmlinux                  __find_get_block
45504     2.7960  vmlinux                  journal_put_journal_head
36005     2.2123  vmlinux                  journal_stop
pattern9-0-cpu4-0-09011744/summary.out

Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000
samples  %        app name                 symbol name
1147      5.4994  vmlinux                  journal_add_journal_head
881       4.2240  vmlinux                  journal_dirty_data
872       4.1809  vmlinux                  blk_rq_map_sg
734       3.5192  vmlinux                  journal_commit_transaction
617       2.9582  vmlinux                  radix_tree_delete
pattern9-0-cpu4-0-09011731/summary.out

iozone results are

original 2.6.12.4 CPU time = 207.768 sec
cache aware       CPU time = 184.783 sec
(three times run)
184.783/207.768=88.94% (11.06% reduction)

original:
pattern9-0-cpu4-0-08191720/iozone.out:  CPU Utilization: Wall time   45.997    CPU time   64.527    CPU utilization 140.28 %
pattern9-0-cpu4-0-08191741/iozone.out:  CPU Utilization: Wall time   46.878    CPU time   71.933    CPU utilization 153.45 %
pattern9-0-cpu4-0-08191743/iozone.out:  CPU Utilization: Wall time   45.152    CPU time   71.308    CPU utilization 157.93 %

cache awre:
pattern9-0-cpu4-0-09011728/iozone.out:  CPU Utilization: Wall time   44.842    CPU time   62.465    CPU utilization 139.30 %
pattern9-0-cpu4-0-09011731/iozone.out:  CPU Utilization: Wall time   44.718    CPU time   59.273    CPU utilization 132.55 %
pattern9-0-cpu4-0-09011744/iozone.out:  CPU Utilization: Wall time   44.367    CPU time   63.045    CPU utilization 142.10 %

Signed-off-by: Hiro Yoshioka <hyoshiok@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/lib/usercopy.c   | 137 ++++++++++++++++++++++++++++++++++++++++++---
 include/asm-i386/uaccess.h |  33 +++++++++++
 include/linux/uaccess.h    |  22 ++++++++
 mm/filemap.c               |   4 +-
 mm/filemap.h               |   6 +-
 5 files changed, 189 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/uaccess.h

(limited to 'include/linux')

diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 4cf981d70f45..6979297ce278 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -425,15 +425,121 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
 		       : "eax", "edx", "memory");
 	return size;
 }
+
+/*
+ * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
+ * hyoshiok@miraclelinux.com
+ */
+
+static unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size)
+{
+        int d0, d1;
+
+	__asm__ __volatile__(
+	       "        .align 2,0x90\n"
+	       "0:      movl 32(%4), %%eax\n"
+	       "        cmpl $67, %0\n"
+	       "        jbe 2f\n"
+	       "1:      movl 64(%4), %%eax\n"
+	       "        .align 2,0x90\n"
+	       "2:      movl 0(%4), %%eax\n"
+	       "21:     movl 4(%4), %%edx\n"
+	       "        movnti %%eax, 0(%3)\n"
+	       "        movnti %%edx, 4(%3)\n"
+	       "3:      movl 8(%4), %%eax\n"
+	       "31:     movl 12(%4),%%edx\n"
+	       "        movnti %%eax, 8(%3)\n"
+	       "        movnti %%edx, 12(%3)\n"
+	       "4:      movl 16(%4), %%eax\n"
+	       "41:     movl 20(%4), %%edx\n"
+	       "        movnti %%eax, 16(%3)\n"
+	       "        movnti %%edx, 20(%3)\n"
+	       "10:     movl 24(%4), %%eax\n"
+	       "51:     movl 28(%4), %%edx\n"
+	       "        movnti %%eax, 24(%3)\n"
+	       "        movnti %%edx, 28(%3)\n"
+	       "11:     movl 32(%4), %%eax\n"
+	       "61:     movl 36(%4), %%edx\n"
+	       "        movnti %%eax, 32(%3)\n"
+	       "        movnti %%edx, 36(%3)\n"
+	       "12:     movl 40(%4), %%eax\n"
+	       "71:     movl 44(%4), %%edx\n"
+	       "        movnti %%eax, 40(%3)\n"
+	       "        movnti %%edx, 44(%3)\n"
+	       "13:     movl 48(%4), %%eax\n"
+	       "81:     movl 52(%4), %%edx\n"
+	       "        movnti %%eax, 48(%3)\n"
+	       "        movnti %%edx, 52(%3)\n"
+	       "14:     movl 56(%4), %%eax\n"
+	       "91:     movl 60(%4), %%edx\n"
+	       "        movnti %%eax, 56(%3)\n"
+	       "        movnti %%edx, 60(%3)\n"
+	       "        addl $-64, %0\n"
+	       "        addl $64, %4\n"
+	       "        addl $64, %3\n"
+	       "        cmpl $63, %0\n"
+	       "        ja  0b\n"
+	       "        sfence \n"
+	       "5:      movl  %0, %%eax\n"
+	       "        shrl  $2, %0\n"
+	       "        andl $3, %%eax\n"
+	       "        cld\n"
+	       "6:      rep; movsl\n"
+	       "        movl %%eax,%0\n"
+	       "7:      rep; movsb\n"
+	       "8:\n"
+	       ".section .fixup,\"ax\"\n"
+	       "9:      lea 0(%%eax,%0,4),%0\n"
+	       "16:     pushl %0\n"
+	       "        pushl %%eax\n"
+	       "        xorl %%eax,%%eax\n"
+	       "        rep; stosb\n"
+	       "        popl %%eax\n"
+	       "        popl %0\n"
+	       "        jmp 8b\n"
+	       ".previous\n"
+	       ".section __ex_table,\"a\"\n"
+	       "	.align 4\n"
+	       "	.long 0b,16b\n"
+	       "	.long 1b,16b\n"
+	       "	.long 2b,16b\n"
+	       "	.long 21b,16b\n"
+	       "	.long 3b,16b\n"
+	       "	.long 31b,16b\n"
+	       "	.long 4b,16b\n"
+	       "	.long 41b,16b\n"
+	       "	.long 10b,16b\n"
+	       "	.long 51b,16b\n"
+	       "	.long 11b,16b\n"
+	       "	.long 61b,16b\n"
+	       "	.long 12b,16b\n"
+	       "	.long 71b,16b\n"
+	       "	.long 13b,16b\n"
+	       "	.long 81b,16b\n"
+	       "	.long 14b,16b\n"
+	       "	.long 91b,16b\n"
+	       "	.long 6b,9b\n"
+	       "        .long 7b,16b\n"
+	       ".previous"
+	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+	       :  "1"(to), "2"(from), "0"(size)
+	       : "eax", "edx", "memory");
+	return size;
+}
+
 #else
+
 /*
  * Leave these declared but undefined.  They should not be any references to
  * them
  */
-unsigned long
-__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size);
-unsigned long
-__copy_user_intel(void __user *to, const void *from, unsigned long size);
+unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
+					unsigned long size);
+unsigned long __copy_user_intel(void __user *to, const void *from,
+					unsigned long size);
+unsigned long __copy_user_zeroing_intel_nocache(void *to,
+				const void __user *from, unsigned long size);
 #endif /* CONFIG_X86_INTEL_USERCOPY */
 
 /* Generic arbitrary sized copy.  */
@@ -515,8 +621,8 @@ do {									\
 		: "memory");						\
 } while (0)
 
-
-unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n)
+unsigned long __copy_to_user_ll(void __user *to, const void *from,
+				unsigned long n)
 {
 	BUG_ON((long) n < 0);
 #ifndef CONFIG_X86_WP_WORKS_OK
@@ -576,8 +682,8 @@ survive:
 }
 EXPORT_SYMBOL(__copy_to_user_ll);
 
-unsigned long
-__copy_from_user_ll(void *to, const void __user *from, unsigned long n)
+unsigned long __copy_from_user_ll(void *to, const void __user *from,
+					unsigned long n)
 {
 	BUG_ON((long)n < 0);
 	if (movsl_is_ok(to, from, n))
@@ -588,6 +694,21 @@ __copy_from_user_ll(void *to, const void __user *from, unsigned long n)
 }
 EXPORT_SYMBOL(__copy_from_user_ll);
 
+unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
+					unsigned long n)
+{
+	BUG_ON((long)n < 0);
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	if ( n > 64 && cpu_has_xmm2)
+                n = __copy_user_zeroing_intel_nocache(to, from, n);
+	else
+		__copy_user_zeroing(to, from, n);
+#else
+        __copy_user_zeroing(to, from, n);
+#endif
+	return n;
+}
+
 /**
  * copy_to_user: - Copy a block of data into user space.
  * @to:   Destination address, in user space.
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 1ec65523ea5e..82af28a943ab 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -390,6 +390,8 @@ unsigned long __must_check __copy_to_user_ll(void __user *to,
 				const void *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll(void *to,
 				const void __user *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache(void *to,
+				const void __user *from, unsigned long n);
 
 /*
  * Here we special-case 1, 2 and 4-byte copy_*_user invocations.  On a fault
@@ -478,12 +480,43 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 	return __copy_from_user_ll(to, from, n);
 }
 
+#define ARCH_HAS_NOCACHE_UACCESS
+
+static __always_inline unsigned long __copy_from_user_inatomic_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	if (__builtin_constant_p(n)) {
+		unsigned long ret;
+
+		switch (n) {
+		case 1:
+			__get_user_size(*(u8 *)to, from, 1, ret, 1);
+			return ret;
+		case 2:
+			__get_user_size(*(u16 *)to, from, 2, ret, 2);
+			return ret;
+		case 4:
+			__get_user_size(*(u32 *)to, from, 4, ret, 4);
+			return ret;
+		}
+	}
+	return __copy_from_user_ll_nocache(to, from, n);
+}
+
 static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
        might_sleep();
        return __copy_from_user_inatomic(to, from, n);
 }
+
+static __always_inline unsigned long
+__copy_from_user_nocache(void *to, const void __user *from, unsigned long n)
+{
+       might_sleep();
+       return __copy_from_user_inatomic_nocache(to, from, n);
+}
+
 unsigned long __must_check copy_to_user(void __user *to,
 				const void *from, unsigned long n);
 unsigned long __must_check copy_from_user(void *to,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
new file mode 100644
index 000000000000..391e7ed1eb3f
--- /dev/null
+++ b/include/linux/uaccess.h
@@ -0,0 +1,22 @@
+#ifndef __LINUX_UACCESS_H__
+#define __LINUX_UACCESS_H__
+
+#include <asm/uaccess.h>
+
+#ifndef ARCH_HAS_NOCACHE_UACCESS
+
+static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	return __copy_from_user_inatomic(to, from, n);
+}
+
+static inline unsigned long __copy_from_user_nocache(void *to,
+				const void __user *from, unsigned long n)
+{
+	return __copy_from_user(to, from, n);
+}
+
+#endif		/* ARCH_HAS_NOCACHE_UACCESS */
+
+#endif		/* __LINUX_UACCESS_H__ */
diff --git a/mm/filemap.c b/mm/filemap.c
index 368678c2d531..807a463fd5ed 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
  */
 #include <linux/buffer_head.h> /* for generic_osync_inode */
 
-#include <asm/uaccess.h>
 #include <asm/mman.h>
 
 static ssize_t
@@ -1902,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
 		int copy = min(bytes, iov->iov_len - base);
 
 		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
 		copied += copy;
 		bytes -= copy;
 		vaddr += copy;
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce17..5683cde22055 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,7 +13,7 @@
 #include <linux/highmem.h>
 #include <linux/uio.h>
 #include <linux/config.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 size_t
 __filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
 	int left;
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+	left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (left != 0) {
 		/* Do it the slow way */
 		kaddr = kmap(page);
-		left = __copy_from_user(kaddr + offset, buf, bytes);
+		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
 		kunmap(page);
 	}
 	return bytes - left;
-- 
cgit v1.2.3


From 1b61b910e99059abdd54c93aa70e84e076e33d16 Mon Sep 17 00:00:00 2001
From: Zhang Yanmin <yanmin.zhang@intel.com>
Date: Fri, 23 Jun 2006 02:04:22 -0700
Subject: [PATCH] x86: kernel irq balance doesn't work

On i386, kernel irq balance doesn't work.

1) In function do_irq_balance, after kernel finds the min_loaded cpu but
   before calling set_pending_irq to really pin the selected_irq to the
   target cpu, kernel does a cpus_and with irq_affinity[selected_irq].
   Later on, when the irq is acked, kernel would calls
   move_native_irq=>desc->handler->set_affinity to change the irq affinity.
    However, every function pointed by
   hw_interrupt_type->set_affinity(unsigned int irq, cpumask_t cpumask)
   always changes irq_affinity[irq] to cpumask.  Next time when recalling
   do_irq_balance, it has to do cpu_ands again with
   irq_affinity[selected_irq], but irq_affinity[selected_irq] already
   becomes one cpu selected by the first irq balance.

2) Function balance_irq in file arch/i386/kernel/io_apic.c has the same
   issue.

[akpm@osdl.org: cleanups]
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c | 31 ++++++++++++++++++++-----------
 include/linux/irq.h        |  8 ++++++++
 kernel/irq/proc.c          |  3 +++
 3 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index d70f2ade5cde..a62df3e764c5 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -267,7 +267,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 # include <linux/slab.h>		/* kmalloc() */
 # include <linux/timer.h>	/* time_after() */
  
-# ifdef CONFIG_BALANCED_IRQ_DEBUG
+#ifdef CONFIG_BALANCED_IRQ_DEBUG
 #  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
 #  define Dprintk(x...) do { TDprintk(x); } while (0)
 # else
@@ -275,10 +275,15 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 #  define Dprintk(x...) 
 # endif
 
-
 #define IRQBALANCE_CHECK_ARCH -999
-static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
-static int physical_balance = 0;
+#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
+#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
+#define BALANCED_IRQ_LESS_DELTA		(HZ)
+
+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
+static int physical_balance __read_mostly;
+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
 
 static struct irq_cpu_info {
 	unsigned long * last_irq;
@@ -297,12 +302,14 @@ static struct irq_cpu_info {
 
 #define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
 
-#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
-#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
-#define BALANCED_IRQ_LESS_DELTA		(HZ)
+static cpumask_t balance_irq_affinity[NR_IRQS] = {
+	[0 ... NR_IRQS-1] = CPU_MASK_ALL
+};
 
-static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	balance_irq_affinity[irq] = mask;
+}
 
 static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
 			unsigned long now, int direction)
@@ -340,7 +347,7 @@ static inline void balance_irq(int cpu, int irq)
 	if (irqbalance_disabled)
 		return; 
 
-	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
+	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
 		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
@@ -529,7 +536,9 @@ tryanotherirq:
 		}
 	}
 
-	cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
+	cpus_and(allowed_mask,
+		cpu_online_map,
+		balance_irq_affinity[selected_irq]);
 	target_cpu_mask = cpumask_of_cpu(min_loaded);
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 42c9cd562860..e8a07e75e4fb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -164,6 +164,14 @@ static inline void set_irq_info(int irq, cpumask_t mask)
 
 #endif // CONFIG_SMP
 
+#ifdef CONFIG_IRQBALANCE
+extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
+#else
+static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+}
+#endif
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..afacd6f585fa 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
+
 	/*
 	 * Save these away for later use. Re-progam when the
 	 * interrupt is pending
@@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 #else
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
-- 
cgit v1.2.3


From ce4ab0012b32c1a4a1d6e934aeb73bf3151c48d9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 23 Jun 2006 02:04:44 -0700
Subject: [PATCH] swsusp: add architecture special saveable pages support

1. Add architecture specific pages save/restore support.  Next two patches
   will use this to save/restore 'ACPI NVS' pages.

2. Allow reserved pages 'nosave'.  This could avoid save/restore BIOS
   reserved pages.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Nigel Cunningham <nigel@suspend2.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   1 +
 kernel/power/power.h    |   4 ++
 kernel/power/snapshot.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/power/swsusp.c   |  18 ++------
 4 files changed, 118 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 96e31aa64cc7..e82cb10fb3ea 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,6 +71,7 @@ struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
 unsigned long get_safe_page(gfp_t gfp_mask);
+int swsusp_add_arch_pages(unsigned long start, unsigned long end);
 
 /*
  * XXX: We try to keep some more pages free so that I/O operations succeed
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..c81f0ed3eeba 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -105,6 +105,10 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
 extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
+extern unsigned int count_special_pages(void);
+extern int save_special_mem(void);
+extern int restore_special_mem(void);
+
 extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..7f511d89c667 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,88 @@ static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
 static unsigned long *buffer;
 
+struct arch_saveable_page {
+	unsigned long start;
+	unsigned long end;
+	char *data;
+	struct arch_saveable_page *next;
+};
+static struct arch_saveable_page *arch_pages;
+
+int swsusp_add_arch_pages(unsigned long start, unsigned long end)
+{
+	struct arch_saveable_page *tmp;
+
+	while (start < end) {
+		tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+		tmp->start = start;
+		tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT;
+		if (tmp->end > end)
+			tmp->end = end;
+		tmp->next = arch_pages;
+		start = tmp->end;
+		arch_pages = tmp;
+	}
+	return 0;
+}
+
+static unsigned int count_arch_pages(void)
+{
+	unsigned int count = 0;
+	struct arch_saveable_page *tmp = arch_pages;
+	while (tmp) {
+		count++;
+		tmp = tmp->next;
+	}
+	return count;
+}
+
+static int save_arch_mem(void)
+{
+	char *kaddr;
+	struct arch_saveable_page *tmp = arch_pages;
+	int offset;
+
+	pr_debug("swsusp: Saving arch specific memory");
+	while (tmp) {
+		tmp->data = (char *)__get_free_page(GFP_ATOMIC);
+		if (!tmp->data)
+			return -ENOMEM;
+		offset = tmp->start - (tmp->start & PAGE_MASK);
+		/* arch pages might haven't a 'struct page' */
+		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
+		memcpy(tmp->data + offset, kaddr + offset,
+			tmp->end - tmp->start);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		tmp = tmp->next;
+	}
+	return 0;
+}
+
+static int restore_arch_mem(void)
+{
+	char *kaddr;
+	struct arch_saveable_page *tmp = arch_pages;
+	int offset;
+
+	while (tmp) {
+		if (!tmp->data)
+			continue;
+		offset = tmp->start - (tmp->start & PAGE_MASK);
+		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
+		memcpy(kaddr + offset, tmp->data + offset,
+			tmp->end - tmp->start);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long)tmp->data);
+		tmp->data = NULL;
+		tmp = tmp->next;
+	}
+	return 0;
+}
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
 {
@@ -150,8 +232,35 @@ int restore_highmem(void)
 	}
 	return 0;
 }
+#else
+static unsigned int count_highmem_pages(void) {return 0;}
+static int save_highmem(void) {return 0;}
+static int restore_highmem(void) {return 0;}
 #endif
 
+unsigned int count_special_pages(void)
+{
+	return count_arch_pages() + count_highmem_pages();
+}
+
+int save_special_mem(void)
+{
+	int ret;
+	ret = save_arch_mem();
+	if (!ret)
+		ret = save_highmem();
+	return ret;
+}
+
+int restore_special_mem(void)
+{
+	int ret;
+	ret = restore_arch_mem();
+	if (!ret)
+		ret = restore_highmem();
+	return ret;
+}
+
 static int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
@@ -177,7 +286,6 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
 	if (PageReserved(page) && pfn_is_nosave(pfn))
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f9238faf76e4..78b6e71b0813 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -62,16 +62,6 @@ unsigned long image_size = 500 * 1024 * 1024;
 
 int in_suspend __nosavedata = 0;
 
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void);
-int save_highmem(void);
-int restore_highmem(void);
-#else
-static int save_highmem(void) { return 0; }
-static int restore_highmem(void) { return 0; }
-static unsigned int count_highmem_pages(void) { return 0; }
-#endif
-
 /**
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
@@ -192,7 +182,7 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-		size = 2 * count_highmem_pages();
+		size = 2 * count_special_pages();
 		size += size / 50 + count_data_pages();
 		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
 			PAGES_FOR_IO;
@@ -234,7 +224,7 @@ int swsusp_suspend(void)
 		goto Enable_irqs;
 	}
 
-	if ((error = save_highmem())) {
+	if ((error = save_special_mem())) {
 		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
 		goto Restore_highmem;
 	}
@@ -245,7 +235,7 @@ int swsusp_suspend(void)
 	/* Restore control flow magically appears here */
 	restore_processor_state();
 Restore_highmem:
-	restore_highmem();
+	restore_special_mem();
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
@@ -271,7 +261,7 @@ int swsusp_resume(void)
 	 */
 	swsusp_free();
 	restore_processor_state();
-	restore_highmem();
+	restore_special_mem();
 	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
-- 
cgit v1.2.3


From 98317f1271e7fd472983b013c76df6cc15fbef22 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Fri, 23 Jun 2006 02:04:54 -0700
Subject: [PATCH] m68k: Remove some unused definitions in zorro.h

These definitions have long been superseded by asm-offsets.h

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/zorro.h | 42 ------------------------------------------
 1 file changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zorro.h b/include/linux/zorro.h
index 2f135cf6eef1..913bfc226dda 100644
--- a/include/linux/zorro.h
+++ b/include/linux/zorro.h
@@ -11,8 +11,6 @@
 #ifndef _LINUX_ZORRO_H
 #define _LINUX_ZORRO_H
 
-#ifndef __ASSEMBLY__
-
 #include <linux/device.h>
 
 
@@ -112,45 +110,6 @@ struct ConfigDev {
     __u32		cd_Unused[4];	/* for whatever the driver wants */
 } __attribute__ ((packed));
 
-#else /* __ASSEMBLY__ */
-
-LN_Succ		= 0
-LN_Pred		= LN_Succ+4
-LN_Type		= LN_Pred+4
-LN_Pri		= LN_Type+1
-LN_Name		= LN_Pri+1
-LN_sizeof	= LN_Name+4
-
-ER_Type		= 0
-ER_Product	= ER_Type+1
-ER_Flags	= ER_Product+1
-ER_Reserved03	= ER_Flags+1
-ER_Manufacturer	= ER_Reserved03+1
-ER_SerialNumber	= ER_Manufacturer+2
-ER_InitDiagVec	= ER_SerialNumber+4
-ER_Reserved0c	= ER_InitDiagVec+2
-ER_Reserved0d	= ER_Reserved0c+1
-ER_Reserved0e	= ER_Reserved0d+1
-ER_Reserved0f	= ER_Reserved0e+1
-ER_sizeof	= ER_Reserved0f+1
-
-CD_Node		= 0
-CD_Flags	= CD_Node+LN_sizeof
-CD_Pad		= CD_Flags+1
-CD_Rom		= CD_Pad+1
-CD_BoardAddr	= CD_Rom+ER_sizeof
-CD_BoardSize	= CD_BoardAddr+4
-CD_SlotAddr	= CD_BoardSize+4
-CD_SlotSize	= CD_SlotAddr+2
-CD_Driver	= CD_SlotSize+2
-CD_NextCD	= CD_Driver+4
-CD_Unused	= CD_NextCD+4
-CD_sizeof	= CD_Unused+(4*4)
-
-#endif /* __ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
 #define ZORRO_NUM_AUTO		16
 
 #ifdef __KERNEL__
@@ -290,7 +249,6 @@ extern DECLARE_BITMAP(zorro_unused_z2ram, 128);
 #define Z2RAM_CHUNKSHIFT	(16)
 
 
-#endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_ZORRO_H */
-- 
cgit v1.2.3


From c330dda908b5a46469a997eea90b66f2f9f02b34 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 23 Jun 2006 02:05:07 -0700
Subject: [PATCH] Add a sysfs file to determine if a kexec kernel is loaded

Create two files in /sys/kernel, kexec_loaded and kexec_crash_loaded.  Each
file contains a simple boolean value indicating whether the relevant kernel
has been loaded into memory.  The motivation for this is geared around
support.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kexec.h |  1 +
 kernel/kexec.c        |  6 +++---
 kernel/ksysfs.c       | 19 +++++++++++++++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index cfb3410e32b1..6427949ddf99 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -106,6 +106,7 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image,
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 extern struct kimage *kexec_image;
+extern struct kimage *kexec_crash_image;
 
 #define KEXEC_ON_CRASH  0x00000001
 #define KEXEC_ARCH_MASK 0xffff0000
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..58f0f382597c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
  * kexec does not sync, or unmount filesystems so if you need
  * that to happen you need to do that yourself.
  */
-struct kimage *kexec_image = NULL;
-static struct kimage *kexec_crash_image = NULL;
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
 /*
  * A home grown binary mutex.
  * Nothing can wait so this mutex is safe to use
  * in interrupt context :)
  */
-static int kexec_lock = 0;
+static int kexec_lock;
 
 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 				struct kexec_segment __user *segments,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..9e28478a17a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
 #include <linux/sysfs.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/kexec.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
 KERNEL_ATTR_RW(uevent_helper);
 #endif
 
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+
+static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+#endif /* CONFIG_KEXEC */
+
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
@@ -55,6 +70,10 @@ static struct attribute * kernel_attrs[] = {
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	&uevent_seqnum_attr.attr,
 	&uevent_helper_attr.attr,
+#endif
+#ifdef CONFIG_KEXEC
+	&kexec_loaded_attr.attr,
+	&kexec_crash_loaded_attr.attr,
 #endif
 	NULL
 };
-- 
cgit v1.2.3


From 090d2b185d8680fc26a2eaf4245d4171dcf4baf1 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:05:08 -0700
Subject: [PATCH] read_mapping_page for address space

Add read_mapping_page() which is used for callers that pass
mapping->a_ops->readpage as the filler for read_cache_page.  This removes
some duplication from filesystem code.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/dir.c            |  4 +---
 fs/afs/mntpt.c          | 11 ++---------
 fs/cramfs/inode.c       |  4 +---
 fs/ext2/dir.c           |  3 +--
 fs/freevxfs/vxfs_subr.c |  3 +--
 fs/hfs/bnode.c          |  2 +-
 fs/hfs/btree.c          |  2 +-
 fs/hfsplus/bitmap.c     | 15 +++++++--------
 fs/hfsplus/bnode.c      |  2 +-
 fs/hfsplus/btree.c      |  2 +-
 fs/jfs/jfs_metapage.c   |  5 ++---
 fs/minix/dir.c          |  3 +--
 fs/namei.c              |  3 +--
 fs/ntfs/aops.h          |  3 +--
 fs/ntfs/attrib.c        |  6 ++----
 fs/ntfs/file.c          |  3 +--
 fs/ocfs2/symlink.c      |  3 +--
 fs/partitions/check.c   |  4 ++--
 fs/reiserfs/xattr.c     |  3 +--
 fs/sysv/dir.c           |  3 +--
 include/linux/pagemap.h |  7 +++++++
 mm/swapfile.c           |  3 +--
 22 files changed, 38 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a6dff6a4f204..2fc99877cb0d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -185,9 +185,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
 
 	_enter("{%lu},%lu", dir->i_ino, index);
 
-	page = read_cache_page(dir->i_mapping,index,
-			       (filler_t *) dir->i_mapping->a_ops->readpage,
-			       NULL);
+	page = read_mapping_page(dir->i_mapping, index, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83c..b5cf9e1205ad 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -63,7 +63,6 @@ unsigned long afs_mntpt_expiry_timeout = 20;
 int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 {
 	struct page *page;
-	filler_t *filler;
 	size_t size;
 	char *buf;
 	int ret;
@@ -71,10 +70,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 	_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
-	filler = (filler_t *) AFS_VNODE_TO_I(vnode)->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
-			       filler, NULL);
+	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto out;
@@ -160,7 +156,6 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	struct page *page = NULL;
 	size_t size;
 	char *buf, *devname = NULL, *options = NULL;
-	filler_t *filler;
 	int ret;
 
 	kenter("{%s}", mntpt->d_name.name);
@@ -182,9 +177,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		goto error;
 
 	/* read the contents of the AFS special symlink */
-	filler = (filler_t *)mntpt->d_inode->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(mntpt->d_inode->i_mapping, 0, filler, NULL);
+	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto error;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 8a9d5d3b3262..c45d73860803 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -181,9 +181,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_cache_page(mapping, blocknr + i,
-				(filler_t *)mapping->a_ops->readpage,
-				NULL);
+			page = read_mapping_page(mapping, blocknr + i, NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d672aa9f4061..3c1c9aaaca6b 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -159,8 +159,7 @@ fail:
 static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 50aae77651b2..c1be118fc067 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -71,8 +71,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 {
 	struct page *			pp;
 
-	pp = read_cache_page(mapping, n,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	pp = read_mapping_page(mapping, n, NULL);
 
 	if (!IS_ERR(pp)) {
 		wait_on_page_locked(pp);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 1e44dcfe49c4..13231dd5ce66 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -280,7 +280,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; i++) {
-		page = read_cache_page(mapping, block++, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block++, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index d20131ce4b95..400357994319 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -59,7 +59,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	unlock_new_inode(tree->inode);
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 9fb51632303c..d128a25b74d2 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,8 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
-	page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	i = offset % 32;
@@ -72,8 +71,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		offset += PAGE_CACHE_BITS;
 		if (offset >= size)
 			break;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		curr = pptr = kmap(page);
 		if ((size ^ offset) / PAGE_CACHE_BITS)
 			end = pptr + PAGE_CACHE_BITS / 32;
@@ -119,8 +118,8 @@ found:
 		set_page_dirty(page);
 		kunmap(page);
 		offset += PAGE_CACHE_BITS;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
@@ -167,7 +166,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
-	page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	end = pptr + PAGE_CACHE_BITS / 32;
@@ -199,7 +198,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 			break;
 		set_page_dirty(page);
 		kunmap(page);
-		page = read_cache_page(mapping, ++pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, ++pnr, NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 746abc9ecf70..77bf434da679 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -440,7 +440,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
-		page = read_cache_page(mapping, block, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index effa8991999c..cfc852fdd1b5 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -38,7 +38,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2b220dd6b4e7..7f6e88039700 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -632,10 +632,9 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 		}
 		SetPageUptodate(page);
 	} else {
-		page = read_cache_page(mapping, page_index,
-			    (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, page_index, NULL);
 		if (IS_ERR(page) || !PageUptodate(page)) {
-			jfs_err("read_cache_page failed!");
+			jfs_err("read_mapping_page failed!");
 			return NULL;
 		}
 		lock_page(page);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 69224d1fe043..2b0a389d1987 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -60,8 +60,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/namei.c b/fs/namei.c
index 184fe4acf824..bb4a3e40e432 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2577,8 +2577,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage,
-				NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 3b74e66ca2ff..325ce261a107 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -86,8 +86,7 @@ static inline void ntfs_unmap_page(struct page *page)
 static inline struct page *ntfs_map_page(struct address_space *mapping,
 		unsigned long index)
 {
-	struct page *page = read_cache_page(mapping, index,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1663f5c3c6aa..6708e1d68a9e 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2529,8 +2529,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	end >>= PAGE_CACHE_SHIFT;
 	/* If there is a first partial page, need to do it the slow way. */
 	if (start_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
 					"page (sync error, index 0x%lx).", idx);
@@ -2600,8 +2599,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	}
 	/* If there is a last partial page, need to do it the slow way. */
 	if (end_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
 					"(sync error, index 0x%lx).", idx);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 36e1e136bb0c..88292f9e4b9b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -231,8 +231,7 @@ do_non_resident_extend:
 		 * Read the page.  If the page is not present, this will zero
 		 * the uninitialized regions for us.
 		 */
-		page = read_cache_page(mapping, index,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto init_err_out;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f6986bd79e75..0c8a1294ec96 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -64,8 +64,7 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8851b81e7c5a..cd885b23cb5c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -499,8 +499,8 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct page *page;
 
-	page = read_cache_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-			(filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+				 NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ffb79c48c5bf..39fedaa88a0c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -452,8 +452,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 	/* We can deadlock if we try to free dentries,
 	   and an unlink/rmdir has just occured - GFP_NOFS avoids this */
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_cache_page(mapping, n,
-			       (filler_t *) mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index d7074341ee87..f2bef962d309 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -53,8 +53,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7a1af574dedf..1245df7141aa 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -99,6 +99,13 @@ extern struct page * read_cache_page(struct address_space *mapping,
 extern int read_cache_pages(struct address_space *mapping,
 		struct list_head *pages, filler_t *filler, void *data);
 
+static inline struct page *read_mapping_page(struct address_space *mapping,
+					     unsigned long index, void *data)
+{
+	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+	return read_cache_page(mapping, index, filler, data);
+}
+
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 				unsigned long index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f2824c3c31b4..cc367f7e75d8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1477,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		error = -EINVAL;
 		goto bad_swap;
 	}
-	page = read_cache_page(mapping, 0,
-			(filler_t *)mapping->a_ops->readpage, swap_file);
+	page = read_mapping_page(mapping, 0, swap_file);
 	if (IS_ERR(page)) {
 		error = PTR_ERR(page);
 		goto bad_swap;
-- 
cgit v1.2.3


From 75e1fcc0b18df0a65ab113198e9dc0e98999a08c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 23 Jun 2006 02:05:12 -0700
Subject: [PATCH] vfs: add lock owner argument to flush operation

Pass the POSIX lock owner ID to the flush operation.

This is useful for filesystems which don't want to store any locking state
in inode->i_flock but want to handle locking/unlocking POSIX locks
internally.  FUSE is one such filesystem but I think it possible that some
network filesystems would need this also.

Also add a flag to indicate that a POSIX locking request was generated by
close(), so filesystems using the above feature won't send an extra locking
request in this case.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/perfmon.c | 3 +--
 drivers/input/evdev.c      | 2 +-
 drivers/scsi/osst.c        | 2 +-
 drivers/scsi/st.c          | 2 +-
 fs/cifs/cifsfs.h           | 2 +-
 fs/cifs/file.c             | 2 +-
 fs/coda/file.c             | 2 +-
 fs/fuse/file.c             | 2 +-
 fs/locks.c                 | 2 +-
 fs/nfs/file.c              | 4 ++--
 fs/open.c                  | 2 +-
 include/linux/coda_linux.h | 2 +-
 include/linux/fs.h         | 3 ++-
 ipc/mqueue.c               | 2 +-
 14 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 2359e2809f50..6d7bc8ff7b3a 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -532,7 +532,6 @@ static ctl_table pfm_sysctl_root[] = {
 static struct ctl_table_header *pfm_sysctl_header;
 
 static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
-static int pfm_flush(struct file *filp);
 
 #define pfm_get_cpu_var(v)		__ia64_per_cpu_var(v)
 #define pfm_get_cpu_data(a,b)		per_cpu(a, b)
@@ -1774,7 +1773,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
  * When caller is self-monitoring, the context is unloaded.
  */
 static int
-pfm_flush(struct file *filp)
+pfm_flush(struct file *filp, fl_owner_t id)
 {
 	pfm_context_t *ctx;
 	struct task_struct *task;
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index ba325f16d077..5f561fce32d8 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -82,7 +82,7 @@ static int evdev_fasync(int fd, struct file *file, int on)
 	return retval < 0 ? retval : 0;
 }
 
-static int evdev_flush(struct file * file)
+static int evdev_flush(struct file * file, fl_owner_t id)
 {
 	struct evdev_list *list = file->private_data;
 	if (!list->evdev->exist) return -ENODEV;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index ce0ba3a174f9..4a2fed350d4e 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -4724,7 +4724,7 @@ err_out:
 
 
 /* Flush the tape buffer before close */
-static int os_scsi_tape_flush(struct file * filp)
+static int os_scsi_tape_flush(struct file * filp, fl_owner_t id)
 {
 	int		      result = 0, result2;
 	struct osst_tape    * STp    = filp->private_data;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index ad87d73f88ee..1272dd249af3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -1193,7 +1193,7 @@ static int st_open(struct inode *inode, struct file *filp)
 
 
 /* Flush the tape buffer before close */
-static int st_flush(struct file *filp)
+static int st_flush(struct file *filp, fl_owner_t id)
 {
 	int result = 0, result2;
 	unsigned char cmd[MAX_COMMAND_SIZE];
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c98755dca868..d56c0577c710 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -74,7 +74,7 @@ extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t * poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, struct dentry *, int);
-extern int cifs_flush(struct file *);
+extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 487ea8b3baaa..b4a18c1cab0a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1417,7 +1417,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
  * As file closes, flush all cached write data for this inode checking
  * for write behind errors.
  */
-int cifs_flush(struct file *file)
+int cifs_flush(struct file *file, fl_owner_t id)
 {
 	struct inode * inode = file->f_dentry->d_inode;
 	int rc = 0;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7c2642431fa5..cc66c681bd11 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -164,7 +164,7 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_flush(struct file *coda_file)
+int coda_flush(struct file *coda_file, fl_owner_t id)
 {
 	unsigned short flags = coda_file->f_flags & ~O_EXCL;
 	unsigned short coda_flags = coda_flags_to_cflags(flags);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fc342cf7c2cc..087f3b734f40 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -169,7 +169,7 @@ static int fuse_release(struct inode *inode, struct file *file)
 	return fuse_release_common(inode, file, 0);
 }
 
-static int fuse_flush(struct file *file)
+static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
diff --git a/fs/locks.c b/fs/locks.c
index e588e1c265f7..f8a634ac1121 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1907,7 +1907,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 		return;
 
 	lock.fl_type = F_UNLCK;
-	lock.fl_flags = FL_POSIX;
+	lock.fl_flags = FL_POSIX | FL_CLOSE;
 	lock.fl_start = 0;
 	lock.fl_end = OFFSET_MAX;
 	lock.fl_owner = owner;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6e..fa05c027ea11 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -43,7 +43,7 @@ static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
 static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
 static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t);
 static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t);
-static int  nfs_file_flush(struct file *);
+static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
@@ -188,7 +188,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
  *
  */
 static int
-nfs_file_flush(struct file *file)
+nfs_file_flush(struct file *file, fl_owner_t id)
 {
 	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
 	struct inode	*inode = file->f_dentry->d_inode;
diff --git a/fs/open.c b/fs/open.c
index a37ff861108f..5fb16e5267dc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1152,7 +1152,7 @@ int filp_close(struct file *filp, fl_owner_t id)
 	}
 
 	if (filp->f_op && filp->f_op->flush)
-		retval = filp->f_op->flush(filp);
+		retval = filp->f_op->flush(filp, id);
 
 	dnotify_flush(filp, id);
 	locks_remove_posix(filp, id);
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index b3ecf8f71d97..7b5c5df5cb69 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -36,7 +36,7 @@ extern const struct file_operations coda_ioctl_operations;
 
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
-int coda_flush(struct file *f);
+int coda_flush(struct file *f, fl_owner_t id);
 int coda_release(struct inode *i, struct file *f);
 int coda_permission(struct inode *inode, int mask, struct nameidata *nd);
 int coda_revalidate_inode(struct dentry *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e917403f4d58..56d8bf0d0a77 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -683,6 +683,7 @@ extern spinlock_t files_lock;
 #define FL_FLOCK	2
 #define FL_ACCESS	8	/* not trying to lock, just looking */
 #define FL_LEASE	32	/* lease held on this file */
+#define FL_CLOSE	64	/* unlock on close */
 #define FL_SLEEP	128	/* A blocking lock */
 
 /*
@@ -1025,7 +1026,7 @@ struct file_operations {
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
 	int (*open) (struct inode *, struct file *);
-	int (*flush) (struct file *);
+	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
 	int (*fsync) (struct file *, struct dentry *, int datasync);
 	int (*aio_fsync) (struct kiocb *, int datasync);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 0a2a24b6ebe4..02e6f6798972 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -359,7 +359,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 	return count;
 }
 
-static int mqueue_flush_file(struct file *filp)
+static int mqueue_flush_file(struct file *filp, fl_owner_t id)
 {
 	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
 
-- 
cgit v1.2.3


From b0904e147f7cbe4be3b4dae49ddccd627bb66f16 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 23 Jun 2006 02:05:13 -0700
Subject: [PATCH] fs/locks.c: make posix_locks_deadlock() static

We can now make posix_locks_deadlock() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c         | 4 +---
 include/linux/fs.h | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index f8a634ac1121..1ad29c9b6252 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -703,7 +703,7 @@ EXPORT_SYMBOL(posix_test_lock);
  * from a broken NFS client. But broken NFS clients have a lot more to
  * worry about than proper deadlock detection anyway... --okir
  */
-int posix_locks_deadlock(struct file_lock *caller_fl,
+static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
 	struct list_head *tmp;
@@ -722,8 +722,6 @@ next_task:
 	return 0;
 }
 
-EXPORT_SYMBOL(posix_locks_deadlock);
-
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
  * at the head of the list, but that's secret knowledge known only to
  * flock_lock_file and posix_lock_file.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56d8bf0d0a77..dba4cbd157ee 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -776,7 +776,6 @@ extern int posix_lock_file_conf(struct file *, struct file_lock *, struct file_l
 extern int posix_lock_file(struct file *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file *, struct file_lock *);
-extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
-- 
cgit v1.2.3


From 8d27e9084b372441dc8c9cf361a965ee58032767 Mon Sep 17 00:00:00 2001
From: Xose Vazquez Perez <xose.vazquez@gmail.com>
Date: Fri, 23 Jun 2006 02:05:13 -0700
Subject: [PATCH] module.h: updated comments with a new license

"Dual MIT/GPL" is also accepted (kernel/module.c), so updated comments.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/module.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c2d89e037af0..2d366098eab5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -105,6 +105,8 @@ extern struct module __this_module;
  *	"GPL and additional rights"	[GNU Public License v2 rights and more]
  *	"Dual BSD/GPL"			[GNU Public License v2
  *					 or BSD license choice]
+ *	"Dual MIT/GPL"			[GNU Public License v2
+ *					 or MIT license choice]
  *	"Dual MPL/GPL"			[GNU Public License v2
  *					 or Mozilla license choice]
  *
-- 
cgit v1.2.3


From 260ea1013283d8acbb451459ed1ca560c1445c20 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 23 Jun 2006 02:05:18 -0700
Subject: [PATCH] ptrace: document the locking rules

After a lot of reading the code and thinking about how it behaves I have
managed to figure out what the current ptrace locking rules are.  The
current code is in much better that it appears at first glance.  The
troublesome code paths are actually the code paths that violate the current
rules.

ptrace uses simple exclusive access as it's locking.  You can only touch
task->ptrace if the task is stopped and you are the ptracer, or if the task
is running and are the task itself.

Very simple, very easy to maintain.  It just needs to be documented so
people know not to touch ptrace from elsewhere.

Currently we do have a few pieces of code that are in violation of this
rule.  Particularly the core dump code, and ptrace_attach.  But so far the
code looks fixable.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ptrace.h | 4 ++++
 include/linux/sched.h  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 0d36750fc0f1..ee918bc6e18c 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -51,6 +51,10 @@
 #ifdef __KERNEL__
 /*
  * Ptrace flags
+ *
+ * The owner ship rules for task->ptrace which holds the ptrace
+ * flags is simple.  When a task is running it owns it's task->ptrace
+ * flags.  When the a task is stopped the ptracer owns task->ptrace.
  */
 
 #define PT_PTRACED	0x00000001
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 267f15257040..a9d23c7d1b25 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,7 +1225,7 @@ static inline int thread_group_empty(task_t *p)
 		(thread_group_leader(p) && !thread_group_empty(p))
 
 /*
- * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset.
  *
-- 
cgit v1.2.3


From 0216bfcffe424a5473daa4da47440881b36c1f41 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:41 -0700
Subject: [PATCH] percpu counter data type changes to suppport more than 2**31
 ext3 free blocks counter

The percpu counter data type are changed in this set of patches to support
more users like ext3 who need more than 32 bit to store the free blocks
total in the filesystem.

- Generic perpcu counters data type changes.  The size of the global counter
  and local counter were explictly specified using s64 and s32.  The global
  counter is changed from long to s64, while the local counter is changed from
  long to s32, so we could avoid doing 64 bit update in most cases.

- Users of the percpu counters are updated to make use of the new
  percpu_counter_init() routine now taking an additional parameter to allow
  users to pass the initial value of the global counter.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/super.c                | 25 +++++++++++++------------
 fs/ext3/super.c                | 36 +++++++++++++++++++-----------------
 fs/file_table.c                |  2 +-
 include/linux/percpu_counter.h | 38 ++++++++++++++++++++------------------
 lib/percpu_counter.c           | 10 +++++-----
 5 files changed, 58 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a6c4d6e02324..ee4ba759581e 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -834,9 +834,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
 			       GFP_KERNEL);
@@ -863,6 +860,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+				ext2_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext2_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+				ext2_count_dirs(sb));
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -874,24 +878,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		iput(root);
 		printk(KERN_ERR "EXT2-fs: get root inode failed\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
 		ext2_warning(sb, __FUNCTION__,
 			"mounting ext3 filesystem as ext2");
 	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-				ext2_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-				ext2_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-				ext2_count_dirs(sb));
 	return 0;
 
 cantfind_ext2:
@@ -899,7 +897,10 @@ cantfind_ext2:
 		printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
 		       sb->s_id);
 	goto failed_mount;
-
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1a198b3985c9..a60cc6ec130f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1580,9 +1580,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -1602,6 +1599,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+		ext3_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+		ext3_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+		ext3_count_dirs(sb));
+
 	/* per fileystem reservation list head & lock */
 	spin_lock_init(&sbi->s_rsv_window_lock);
 	sbi->s_rsv_window_root = RB_ROOT;
@@ -1640,16 +1645,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext3_load_journal(sb, es, journal_devnum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else {
 		if (!silent)
 			printk (KERN_ERR
 				"ext3: No journal on filesystem on %s\n",
 				sb->s_id);
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 
 	/* We have now updated the journal if required, so we can
@@ -1672,7 +1677,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
 			printk(KERN_ERR "EXT3-fs: Journal does not support "
 			       "requested data journaling mode\n");
-			goto failed_mount3;
+			goto failed_mount4;
 		}
 	default:
 		break;
@@ -1695,13 +1700,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		printk(KERN_ERR "EXT3-fs: get root inode failed\n");
 		iput(root);
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 
 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -1724,13 +1729,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-		ext3_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-		ext3_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-		ext3_count_dirs(sb));
-
 	lock_kernel();
 	return 0;
 
@@ -1740,8 +1738,12 @@ cantfind_ext3:
 		       sb->s_id);
 	goto failed_mount;
 
-failed_mount3:
+failed_mount4:
 	journal_destroy(sbi->s_journal);
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
diff --git a/fs/file_table.c b/fs/file_table.c
index bcea1998b4de..506d5307108d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -300,5 +300,5 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
-	percpu_counter_init(&nr_files);
+	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 66b5de404f22..f5aa593ccf32 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -10,13 +10,14 @@
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/percpu.h>
+#include <linux/types.h>
 
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
 	spinlock_t lock;
-	long count;
-	long *counters;
+	s64 count;
+	s32 *counters;
 };
 
 #if NR_CPUS >= 16
@@ -25,11 +26,11 @@ struct percpu_counter {
 #define FBC_BATCH	(NR_CPUS*4)
 #endif
 
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
 	spin_lock_init(&fbc->lock);
-	fbc->count = 0;
-	fbc->counters = alloc_percpu(long);
+	fbc->count = amount;
+	fbc->counters = alloc_percpu(s32);
 }
 
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
@@ -37,10 +38,10 @@ static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 	free_percpu(fbc->counters);
 }
 
-void percpu_counter_mod(struct percpu_counter *fbc, long amount);
-long percpu_counter_sum(struct percpu_counter *fbc);
+void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
+s64 percpu_counter_sum(struct percpu_counter *fbc);
 
-static inline long percpu_counter_read(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
 	return fbc->count;
 }
@@ -48,13 +49,14 @@ static inline long percpu_counter_read(struct percpu_counter *fbc)
 /*
  * It is possible for the percpu_counter_read() to return a small negative
  * number for some counter which should never be negative.
+ *
  */
-static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
-	long ret = fbc->count;
+	s64 ret = fbc->count;
 
 	barrier();		/* Prevent reloads of fbc->count */
-	if (ret > 0)
+	if (ret >= 0)
 		return ret;
 	return 1;
 }
@@ -62,12 +64,12 @@ static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
 #else
 
 struct percpu_counter {
-	long count;
+	s64 count;
 };
 
-static inline void percpu_counter_init(struct percpu_counter *fbc)
+static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
-	fbc->count = 0;
+	fbc->count = amount;
 }
 
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
@@ -75,24 +77,24 @@ static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 }
 
 static inline void
-percpu_counter_mod(struct percpu_counter *fbc, long amount)
+percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
 {
 	preempt_disable();
 	fbc->count += amount;
 	preempt_enable();
 }
 
-static inline long percpu_counter_read(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
 	return fbc->count;
 }
 
-static inline long percpu_counter_read_positive(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
 	return fbc->count;
 }
 
-static inline long percpu_counter_sum(struct percpu_counter *fbc)
+static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
 	return percpu_counter_read_positive(fbc);
 }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 7a87003f8e8f..850449080e1c 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -5,10 +5,10 @@
 #include <linux/percpu_counter.h>
 #include <linux/module.h>
 
-void percpu_counter_mod(struct percpu_counter *fbc, long amount)
+void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
 {
 	long count;
-	long *pcount;
+	s32 *pcount;
 	int cpu = get_cpu();
 
 	pcount = per_cpu_ptr(fbc->counters, cpu);
@@ -29,15 +29,15 @@ EXPORT_SYMBOL(percpu_counter_mod);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-long percpu_counter_sum(struct percpu_counter *fbc)
+s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	long ret;
+	s64 ret;
 	int cpu;
 
 	spin_lock(&fbc->lock);
 	ret = fbc->count;
 	for_each_possible_cpu(cpu) {
-		long *pcount = per_cpu_ptr(fbc->counters, cpu);
+		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
 	spin_unlock(&fbc->lock);
-- 
cgit v1.2.3


From 368a5fa1f28589e6b54588a139ea872d5b4b1914 Mon Sep 17 00:00:00 2001
From: Hua Zhong <hzhong@gmail.com>
Date: Fri, 23 Jun 2006 02:05:42 -0700
Subject: [PATCH] remove unlikely() in might_sleep_if()

The likely() profiling tools show that __alloc_page() causes a lot of misses:

!       132    119193 __alloc_pages():mm/page_alloc.c@937

Because most __alloc_page() calls are not atomic.

Signed-off-by: Hua Zhong <hzhong@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 25fccd859fbf..8c21aaa248b4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -78,7 +78,7 @@ extern int cond_resched(void);
 # define might_sleep() do { might_resched(); } while (0)
 #endif
 
-#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
+#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
 
 #define abs(x) ({				\
 		int __x = (x);			\
-- 
cgit v1.2.3


From 1d31a4ea8cf7a2afc7299d1d3d8732ca54a5934e Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:42 -0700
Subject: [PATCH] Process Events - Header Cleanup

Move connector header include to precisely where it's needed.

Remove unused time.h header file as well.  This was leftover from previous
iterations of the process events patches.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Cc: Nguyen Anh Quynh <aquynh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/connector/cn_proc.c | 1 +
 include/linux/cn_proc.h     | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 4b4d7db1ff7b..498aa37bca22 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -26,6 +26,7 @@
 #include <linux/kernel.h>
 #include <linux/ktime.h>
 #include <linux/init.h>
+#include <linux/connector.h>
 #include <asm/atomic.h>
 
 #include <linux/cn_proc.h>
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 1417de935057..1e3459a14e20 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -26,8 +26,6 @@
 #define CN_PROC_H
 
 #include <linux/types.h>
-#include <linux/time.h>
-#include <linux/connector.h>
 
 /*
  * Userspace sends this enum to register with the kernel that it is listening
-- 
cgit v1.2.3


From 3fa2164d03fb7af17fcfe4f8800dd754fbd99ff7 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:44 -0700
Subject: [PATCH] Process Events: License Change

Change the license on the process event structure passed between kernel and
userspace.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Acked-by: Nguyen Anh Quynh <aquynh@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cn_proc.h | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 1e3459a14e20..dbb7769009be 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -3,23 +3,16 @@
  *
  * Copyright (C) Matt Helsley, IBM Corp. 2005
  * Based on cn_fork.h by Nguyen Anh Quynh and Guillaume Thouvenin
- * Original copyright notice follows:
  * Copyright (C) 2005 Nguyen Anh Quynh <aquynh@gmail.com>
  * Copyright (C) 2005 Guillaume Thouvenin <guillaume.thouvenin@bull.net>
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
 
 #ifndef CN_PROC_H
-- 
cgit v1.2.3


From 481fad483487ea967fe20bbc9e565d787f7bf20f Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:05:44 -0700
Subject: [PATCH] strstrip() API

Add a new strstrip() function to lib/string.c for removing leading and
trailing whitespace from a string.

Cc: Michael Holzheu <holzheu@de.ibm.com>
Acked-by: Ingo Oeser <ioe-lkml@rameria.de>
Acked-by: Joern Engel <joern@wohnheim.fh-wedel.de>
Cc: Corey Minyard <minyard@acm.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Michael Holzheu <HOLZHEU@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/string.h |  1 +
 lib/string.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index c61306da8c52..e4c755860316 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -56,6 +56,7 @@ extern char * strnchr(const char *, size_t, int);
 #ifndef __HAVE_ARCH_STRRCHR
 extern char * strrchr(const char *,int);
 #endif
+extern char * strstrip(char *);
 #ifndef __HAVE_ARCH_STRSTR
 extern char * strstr(const char *,const char *);
 #endif
diff --git a/lib/string.c b/lib/string.c
index 064f6315b1c3..63077267367e 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -301,6 +301,36 @@ char *strnchr(const char *s, size_t count, int c)
 EXPORT_SYMBOL(strnchr);
 #endif
 
+/**
+ * strstrip - Removes leading and trailing whitespace from @s.
+ * @s: The string to be stripped.
+ *
+ * Note that the first trailing whitespace is replaced with a %NUL-terminator
+ * in the given string @s. Returns a pointer to the first non-whitespace
+ * character in @s.
+ */
+char *strstrip(char *s)
+{
+	size_t size;
+	char *end;
+
+	size = strlen(s);
+
+	if (!size)
+		return s;
+
+	end = s + size - 1;
+	while (end != s && isspace(*end))
+		end--;
+	*(end + 1) = '\0';
+
+	while (*s && isspace(*s))
+		s++;
+
+	return s;
+}
+EXPORT_SYMBOL(strstrip);
+
 #ifndef __HAVE_ARCH_STRLEN
 /**
  * strlen - Find the length of a string
-- 
cgit v1.2.3


From d83015b8f62ee3fcd338f6f009051ed57f77a531 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:51 -0700
Subject: [PATCH] Make RCU API inaccessible to non-GPL Linux kernel modules

Remove synchronize_kernel() (deprecated 2-APR-2005 in
http://lkml.org/lkml/2005/4/3/11) and makes the RCU API inaccessible to
non-GPL Linux kernel modules (as was announced more than one year ago in
http://lkml.org/lkml/2005/4/3/8).  Tested on x86 and ppc64.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/whatisRCU.txt            |  1 -
 Documentation/feature-removal-schedule.txt | 15 ---------------
 include/linux/rcupdate.h                   |  3 +--
 kernel/rcupdate.c                          | 13 ++-----------
 4 files changed, 3 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 07cb93b82ba9..6e459420ee9f 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -790,7 +790,6 @@ RCU pointer update:
 
 RCU grace period:
 
-	synchronize_kernel (deprecated)
 	synchronize_net
 	synchronize_sched
 	synchronize_rcu
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index f7293297f326..027285d0c26c 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -33,21 +33,6 @@ Who:	Adrian Bunk <bunk@stusta.de>
 
 ---------------------------
 
-What:	RCU API moves to EXPORT_SYMBOL_GPL
-When:	April 2006
-Files:	include/linux/rcupdate.h, kernel/rcupdate.c
-Why:	Outside of Linux, the only implementations of anything even
-	vaguely resembling RCU that I am aware of are in DYNIX/ptx,
-	VM/XA, Tornado, and K42.  I do not expect anyone to port binary
-	drivers or kernel modules from any of these, since the first two
-	are owned by IBM and the last two are open-source research OSes.
-	So these will move to GPL after a grace period to allow
-	people, who might be using implementations that I am not aware
-	of, to adjust to this upcoming change.
-Who:	Paul E. McKenney <paulmck@us.ibm.com>
-
----------------------------
-
 What:	raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN
 When:	November 2006
 Why:	Deprecated in favour of the new ioctl-based rawiso interface, which is
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 970284f571a6..6312758393b6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -246,7 +246,7 @@ extern int rcu_needs_cpu(int cpu);
  * softirq handlers will have completed, since in some kernels, these
  * handlers can run in process context, and can block.
  *
- * This primitive provides the guarantees made by the (deprecated)
+ * This primitive provides the guarantees made by the (now removed)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  * In "classic RCU", these two guarantees happen to be one and
@@ -264,7 +264,6 @@ extern void FASTCALL(call_rcu(struct rcu_head *head,
 				void (*func)(struct rcu_head *head)));
 extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
 				void (*func)(struct rcu_head *head)));
-extern __deprecated_for_modules void synchronize_kernel(void);
 extern void synchronize_rcu(void);
 void synchronize_idle(void);
 extern void rcu_barrier(void);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..20e9710fc21c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -612,14 +612,6 @@ void synchronize_rcu(void)
 	wait_for_completion(&rcu.completion);
 }
 
-/*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
- */
-void synchronize_kernel(void)
-{
-	synchronize_rcu();
-}
-
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
@@ -627,7 +619,6 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu);	/* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);	/* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
-- 
cgit v1.2.3


From f5befceb5cfecba49fdf61f8e0eb4d453200eac9 Mon Sep 17 00:00:00 2001
From: Brent Casavant <bcasavan@sgi.com>
Date: Fri, 23 Jun 2006 02:05:52 -0700
Subject: [PATCH] SGI IOC4: Detect IO card variant

There are three different IO cards which an SGI IOC4 controller may find
itself on.  One of these variants does not bring out the IDE and serial
signals, so we need to disable attaching the corresponding IOC4 subdrivers
to such cards.

Cleans up message clutter emitted during device probing.

Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/pci/sgiioc4.c    |  6 +++++
 drivers/serial/ioc4_serial.c |  9 +++++++
 drivers/sn/ioc4.c            | 64 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/ioc4.h         |  5 ++++
 4 files changed, 79 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index 27c9eb989a9a..e125032bb403 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -723,6 +723,12 @@ static ide_pci_device_t sgiioc4_chipsets[] __devinitdata = {
 int
 ioc4_ide_attach_one(struct ioc4_driver_data *idd)
 {
+	/* PCI-RT does not bring out IDE connection.
+	 * Do not attach to this particular IOC4.
+	 */
+	if (idd->idd_variant == IOC4_VARIANT_PCI_RT)
+		return 0;
+
 	return pci_init_sgiioc4(idd->idd_pdev,
 				&sgiioc4_chipsets[idd->idd_pci_id->driver_data]);
 }
diff --git a/drivers/serial/ioc4_serial.c b/drivers/serial/ioc4_serial.c
index c620209d7b9a..717e47bbd784 100644
--- a/drivers/serial/ioc4_serial.c
+++ b/drivers/serial/ioc4_serial.c
@@ -2646,7 +2646,10 @@ static int ioc4_serial_remove_one(struct ioc4_driver_data *idd)
 	struct ioc4_port *port;
 	struct ioc4_soft *soft;
 
+	/* If serial driver did not attach, don't try to detach */
 	control = idd->idd_serial_data;
+	if (!control)
+		return 0;
 
 	for (port_num = 0; port_num < IOC4_NUM_SERIAL_PORTS; port_num++) {
 		for (port_type = UART_PORT_MIN;
@@ -2778,6 +2781,12 @@ ioc4_serial_attach_one(struct ioc4_driver_data *idd)
 	DPRINT_CONFIG(("%s (0x%p, 0x%p)\n", __FUNCTION__, idd->idd_pdev,
 							idd->idd_pci_id));
 
+	/* PCI-RT does not bring out serial connections.
+	 * Do not attach to this particular IOC4.
+	 */
+	if (idd->idd_variant == IOC4_VARIANT_PCI_RT)
+		return 0;
+
 	/* request serial registers */
 	tmp_addr1 = idd->idd_bar0 + IOC4_SERIAL_OFFSET;
 
diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c
index cdeff909403e..8256a97eb508 100644
--- a/drivers/sn/ioc4.c
+++ b/drivers/sn/ioc4.c
@@ -160,9 +160,6 @@ ioc4_clock_calibrate(struct ioc4_driver_data *idd)
 	writel(0, &idd->idd_misc_regs->int_out.raw);
 	mmiowb();
 
-	printk(KERN_INFO
-	       "%s: Calibrating PCI bus speed "
-	       "for pci_dev %s ... ", __FUNCTION__, pci_name(idd->idd_pdev));
 	/* Set up square wave */
 	int_out.raw = 0;
 	int_out.fields.count = IOC4_CALIBRATE_COUNT;
@@ -206,11 +203,16 @@ ioc4_clock_calibrate(struct ioc4_driver_data *idd)
 	/* Bounds check the result. */
 	if (period > IOC4_CALIBRATE_LOW_LIMIT ||
 	    period < IOC4_CALIBRATE_HIGH_LIMIT) {
-		printk("failed. Assuming PCI clock ticks are %d ns.\n",
+		printk(KERN_INFO
+		       "IOC4 %s: Clock calibration failed.  Assuming"
+		       "PCI clock is %d ns.\n",
+		       pci_name(idd->idd_pdev),
 		       IOC4_CALIBRATE_DEFAULT / IOC4_EXTINT_COUNT_DIVISOR);
 		period = IOC4_CALIBRATE_DEFAULT;
 	} else {
-		printk("succeeded. PCI clock ticks are %ld ns.\n",
+		printk(KERN_DEBUG
+		       "IOC4 %s: PCI clock is %ld ns.\n",
+		       pci_name(idd->idd_pdev),
 		       period / IOC4_EXTINT_COUNT_DIVISOR);
 	}
 
@@ -222,6 +224,51 @@ ioc4_clock_calibrate(struct ioc4_driver_data *idd)
 	idd->count_period = period;
 }
 
+/* There are three variants of IOC4 cards: IO9, IO10, and PCI-RT.
+ * Each brings out different combinations of IOC4 signals, thus.
+ * the IOC4 subdrivers need to know to which we're attached.
+ *
+ * We look for the presence of a SCSI (IO9) or SATA (IO10) controller
+ * on the same PCI bus at slot number 3 to differentiate IO9 from IO10.
+ * If neither is present, it's a PCI-RT.
+ */
+static unsigned int
+ioc4_variant(struct ioc4_driver_data *idd)
+{
+	struct pci_dev *pdev = NULL;
+	int found = 0;
+
+	/* IO9: Look for a QLogic ISP 12160 at the same bus and slot 3. */
+	do {
+		pdev = pci_get_device(PCI_VENDOR_ID_QLOGIC,
+				      PCI_DEVICE_ID_QLOGIC_ISP12160, pdev);
+		if (pdev &&
+		    idd->idd_pdev->bus->number == pdev->bus->number &&
+		    3 == PCI_SLOT(pdev->devfn))
+			found = 1;
+		pci_dev_put(pdev);
+	} while (pdev && !found);
+	if (NULL != pdev)
+		return IOC4_VARIANT_IO9;
+
+	/* IO10: Look for a Vitesse VSC 7174 at the same bus and slot 3. */
+	pdev = NULL;
+	do {
+		pdev = pci_get_device(PCI_VENDOR_ID_VITESSE,
+				      PCI_DEVICE_ID_VITESSE_VSC7174, pdev);
+		if (pdev &&
+		    idd->idd_pdev->bus->number == pdev->bus->number &&
+		    3 == PCI_SLOT(pdev->devfn))
+			found = 1;
+		pci_dev_put(pdev);
+	} while (pdev && !found);
+	if (NULL != pdev)
+		return IOC4_VARIANT_IO10;
+
+	/* PCI-RT: No SCSI/SATA controller will be present */
+	return IOC4_VARIANT_PCI_RT;
+}
+
 /* Adds a new instance of an IOC4 card */
 static int
 ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
@@ -286,6 +333,13 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
 
 	/* Failsafe portion of per-IOC4 initialization */
 
+	/* Detect card variant */
+	idd->idd_variant = ioc4_variant(idd);
+	printk(KERN_INFO "IOC4 %s: %s card detected.\n", pci_name(pdev),
+	       idd->idd_variant == IOC4_VARIANT_IO9 ? "IO9" :
+	       idd->idd_variant == IOC4_VARIANT_PCI_RT ? "PCI-RT" :
+	       idd->idd_variant == IOC4_VARIANT_IO10 ? "IO10" : "unknown");
+
 	/* Initialize IOC4 */
 	pci_read_config_dword(idd->idd_pdev, PCI_COMMAND, &pcmd);
 	pci_write_config_dword(idd->idd_pdev, PCI_COMMAND,
diff --git a/include/linux/ioc4.h b/include/linux/ioc4.h
index 3dd18b785ebd..de73a3289cc2 100644
--- a/include/linux/ioc4.h
+++ b/include/linux/ioc4.h
@@ -147,6 +147,10 @@ struct ioc4_misc_regs {
 #define IOC4_GPCR_EDGE_6 0x40
 #define IOC4_GPCR_EDGE_7 0x80
 
+#define IOC4_VARIANT_IO9	0x0900
+#define IOC4_VARIANT_PCI_RT	0x0901
+#define IOC4_VARIANT_IO10	0x1000
+
 /* One of these per IOC4 */
 struct ioc4_driver_data {
 	struct list_head idd_list;
@@ -156,6 +160,7 @@ struct ioc4_driver_data {
 	struct __iomem ioc4_misc_regs *idd_misc_regs;
 	unsigned long count_period;
 	void *idd_serial_data;
+	unsigned int idd_variant;
 };
 
 /* One per submodule */
-- 
cgit v1.2.3


From 54e73770357142e297c916c7865f5fca7499f69c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 23 Jun 2006 02:05:54 -0700
Subject: [PATCH] list: introduce list_replace() helper

list_replace() is similar to list_replace_rcu(), but unlike
list_replace_rcu() it

	could be used when list_empty(old) == 1

	doesn't use barriers

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/list.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 76f05718342c..a02642e4710a 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -197,12 +197,35 @@ static inline void list_del_rcu(struct list_head *entry)
 	entry->prev = LIST_POISON2;
 }
 
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ * Note: if 'old' was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+
 /*
  * list_replace_rcu - replace old entry by new one
  * @old : the element to be replaced
  * @new : the new element to insert
  *
  * The old entry will be replaced with the new entry atomically.
+ * Note: 'old' should not be empty.
  */
 static inline void list_replace_rcu(struct list_head *old,
 				struct list_head *new)
-- 
cgit v1.2.3


From 908dcecda1d18803b5823f30e6c47d2882dc0cf1 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 23 Jun 2006 02:06:00 -0700
Subject: [PATCH] adjust handle_IRR_event() return type

Correct the return type of handle_IRQ_event() (inconsistency noticed during
Xen development), and remove redundant declarations.  The return type
adjustment required breaking out the definition of irqreturn_t into a
separate header, in order to satisfy current include order dependencies.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/irq.h     |  4 ----
 include/asm-arm/irq.h       |  4 ----
 include/asm-arm26/irq.h     |  4 ----
 include/asm-h8300/irq.h     |  4 ----
 include/asm-m68k/irq.h      |  4 ----
 include/asm-m68knommu/irq.h |  4 ----
 include/asm-s390/irq.h      |  4 ----
 include/asm-sparc/irq.h     |  4 ----
 include/asm-v850/irq.h      |  2 --
 include/linux/interrupt.h   | 21 +--------------------
 include/linux/irq.h         |  3 ++-
 include/linux/irqreturn.h   | 25 +++++++++++++++++++++++++
 kernel/irq/handle.c         |  5 +++--
 13 files changed, 31 insertions(+), 57 deletions(-)
 create mode 100644 include/linux/irqreturn.h

(limited to 'include/linux')

diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h
index f6de033718a0..917b9fe372cf 100644
--- a/include/asm-alpha/irq.h
+++ b/include/asm-alpha/irq.h
@@ -92,8 +92,4 @@ extern void enable_irq(unsigned int);
 struct pt_regs;
 extern void (*perf_irq)(unsigned long, struct pt_regs *);
 
-struct irqaction;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
-
 #endif /* _ALPHA_IRQ_H */
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index 60b5105c9c93..66e67e60bc56 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -47,10 +47,6 @@ void disable_irq_wake(unsigned int irq);
 void enable_irq_wake(unsigned int irq);
 int setup_irq(unsigned int, struct irqaction *);
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 extern void migrate_irqs(void);
 #endif
 
diff --git a/include/asm-arm26/irq.h b/include/asm-arm26/irq.h
index 06bd5a543d13..9aaac87efba9 100644
--- a/include/asm-arm26/irq.h
+++ b/include/asm-arm26/irq.h
@@ -44,9 +44,5 @@ extern void enable_irq(unsigned int);
 
 int set_irq_type(unsigned int irq, unsigned int type);
 
-int setup_irq(unsigned int, struct irqaction *);
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif
 
diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index 73065f5bda0e..42a3ac424a9e 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -63,8 +63,4 @@ extern void enable_irq(unsigned int);
 extern void disable_irq(unsigned int);
 #define disable_irq_nosync(x)	disable_irq(x)
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* _H8300_IRQ_H_ */
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index b4f48b2a6a57..9727ca9d9f26 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -130,8 +130,4 @@ extern volatile unsigned int num_spurious;
  */
 extern irq_node_t *new_irq_node(void);
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index 2b408842a30e..c5247516fcfe 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -87,8 +87,4 @@ extern void (*mach_disable_irq)(unsigned int);
 #define disable_irq(x)	do { } while (0)
 #define disable_irq_nosync(x)	disable_irq(x)
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-s390/irq.h b/include/asm-s390/irq.h
index 916a1aa0b073..bd1a721f7aa2 100644
--- a/include/asm-s390/irq.h
+++ b/include/asm-s390/irq.h
@@ -21,10 +21,6 @@ enum interruption_class {
 
 #define touch_nmi_watchdog() do { } while(0)
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* __KERNEL__ */
 #endif
 
diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h
index f2d64537e29d..3141ddfea97d 100644
--- a/include/asm-sparc/irq.h
+++ b/include/asm-sparc/irq.h
@@ -181,8 +181,4 @@ extern struct sun4m_intregs *sun4m_interrupts;
 #define SUN4M_INT_SBUS(x)	(1 << (x+7))
 #define SUN4M_INT_VME(x)	(1 << (x))
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif
diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h
index 44431152b36d..1bf096db8f4c 100644
--- a/include/asm-v850/irq.h
+++ b/include/asm-v850/irq.h
@@ -62,8 +62,6 @@ extern void disable_irq (unsigned int irq);
 /* Disable an irq without waiting. */
 extern void disable_irq_nosync (unsigned int irq);
 
-extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __V850_IRQ_H__ */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9e0fefd7884a..70741e170114 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -7,32 +7,13 @@
 #include <linux/bitops.h>
 #include <linux/preempt.h>
 #include <linux/cpumask.h>
+#include <linux/irqreturn.h>
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
-/*
- * For 2.4.x compatibility, 2.4.x can use
- *
- *	typedef void irqreturn_t;
- *	#define IRQ_NONE
- *	#define IRQ_HANDLED
- *	#define IRQ_RETVAL(x)
- *
- * To mix old-style and new-style irq handler returns.
- *
- * IRQ_NONE means we didn't handle it.
- * IRQ_HANDLED means that we did have a valid interrupt and handled it.
- * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled)
- */
-typedef int irqreturn_t;
-
-#define IRQ_NONE	(0)
-#define IRQ_HANDLED	(1)
-#define IRQ_RETVAL(x)	((x) != 0)
-
 struct irqaction {
 	irqreturn_t (*handler)(int, void *, struct pt_regs *);
 	unsigned long flags;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index e8a07e75e4fb..676e00dfb21a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -17,6 +17,7 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
+#include <linux/irqreturn.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
@@ -175,7 +176,7 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
-extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+extern fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 					struct irqaction *action);
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
new file mode 100644
index 000000000000..881883c2009d
--- /dev/null
+++ b/include/linux/irqreturn.h
@@ -0,0 +1,25 @@
+/* irqreturn.h */
+#ifndef _LINUX_IRQRETURN_H
+#define _LINUX_IRQRETURN_H
+
+/*
+ * For 2.4.x compatibility, 2.4.x can use
+ *
+ *	typedef void irqreturn_t;
+ *	#define IRQ_NONE
+ *	#define IRQ_HANDLED
+ *	#define IRQ_RETVAL(x)
+ *
+ * To mix old-style and new-style irq handler returns.
+ *
+ * IRQ_NONE means we didn't handle it.
+ * IRQ_HANDLED means that we did have a valid interrupt and handled it.
+ * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled)
+ */
+typedef int irqreturn_t;
+
+#define IRQ_NONE	(0)
+#define IRQ_HANDLED	(1)
+#define IRQ_RETVAL(x)	((x) != 0)
+
+#endif
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..0f6530117105 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
 /*
  * Have got an event to handle:
  */
-fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				struct irqaction *action)
 {
-	int ret, retval = 0, status = 0;
+	irqreturn_t ret, retval = IRQ_NONE;
+	unsigned int status = 0;
 
 	if (!(action->flags & SA_INTERRUPT))
 		local_irq_enable();
-- 
cgit v1.2.3


From 78ce89c92bc6eaf5933b5664bff64253a7103bd7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 23 Jun 2006 02:06:05 -0700
Subject: [PATCH] JBD: split checkpoint lists

Split the checkpoint list of the transaction into two lists.  In the first
list we keep the buffers that need to be submitted for IO.  In the second
list are kept buffers that were already submitted and we just have to wait
for the IO to complete.  This should simplify a handling of checkpoint
lists a bit and can eventually be also a performance gain.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: "Stephen C. Tweedie" <sct@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 419 ++++++++++++++++++++++++++++++----------------------
 include/linux/jbd.h |   8 +-
 2 files changed, 246 insertions(+), 181 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 3f5102b069db..47678a26c13b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,67 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction.
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
-
-static inline void __buffer_unlink(struct journal_head *jh)
+static inline void __buffer_unlink_first(struct journal_head *jh)
 {
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
+	transaction_t *transaction = jh->b_cp_transaction;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +95,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +154,54 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction.
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
 		/*
-		 * This is foul
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
-		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
-		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
-		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +211,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +227,43 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		atomic_inc(&bh->b_count);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+		ret = 1;
+	} else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +276,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +314,68 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (!retry && lock_need_resched(&journal->j_list_lock)){
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble.
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -455,6 +459,54 @@ int cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
 /*
  * journal_clean_checkpoint_list
  *
@@ -462,46 +514,44 @@ int cleanup_journal_tail(journal_t *journal)
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret = 0;
+	int released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +566,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +592,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +621,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +686,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 6a425e370cb3..20eb34403d0c 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -500,6 +500,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_checkpoint_list;
 
+	/*
+	 * Doubly-linked circular list of all buffers submitted for IO while
+	 * checkpointing. [j_list_lock]
+	 */
+	struct journal_head	*t_checkpoint_io_list;
+
 	/*
 	 * Doubly-linked circular list of temporary buffers currently undergoing
 	 * IO in the log [j_list_lock]
@@ -849,7 +855,7 @@ extern void journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
 int __journal_clean_checkpoint_list(journal_t *journal);
-void __journal_remove_checkpoint(struct journal_head *);
+int __journal_remove_checkpoint(struct journal_head *);
 void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
 
 /* Buffer IO */
-- 
cgit v1.2.3


From fda151d9feafc0e8418f23c716587c44394fcdbf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 5 Jun 2006 12:09:50 +0200
Subject: [PATCH] blktrace_api.h: endian annotations

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 include/linux/blktrace_api.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index eb1a867ed245..a7e8cef73d15 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -90,9 +90,9 @@ struct blk_io_trace {
  * The remap event
  */
 struct blk_io_trace_remap {
-	u32 device;
+	__be32 device;
 	u32 __pad;
-	u64 sector;
+	__be64 sector;
 };
 
 enum {
@@ -224,7 +224,7 @@ static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
 					 struct bio *bio, unsigned int pdu)
 {
 	struct blk_trace *bt = q->blk_trace;
-	u64 rpdu = cpu_to_be64(pdu);
+	__be64 rpdu = cpu_to_be64(pdu);
 
 	if (likely(!bt))
 		return;
-- 
cgit v1.2.3


From b31dc66a54ad986b6b73bdc49c8efc17cbad1833 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 08:26:10 +0200
Subject: [PATCH] Kill PF_SYNCWRITE flag

A process flag to indicate whether we are doing sync io is incredibly
ugly. It also causes performance problems when one does a lot of async
io and then proceeds to sync it. Part of the io will go out as async,
and the other part as sync. This causes a disconnect between the
previously submitted io and the synced io. For io schedulers such as CFQ,
this will cause us lost merges and suboptimal behaviour in scheduling.

Remove PF_SYNCWRITE completely from the fsync/msync paths, and let
the O_DIRECT path just directly indicate that the writes are sync
by using WRITE_SYNC instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c                |  2 +-
 block/cfq-iosched.c               |  4 +---
 block/ll_rw_blk.c                 |  3 +++
 drivers/usb/gadget/file_storage.c |  2 --
 fs/buffer.c                       |  2 --
 fs/direct-io.c                    | 18 ++++++++----------
 fs/fs-writeback.c                 |  2 --
 include/linux/blkdev.h            |  2 ++
 include/linux/sched.h             | 11 +++++------
 mm/msync.c                        |  3 ---
 10 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 9b13d72ffefa..56c99fa037df 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1339,7 +1339,7 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 	arq->state = AS_RQ_NEW;
 
 	if (rq_data_dir(arq->request) == READ
-			|| current->flags&PF_SYNCWRITE)
+			|| (arq->request->flags & REQ_RW_SYNC))
 		arq->is_sync = 1;
 	else
 		arq->is_sync = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c88f161d3fb3..4c4e9cc3ae26 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -277,8 +277,6 @@ static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsi
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
-#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
-
 /*
  * lots of deadline iosched dupes, can be abstracted later...
  */
@@ -334,7 +332,7 @@ static int cfq_queue_empty(request_queue_t *q)
 
 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 {
-	if (rw == READ || process_sync(task))
+	if (rw == READ || rw == WRITE_SYNC)
 		return task->pid;
 
 	return CFQ_KEY_ASYNC;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 17c42ddd31db..2270bb451385 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2827,6 +2827,9 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
 	if (unlikely(bio_barrier(bio)))
 		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 
+	if (bio_sync(bio))
+		req->flags |= REQ_RW_SYNC;
+
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 6f887478b148..a43dc908ac59 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1906,7 +1906,6 @@ static int fsync_sub(struct lun *curlun)
 
 	inode = filp->f_dentry->d_inode;
 	mutex_lock(&inode->i_mutex);
-	current->flags |= PF_SYNCWRITE;
 	rc = filemap_fdatawrite(inode->i_mapping);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
 	if (!rc)
@@ -1914,7 +1913,6 @@ static int fsync_sub(struct lun *curlun)
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!rc)
 		rc = err;
-	current->flags &= ~PF_SYNCWRITE;
 	mutex_unlock(&inode->i_mutex);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	return rc;
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f1f3a68077..373bb6292bdc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
 		goto out;
 	}
 
-	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 
 	/*
@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
-	current->flags &= ~PF_SYNCWRITE;
 out:
 	return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b05d1b218776..538fb0418fba 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
-		create = dio->rw == WRITE;
+		create = dio->rw & WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
-				if (dio->rw == WRITE) {
+				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					return -ENOTBLK;
 				}
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && rw == WRITE) {
+	if (ret == -ENOTBLK && (rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->is_async) {
 		int should_wait = 0;
 
-		if (dio->result < dio->size && rw == WRITE) {
+		if (dio->result < dio->size && (rw & WRITE)) {
 			dio->waiter = current;
 			should_wait = 1;
 		}
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 			ret = transferred;
 
 		/* We could have also come here on an AIO file extend */
-		if (!is_sync_kiocb(iocb) && rw == WRITE &&
+		if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
 		    ret >= 0 && dio->result == dio->size)
 			/*
 			 * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		current->flags |= PF_SYNCWRITE;
+		rw = WRITE_SYNC;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
 		mutex_unlock(&inode->i_mutex);
 	else if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
-	if (rw & WRITE)
-		current->flags &= ~PF_SYNCWRITE;
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6db95cf3aaa2..031b27a4bc9a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 	int need_write_inode_now = 0;
 	int err2;
 
-	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 		if (!err)
 			err = err2;
 	}
-	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3457e7b97363..482a21d67627 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -241,6 +241,7 @@ enum rq_flag_bits {
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
+	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -270,6 +271,7 @@ enum rq_flag_bits {
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
+#define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a9d23c7d1b25..38b4791e6a5d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -941,12 +941,11 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
-#define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
-#define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
-#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
-#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
-#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
diff --git a/mm/msync.c b/mm/msync.c
index bc6c95376366..d083544df21b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
 	down_read(&current->mm->mmap_sem);
-	if (flags & MS_SYNC)
-		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
 	if (!vma) {
 		error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		}
 	} while (vma && !done);
 out_unlock:
-	current->flags &= ~PF_SYNCWRITE;
 	up_read(&current->mm->mmap_sem);
 out:
 	return error;
-- 
cgit v1.2.3


From ad3caddaa1708e506f20b8e25a4a8ae586fc7d5b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 08:46:57 +0200
Subject: [PATCH] Get rid of struct request request_pm_state member

The IDE power management can just use the ->end_io_data member to store
it's data.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 drivers/ide/ide-io.c   | 91 ++++++++++++++++++++++++++++----------------------
 drivers/ide/ide.c      |  4 +--
 include/linux/blkdev.h |  5 ---
 3 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c01615dec202..4f2f138de2ca 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -142,38 +142,41 @@ enum {
 
 static void ide_complete_power_step(ide_drive_t *drive, struct request *rq, u8 stat, u8 error)
 {
+	struct request_pm_state *pm = rq->end_io_data;
+
 	if (drive->media != ide_disk)
 		return;
 
-	switch (rq->pm->pm_step) {
+	switch (pm->pm_step) {
 	case ide_pm_flush_cache:	/* Suspend step 1 (flush cache) complete */
-		if (rq->pm->pm_state == PM_EVENT_FREEZE)
-			rq->pm->pm_step = ide_pm_state_completed;
+		if (pm->pm_state == PM_EVENT_FREEZE)
+			pm->pm_step = ide_pm_state_completed;
 		else
-			rq->pm->pm_step = idedisk_pm_standby;
+			pm->pm_step = idedisk_pm_standby;
 		break;
 	case idedisk_pm_standby:	/* Suspend step 2 (standby) complete */
-		rq->pm->pm_step = ide_pm_state_completed;
+		pm->pm_step = ide_pm_state_completed;
 		break;
 	case idedisk_pm_idle:		/* Resume step 1 (idle) complete */
-		rq->pm->pm_step = ide_pm_restore_dma;
+		pm->pm_step = ide_pm_restore_dma;
 		break;
 	}
 }
 
 static ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
+	struct request_pm_state *pm = rq->end_io_data;
 	ide_task_t *args = rq->special;
 
 	memset(args, 0, sizeof(*args));
 
 	if (drive->media != ide_disk) {
 		/* skip idedisk_pm_idle for ATAPI devices */
-		if (rq->pm->pm_step == idedisk_pm_idle)
-			rq->pm->pm_step = ide_pm_restore_dma;
+		if (pm->pm_step == idedisk_pm_idle)
+			pm->pm_step = ide_pm_restore_dma;
 	}
 
-	switch (rq->pm->pm_step) {
+	switch (pm->pm_step) {
 	case ide_pm_flush_cache:	/* Suspend step 1 (flush cache) */
 		if (drive->media != ide_disk)
 			break;
@@ -215,7 +218,7 @@ static ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *
 		drive->hwif->ide_dma_check(drive);
 		break;
 	}
-	rq->pm->pm_step = ide_pm_state_completed;
+	pm->pm_step = ide_pm_state_completed;
 	return ide_stopped;
 }
 
@@ -362,12 +365,13 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
 			}
 		}
 	} else if (blk_pm_request(rq)) {
+		struct request_pm_state *pm = rq->end_io_data;
 #ifdef DEBUG_PM
 		printk("%s: complete_power_step(step: %d, stat: %x, err: %x)\n",
 			drive->name, rq->pm->pm_step, stat, err);
 #endif
 		ide_complete_power_step(drive, rq, stat, err);
-		if (rq->pm->pm_step == ide_pm_state_completed)
+		if (pm->pm_step == ide_pm_state_completed)
 			ide_complete_pm_request(drive, rq);
 		return;
 	}
@@ -871,6 +875,39 @@ done:
  	return ide_stopped;
 }
 
+static void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
+{
+	struct request_pm_state *pm = rq->end_io_data;
+
+	if (blk_pm_suspend_request(rq) &&
+	    pm->pm_step == ide_pm_state_start_suspend)
+		/* Mark drive blocked when starting the suspend sequence. */
+		drive->blocked = 1;
+	else if (blk_pm_resume_request(rq) &&
+		 pm->pm_step == ide_pm_state_start_resume) {
+		/* 
+		 * The first thing we do on wakeup is to wait for BSY bit to
+		 * go away (with a looong timeout) as a drive on this hwif may
+		 * just be POSTing itself.
+		 * We do that before even selecting as the "other" device on
+		 * the bus may be broken enough to walk on our toes at this
+		 * point.
+		 */
+		int rc;
+#ifdef DEBUG_PM
+		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
+#endif
+		rc = ide_wait_not_busy(HWIF(drive), 35000);
+		if (rc)
+			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
+		SELECT_DRIVE(drive);
+		HWIF(drive)->OUTB(8, HWIF(drive)->io_ports[IDE_CONTROL_OFFSET]);
+		rc = ide_wait_not_busy(HWIF(drive), 10000);
+		if (rc)
+			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
+	}
+}
+
 /**
  *	start_request	-	start of I/O and command issuing for IDE
  *
@@ -909,33 +946,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 	if (block == 0 && drive->remap_0_to_1 == 1)
 		block = 1;  /* redirect MBR access to EZ-Drive partn table */
 
-	if (blk_pm_suspend_request(rq) &&
-	    rq->pm->pm_step == ide_pm_state_start_suspend)
-		/* Mark drive blocked when starting the suspend sequence. */
-		drive->blocked = 1;
-	else if (blk_pm_resume_request(rq) &&
-		 rq->pm->pm_step == ide_pm_state_start_resume) {
-		/* 
-		 * The first thing we do on wakeup is to wait for BSY bit to
-		 * go away (with a looong timeout) as a drive on this hwif may
-		 * just be POSTing itself.
-		 * We do that before even selecting as the "other" device on
-		 * the bus may be broken enough to walk on our toes at this
-		 * point.
-		 */
-		int rc;
-#ifdef DEBUG_PM
-		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
-#endif
-		rc = ide_wait_not_busy(HWIF(drive), 35000);
-		if (rc)
-			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
-		SELECT_DRIVE(drive);
-		HWIF(drive)->OUTB(8, HWIF(drive)->io_ports[IDE_CONTROL_OFFSET]);
-		rc = ide_wait_not_busy(HWIF(drive), 10000);
-		if (rc)
-			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
-	}
+	if (blk_pm_request(rq))
+		ide_check_pm_state(drive, rq);
 
 	SELECT_DRIVE(drive);
 	if (ide_wait_stat(&startstop, drive, drive->ready_stat, BUSY_STAT|DRQ_STAT, WAIT_READY)) {
@@ -950,13 +962,14 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		else if (rq->flags & REQ_DRIVE_TASKFILE)
 			return execute_drive_cmd(drive, rq);
 		else if (blk_pm_request(rq)) {
+			struct request_pm_state *pm = rq->end_io_data;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, rq->pm->pm_step);
 #endif
 			startstop = ide_start_power_step(drive, rq);
 			if (startstop == ide_stopped &&
-			    rq->pm->pm_step == ide_pm_state_completed)
+			    pm->pm_step == ide_pm_state_completed)
 				ide_complete_pm_request(drive, rq);
 			return startstop;
 		}
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 9799aed772e1..59fe358048b3 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1226,7 +1226,7 @@ static int generic_ide_suspend(struct device *dev, pm_message_t state)
 	memset(&args, 0, sizeof(args));
 	rq.flags = REQ_PM_SUSPEND;
 	rq.special = &args;
-	rq.pm = &rqpm;
+	rq.end_io_data = &rqpm;
 	rqpm.pm_step = ide_pm_state_start_suspend;
 	rqpm.pm_state = state.event;
 
@@ -1245,7 +1245,7 @@ static int generic_ide_resume(struct device *dev)
 	memset(&args, 0, sizeof(args));
 	rq.flags = REQ_PM_RESUME;
 	rq.special = &args;
-	rq.pm = &rqpm;
+	rq.end_io_data = &rqpm;
 	rqpm.pm_step = ide_pm_state_start_resume;
 	rqpm.pm_state = PM_EVENT_ON;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 482a21d67627..371c0ce5f630 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -195,11 +195,6 @@ struct request {
 	unsigned int timeout;
 	int retries;
 
-	/*
-	 * For Power Management requests
-	 */
-	struct request_pm_state *pm;
-
 	/*
 	 * completion callback. end_io_data should be folded in with waiting
 	 */
-- 
cgit v1.2.3


From 8f34ee75decb80007ba77bba5a7384eadff4866d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 09:02:34 +0200
Subject: [PATCH] Rearrange a few struct request members

This saves 8 bytes of data in 64-bit archs.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 include/linux/blkdev.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 371c0ce5f630..aafe82788b4e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -151,11 +151,9 @@ struct request {
 	void *elevator_private;
 	void *completion_data;
 
-	unsigned short ioprio;
-
 	int rq_status;	/* should split this into a few status bits */
-	struct gendisk *rq_disk;
 	int errors;
+	struct gendisk *rq_disk;
 	unsigned long start_time;
 
 	/* Number of scatter-gather DMA addr+len pairs after
@@ -170,8 +168,9 @@ struct request {
 	 */
 	unsigned short nr_hw_segments;
 
+	unsigned short ioprio;
+
 	int tag;
-	char *buffer;
 
 	int ref_count;
 	request_queue_t *q;
@@ -179,6 +178,7 @@ struct request {
 
 	struct completion *waiting;
 	void *special;
+	char *buffer;
 
 	/*
 	 * when request is used as a packet command carrier
@@ -187,9 +187,8 @@ struct request {
 	unsigned char cmd[BLK_MAX_CDB];
 
 	unsigned int data_len;
-	void *data;
-
 	unsigned int sense_len;
+	void *data;
 	void *sense;
 
 	unsigned int timeout;
-- 
cgit v1.2.3


From dd67d051529387f6e44d22d1d5540ef281965fdd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 21 Jun 2006 09:36:18 +0200
Subject: [PATCH] rbtree: support functions used by the io schedulers

They all duplicate macros to check for empty root and/or node, and
clearing a node. So put those in rbtree.h.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c       | 17 +++++++----------
 block/cfq-iosched.c      | 22 ++++++++--------------
 block/deadline-iosched.c | 13 +++++--------
 include/linux/rbtree.h   |  4 ++++
 4 files changed, 24 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 56c99fa037df..1ec5df466708 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -347,9 +347,6 @@ static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
 /*
  * rb tree support functions
  */
-#define RB_EMPTY(root)	((root)->rb_node == NULL)
-#define ON_RB(node)	(rb_parent(node) != node)
-#define RB_CLEAR(node)	(rb_set_parent(node, node))
 #define rb_entry_arq(node)	rb_entry((node), struct as_rq, rb_node)
 #define ARQ_RB_ROOT(ad, arq)	(&(ad)->sort_list[(arq)->is_sync])
 #define rq_rb_key(rq)		(rq)->sector
@@ -418,13 +415,13 @@ static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
 
 static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
 {
-	if (!ON_RB(&arq->rb_node)) {
+	if (!RB_EMPTY_NODE(&arq->rb_node)) {
 		WARN_ON(1);
 		return;
 	}
 
 	rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
-	RB_CLEAR(&arq->rb_node);
+	RB_CLEAR_NODE(&arq->rb_node);
 }
 
 static struct request *
@@ -545,7 +542,7 @@ static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
 	struct rb_node *rbprev = rb_prev(&last->rb_node);
 	struct as_rq *arq_next, *arq_prev;
 
-	BUG_ON(!ON_RB(&last->rb_node));
+	BUG_ON(!RB_EMPTY_NODE(&last->rb_node));
 
 	if (rbprev)
 		arq_prev = rb_entry_arq(rbprev);
@@ -1122,7 +1119,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 	struct request *rq = arq->request;
 	const int data_dir = arq->is_sync;
 
-	BUG_ON(!ON_RB(&arq->rb_node));
+	BUG_ON(!RB_EMPTY_NODE(&arq->rb_node));
 
 	as_antic_stop(ad);
 	ad->antic_status = ANTIC_OFF;
@@ -1247,7 +1244,7 @@ static int as_dispatch_request(request_queue_t *q, int force)
 	 */
 
 	if (reads) {
-		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC]));
+		BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_SYNC]));
 
 		if (writes && ad->batch_data_dir == REQ_SYNC)
 			/*
@@ -1271,7 +1268,7 @@ static int as_dispatch_request(request_queue_t *q, int force)
 
 	if (writes) {
 dispatch_writes:
-		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
+		BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[REQ_ASYNC]));
 
 		if (ad->batch_data_dir == REQ_SYNC) {
 			ad->changed_batch = 1;
@@ -1591,7 +1588,7 @@ static int as_set_request(request_queue_t *q, struct request *rq,
 
 	if (arq) {
 		memset(arq, 0, sizeof(*arq));
-		RB_CLEAR(&arq->rb_node);
+		RB_CLEAR_NODE(&arq->rb_node);
 		arq->request = rq;
 		arq->state = AS_RQ_PRESCHED;
 		arq->io_context = NULL;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 940364edf2b9..e25223e147a2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -60,11 +60,6 @@ static DEFINE_SPINLOCK(cfq_exit_lock);
 /*
  * rb-tree defines
  */
-#define RB_EMPTY(node)		((node)->rb_node == NULL)
-#define RB_CLEAR(node)		do {	\
-		memset(node, 0, sizeof(*node)); \
-} while (0)
-#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
 #define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
 #define rq_rb_key(rq)		(rq)->sector
 
@@ -559,7 +554,7 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 
 	rb_erase(&crq->rb_node, &cfqq->sort_list);
 
-	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
+	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
 }
 
@@ -914,7 +909,7 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	struct cfq_io_context *cic;
 	unsigned long sl;
 
-	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
+	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
 	WARN_ON(cfqq != cfqd->active_queue);
 
 	/*
@@ -1042,7 +1037,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * if queue has requests, dispatch one. if not, check if
 	 * enough slice is left to wait for one
 	 */
-	if (!RB_EMPTY(&cfqq->sort_list))
+	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
 	else if (cfq_cfqq_dispatched(cfqq)) {
 		cfqq = NULL;
@@ -1066,7 +1061,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
 	int dispatched = 0;
 
-	BUG_ON(RB_EMPTY(&cfqq->sort_list));
+	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 
 	do {
 		struct cfq_rq *crq;
@@ -1090,7 +1085,7 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			cfqd->active_cic = crq->io_context;
 		}
 
-		if (RB_EMPTY(&cfqq->sort_list))
+		if (RB_EMPTY_ROOT(&cfqq->sort_list))
 			break;
 
 	} while (dispatched < max_dispatch);
@@ -1480,7 +1475,6 @@ retry:
 
 		INIT_HLIST_NODE(&cfqq->cfq_hash);
 		INIT_LIST_HEAD(&cfqq->cfq_list);
-		RB_CLEAR_ROOT(&cfqq->sort_list);
 		INIT_LIST_HEAD(&cfqq->fifo);
 
 		cfqq->key = key;
@@ -1873,7 +1867,7 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 	if (cfqd->active_queue == cfqq) {
 		if (time_after(now, cfqq->slice_end))
 			cfq_slice_expired(cfqd, 0);
-		else if (sync && RB_EMPTY(&cfqq->sort_list)) {
+		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
 			if (!cfq_arm_slice_timer(cfqd, cfqq))
 				cfq_schedule_dispatch(cfqd);
 		}
@@ -2059,7 +2053,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 
 	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 	if (crq) {
-		RB_CLEAR(&crq->rb_node);
+		RB_CLEAR_NODE(&crq->rb_node);
 		crq->rb_key = 0;
 		crq->request = rq;
 		INIT_HLIST_NODE(&crq->hash);
@@ -2151,7 +2145,7 @@ static void cfq_idle_slice_timer(unsigned long data)
 		/*
 		 * not expired and it has a request pending, let it dispatch
 		 */
-		if (!RB_EMPTY(&cfqq->sort_list)) {
+		if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
 			cfq_mark_cfqq_must_dispatch(cfqq);
 			goto out_kick;
 		}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index e5bccaaed563..4469dd84623c 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -159,9 +159,6 @@ deadline_find_drq_hash(struct deadline_data *dd, sector_t offset)
 /*
  * rb tree support functions
  */
-#define RB_EMPTY(root)	((root)->rb_node == NULL)
-#define ON_RB(node)	(rb_parent(node) != node)
-#define RB_CLEAR(node)	(rb_set_parent(node, node))
 #define rb_entry_drq(node)	rb_entry((node), struct deadline_rq, rb_node)
 #define DRQ_RB_ROOT(dd, drq)	(&(dd)->sort_list[rq_data_dir((drq)->request)])
 #define rq_rb_key(rq)		(rq)->sector
@@ -220,9 +217,9 @@ deadline_del_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
 			dd->next_drq[data_dir] = rb_entry_drq(rbnext);
 	}
 
-	BUG_ON(!ON_RB(&drq->rb_node));
+	BUG_ON(!RB_EMPTY_NODE(&drq->rb_node));
 	rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
-	RB_CLEAR(&drq->rb_node);
+	RB_CLEAR_NODE(&drq->rb_node);
 }
 
 static struct request *
@@ -496,7 +493,7 @@ static int deadline_dispatch_requests(request_queue_t *q, int force)
 	 */
 
 	if (reads) {
-		BUG_ON(RB_EMPTY(&dd->sort_list[READ]));
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
 
 		if (writes && (dd->starved++ >= dd->writes_starved))
 			goto dispatch_writes;
@@ -512,7 +509,7 @@ static int deadline_dispatch_requests(request_queue_t *q, int force)
 
 	if (writes) {
 dispatch_writes:
-		BUG_ON(RB_EMPTY(&dd->sort_list[WRITE]));
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
 
 		dd->starved = 0;
 
@@ -668,7 +665,7 @@ deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	drq = mempool_alloc(dd->drq_pool, gfp_mask);
 	if (drq) {
 		memset(drq, 0, sizeof(*drq));
-		RB_CLEAR(&drq->rb_node);
+		RB_CLEAR_NODE(&drq->rb_node);
 		drq->request = rq;
 
 		INIT_HLIST_NODE(&drq->hash);
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index f37006f21664..8d5382e62c08 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -132,6 +132,10 @@ static inline void rb_set_color(struct rb_node *rb, int color)
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
+#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node)	(rb_parent(node) != node)
+#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
-- 
cgit v1.2.3