From 2999b58b795ad81f10e34bdbbfd2742172f247e4 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Sun, 1 Feb 2009 20:46:39 +0400
Subject: ide/libata: fix ata_id_is_cfa() (take 4)

When checking for the CFA feature set support, ata_id_is_cfa() tests bit 2 in
word 82 of the identify data instead the word 83;  it also checks the ATA/PI
version support in the word 80 (which the CompactFlash specifications have as
reserved), this having no slightest chance to work on the modern CF cards that
don't have 0x848A in the word 0...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/ata.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index a53318b8cbd0..08a86d5cdf1b 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -731,12 +731,17 @@ static inline int ata_id_current_chs_valid(const u16 *id)
 
 static inline int ata_id_is_cfa(const u16 *id)
 {
-	if (id[ATA_ID_CONFIG] == 0x848A)	/* Standard CF */
+	if (id[ATA_ID_CONFIG] == 0x848A)	/* Traditional CF */
 		return 1;
-	/* Could be CF hiding as standard ATA */
-	if (ata_id_major_version(id) >= 3 &&
-	    id[ATA_ID_COMMAND_SET_1] != 0xFFFF &&
-	   (id[ATA_ID_COMMAND_SET_1] & (1 << 2)))
+	/*
+	 * CF specs don't require specific value in the word 0 anymore and yet
+	 * they forbid to report the ATA version in the word 80 and require the
+	 * CFA feature set support to be indicated in the word 83 in this case.
+	 * Unfortunately, some cards only follow either of this requirements,
+	 * and while those that don't indicate CFA feature support need some
+	 * sort of quirk list, it seems impractical for the ones that do...
+	 */
+	if ((id[ATA_ID_COMMAND_SET_2] & 0xC004) == 0x4004)
 		return 1;
 	return 0;
 }
-- 
cgit v1.2.3


From 99cf610aa4840d822cdc67d194b23b55010ca9bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 29 Jan 2009 20:31:32 +0900
Subject: libata: clear dev->ering in smarter way

dev->ering used to be cleared together with the rest of ata_device in
ata_dev_init() which is called whenever a probing event occurs.
dev->ering is about to be used to track probing failures so it needs
to remain persistent over multiple porbing events.  This patch
achieves this by doing the following.

* Instead of CLEAR_OFFSET, define CLEAR_BEGIN and CLEAR_END and only
  clear between BEGIN and END.  ering is moved after END.  The split
  of persistent area is to allow hotter items remain at the head.

* ering is explicitly cleared on ata_dev_disable() and when device
  attach succeeds.  So, ering is persistent throug a device's life
  time (unless explicitly cleared of course) and also through periods
  inbetween disablement of an attached device and successful detection
  of the next one.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c |  4 ++--
 drivers/ata/libata-eh.c   |  7 +++++++
 include/linux/libata.h    | 18 ++++++++++--------
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index d006e5c4768c..564c03c4ebb3 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5404,8 +5404,8 @@ void ata_dev_init(struct ata_device *dev)
 	dev->horkage = 0;
 	spin_unlock_irqrestore(ap->lock, flags);
 
-	memset((void *)dev + ATA_DEVICE_CLEAR_OFFSET, 0,
-	       sizeof(*dev) - ATA_DEVICE_CLEAR_OFFSET);
+	memset((void *)dev + ATA_DEVICE_CLEAR_BEGIN, 0,
+	       ATA_DEVICE_CLEAR_END - ATA_DEVICE_CLEAR_BEGIN);
 	dev->pio_mask = UINT_MAX;
 	dev->mwdma_mask = UINT_MAX;
 	dev->udma_mask = UINT_MAX;
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index aafe82bf5e2e..90092c1aae53 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1194,6 +1194,11 @@ void ata_dev_disable(struct ata_device *dev)
 	ata_acpi_on_disable(dev);
 	ata_down_xfermask_limit(dev, ATA_DNXFER_FORCE_PIO0 | ATA_DNXFER_QUIET);
 	dev->class++;
+
+	/* From now till the next successful probe, ering is used to
+	 * track probe failures.  Clear accumulated device error info.
+	 */
+	ata_ering_clear(&dev->ering);
 }
 
 /**
@@ -2765,6 +2770,8 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link,
 						     readid_flags, dev->id);
 			switch (rc) {
 			case 0:
+				/* clear error info accumulated during probe */
+				ata_ering_clear(&dev->ering);
 				new_mask |= 1 << dev->devno;
 				break;
 			case -ENOENT:
diff --git a/include/linux/libata.h b/include/linux/libata.h
index bca3ba25f52a..9dcefee5843c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -580,7 +580,7 @@ struct ata_device {
 	acpi_handle		acpi_handle;
 	union acpi_object	*gtf_cache;
 #endif
-	/* n_sector is used as CLEAR_OFFSET, read comment above CLEAR_OFFSET */
+	/* n_sector is CLEAR_BEGIN, read comment above CLEAR_BEGIN */
 	u64			n_sectors;	/* size of device, if ATA */
 	unsigned int		class;		/* ATA_DEV_xxx */
 	unsigned long		unpark_deadline;
@@ -605,20 +605,22 @@ struct ata_device {
 	u16			heads;		/* Number of heads */
 	u16			sectors;	/* Number of sectors per track */
 
-	/* error history */
-	int			spdn_cnt;
-	struct ata_ering	ering;
-
 	union {
 		u16		id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */
 		u32		gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */
 	};
+
+	/* error history */
+	int			spdn_cnt;
+	/* ering is CLEAR_END, read comment above CLEAR_END */
+	struct ata_ering	ering;
 };
 
-/* Offset into struct ata_device.  Fields above it are maintained
- * acress device init.  Fields below are zeroed.
+/* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are
+ * cleared to zero on ata_dev_init().
  */
-#define ATA_DEVICE_CLEAR_OFFSET		offsetof(struct ata_device, n_sectors)
+#define ATA_DEVICE_CLEAR_BEGIN		offsetof(struct ata_device, n_sectors)
+#define ATA_DEVICE_CLEAR_END		offsetof(struct ata_device, ering)
 
 struct ata_eh_info {
 	struct ata_device	*dev;		/* offending device */
-- 
cgit v1.2.3


From 9062712fa9ed13b531dfc2228086650b8bd6a255 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 29 Jan 2009 20:31:36 +0900
Subject: libata: implement HORKAGE_1_5_GBPS and apply it to WD My Book

3Gbps is often much more prone to transmission failures.  It's usually
okay to let EH handle speed down after transmission failures but some
WD My Book drives completely shutdown after certain transmission
failures and after it only power cycling can revive them.  Combined
with the fact that external drives often end up with cable assembly
which is longer than usual and more likely to have intervening gender,
this makes these drives very likely to shutdown under certain
configurations virtually rendering them unusable.

This patch implements HOARKGE_1_5_GBPS and applies it to WD My Book
such that 1.5Gbps is forced once the device is identified.

Please take a look at the following bz for related reports.

  http://bugzilla.kernel.org/show_bug.cgi?id=9913

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/libata.h    |  1 +
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 3c5965d56c47..9fbf0595f3d4 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2232,6 +2232,40 @@ retry:
 	return rc;
 }
 
+static int ata_do_link_spd_horkage(struct ata_device *dev)
+{
+	struct ata_link *plink = ata_dev_phys_link(dev);
+	u32 target, target_limit;
+
+	if (!sata_scr_valid(plink))
+		return 0;
+
+	if (dev->horkage & ATA_HORKAGE_1_5_GBPS)
+		target = 1;
+	else
+		return 0;
+
+	target_limit = (1 << target) - 1;
+
+	/* if already on stricter limit, no need to push further */
+	if (plink->sata_spd_limit <= target_limit)
+		return 0;
+
+	plink->sata_spd_limit = target_limit;
+
+	/* Request another EH round by returning -EAGAIN if link is
+	 * going faster than the target speed.  Forward progress is
+	 * guaranteed by setting sata_spd_limit to target_limit above.
+	 */
+	if (plink->sata_spd > target) {
+		ata_dev_printk(dev, KERN_INFO,
+			       "applying link speed limit horkage to %s\n",
+			       sata_spd_string(target));
+		return -EAGAIN;
+	}
+	return 0;
+}
+
 static inline u8 ata_dev_knobble(struct ata_device *dev)
 {
 	struct ata_port *ap = dev->link->ap;
@@ -2322,6 +2356,10 @@ int ata_dev_configure(struct ata_device *dev)
 		return 0;
 	}
 
+	rc = ata_do_link_spd_horkage(dev);
+	if (rc)
+		return rc;
+
 	/* let ACPI work its magic */
 	rc = ata_acpi_on_devcfg(dev);
 	if (rc)
@@ -4223,6 +4261,9 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	/* Devices that do not need bridging limits applied */
 	{ "MTRON MSP-SATA*",		NULL,	ATA_HORKAGE_BRIDGE_OK, },
 
+	/* Devices which aren't very happy with higher link speeds */
+	{ "WD My Book",			NULL,	ATA_HORKAGE_1_5_GBPS, },
+
 	/* End Marker */
 	{ }
 };
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9dcefee5843c..5d87bc09a1f5 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -380,6 +380,7 @@ enum {
 	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
 						    not multiple of 16 bytes */
 	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firwmare update warning */
+	ATA_HORKAGE_1_5_GBPS	= (1 << 13),	/* force 1.5 Gbps */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From 35626129abcd6a7547e84c817ef5b6eff7a8758b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 2 Feb 2009 16:00:29 -0800
Subject: sched: add missing kernel-doc in sched.h

Add kernel-doc notation for @lock:

include/linux/sched.h:457: No description found for parameter 'lock'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a7c76388731..2127e959e0f4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -443,6 +443,7 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
+ * @lock:		lock for fields in this struct
  *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
-- 
cgit v1.2.3


From 97c44836cdec1ea713a15d84098a1a908157e68f Mon Sep 17 00:00:00 2001
From: "Timothy S. Nelson" <wayland@wayland.id.au>
Date: Fri, 30 Jan 2009 06:12:47 +1100
Subject: PCI: return error on failure to read PCI ROMs

This patch makes the ROM reading code return an error to user space if
the size of the ROM read is equal to 0.

The patch also emits a warnings if the contents of the ROM are invalid,
and documents the effects of the "enable" file on ROM reading.

Signed-off-by: Timothy S. Nelson <wayland@wayland.id.au>
Acked-by: Alex Villacis-Lasso <a_villacis@palosanto.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/filesystems/sysfs-pci.txt | 13 ++++++++++++-
 arch/ia64/sn/kernel/io_acpi_init.c      |  2 +-
 arch/ia64/sn/kernel/io_init.c           |  2 +-
 drivers/pci/pci-sysfs.c                 |  4 ++--
 drivers/pci/rom.c                       |  8 +++++---
 include/linux/pci.h                     |  2 +-
 6 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 68ef48839c04..9f8740ca3f3b 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -9,6 +9,7 @@ that support it.  For example, a given bus might look like this:
      |   |-- class
      |   |-- config
      |   |-- device
+     |   |-- enable
      |   |-- irq
      |   |-- local_cpus
      |   |-- resource
@@ -32,6 +33,7 @@ files, each with their own function.
        class		   PCI class (ascii, ro)
        config		   PCI config space (binary, rw)
        device		   PCI device (ascii, ro)
+       enable	           Whether the device is enabled (ascii, rw)
        irq		   IRQ number (ascii, ro)
        local_cpus	   nearby CPU mask (cpumask, ro)
        resource		   PCI resource host addresses (ascii, ro)
@@ -57,10 +59,19 @@ used to do actual device programming from userspace.  Note that some platforms
 don't support mmapping of certain resources, so be sure to check the return
 value from any attempted mmap.
 
+The 'enable' file provides a counter that indicates how many times the device 
+has been enabled.  If the 'enable' file currently returns '4', and a '1' is
+echoed into it, it will then return '5'.  Echoing a '0' into it will decrease
+the count.  Even when it returns to 0, though, some of the initialisation
+may not be reversed.  
+
 The 'rom' file is special in that it provides read-only access to the device's
 ROM file, if available.  It's disabled by default, however, so applications
 should write the string "1" to the file to enable it before attempting a read
-call, and disable it following the access by writing "0" to the file.
+call, and disable it following the access by writing "0" to the file.  Note
+that the device must be enabled for a rom read to return data succesfully.
+In the event a driver is not bound to the device, it can be enabled using the
+'enable' file, documented above.
 
 Accessing legacy resources through sysfs
 ----------------------------------------
diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c
index c5a214026a77..d0223abbbbd4 100644
--- a/arch/ia64/sn/kernel/io_acpi_init.c
+++ b/arch/ia64/sn/kernel/io_acpi_init.c
@@ -443,7 +443,7 @@ sn_acpi_slot_fixup(struct pci_dev *dev)
 		size = pci_resource_len(dev, PCI_ROM_RESOURCE);
 		addr = ioremap(pcidev_info->pdi_pio_mapped_addr[PCI_ROM_RESOURCE],
 			       size);
-		image_size = pci_get_rom_size(addr, size);
+		image_size = pci_get_rom_size(dev, addr, size);
 		dev->resource[PCI_ROM_RESOURCE].start = (unsigned long) addr;
 		dev->resource[PCI_ROM_RESOURCE].end =
 					(unsigned long) addr + image_size - 1;
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index 4e1801bad83a..e2eb2da60f96 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -269,7 +269,7 @@ sn_io_slot_fixup(struct pci_dev *dev)
 
 			rom = ioremap(pci_resource_start(dev, PCI_ROM_RESOURCE),
 				      size + 1);
-			image_size = pci_get_rom_size(rom, size + 1);
+			image_size = pci_get_rom_size(dev, rom, size + 1);
 			dev->resource[PCI_ROM_RESOURCE].end =
 				dev->resource[PCI_ROM_RESOURCE].start +
 				image_size - 1;
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index db7ec14fa719..dfc4e0ddf241 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -768,8 +768,8 @@ pci_read_rom(struct kobject *kobj, struct bin_attribute *bin_attr,
 		return -EINVAL;
 	
 	rom = pci_map_rom(pdev, &size);	/* size starts out as PCI window size */
-	if (!rom)
-		return 0;
+	if (!rom || !size)
+		return -EIO;
 		
 	if (off >= size)
 		count = 0;
diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index 132a78159b60..29cbe47f219f 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -63,7 +63,7 @@ void pci_disable_rom(struct pci_dev *pdev)
  * The PCI window size could be much larger than the
  * actual image size.
  */
-size_t pci_get_rom_size(void __iomem *rom, size_t size)
+size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size)
 {
 	void __iomem *image;
 	int last_image;
@@ -72,8 +72,10 @@ size_t pci_get_rom_size(void __iomem *rom, size_t size)
 	do {
 		void __iomem *pds;
 		/* Standard PCI ROMs start out with these bytes 55 AA */
-		if (readb(image) != 0x55)
+		if (readb(image) != 0x55) {
+			dev_err(&pdev->dev, "Invalid ROM contents\n");
 			break;
+		}
 		if (readb(image + 1) != 0xAA)
 			break;
 		/* get the PCI data structure and check its signature */
@@ -159,7 +161,7 @@ void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size)
 	 * size is much larger than the actual size of the ROM.
 	 * True size is important if the ROM is going to be copied.
 	 */
-	*size = pci_get_rom_size(rom, *size);
+	*size = pci_get_rom_size(pdev, rom, *size);
 	return rom;
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 48890cf3f96e..7bd624bfdcfd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -684,7 +684,7 @@ int pci_enable_rom(struct pci_dev *pdev);
 void pci_disable_rom(struct pci_dev *pdev);
 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
 void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
-size_t pci_get_rom_size(void __iomem *rom, size_t size);
+size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom, size_t size);
 
 /* Power management related routines */
 int pci_save_state(struct pci_dev *dev);
-- 
cgit v1.2.3


From 7b2cd92adc5430b0c1adeb120971852b4ea1ab08 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 5 Feb 2009 16:48:24 +1100
Subject: crypto: api - Fix zeroing on free

Geert Uytterhoeven pointed out that we're not zeroing all the
memory when freeing a transform.  This patch fixes it by calling
ksize to ensure that we zero everything in sight.

Reported-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 20 ++++++++++----------
 include/linux/crypto.h |  7 ++++++-
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 9975a7bd246c..efe77df6863f 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -557,34 +557,34 @@ err:
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_tfm);
- 
+
 /*
- *	crypto_free_tfm - Free crypto transform
+ *	crypto_destroy_tfm - Free crypto transform
+ *	@mem: Start of tfm slab
  *	@tfm: Transform to free
  *
- *	crypto_free_tfm() frees up the transform and any associated resources,
+ *	This function frees up the transform and any associated resources,
  *	then drops the refcount on the associated algorithm.
  */
-void crypto_free_tfm(struct crypto_tfm *tfm)
+void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
 {
 	struct crypto_alg *alg;
 	int size;
 
-	if (unlikely(!tfm))
+	if (unlikely(!mem))
 		return;
 
 	alg = tfm->__crt_alg;
-	size = sizeof(*tfm) + alg->cra_ctxsize;
+	size = ksize(mem);
 
 	if (!tfm->exit && alg->cra_exit)
 		alg->cra_exit(tfm);
 	crypto_exit_ops(tfm);
 	crypto_mod_put(alg);
-	memset(tfm, 0, size);
-	kfree(tfm);
+	memset(mem, 0, size);
+	kfree(mem);
 }
-
-EXPORT_SYMBOL_GPL(crypto_free_tfm);
+EXPORT_SYMBOL_GPL(crypto_destroy_tfm);
 
 int crypto_has_alg(const char *name, u32 type, u32 mask)
 {
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3bacd71509fb..1f2e9020acc6 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -552,7 +552,12 @@ struct crypto_tfm *crypto_alloc_tfm(const char *alg_name,
 				    const struct crypto_type *frontend,
 				    u32 type, u32 mask);
 struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
-void crypto_free_tfm(struct crypto_tfm *tfm);
+void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);
+
+static inline void crypto_free_tfm(struct crypto_tfm *tfm)
+{
+	return crypto_destroy_tfm(tfm, tfm);
+}
 
 int alg_test(const char *driver, const char *alg, u32 type, u32 mask);
 
-- 
cgit v1.2.3


From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Feb 2009 12:24:15 +0100
Subject: signal: re-add dead task accumulation stats.

We're going to split the process wide cpu accounting into two parts:

 - clocks; which can take all the time they want since they run
           from user context.

 - timers; which need constant time tracing but can affort the overhead
           because they're default off -- and rare.

The clock readout will go back to a full sum of the thread group, for this
we need to re-add the exit stats that were removed in the initial itimer
rework (f06febc9: timers: fix itimer/many thread hang).

Furthermore, since that full sum can be rather slow for large thread groups
and we have the complete dead task stats, revert the do_notify_parent time
computation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 10 +++++++++-
 kernel/exit.c         |  3 +++
 kernel/fork.c         |  3 ++-
 kernel/signal.c       |  8 ++++----
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2127e959e0f4..2e0646a30314 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -559,7 +559,7 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
-	cputime_t cutime, cstime;
+	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -567,6 +567,14 @@ struct signal_struct {
 	unsigned long inblock, oublock, cinblock, coublock;
 	struct task_io_accounting ioac;
 
+	/*
+	 * Cumulative ns of schedule CPU time fo dead threads in the
+	 * group, not including a zombie group leader, (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sum_sched_runtime;
+
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
diff --git a/kernel/exit.c b/kernel/exit.c
index f80dec3f1875..efd30ccf3858 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
+		sig->utime = cputime_add(sig->utime, task_utime(tsk));
+		sig->stime = cputime_add(sig->stime, task_stime(tsk));
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
+		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 242a706e7721..e8e854a04ad2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->tty_old_pgrp = NULL;
 	sig->tty = NULL;
 
-	sig->cutime = sig->cstime = cputime_zero;
+	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
 	task_io_accounting_init(&sig->ioac);
+	sig->sum_sched_runtime = 0;
 	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
diff --git a/kernel/signal.c b/kernel/signal.c
index b6b36768b758..2a74fe87c0dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
-	struct task_cputime cputime;
 	int ret = sig;
 
 	BUG_ON(sig == -1);
@@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	thread_group_cputime(tsk, &cputime);
-	info.si_utime = cputime_to_jiffies(cputime.utime);
-	info.si_stime = cputime_to_jiffies(cputime.stime);
+	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
+				tsk->signal->utime));
+	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
+				tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
-- 
cgit v1.2.3


From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Feb 2009 12:24:16 +0100
Subject: timers: split process wide cpu clocks/timers

Change the process wide cpu timers/clocks so that we:

 1) don't mess up the kernel with too many threads,
 2) don't have a per-cpu allocation for each process,
 3) have no impact when not used.

In order to accomplish this we're going to split it into two parts:

 - clocks; which can take all the time they want since they run
           from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID)

 - timers; which need constant time sampling but since they're
           explicity used, the user can pay the overhead.

The clock readout will go back to a full sum of the thread group, while the
timers will run of a global 'clock' that only runs when needed, so only
programs that make use of the facility pay the price.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h | 11 +++---
 include/linux/sched.h     | 54 +++++++++++++++------------
 kernel/itimer.c           |  4 +-
 kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_stats.h      | 45 ++++++++++++----------
 5 files changed, 155 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index ea0ea1a4c36f..e752d973fa21 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -48,12 +48,11 @@ extern struct fs_struct init_fs;
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
-	.cputime	= { .totals = {					\
-		.utime = cputime_zero,					\
-		.stime = cputime_zero,					\
-		.sum_exec_runtime = 0,					\
-		.lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock),	\
-	}, },								\
+	.cputimer	= { 						\
+		.cputime = INIT_CPUTIME,				\
+		.running = 0,						\
+		.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+	},								\
 }
 
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e0646a30314..082d7619b3a1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -443,7 +443,6 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
- * @lock:		lock for fields in this struct
  *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
@@ -454,23 +453,33 @@ struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
-	spinlock_t lock;
 };
 /* Alternate field names when used to cache expirations. */
 #define prof_exp	stime
 #define virt_exp	utime
 #define sched_exp	sum_exec_runtime
 
+#define INIT_CPUTIME	\
+	(struct task_cputime) {					\
+		.utime = cputime_zero,				\
+		.stime = cputime_zero,				\
+		.sum_exec_runtime = 0,				\
+	}
+
 /**
- * struct thread_group_cputime - thread group interval timer counts
- * @totals:		thread group interval timers; substructure for
- *			uniprocessor kernel, per-cpu for SMP kernel.
+ * struct thread_group_cputimer - thread group interval timer counts
+ * @cputime:		thread group interval timers.
+ * @running:		non-zero when there are timers running and
+ * 			@cputime receives updates.
+ * @lock:		lock for fields in this struct.
  *
  * This structure contains the version of task_cputime, above, that is
- * used for thread group CPU clock calculations.
+ * used for thread group CPU timer calculations.
  */
-struct thread_group_cputime {
-	struct task_cputime totals;
+struct thread_group_cputimer {
+	struct task_cputime cputime;
+	int running;
+	spinlock_t lock;
 };
 
 /*
@@ -519,10 +528,10 @@ struct signal_struct {
 	cputime_t it_prof_incr, it_virt_incr;
 
 	/*
-	 * Thread group totals for process CPU clocks.
-	 * See thread_group_cputime(), et al, for details.
+	 * Thread group totals for process CPU timers.
+	 * See thread_group_cputimer(), et al, for details.
 	 */
-	struct thread_group_cputime cputime;
+	struct thread_group_cputimer cputimer;
 
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
@@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock)
 /*
  * Thread group CPU time accounting.
  */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 
 static inline
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
-	struct task_cputime *totals = &tsk->signal->cputime.totals;
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	unsigned long flags;
 
-	spin_lock_irqsave(&totals->lock, flags);
-	*times = *totals;
-	spin_unlock_irqrestore(&totals->lock, flags);
+	WARN_ON(!cputimer->running);
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	*times = cputimer->cputime;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
-	sig->cputime.totals = (struct task_cputime){
-		.utime = cputime_zero,
-		.stime = cputime_zero,
-		.sum_exec_runtime = 0,
-	};
-
-	spin_lock_init(&sig->cputime.totals.lock);
+	sig->cputimer.cputime = INIT_CPUTIME;
+	spin_lock_init(&sig->cputimer.lock);
+	sig->cputimer.running = 0;
 }
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6a5fe93dd8bd..58762f7077ec 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value)
 			struct task_cputime cputime;
 			cputime_t utime;
 
-			thread_group_cputime(tsk, &cputime);
+			thread_group_cputimer(tsk, &cputime);
 			utime = cputime.utime;
 			if (cputime_le(cval, utime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
 			struct task_cputime times;
 			cputime_t ptime;
 
-			thread_group_cputime(tsk, &times);
+			thread_group_cputimer(tsk, &times);
 			ptime = cputime_add(times.utime, times.stime);
 			if (cputime_le(cval, ptime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da94d7be..db107c9bbc05 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct sighand_struct *sighand;
+	struct signal_struct *sig;
+	struct task_struct *t;
+
+	*times = INIT_CPUTIME;
+
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
+	if (!sighand)
+		goto out;
+
+	sig = tsk->signal;
+
+	t = tsk;
+	do {
+		times->utime = cputime_add(times->utime, t->utime);
+		times->stime = cputime_add(times->stime, t->stime);
+		times->sum_exec_runtime += t->se.sum_exec_runtime;
+
+		t = next_thread(t);
+	} while (t != tsk);
+
+	times->utime = cputime_add(times->utime, sig->utime);
+	times->stime = cputime_add(times->stime, sig->stime);
+	times->sum_exec_runtime += sig->sum_sched_runtime;
+out:
+	rcu_read_unlock();
+}
+
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
@@ -475,6 +506,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 					     now);
 }
 
+/*
+ * Enable the process wide cpu timer accounting.
+ *
+ * serialized using ->sighand->siglock
+ */
+static void start_process_timers(struct task_struct *tsk)
+{
+	tsk->signal->cputimer.running = 1;
+	barrier();
+}
+
+/*
+ * Release the process wide timer accounting -- timer stops ticking when
+ * nobody cares about it.
+ *
+ * serialized using ->sighand->siglock
+ */
+static void stop_process_timers(struct task_struct *tsk)
+{
+	tsk->signal->cputimer.running = 0;
+	barrier();
+}
+
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
@@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	BUG_ON(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
+	if (!CPUCLOCK_PERTHREAD(timer->it_clock))
+		start_process_timers(p);
+
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
@@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk,
 	    sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
 	    list_empty(&timers[CPUCLOCK_VIRT]) &&
 	    cputime_eq(sig->it_virt_expires, cputime_zero) &&
-	    list_empty(&timers[CPUCLOCK_SCHED]))
+	    list_empty(&timers[CPUCLOCK_SCHED])) {
+		stop_process_timers(tsk);
 		return;
+	}
 
 	/*
 	 * Collect the current process totals.
 	 */
-	thread_group_cputime(tsk, &cputime);
+	thread_group_cputimer(tsk, &cputime);
 	utime = cputime.utime;
 	ptime = cputime_add(utime, cputime.stime);
 	sum_sched_runtime = cputime.sum_exec_runtime;
@@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	if (!task_cputime_zero(&sig->cputime_expires)) {
 		struct task_cputime group_sample;
 
-		thread_group_cputime(tsk, &group_sample);
+		thread_group_cputimer(tsk, &group_sample);
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
 	}
@@ -1328,6 +1387,33 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+/*
+ * Sample a process (thread group) timer for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_timer_sample_group(const clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputimer(p, &cputime);
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = cputime.utime;
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		break;
+	}
+	return 0;
+}
+
 /*
  * Set one of the process-wide special case CPU timers.
  * The tsk->sighand->siglock must be held by the caller.
@@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	cpu_clock_sample_group(clock_idx, tsk, &now);
+	start_process_timers(tsk);
+	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
 		if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8ab0cef8ecab..a8f93dd374e1 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
-	struct task_cputime *times;
-	struct signal_struct *sig;
+	struct thread_group_cputimer *cputimer;
 
 	/* tsk == current, ensure it is safe to use ->signal */
 	if (unlikely(tsk->exit_state))
 		return;
 
-	sig = tsk->signal;
-	times = &sig->cputime.totals;
+	cputimer = &tsk->signal->cputimer;
 
-	spin_lock(&times->lock);
-	times->utime = cputime_add(times->utime, cputime);
-	spin_unlock(&times->lock);
+	if (!cputimer->running)
+		return;
+
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.utime =
+		cputime_add(cputimer->cputime.utime, cputime);
+	spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
-	struct task_cputime *times;
-	struct signal_struct *sig;
+	struct thread_group_cputimer *cputimer;
 
 	/* tsk == current, ensure it is safe to use ->signal */
 	if (unlikely(tsk->exit_state))
 		return;
 
-	sig = tsk->signal;
-	times = &sig->cputime.totals;
+	cputimer = &tsk->signal->cputimer;
+
+	if (!cputimer->running)
+		return;
 
-	spin_lock(&times->lock);
-	times->stime = cputime_add(times->stime, cputime);
-	spin_unlock(&times->lock);
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.stime =
+		cputime_add(cputimer->cputime.stime, cputime);
+	spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
-	struct task_cputime *times;
+	struct thread_group_cputimer *cputimer;
 	struct signal_struct *sig;
 
 	sig = tsk->signal;
@@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (unlikely(!sig))
 		return;
 
-	times = &sig->cputime.totals;
+	cputimer = &sig->cputimer;
+
+	if (!cputimer->running)
+		return;
 
-	spin_lock(&times->lock);
-	times->sum_exec_runtime += ns;
-	spin_unlock(&times->lock);
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.sum_exec_runtime += ns;
+	spin_unlock(&cputimer->lock);
 }
-- 
cgit v1.2.3


From ac7b9004909d03d67016368093e81d37cae72895 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 4 Feb 2009 15:11:59 -0800
Subject: generic swap(): don't return a value from swap()

The swap() macro is accidentally retuning the value of its first argument.
Change it into a doesn't-return-anything macro before someone goes and
relies upon this behaviour.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <wfg@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 343df9ef2412..7fa371898e3e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -480,7 +480,8 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 /*
  * swap - swap value of @a and @b
  */
-#define swap(a, b) ({ typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; })
+#define swap(a, b) \
+	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
 /**
  * container_of - cast a member of a structure out to the containing structure
-- 
cgit v1.2.3


From 1f5e31d7e55ac7fbd4ec5e5b20c8868b0e4564c9 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Wed, 4 Feb 2009 15:12:03 -0800
Subject: fbmem: don't call copy_from/to_user() with mutex held

Avoid calling copy_from/to_user() with fb_info->lock mutex held in fbmem
ioctl().

fb_mmap() is called under mm->mmap_sem (A) held, that also acquires
fb_info->lock (B); fb_ioctl() takes fb_info->lock (B) and does
copy_from/to_user() that might acquire mm->mmap_sem (A), causing a
deadlock.

NOTE: it doesn't push down the fb_info->lock in each own driver's
fb_ioctl(), so there are still potential deadlocks elsewhere.

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: Dave Jones <davej@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Johannes Weiner <hannes@saeurebad.de>
Cc: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/fbcmap.c |  20 +++++---
 drivers/video/fbmem.c  | 135 +++++++++++++++++++++++++------------------------
 include/linux/fb.h     |  15 ++++++
 3 files changed, 98 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbcmap.c b/drivers/video/fbcmap.c
index 91b78e691505..f53b9f1d6aba 100644
--- a/drivers/video/fbcmap.c
+++ b/drivers/video/fbcmap.c
@@ -250,10 +250,6 @@ int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *info)
 	int rc, size = cmap->len * sizeof(u16);
 	struct fb_cmap umap;
 
-	if (cmap->start < 0 || (!info->fbops->fb_setcolreg &&
-			        !info->fbops->fb_setcmap))
-		return -EINVAL;
-
 	memset(&umap, 0, sizeof(struct fb_cmap));
 	rc = fb_alloc_cmap(&umap, cmap->len, cmap->transp != NULL);
 	if (rc)
@@ -262,11 +258,23 @@ int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *info)
 	    copy_from_user(umap.green, cmap->green, size) ||
 	    copy_from_user(umap.blue, cmap->blue, size) ||
 	    (cmap->transp && copy_from_user(umap.transp, cmap->transp, size))) {
-		fb_dealloc_cmap(&umap);
-		return -EFAULT;
+		rc = -EFAULT;
+		goto out;
 	}
 	umap.start = cmap->start;
+	if (!lock_fb_info(info)) {
+		rc = -ENODEV;
+		goto out;
+	}
+	if (cmap->start < 0 || (!info->fbops->fb_setcolreg &&
+				!info->fbops->fb_setcmap)) {
+		rc = -EINVAL;
+		goto out1;
+	}
 	rc = fb_set_cmap(&umap, info);
+out1:
+	unlock_fb_info(info);
+out:
 	fb_dealloc_cmap(&umap);
 	return rc;
 }
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 756efeb91abc..cfd9dce1ce0b 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1013,132 +1013,139 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 	struct fb_var_screeninfo var;
 	struct fb_fix_screeninfo fix;
 	struct fb_con2fbmap con2fb;
+	struct fb_cmap cmap_from;
 	struct fb_cmap_user cmap;
 	struct fb_event event;
 	void __user *argp = (void __user *)arg;
 	long ret = 0;
 
-	fb = info->fbops;
-	if (!fb)
-		return -ENODEV;
-
 	switch (cmd) {
 	case FBIOGET_VSCREENINFO:
-		ret = copy_to_user(argp, &info->var,
-				    sizeof(var)) ? -EFAULT : 0;
+		if (!lock_fb_info(info))
+			return -ENODEV;
+		var = info->var;
+		unlock_fb_info(info);
+
+		ret = copy_to_user(argp, &var, sizeof(var)) ? -EFAULT : 0;
 		break;
 	case FBIOPUT_VSCREENINFO:
-		if (copy_from_user(&var, argp, sizeof(var))) {
-			ret =  -EFAULT;
-			break;
-		}
+		if (copy_from_user(&var, argp, sizeof(var)))
+			return -EFAULT;
+		if (!lock_fb_info(info))
+			return -ENODEV;
 		acquire_console_sem();
 		info->flags |= FBINFO_MISC_USEREVENT;
 		ret = fb_set_var(info, &var);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
 		release_console_sem();
-		if (ret == 0 && copy_to_user(argp, &var, sizeof(var)))
+		unlock_fb_info(info);
+		if (!ret && copy_to_user(argp, &var, sizeof(var)))
 			ret = -EFAULT;
 		break;
 	case FBIOGET_FSCREENINFO:
-		ret = copy_to_user(argp, &info->fix,
-				    sizeof(fix)) ? -EFAULT : 0;
+		if (!lock_fb_info(info))
+			return -ENODEV;
+		fix = info->fix;
+		unlock_fb_info(info);
+
+		ret = copy_to_user(argp, &fix, sizeof(fix)) ? -EFAULT : 0;
 		break;
 	case FBIOPUTCMAP:
 		if (copy_from_user(&cmap, argp, sizeof(cmap)))
-			ret = -EFAULT;
-		else
-			ret = fb_set_user_cmap(&cmap, info);
+			return -EFAULT;
+		ret = fb_set_user_cmap(&cmap, info);
 		break;
 	case FBIOGETCMAP:
 		if (copy_from_user(&cmap, argp, sizeof(cmap)))
-			ret = -EFAULT;
-		else
-			ret = fb_cmap_to_user(&info->cmap, &cmap);
+			return -EFAULT;
+		if (!lock_fb_info(info))
+			return -ENODEV;
+		cmap_from = info->cmap;
+		unlock_fb_info(info);
+		ret = fb_cmap_to_user(&cmap_from, &cmap);
 		break;
 	case FBIOPAN_DISPLAY:
-		if (copy_from_user(&var, argp, sizeof(var))) {
-			ret = -EFAULT;
-			break;
-		}
+		if (copy_from_user(&var, argp, sizeof(var)))
+			return -EFAULT;
+		if (!lock_fb_info(info))
+			return -ENODEV;
 		acquire_console_sem();
 		ret = fb_pan_display(info, &var);
 		release_console_sem();
+		unlock_fb_info(info);
 		if (ret == 0 && copy_to_user(argp, &var, sizeof(var)))
-			ret = -EFAULT;
+			return -EFAULT;
 		break;
 	case FBIO_CURSOR:
 		ret = -EINVAL;
 		break;
 	case FBIOGET_CON2FBMAP:
 		if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
-			ret = -EFAULT;
-		else if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
-			ret = -EINVAL;
-		else {
-			con2fb.framebuffer = -1;
-			event.info = info;
-			event.data = &con2fb;
-			fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP,
-								&event);
-			ret = copy_to_user(argp, &con2fb,
-				    sizeof(con2fb)) ? -EFAULT : 0;
-		}
+			return -EFAULT;
+		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
+			return -EINVAL;
+		con2fb.framebuffer = -1;
+		event.data = &con2fb;
+
+		if (!lock_fb_info(info))
+			return -ENODEV;
+		event.info = info;
+		fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event);
+		unlock_fb_info(info);
+
+		ret = copy_to_user(argp, &con2fb, sizeof(con2fb)) ? -EFAULT : 0;
 		break;
 	case FBIOPUT_CON2FBMAP:
-		if (copy_from_user(&con2fb, argp, sizeof(con2fb))) {
-			ret = -EFAULT;
-			break;
-		}
-		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) {
-			ret = -EINVAL;
-			break;
-		}
-		if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) {
-			ret = -EINVAL;
-			break;
-		}
+		if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
+			return -EFAULT;
+		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
+			return -EINVAL;
+		if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX)
+			return -EINVAL;
 		if (!registered_fb[con2fb.framebuffer])
 			request_module("fb%d", con2fb.framebuffer);
 		if (!registered_fb[con2fb.framebuffer]) {
 			ret = -EINVAL;
 			break;
 		}
-		event.info = info;
 		event.data = &con2fb;
+		if (!lock_fb_info(info))
+			return -ENODEV;
+		event.info = info;
 		ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP,
 					      &event);
+		unlock_fb_info(info);
 		break;
 	case FBIOBLANK:
+		if (!lock_fb_info(info))
+			return -ENODEV;
 		acquire_console_sem();
 		info->flags |= FBINFO_MISC_USEREVENT;
 		ret = fb_blank(info, arg);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
 		release_console_sem();
-		break;;
+		unlock_fb_info(info);
+		break;
 	default:
-		if (fb->fb_ioctl == NULL)
-			ret = -ENOTTY;
-		else
+		if (!lock_fb_info(info))
+			return -ENODEV;
+		fb = info->fbops;
+		if (fb->fb_ioctl)
 			ret = fb->fb_ioctl(info, cmd, arg);
+		else
+			ret = -ENOTTY;
+		unlock_fb_info(info);
 	}
 	return ret;
 }
 
 static long fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-__acquires(&info->lock)
-__releases(&info->lock)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	int fbidx = iminor(inode);
-	struct fb_info *info;
-	long ret;
+	struct fb_info *info = registered_fb[fbidx];
 
-	info = registered_fb[fbidx];
-	mutex_lock(&info->lock);
-	ret = do_fb_ioctl(info, cmd, arg);
-	mutex_unlock(&info->lock);
-	return ret;
+	return do_fb_ioctl(info, cmd, arg);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1257,8 +1264,6 @@ static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd,
 
 static long fb_compat_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
-__acquires(&info->lock)
-__releases(&info->lock)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	int fbidx = iminor(inode);
@@ -1266,7 +1271,6 @@ __releases(&info->lock)
 	struct fb_ops *fb = info->fbops;
 	long ret = -ENOIOCTLCMD;
 
-	mutex_lock(&info->lock);
 	switch(cmd) {
 	case FBIOGET_VSCREENINFO:
 	case FBIOPUT_VSCREENINFO:
@@ -1292,7 +1296,6 @@ __releases(&info->lock)
 			ret = fb->fb_compat_ioctl(info, cmd, arg);
 		break;
 	}
-	mutex_unlock(&info->lock);
 	return ret;
 }
 #endif
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 818fe21257e8..31527e17076b 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -960,6 +960,21 @@ extern struct fb_info *registered_fb[FB_MAX];
 extern int num_registered_fb;
 extern struct class *fb_class;
 
+static inline int lock_fb_info(struct fb_info *info)
+{
+	mutex_lock(&info->lock);
+	if (!info->fbops) {
+		mutex_unlock(&info->lock);
+		return 0;
+	}
+	return 1;
+}
+
+static inline void unlock_fb_info(struct fb_info *info)
+{
+	mutex_unlock(&info->lock);
+}
+
 static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch,
 					   u8 *src, u32 s_pitch, u32 height)
 {
-- 
cgit v1.2.3


From 777c6c5f1f6e757ae49ecca2ed72d6b1f523c007 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 4 Feb 2009 15:12:14 -0800
Subject: wait: prevent exclusive waiter starvation

With exclusive waiters, every process woken up through the wait queue must
ensure that the next waiter down the line is woken when it has finished.

Interruptible waiters don't do that when aborting due to a signal.  And if
an aborting waiter is concurrently woken up through the waitqueue, noone
will ever wake up the next waiter.

This has been observed with __wait_on_bit_lock() used by
lock_page_killable(): the first contender on the queue was aborting when
the actual lock holder woke it up concurrently.  The aborted contender
didn't acquire the lock and therefor never did an unlock followed by
waking up the next waiter.

Add abort_exclusive_wait() which removes the process' wait descriptor from
the waitqueue, iff still queued, or wakes up the next waiter otherwise.
It does so under the waitqueue lock.  Racing with a wake up means the
aborting process is either already woken (removed from the queue) and will
wake up the next waiter, or it will remove itself from the queue and the
concurrent wake up will apply to the next waiter after it.

Use abort_exclusive_wait() in __wait_event_interruptible_exclusive() and
__wait_on_bit_lock() when they were interrupted by other means than a wake
up through the queue.

[akpm@linux-foundation.org: coding-style fixes]
Reported-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Mentored-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chuck Lever <cel@citi.umich.edu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>		["after some testing"]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 11 ++++++++--
 kernel/sched.c       |  4 ++--
 kernel/wait.c        | 59 +++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 63 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index ef609f842fac..a210ede73b56 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,6 +132,8 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 	list_del(&old->task_list);
 }
 
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
 extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -333,16 +335,19 @@ do {									\
 	for (;;) {							\
 		prepare_to_wait_exclusive(&wq, &__wait,			\
 					TASK_INTERRUPTIBLE);		\
-		if (condition)						\
+		if (condition) {					\
+			finish_wait(&wq, &__wait);			\
 			break;						\
+		}							\
 		if (!signal_pending(current)) {				\
 			schedule();					\
 			continue;					\
 		}							\
 		ret = -ERESTARTSYS;					\
+		abort_exclusive_wait(&wq, &__wait, 			\
+				TASK_INTERRUPTIBLE, NULL);		\
 		break;							\
 	}								\
-	finish_wait(&wq, &__wait);					\
 } while (0)
 
 #define wait_event_interruptible_exclusive(wq, condition)		\
@@ -431,6 +436,8 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
 void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
 void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+			unsigned int mode, void *key);
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 242d0d47a70d..8ee437a5ec1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4697,8 +4697,8 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			     int nr_exclusive, int sync, void *key)
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, int sync, void *key)
 {
 	wait_queue_t *curr, *next;
 
diff --git a/kernel/wait.c b/kernel/wait.c
index cd87131f2fc2..42a2dbc181c8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
+/*
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 {
 	unsigned long flags;
@@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
 
+/*
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @state: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and noone wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+			unsigned int mode, void *key)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+	spin_lock_irqsave(&q->lock, flags);
+	if (!list_empty(&wait->task_list))
+		list_del_init(&wait->task_list);
+	else if (waitqueue_active(q))
+		__wake_up_common(q, mode, 1, 0, key);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 	int ret = default_wake_function(wait, mode, sync, key);
@@ -177,17 +219,20 @@ int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 			int (*action)(void *), unsigned mode)
 {
-	int ret = 0;
-
 	do {
+		int ret;
+
 		prepare_to_wait_exclusive(wq, &q->wait, mode);
-		if (test_bit(q->key.bit_nr, q->key.flags)) {
-			if ((ret = (*action)(q->key.flags)))
-				break;
-		}
+		if (!test_bit(q->key.bit_nr, q->key.flags))
+			continue;
+		ret = action(q->key.flags);
+		if (!ret)
+			continue;
+		abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+		return ret;
 	} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
 	finish_wait(wq, &q->wait);
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);
 
-- 
cgit v1.2.3


From 7d8e23df69820e6be42bcc41d441f4860e8c76f7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 6 Feb 2009 14:57:51 +0100
Subject: timers: split process wide cpu clocks/timers, remove spurious warning

Mike Galbraith reported that the new warning in thread_group_cputimer()
triggers en masse with Amarok running.

Oleg Nesterov observed:

  Can't fastpath_timer_check()->thread_group_cputimer() have the
  false warning too? Suppose we had the timer, then posix_cpu_timer_del()
  removes this timer, but task_cputime_zero(&sig->cputime_expires) still
  not true.

Remove the spurious debug warning.

Reported-by: Mike Galbraith <efault@gmx.de>
Explained-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 082d7619b3a1..79392916d6c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2208,8 +2208,6 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	unsigned long flags;
 
-	WARN_ON(!cputimer->running);
-
 	spin_lock_irqsave(&cputimer->lock, flags);
 	*times = cputimer->cputime;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
-- 
cgit v1.2.3


From 7f9a50a5b89b87f8e754f59ae9968da28be618a5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 7 Feb 2009 18:15:56 +1030
Subject: module: remove over-zealous check in __module_get()

Impact: fix spurious BUG_ON() triggered under load

module_refcount() isn't reliable outside stop_machine(), as demonstrated
by Karsten Keil <kkeil@suse.de>, networking can trigger it under load
(an inc on one cpu and dec on another while module_refcount() is tallying
 can give false results, for example).

Almost noone should be using __module_get, but that's another issue.

Cc: Karsten Keil <kkeil@suse.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/module.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index f3b8329eb5b8..145a75528cc1 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -407,7 +407,6 @@ static inline local_t *__module_ref_addr(struct module *mod, int cpu)
 static inline void __module_get(struct module *module)
 {
 	if (module) {
-		BUG_ON(module_refcount(module) == 0);
 		local_inc(__module_ref_addr(module, get_cpu()));
 		put_cpu();
 	}
-- 
cgit v1.2.3


From 766ccb9ed406c230d13c145def08ebea1b932982 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 20 Jan 2009 15:31:31 +0100
Subject: async: Rename _special -> _domain for clarity.

Rename the async_*_special() functions to async_*_domain(), which
describes the purpose of these functions much better.
[Broke up long lines to silence checkpatch]

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/super.c            |  4 ++--
 include/linux/async.h |  8 +++++---
 kernel/async.c        | 41 ++++++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a0..61dce001dd57 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb)
 		/*
 		 * wait for asynchronous fs operations to finish before going further
 		 */
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
@@ -470,7 +470,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
diff --git a/include/linux/async.h b/include/linux/async.h
index c4ecacd0b327..68a9530196f2 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -17,9 +17,11 @@ typedef u64 async_cookie_t;
 typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
 
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
-extern async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *list);
+extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+					    struct list_head *list);
 extern void async_synchronize_full(void);
-extern void async_synchronize_full_special(struct list_head *list);
+extern void async_synchronize_full_domain(struct list_head *list);
 extern void async_synchronize_cookie(async_cookie_t cookie);
-extern void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *list);
+extern void async_synchronize_cookie_domain(async_cookie_t cookie,
+					    struct list_head *list);
 
diff --git a/kernel/async.c b/kernel/async.c
index b5f0d4b94937..e23399d88bac 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -224,22 +224,23 @@ async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 EXPORT_SYMBOL_GPL(async_schedule);
 
 /**
- * async_schedule_special - schedule a function for asynchronous execution with a special running queue
+ * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
  * @ptr: function to execute asynchronously
  * @data: data pointer to pass to the function
- * @running: list head to add to while running
+ * @running: running list for the domain
  *
  * Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_special() functions
- * to wait on a special running queue rather than on the global running
- * queue.
+ * @running may be used in the async_synchronize_*_domain() functions
+ * to wait within a certain synchronization domain rather than globally.
+ * A synchronization domain is specified via the running queue @running to use.
  * Note: This function may be called from atomic or non-atomic contexts.
  */
-async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
+async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+				     struct list_head *running)
 {
 	return __async_schedule(ptr, data, running);
 }
-EXPORT_SYMBOL_GPL(async_schedule_special);
+EXPORT_SYMBOL_GPL(async_schedule_domain);
 
 /**
  * async_synchronize_full - synchronize all asynchronous function calls
@@ -255,27 +256,29 @@ void async_synchronize_full(void)
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
 /**
- * async_synchronize_full_special - synchronize all asynchronous function calls for a running list
+ * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
  * @list: running list to synchronize on
  *
- * This function waits until all asynchronous function calls for the running
- * list @list have been done.
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list have been done.
  */
-void async_synchronize_full_special(struct list_head *list)
+void async_synchronize_full_domain(struct list_head *list)
 {
-	async_synchronize_cookie_special(next_cookie, list);
+	async_synchronize_cookie_domain(next_cookie, list);
 }
-EXPORT_SYMBOL_GPL(async_synchronize_full_special);
+EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 
 /**
- * async_synchronize_cookie_special - synchronize asynchronous function calls on a running list with cookie checkpointing
+ * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
  * @cookie: async_cookie_t to use as checkpoint
  * @running: running list to synchronize on
  *
- * This function waits until all asynchronous function calls for the running
- * list @list submitted prior to @cookie have been done.
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list submitted
+ * prior to @cookie have been done.
  */
-void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie,
+				     struct list_head *running)
 {
 	ktime_t starttime, delta, endtime;
 
@@ -295,7 +298,7 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
 			(long long)ktime_to_ns(delta) >> 10);
 	}
 }
-EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
 
 /**
  * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
@@ -306,7 +309,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
  */
 void async_synchronize_cookie(async_cookie_t cookie)
 {
-	async_synchronize_cookie_special(cookie, &async_running);
+	async_synchronize_cookie_domain(cookie, &async_running);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
 
-- 
cgit v1.2.3


From a5ef7ca0e2636bad0ccd07b996d775348ae2b65e Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kyle@redhat.com>
Date: Sun, 8 Feb 2009 17:39:58 -0500
Subject: x86: spinlocks: define dummy __raw_spin_is_contended

Architectures other than mips and x86 are not using ticket spinlocks.
Therefore, the contention on the lock is meaningless, since there is
nobody known to be waiting on it (arguably /fairly/ unfair locks).

Dummy it out to return 0 on other architectures.

Signed-off-by: Kyle McMartin <kyle@redhat.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/spinlock.h | 1 +
 arch/x86/include/asm/paravirt.h  | 1 +
 arch/x86/include/asm/spinlock.h  | 1 +
 include/linux/spinlock.h         | 5 +++++
 4 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 1a1f320c30d8..0884947ebe27 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -51,6 +51,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
 
 	return (((counters >> 14) - counters) & 0x1fff) > 1;
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ba3e2ff6aedc..c09a14127584 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1402,6 +1402,7 @@ static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
 {
 	return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
 {
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index d17c91981da2..8247e94ac6b1 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -245,6 +245,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
 {
 	return __ticket_spin_is_contended(lock);
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index e0c0fccced46..a0c66a2e00ad 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -124,7 +124,12 @@ do {								\
 #ifdef CONFIG_GENERIC_LOCKBREAK
 #define spin_is_contended(lock) ((lock)->break_lock)
 #else
+
+#ifdef __raw_spin_is_contended
 #define spin_is_contended(lock)	__raw_spin_is_contended(&(lock)->raw_lock)
+#else
+#define spin_is_contended(lock)	(((void)(lock), 0))
+#endif /*__raw_spin_is_contended*/
 #endif
 
 /**
-- 
cgit v1.2.3


From 43a990765a9e874350bae1009366d00809dbc9d8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 10 Feb 2009 00:00:22 +0100
Subject: sound: Remove OSSlib stuff from linux/soundcard.h

Removed OSSlib stuff from linux/soundcard.h to fix the warnings for
'make headers_check'.

This patch breaks building against OSSlib with the kernel headers
instead of its own headers. It should still work with any
version of the library from the 2003 onwards which provide
their own headers for the latest interface.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/soundcard.h | 74 +++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soundcard.h b/include/linux/soundcard.h
index 523d069c862c..1904afedb82f 100644
--- a/include/linux/soundcard.h
+++ b/include/linux/soundcard.h
@@ -1045,50 +1045,36 @@ typedef struct mixer_vol_table {
  */
 #define LOCL_STARTAUDIO		1
 
-#if (!defined(__KERNEL__) && !defined(KERNEL) && !defined(INKERNEL) && !defined(_KERNEL)) || defined(USE_SEQ_MACROS) 
+#if !defined(__KERNEL__) || defined(USE_SEQ_MACROS)
 /*
  *	Some convenience macros to simplify programming of the
  *	/dev/sequencer interface
  *
- *	These macros define the API which should be used when possible.
+ *	This is a legacy interface for applications written against
+ *	the OSSlib-3.8 style interface. It is no longer possible
+ *	to actually link against OSSlib with this header, but we
+ *	still provide these macros for programs using them.
+ *
+ *	If you want to use OSSlib, it is recommended that you get
+ *	the GPL version of OSS-4.x and build against that version
+ *	of the header.
+ *
+ *	We redefine the extern keyword so that make headers_check
+ *	does not complain about SEQ_USE_EXTBUF.
  */
 #define SEQ_DECLAREBUF()		SEQ_USE_EXTBUF()
 
 void seqbuf_dump(void);	/* This function must be provided by programs */
 
-extern int OSS_init(int seqfd, int buflen);
-extern void OSS_seqbuf_dump(int fd, unsigned char *buf, int buflen);
-extern void OSS_seq_advbuf(int len, int fd, unsigned char *buf, int buflen);
-extern void OSS_seq_needbuf(int len, int fd, unsigned char *buf, int buflen);
-extern void OSS_patch_caching(int dev, int chn, int patch,
-			      int fd, unsigned char *buf, int buflen);
-extern void OSS_drum_caching(int dev, int chn, int patch,
-			      int fd, unsigned char *buf, int buflen);
-extern void OSS_write_patch(int fd, unsigned char *buf, int len);
-extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
-
 #define SEQ_PM_DEFINES int __foo_bar___
-#ifdef OSSLIB
-#  define SEQ_USE_EXTBUF() \
-		extern unsigned char *_seqbuf; \
-		extern int _seqbuflen;extern int _seqbufptr
-#  define SEQ_DEFINEBUF(len) SEQ_USE_EXTBUF();static int _requested_seqbuflen=len
-#  define _SEQ_ADVBUF(len) OSS_seq_advbuf(len, seqfd, _seqbuf, _seqbuflen)
-#  define _SEQ_NEEDBUF(len) OSS_seq_needbuf(len, seqfd, _seqbuf, _seqbuflen)
-#  define SEQ_DUMPBUF() OSS_seqbuf_dump(seqfd, _seqbuf, _seqbuflen)
-
-#  define SEQ_LOAD_GMINSTR(dev, instr) \
-		OSS_patch_caching(dev, -1, instr, seqfd, _seqbuf, _seqbuflen)
-#  define SEQ_LOAD_GMDRUM(dev, drum) \
-		OSS_drum_caching(dev, -1, drum, seqfd, _seqbuf, _seqbuflen)
-#else /* !OSSLIB */
-
-#  define SEQ_LOAD_GMINSTR(dev, instr)
-#  define SEQ_LOAD_GMDRUM(dev, drum)
-
-#  define SEQ_USE_EXTBUF() \
-		extern unsigned char _seqbuf[]; \
-		extern int _seqbuflen;extern int _seqbufptr
+
+#define SEQ_LOAD_GMINSTR(dev, instr)
+#define SEQ_LOAD_GMDRUM(dev, drum)
+
+#define _SEQ_EXTERN extern
+#define SEQ_USE_EXTBUF() \
+		_SEQ_EXTERN unsigned char _seqbuf[]; \
+		_SEQ_EXTERN int _seqbuflen; _SEQ_EXTERN int _seqbufptr
 
 #ifndef USE_SIMPLE_MACROS
 /* Sample seqbuf_dump() implementation:
@@ -1131,7 +1117,6 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
  */
 #define _SEQ_NEEDBUF(len)	/* empty */
 #endif
-#endif /* !OSSLIB */
 
 #define SEQ_VOLUME_MODE(dev, mode)	{_SEQ_NEEDBUF(8);\
 					_seqbuf[_seqbufptr] = SEQ_EXTENDED;\
@@ -1215,14 +1200,8 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
 		_CHN_COMMON(dev, MIDI_CHN_PRESSURE, chn, pressure, 0, 0)
 
 #define SEQ_SET_PATCH SEQ_PGM_CHANGE
-#ifdef OSSLIB
-#   define SEQ_PGM_CHANGE(dev, chn, patch) \
-		{OSS_patch_caching(dev, chn, patch, seqfd, _seqbuf, _seqbuflen); \
-		 _CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0);}
-#else
-#   define SEQ_PGM_CHANGE(dev, chn, patch) \
+#define SEQ_PGM_CHANGE(dev, chn, patch) \
 		_CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0)
-#endif
 
 #define SEQ_CONTROL(dev, chn, controller, value) \
 		_CHN_COMMON(dev, MIDI_CTL_CHANGE, chn, controller, 0, value)
@@ -1300,19 +1279,12 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
 /*
  * Patch loading.
  */
-#ifdef OSSLIB
-#   define SEQ_WRPATCH(patchx, len) \
-		OSS_write_patch(seqfd, (char*)(patchx), len)
-#   define SEQ_WRPATCH2(patchx, len) \
-		OSS_write_patch2(seqfd, (char*)(patchx), len)
-#else
-#   define SEQ_WRPATCH(patchx, len) \
+#define SEQ_WRPATCH(patchx, len) \
 		{if (_seqbufptr) SEQ_DUMPBUF();\
 		 if (write(seqfd, (char*)(patchx), len)==-1) \
 		    perror("Write patch: /dev/sequencer");}
-#   define SEQ_WRPATCH2(patchx, len) \
+#define SEQ_WRPATCH2(patchx, len) \
 		(SEQ_DUMPBUF(), write(seqfd, (char*)(patchx), len))
-#endif
 
 #endif
 #endif
-- 
cgit v1.2.3


From 7f5aa215088b817add9c71914b83650bdd49f8a9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2009 11:15:34 -0500
Subject: jbd2: Avoid possible NULL dereference in
 jbd2_journal_begin_ordered_truncate()

If we race with commit code setting i_transaction to NULL, we could
possibly dereference it.  Proper locking requires the journal pointer
(to access journal->j_list_lock), which we don't have.  So we have to
change the prototype of the function so that filesystem passes us the
journal pointer.  Also add a more detailed comment about why the
function jbd2_journal_begin_ordered_truncate() does what it does and
how it should be used.

Thanks to Dan Carpenter <error27@gmail.com> for pointing to the
suspitious code.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Joel Becker <joel.becker@oracle.com>
CC: linux-ext4@vger.kernel.org
CC: ocfs2-devel@oss.oracle.com
CC: mfasheh@suse.de
CC: Dan Carpenter <error27@gmail.com>
---
 fs/ext4/inode.c       |  6 ++++--
 fs/jbd2/transaction.c | 42 +++++++++++++++++++++++++++++++-----------
 fs/ocfs2/journal.h    |  6 ++++--
 include/linux/jbd2.h  |  3 ++-
 4 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be1329..658c4a7f2578 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+					EXT4_SB(inode->i_sb)->s_journal,
+					&EXT4_I(inode)->jinode,
+					new_size);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7d..28ce21d8598e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
 }
 
 /*
- * This function must be called when inode is journaled in ordered mode
- * before truncation happens. It starts writeout of truncated part in
- * case it is in the committing transaction so that we stand to ordered
- * mode consistency guarantees.
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
  */
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+					struct jbd2_inode *jinode,
 					loff_t new_size)
 {
-	journal_t *journal;
-	transaction_t *commit_trans;
+	transaction_t *inode_trans, *commit_trans;
 	int ret = 0;
 
-	if (!inode->i_transaction && !inode->i_next_transaction)
+	/* This is a quick check to avoid locking if not necessary */
+	if (!jinode->i_transaction)
 		goto out;
-	journal = inode->i_transaction->t_journal;
+	/* Locks are here just to force reading of recent values, it is
+	 * enough that the transaction was not committing before we started
+	 * a transaction adding the inode to orphan list */
 	spin_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
 	spin_unlock(&journal->j_state_lock);
-	if (inode->i_transaction == commit_trans) {
-		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+	spin_lock(&journal->j_list_lock);
+	inode_trans = jinode->i_transaction;
+	spin_unlock(&journal->j_list_lock);
+	if (inode_trans == commit_trans) {
+		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
 			new_size, LLONG_MAX);
 		if (ret)
 			jbd2_journal_abort(journal, ret);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307c..172850a9a12a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
 					       loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+				OCFS2_SB(inode->i_sb)->journal->j_journal,
+				&OCFS2_I(inode)->ip_jinode,
+				new_size);
 }
 
 #endif /* OCFS2_JOURNAL_H */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b28b37eb11c6..4d248b3f1323 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1150,7 +1150,8 @@ extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
 extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
-extern int	   jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern int	   jbd2_journal_begin_ordered_truncate(journal_t *journal,
+				struct jbd2_inode *inode, loff_t new_size);
 extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
 extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 
-- 
cgit v1.2.3


From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 10 Feb 2009 14:02:27 +0000
Subject: Do not account for the address space used by hugetlbfs using
 VM_ACCOUNT

When overcommit is disabled, the core VM accounts for pages used by anonymous
shared, private mappings and special mappings. It keeps track of VMAs that
should be accounted for with VM_ACCOUNT and VMAs that never had a reserve
with VM_NORESERVE.

Overcommit for hugetlbfs is much riskier than overcommit for base pages
due to contiguity requirements. It avoids overcommiting on both shared and
private mappings using reservation counters that are checked and updated
during mmap(). This ensures (within limits) that hugepages exist in the
future when faults occurs or it is too easy to applications to be SIGKILLed.

As hugetlbfs makes its own reservations of a different unit to the base page
size, VM_ACCOUNT should never be set. Even if the units were correct, we would
double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may
be set because an application can request no reserves be made for hugetlbfs
at the risk of getting killed later.

With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and
VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This
breaks the accounting for both the core VM and hugetlbfs, can trigger an
OOM storm when hugepage pools are too small lockups and corrupted counters
otherwise are used. This patch brings hugetlbfs more in line with how the
core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |  8 +++++---
 include/linux/hugetlb.h |  5 +++--
 include/linux/mm.h      |  3 +--
 ipc/shm.c               |  8 +++++---
 mm/fremap.c             |  2 +-
 mm/hugetlb.c            | 39 +++++++++++++++++++++++++--------------
 mm/mmap.c               | 38 ++++++++++++++++++++++----------------
 mm/mprotect.c           |  5 +++--
 8 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma))
+				len >> huge_page_shift(h), vma,
+				vma->vm_flags))
 		goto out;
 
 	ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
 			can_do_mlock());
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
 	int error = -ENOMEM;
 	struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 
 	error = -ENOMEM;
 	if (hugetlb_reserve_pages(inode, 0,
-			size >> huge_page_shift(hstate_inode(inode)), NULL))
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f1d2fba19ea0..af09660001c7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
-						struct vm_area_struct *vma);
+						struct vm_area_struct *vma,
+						int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long hugepages_treat_as_movable;
@@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t);
+struct file *hugetlb_file_setup(const char *name, size_t, int);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8ddc98b8405..323561582c10 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long flag, unsigned long pgoff);
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
-	unsigned int vm_flags, unsigned long pgoff,
-	int accountable);
+	unsigned int vm_flags, unsigned long pgoff);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
diff --git a/ipc/shm.c b/ipc/shm.c
index f8f69fad3a27..05d51d2a792c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	struct file * file;
 	char name[13];
 	int id;
+	int acctflag = 0;
 
 	if (size < SHMMIN || size > ns->shm_ctlmax)
 		return -EINVAL;
@@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
 	sprintf (name, "SYSV%08x", key);
 	if (shmflg & SHM_HUGETLB) {
-		/* hugetlb_file_setup takes care of mlock user accounting */
-		file = hugetlb_file_setup(name, size);
+		/* hugetlb_file_setup applies strict accounting */
+		if (shmflg & SHM_NORESERVE)
+			acctflag = VM_NORESERVE;
+		file = hugetlb_file_setup(name, size, acctflag);
 		shp->mlock_user = current_user();
 	} else {
-		int acctflag = 0;
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
 	 	 * if it's asked for.
diff --git a/mm/fremap.c b/mm/fremap.c
index 736ba7f3306a..b6ec85abbb39 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			flags &= MAP_NONBLOCK;
 			get_file(file);
 			addr = mmap_region(file, start, size,
-					flags, vma->vm_flags, pgoff, 1);
+					flags, vma->vm_flags, pgoff);
 			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e98304080..207464209546 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 int hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
-					struct vm_area_struct *vma)
+					struct vm_area_struct *vma,
+					int acctflag)
 {
-	long ret, chg;
+	long ret = 0, chg;
 	struct hstate *h = hstate_inode(inode);
 
-	if (vma && vma->vm_flags & VM_NORESERVE)
-		return 0;
-
 	/*
 	 * Shared mappings base their reservation on the number of pages that
 	 * are already allocated on behalf of the file. Private mappings need
@@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 */
 	if (!vma || vma->vm_flags & VM_SHARED)
 		chg = region_chg(&inode->i_mapping->private_list, from, to);
-	else {
-		struct resv_map *resv_map = resv_map_alloc();
-		if (!resv_map)
-			return -ENOMEM;
-
+	else
 		chg = to - from;
 
-		set_vma_resv_map(vma, resv_map);
-		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
-	}
-
 	if (chg < 0)
 		return chg;
 
 	if (hugetlb_get_quota(inode->i_mapping, chg))
 		return -ENOSPC;
+
+	/*
+	 * Only apply hugepage reservation if asked. We still have to
+	 * take the filesystem quota because it is an upper limit
+	 * defined for the mount and not necessarily memory as a whole
+	 */
+	if (acctflag & VM_NORESERVE) {
+		reset_vma_resv_huge_pages(vma);
+		return 0;
+	}
+
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
@@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode,
 	}
 	if (!vma || vma->vm_flags & VM_SHARED)
 		region_add(&inode->i_mapping->private_list, from, to);
+	else {
+		struct resv_map *resv_map = resv_map_alloc();
+
+		if (!resv_map)
+			return -ENOMEM;
+
+		set_vma_resv_map(vma, resv_map);
+		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+	}
+
 	return 0;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 214b6a258eeb..eb1270bebe67 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	struct inode *inode;
 	unsigned int vm_flags;
 	int error;
-	int accountable = 1;
 	unsigned long reqprot = prot;
 
 	/*
@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 					return -EPERM;
 				vm_flags &= ~VM_MAYEXEC;
 			}
-			if (is_file_hugepages(file))
-				accountable = 0;
 
 			if (!file->f_op || !file->f_op->mmap)
 				return -ENODEV;
@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	if (error)
 		return error;
 
-	return mmap_region(file, addr, len, flags, vm_flags, pgoff,
-			   accountable);
+	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
 
@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 
 /*
  * We account for memory if it's a private writeable mapping,
- * and VM_NORESERVE wasn't set.
+ * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 {
+	/*
+	 * hugetlb has its own accounting separate from the core VM
+	 * VM_HUGETLB may not be set yet so we cannot check for that flag.
+	 */
+	if (file && is_file_hugepages(file))
+		return 0;
+
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
-			  unsigned int vm_flags, unsigned long pgoff,
-			  int accountable)
+			  unsigned int vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1128,18 +1130,22 @@ munmap_back:
 
 	/*
 	 * Set 'VM_NORESERVE' if we should not account for the
-	 * memory use of this mapping. We only honor MAP_NORESERVE
-	 * if we're allowed to overcommit memory.
+	 * memory use of this mapping.
 	 */
-	if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-		vm_flags |= VM_NORESERVE;
-	if (!accountable)
-		vm_flags |= VM_NORESERVE;
+	if ((flags & MAP_NORESERVE)) {
+		/* We honor MAP_NORESERVE if allowed to overcommit */
+		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+			vm_flags |= VM_NORESERVE;
+
+		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
+		if (file && is_file_hugepages(file))
+			vm_flags |= VM_NORESERVE;
+	}
 
 	/*
 	 * Private writable mapping: check memory availability
 	 */
-	if (accountable_mapping(vm_flags)) {
+	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
 		if (security_vm_enough_memory(charged))
 			return -ENOMEM;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abe2694e13f4..258197b76fb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
-	 * make it unwritable again.
+	 * make it unwritable again. hugetlb mapping were accounted for
+	 * even if read-only so there is no need to account for them here
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
-- 
cgit v1.2.3


From 1db8508cf483dc1ecf66141f90a7c03659d69512 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 10 Feb 2009 23:27:32 +0100
Subject: hugetlbfs: fix build failure with !CONFIG_HUGETLBFS

Fix regression due to 5a6fe125950676015f5108fb71b2a67441755003,
"Do not account for the address space used by hugetlbfs using VM_ACCOUNT"
which added an argument to the function hugetlb_file_setup() but not to
the macro hugetlb_file_setup().

Reported-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index af09660001c7..03be7f29ca01 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -159,9 +159,9 @@ static inline void set_file_hugepages(struct file *file)
 }
 #else /* !CONFIG_HUGETLBFS */
 
-#define is_file_hugepages(file)		0
-#define set_file_hugepages(file)	BUG()
-#define hugetlb_file_setup(name,size)	ERR_PTR(-ENOSYS)
+#define is_file_hugepages(file)			0
+#define set_file_hugepages(file)		BUG()
+#define hugetlb_file_setup(name,size,acctflag)	ERR_PTR(-ENOSYS)
 
 #endif /* !CONFIG_HUGETLBFS */
 
-- 
cgit v1.2.3


From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 10 Feb 2009 16:37:31 +0100
Subject: timers: split process wide cpu clocks/timers, fix

To decrease the chance of a missed enable, always enable the timer when we
sample it, we'll always disable it when we find that there are no active timers
in the jiffy tick.

This fixes a flood of warnings reported by Mike Galbraith.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     |  1 +
 kernel/posix-cpu-timers.c | 42 ++++++++++++++----------------------------
 2 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 79392916d6c9..5d10fa0b6002 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2209,6 +2209,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 	unsigned long flags;
 
 	spin_lock_irqsave(&cputimer->lock, flags);
+	cputimer->running = 1;
 	*times = cputimer->cputime;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index db107c9bbc05..e5d7bfdfa7d4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
 	struct task_cputime cputime;
 
-	thread_group_cputime(tsk, &cputime);
+	thread_group_cputimer(tsk, &cputime);
 	cleanup_timers(tsk->signal->cpu_timers,
 		       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
 }
@@ -506,29 +506,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 					     now);
 }
 
-/*
- * Enable the process wide cpu timer accounting.
- *
- * serialized using ->sighand->siglock
- */
-static void start_process_timers(struct task_struct *tsk)
-{
-	tsk->signal->cputimer.running = 1;
-	barrier();
-}
-
-/*
- * Release the process wide timer accounting -- timer stops ticking when
- * nobody cares about it.
- *
- * serialized using ->sighand->siglock
- */
-static void stop_process_timers(struct task_struct *tsk)
-{
-	tsk->signal->cputimer.running = 0;
-	barrier();
-}
-
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
@@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	BUG_ON(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
-	if (!CPUCLOCK_PERTHREAD(timer->it_clock))
-		start_process_timers(p);
-
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
@@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
+static void stop_process_timers(struct task_struct *tsk)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	unsigned long flags;
+
+	if (!cputimer->running)
+		return;
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	cputimer->running = 0;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them
  * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	start_process_timers(tsk);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
-- 
cgit v1.2.3


From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 11 Feb 2009 11:30:27 +0100
Subject: timers: fix TIMER_ABSTIME for process wide cpu timers

The POSIX timer interface allows for absolute time expiry values through the
TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock
every time we start it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     | 13 +------------
 kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d10fa0b6002..8981e52c714f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2201,18 +2201,7 @@ static inline int spin_needbreak(spinlock_t *lock)
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-
-static inline
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 1;
-	*times = cputimer->cputime;
-	spin_unlock_irqrestore(&cputimer->lock, flags);
-}
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e5d7bfdfa7d4..2313a4cc14ea 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -261,6 +261,40 @@ out:
 	rcu_read_unlock();
 }
 
+static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+{
+	if (cputime_gt(b->utime, a->utime))
+		a->utime = b->utime;
+
+	if (cputime_gt(b->stime, a->stime))
+		a->stime = b->stime;
+
+	if (b->sum_exec_runtime > a->sum_exec_runtime)
+		a->sum_exec_runtime = b->sum_exec_runtime;
+}
+
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct task_cputime sum;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	if (!cputimer->running) {
+		cputimer->running = 1;
+		/*
+		 * The POSIX timer interface allows for absolute time expiry
+		 * values through the TIMER_ABSTIME flag, therefore we have
+		 * to synchronize the timer to the clock every time we start
+		 * it.
+		 */
+		thread_group_cputime(tsk, &sum);
+		update_gt_cputime(&cputimer->cputime, &sum);
+	}
+	*times = cputimer->cputime;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
-- 
cgit v1.2.3


From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 11 Feb 2009 15:10:27 +0100
Subject: x86, ptrace, mm: fix double-free on race

Ptrace_detach() races with __ptrace_unlink() if the traced task is
reaped while detaching. This might cause a double-free of the BTS
buffer.

Change the ptrace_detach() path to only do the memory accounting in
ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace()
which will be called from __ptrace_unlink().

The fix follows a proposal from Oleg Nesterov.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c | 16 ++++++++++------
 include/linux/mm.h       |  1 +
 mm/mlock.c               |  7 ++++++-
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a5df5f82fb9..5a4c23d89892 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -810,12 +810,16 @@ static void ptrace_bts_untrace(struct task_struct *child)
 
 static void ptrace_bts_detach(struct task_struct *child)
 {
-	if (unlikely(child->bts)) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-
-		ptrace_bts_free_buffer(child);
-	}
+	/*
+	 * Ptrace_detach() races with ptrace_untrace() in case
+	 * the child dies and is reaped by another thread.
+	 *
+	 * We only do the memory accounting at this point and
+	 * leave the buffer deallocation and the bts tracer
+	 * release to ptrace_bts_untrace() which will be called
+	 * later on with tasklist_lock held.
+	 */
+	release_locked_buffer(child->bts_buffer, child->bts_size);
 }
 #else
 static inline void ptrace_bts_fork(struct task_struct *tsk) {}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8ddc98b8405..3d7fb44d7d7e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1305,5 +1305,6 @@ void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
 extern void free_locked_buffer(void *buffer, size_t size);
+extern void release_locked_buffer(void *buffer, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 028ec482fdd4..2b57f7e60390 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size)
 	return buffer;
 }
 
-void free_locked_buffer(void *buffer, size_t size)
+void release_locked_buffer(void *buffer, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
@@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size)
 	current->mm->locked_vm -= pgsz;
 
 	up_write(&current->mm->mmap_sem);
+}
+
+void free_locked_buffer(void *buffer, size_t size)
+{
+	release_locked_buffer(buffer, size);
 
 	kfree(buffer);
 }
-- 
cgit v1.2.3


From cfebe563bd0a3ff97e1bc167123120d59c7a84db Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 11 Feb 2009 13:04:36 -0800
Subject: cgroups: fix lockdep subclasses overflow

I enabled all cgroup subsystems when compiling kernel, and then:
 # mount -t cgroup -o net_cls xxx /mnt
 # mkdir /mnt/0

This showed up immediately:
 BUG: MAX_LOCKDEP_SUBCLASSES too low!
 turning off the locking correctness validator.

It's caused by the cgroup hierarchy lock:
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		struct cgroup_subsys *ss = subsys[i];
		if (ss->root == root)
			mutex_lock_nested(&ss->hierarchy_mutex, i);
	}

Now we have 9 cgroup subsystems, and the above 'i' for net_cls is 8, but
MAX_LOCKDEP_SUBCLASSES is 8.

This patch uses different lockdep keys for different subsystems.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 1 +
 kernel/cgroup.c        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e4e8e117d27d..499900d0cee7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -378,6 +378,7 @@ struct cgroup_subsys {
 	 * - initiating hotplug events
 	 */
 	struct mutex hierarchy_mutex;
+	struct lock_class_key subsys_key;
 
 	/*
 	 * Link to parent, and list entry in parent's children.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5a54ff42874e..e14db9c089b9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2351,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (ss->root == root)
-			mutex_lock_nested(&ss->hierarchy_mutex, i);
+			mutex_lock(&ss->hierarchy_mutex);
 	}
 }
 
@@ -2637,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	BUG_ON(!list_empty(&init_task.tasks));
 
 	mutex_init(&ss->hierarchy_mutex);
+	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
 }
 
-- 
cgit v1.2.3


From 6c5979631b4b03c9288776562c18036765e398c1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 11 Feb 2009 13:04:38 -0800
Subject: syscall define: fix uml compile bug

With the new system call defines we get this on uml:

arch/um/sys-i386/built-in.o: In function `sys_call_table':
(.rodata+0x308): undefined reference to `sys_sigprocmask'

Reason for this is that uml passes the preprocessor option
-Dsigprocmask=kernel_sigprocmask to gcc when compiling the kernel.
This causes SYSCALL_DEFINE3(sigprocmask, ...) to be expanded to
SYSCALL_DEFINEx(3, kernel_sigprocmask, ...) and finally to a system
call named sys_kernel_sigprocmask.  However sys_sigprocmask is missing
because of this.

To avoid macro expansion for the system call name just concatenate the
name at first define instead of carrying it through severel levels.
This was pointed out by Al Viro.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0eda02ff2414..f9f900cfd066 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -95,13 +95,13 @@ struct old_linux_dirent;
 #define __SC_TEST5(t5, a5, ...)	__SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
-#define SYSCALL_DEFINE0(name)   asmlinkage long sys_##name(void)
-#define SYSCALL_DEFINE1(...)    SYSCALL_DEFINEx(1, __VA_ARGS__)
-#define SYSCALL_DEFINE2(...)    SYSCALL_DEFINEx(2, __VA_ARGS__)
-#define SYSCALL_DEFINE3(...)    SYSCALL_DEFINEx(3, __VA_ARGS__)
-#define SYSCALL_DEFINE4(...)    SYSCALL_DEFINEx(4, __VA_ARGS__)
-#define SYSCALL_DEFINE5(...)    SYSCALL_DEFINEx(5, __VA_ARGS__)
-#define SYSCALL_DEFINE6(...)    SYSCALL_DEFINEx(6, __VA_ARGS__)
+#define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
+#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
 #ifdef CONFIG_PPC64
 #define SYSCALL_ALIAS(alias, name)					\
@@ -121,21 +121,21 @@ struct old_linux_dirent;
 
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
 #define SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__));		\
-	static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__));	\
-	asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__))		\
+	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
+	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
+	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
 	{								\
 		__SC_TEST##x(__VA_ARGS__);				\
-		return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__));	\
+		return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));	\
 	}								\
-	SYSCALL_ALIAS(sys_##name, SyS_##name);				\
-	static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__))
+	SYSCALL_ALIAS(sys##name, SyS##name);				\
+	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
 
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
 #define SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__))
+	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
-- 
cgit v1.2.3


From 7a0eb1960e8ddcb68ea631caf16815485af0e228 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 19 Jan 2009 14:57:52 +0200
Subject: KVM: Avoid using CONFIG_ in userspace visible headers

Kconfig symbols are not available in userspace, and are not stripped by
headers-install.  Avoid their use by adding #defines in <asm/kvm.h> to
suit each architecture.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm.h |  4 ++++
 arch/x86/include/asm/kvm.h  |  7 +++++++
 include/linux/kvm.h         | 10 +++++-----
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index 68aa6da807c1..bfa86b6af7cd 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -25,6 +25,10 @@
 
 #include <linux/ioctl.h>
 
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_DEVICE_ASSIGNMENT
+
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
 
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index d2e3bf3608af..886c9402ec45 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -9,6 +9,13 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_PIT
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_DEVICE_ASSIGNMENT
+#define __KVM_HAVE_MSI
+#define __KVM_HAVE_USER_NMI
+
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 5715f1907601..0424326f1679 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -58,10 +58,10 @@ struct kvm_irqchip {
 	__u32 pad;
         union {
 		char dummy[512];  /* reserving space */
-#ifdef CONFIG_X86
+#ifdef __KVM_HAVE_PIT
 		struct kvm_pic_state pic;
 #endif
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#ifdef __KVM_HAVE_IOAPIC
 		struct kvm_ioapic_state ioapic;
 #endif
 	} chip;
@@ -384,16 +384,16 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
-#if defined(CONFIG_X86)||defined(CONFIG_IA64)
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
-#if defined(CONFIG_X86)
+#ifdef __KVM_HAVE_MSI
 #define KVM_CAP_DEVICE_MSI 20
 #endif
 /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
-#if defined(CONFIG_X86)
+#ifdef __KVM_HAVE_USER_NMI
 #define KVM_CAP_USER_NMI 22
 #endif
 
-- 
cgit v1.2.3


From ad8ba2cd44d4d39fb3fe55d5dcc565b19fc3a7fb Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 6 Jan 2009 10:03:02 +0800
Subject: KVM: Add kvm_arch_sync_events to sync with asynchronize events

kvm_arch_sync_events is introduced to quiet down all other events may happen
contemporary with VM destroy process, like IRQ handler and work struct for
assigned device.

For kvm_arch_sync_events is called at the very beginning of kvm_destroy_vm(), so
the state of KVM here is legal and can provide a environment to quiet down other
events.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c   | 4 ++++
 arch/powerpc/kvm/powerpc.c | 4 ++++
 arch/s390/kvm/kvm-s390.c   | 4 ++++
 arch/x86/kvm/x86.c         | 4 ++++
 include/linux/kvm_host.h   | 1 +
 virt/kvm/kvm_main.c        | 1 +
 6 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 4e586f6110aa..28f982045f29 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1337,6 +1337,10 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 	}
 }
 
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_iommu_unmap_guest(kvm);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2822c8ccfaaf..5f81256287f5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -125,6 +125,10 @@ static void kvmppc_free_vcpus(struct kvm *kvm)
 	}
 }
 
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvmppc_free_vcpus(kvm);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index be8497186b96..0d33893e1e89 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -212,6 +212,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
 	}
 }
 
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_free_vcpus(kvm);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc17546a2406..b0fc079f1bee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4127,6 +4127,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 }
 
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_free_all_assigned_devices(kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ec49d0be7f52..bf6f703642fc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -285,6 +285,7 @@ void kvm_free_physmem(struct kvm *kvm);
 struct  kvm *kvm_arch_create_vm(void);
 void kvm_arch_destroy_vm(struct kvm *kvm);
 void kvm_free_all_assigned_devices(struct kvm *kvm);
+void kvm_arch_sync_events(struct kvm *kvm);
 
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0b6f2f71271f..68e3f1ec1674 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -891,6 +891,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 {
 	struct mm_struct *mm = kvm->mm;
 
+	kvm_arch_sync_events(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
-- 
cgit v1.2.3


From 5955c7a2cfb6a35429adea5dc480002b15ca8cfc Mon Sep 17 00:00:00 2001
From: Zlatko Calusic <zlatko.calusic@iskon.hr>
Date: Wed, 18 Feb 2009 01:33:34 +0100
Subject: Add support for VT6415 PCIE PATA IDE Host Controller

Signed-off-by: Zlatko Calusic <zlatko.calusic@iskon.hr>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/ata/pata_via.c  | 4 +++-
 include/linux/pci_ids.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index 79a6c9a0b721..ba556d3e6963 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -110,7 +110,8 @@ static const struct via_isa_bridge {
 	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
 	{ "vt8251",	PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
 	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA },
-	{ "vt6410",	PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES},
+	{ "vt6410",	PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
+	{ "vt6415",	PCI_DEVICE_ID_VIA_6415,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
 	{ "vt8237a",	PCI_DEVICE_ID_VIA_8237A,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
 	{ "vt8237",	PCI_DEVICE_ID_VIA_8237,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
 	{ "vt8235",	PCI_DEVICE_ID_VIA_8235,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
@@ -593,6 +594,7 @@ static int via_reinit_one(struct pci_dev *pdev)
 #endif
 
 static const struct pci_device_id via[] = {
+	{ PCI_VDEVICE(VIA, 0x0415), },
 	{ PCI_VDEVICE(VIA, 0x0571), },
 	{ PCI_VDEVICE(VIA, 0x0581), },
 	{ PCI_VDEVICE(VIA, 0x1571), },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 52a9fe08451c..918391b4b109 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1312,6 +1312,7 @@
 #define PCI_DEVICE_ID_VIA_VT3351	0x0351
 #define PCI_DEVICE_ID_VIA_VT3364	0x0364
 #define PCI_DEVICE_ID_VIA_8371_0	0x0391
+#define PCI_DEVICE_ID_VIA_6415		0x0415
 #define PCI_DEVICE_ID_VIA_8501_0	0x0501
 #define PCI_DEVICE_ID_VIA_82C561	0x0561
 #define PCI_DEVICE_ID_VIA_82C586_1	0x0571
-- 
cgit v1.2.3


From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 16 Feb 2009 10:25:40 +0100
Subject: block: fix bad definition of BIO_RW_SYNC

We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO
and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before
213d9417fec62ef4c3675621b9364a667954d4dd.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c             | 2 +-
 drivers/md/dm-io.c           | 2 +-
 drivers/md/dm-kcopyd.c       | 2 +-
 drivers/md/md.c              | 4 ++--
 include/linux/bio.h          | 2 --
 include/linux/blktrace_api.h | 1 +
 include/linux/fs.h           | 6 +++---
 kernel/power/swap.c          | 5 +++--
 mm/page_io.c                 | 2 +-
 9 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/blktrace.c b/block/blktrace.c
index 39cc3bfe56e4..7cf9d1ff45a0 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -142,7 +142,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 
 	what |= ddir_act[rw & WRITE];
 	what |= MASK_TC_BIT(rw, BARRIER);
-	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, SYNCIO);
 	what |= MASK_TC_BIT(rw, AHEAD);
 	what |= MASK_TC_BIT(rw, META);
 	what |= MASK_TC_BIT(rw, DISCARD);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index a34338567a2a..f14813be4eff 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -328,7 +328,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	struct dpages old_pages = *dp;
 
 	if (sync)
-		rw |= (1 << BIO_RW_SYNC);
+		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
 	/*
 	 * For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3073618269ea..0a225da21272 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -344,7 +344,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | (1 << BIO_RW_SYNC),
+		.bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG),
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4495104f6c9f..03b4cd0a6344 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -474,7 +474,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	 * causes ENOTSUPP, we allocate a spare bio...
 	 */
 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
-	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
+	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
 
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_sector = sector;
@@ -531,7 +531,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	struct completion event;
 	int ret;
 
-	rw |= (1 << BIO_RW_SYNC);
+	rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
 	bio->bi_bdev = bdev;
 	bio->bi_sector = sector;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2aa283ab062b..1b16108a5417 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -171,8 +171,6 @@ struct bio {
 #define BIO_RW_FAILFAST_TRANSPORT	8
 #define BIO_RW_FAILFAST_DRIVER		9
 
-#define BIO_RW_SYNC	(BIO_RW_SYNCIO | BIO_RW_UNPLUG)
-
 #define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
 
 /*
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 25379cba2370..6e915878e88c 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -15,6 +15,7 @@ enum blktrace_cat {
 	BLK_TC_WRITE	= 1 << 1,	/* writes */
 	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
 	BLK_TC_SYNC	= 1 << 3,	/* sync IO */
+	BLK_TC_SYNCIO	= BLK_TC_SYNC,
 	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
 	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
 	BLK_TC_ISSUE	= 1 << 6,	/* issue */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6022f44043f2..67857dc16365 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -87,10 +87,10 @@ struct inodes_stat_t {
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
-#define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
+#define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
-#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
-#define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNC))
+#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6da14358537c..505f319e489c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -60,6 +60,7 @@ static struct block_device *resume_bdev;
 static int submit(int rw, pgoff_t page_off, struct page *page,
 			struct bio **bio_chain)
 {
+	const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
@@ -80,7 +81,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 	bio_get(bio);
 
 	if (bio_chain == NULL) {
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		submit_bio(bio_rw, bio);
 		wait_on_page_locked(page);
 		if (rw == READ)
 			bio_set_pages_dirty(bio);
@@ -90,7 +91,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 			get_page(page);	/* These pages are freed later */
 		bio->bi_private = *bio_chain;
 		*bio_chain = bio;
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		submit_bio(bio_rw, bio);
 	}
 	return 0;
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index dc6ce0afbded..3023c475e041 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		goto out;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		rw |= (1 << BIO_RW_SYNC);
+		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 	count_vm_event(PSWPOUT);
 	set_page_writeback(page);
 	unlock_page(page);
-- 
cgit v1.2.3


From c296861291669f305deef19b78042330d7135017 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 18 Feb 2009 14:48:12 -0800
Subject: vmalloc: add __get_vm_area_caller()

We have get_vm_area_caller() and __get_vm_area() but not
__get_vm_area_caller()

On powerpc, I use __get_vm_area() to separate the ranges of addresses
given to vmalloc vs.  ioremap (various good reasons for that) so in order
to be able to implement the new caller tracking in /proc/vmallocinfo, I
need a "_caller" variant of it.

(akpm: needed for ongoing powerpc development, so merge it early)

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 4 ++++
 mm/vmalloc.c            | 8 ++++++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 506e7620a986..9c0890c7a06a 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -84,6 +84,10 @@ extern struct vm_struct *get_vm_area_caller(unsigned long size,
 					unsigned long flags, void *caller);
 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 					unsigned long start, unsigned long end);
+extern struct vm_struct *__get_vm_area_caller(unsigned long size,
+					unsigned long flags,
+					unsigned long start, unsigned long end,
+					void *caller);
 extern struct vm_struct *get_vm_area_node(unsigned long size,
 					  unsigned long flags, int node,
 					  gfp_t gfp_mask);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 75f49d312e8c..4dd2636d0b92 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1106,6 +1106,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
 
+struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
+				       unsigned long start, unsigned long end,
+				       void *caller)
+{
+	return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+				  caller);
+}
+
 /**
  *	get_vm_area  -  reserve a contiguous kernel virtual area
  *	@size:		size of the area
-- 
cgit v1.2.3


From 55ec82176eca52e4e0530a82a0eb59160a1a95a1 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 18 Feb 2009 14:48:15 -0800
Subject: vfs: separate FMODE_PREAD/FMODE_PWRITE into separate flags

Separate FMODE_PREAD and FMODE_PWRITE into separate flags to reflect the
reality that the read and write paths may have independent restrictions.

A git grep verifies that these flags are always cleared together so this
new behavior will only apply to interfaces that change to clear flags
individually.

This is required for "seq_file: properly cope with pread", a post-2.6.25
regression fix.

[akpm@linux-foundation.org: add comment]
Signed-off-by: Paul Turner <pjt@google.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc:  Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6022f44043f2..5852bd6afbe4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -54,24 +54,30 @@ struct inodes_stat_t {
 #define MAY_ACCESS 16
 #define MAY_OPEN 32
 
+/*
+ * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
+ * to O_WRONLY and O_RDWR via the strange trick in __dentry_open()
+ */
+
 /* file is open for reading */
 #define FMODE_READ		((__force fmode_t)1)
 /* file is open for writing */
 #define FMODE_WRITE		((__force fmode_t)2)
 /* file is seekable */
 #define FMODE_LSEEK		((__force fmode_t)4)
-/* file can be accessed using pread/pwrite */
+/* file can be accessed using pread */
 #define FMODE_PREAD		((__force fmode_t)8)
-#define FMODE_PWRITE		FMODE_PREAD	/* These go hand in hand */
+/* file can be accessed using pwrite */
+#define FMODE_PWRITE		((__force fmode_t)16)
 /* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC		((__force fmode_t)16)
+#define FMODE_EXEC		((__force fmode_t)32)
 /* File is opened with O_NDELAY (only set for block devices) */
-#define FMODE_NDELAY		((__force fmode_t)32)
+#define FMODE_NDELAY		((__force fmode_t)64)
 /* File is opened with O_EXCL (only set for block devices) */
-#define FMODE_EXCL		((__force fmode_t)64)
+#define FMODE_EXCL		((__force fmode_t)128)
 /* File is opened using open(.., 3, ..) and is writeable only for ioctls
    (specialy hack for floppy.c) */
-#define FMODE_WRITE_IOCTL	((__force fmode_t)128)
+#define FMODE_WRITE_IOCTL	((__force fmode_t)256)
 
 /*
  * Don't update ctime and mtime.
-- 
cgit v1.2.3


From 8f19d472935c83d823fa4cf02bcc0a7b9952db30 Mon Sep 17 00:00:00 2001
From: Eric Biederman <ebiederm@xmission.com>
Date: Wed, 18 Feb 2009 14:48:16 -0800
Subject: seq_file: properly cope with pread

Currently seq_read assumes that the offset passed to it is always the
offset it passed to user space.  In the case pread this assumption is
broken and we do the wrong thing when presented with pread.

To solve this I introduce an offset cache inside of struct seq_file so we
know where our logical file position is.  Then in seq_read if we try to
read from another offset we reset our data structures and attempt to go to
the offset user space wanted.

[akpm@linux-foundation.org: restore FMODE_PWRITE]
[pjt@google.com: seq_open needs its fmode opened up to take advantage of this]
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Turner <pjt@google.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 36 ++++++++++++++++++++++++++++++++----
 include/linux/seq_file.h |  1 +
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5267098532bf..a1a4cfe19210 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op)
 	 */
 	file->f_version = 0;
 
-	/* SEQ files support lseek, but not pread/pwrite */
-	file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+	/*
+	 * seq_files support lseek() and pread().  They do not implement
+	 * write() at all, but we clear FMODE_PWRITE here for historical
+	 * reasons.
+	 *
+	 * If a client of seq_files a) implements file.write() and b) wishes to
+	 * support pwrite() then that client will need to implement its own
+	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+	 */
+	file->f_mode &= ~FMODE_PWRITE;
 	return 0;
 }
 EXPORT_SYMBOL(seq_open);
@@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	int err = 0;
 
 	mutex_lock(&m->lock);
+
+	/* Don't assume *ppos is where we left it */
+	if (unlikely(*ppos != m->read_pos)) {
+		m->read_pos = *ppos;
+		while ((err = traverse(m, *ppos)) == -EAGAIN)
+			;
+		if (err) {
+			/* With prejudice... */
+			m->read_pos = 0;
+			m->version = 0;
+			m->index = 0;
+			m->count = 0;
+			goto Done;
+		}
+	}
+
 	/*
 	 * seq_file->op->..m_start/m_stop/m_next may do special actions
 	 * or optimisations based on the file->f_version, so we want to
@@ -230,8 +254,10 @@ Fill:
 Done:
 	if (!copied)
 		copied = err;
-	else
+	else {
 		*ppos += copied;
+		m->read_pos += copied;
+	}
 	file->f_version = m->version;
 	mutex_unlock(&m->lock);
 	return copied;
@@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 			if (offset < 0)
 				break;
 			retval = offset;
-			if (offset != file->f_pos) {
+			if (offset != m->read_pos) {
 				while ((retval=traverse(m, offset)) == -EAGAIN)
 					;
 				if (retval) {
 					/* with extreme prejudice... */
 					file->f_pos = 0;
+					m->read_pos = 0;
 					m->version = 0;
 					m->index = 0;
 					m->count = 0;
 				} else {
+					m->read_pos = offset;
 					retval = file->f_pos = offset;
 				}
 			}
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 40ea5058c2ec..f616f31576d7 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -19,6 +19,7 @@ struct seq_file {
 	size_t from;
 	size_t count;
 	loff_t index;
+	loff_t read_pos;
 	u64 version;
 	struct mutex lock;
 	const struct seq_operations *op;
-- 
cgit v1.2.3


From 610d18f4128ebbd88845d0fc60cce67b49af881e Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Wed, 18 Feb 2009 14:48:18 -0800
Subject: timerfd: add flags check

As requested by Michael, add a missing check for valid flags in
timerfd_settime(), and make it return EINVAL in case some extra bits are
set.

Michael said:
If this is to be any use to userland apps that want to check flag
support (perhaps it is too late already), then the sooner we get it
into the kernel the better: 2.6.29 would be good; earlier stables as
well would be even better.

[akpm@linux-foundation.org: remove unused TFD_FLAGS_SET]
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>		[2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/timerfd.c            | 12 ++++++------
 include/linux/timerfd.h | 16 ++++++++++++----
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f5..b042bd7034b1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
-		return -EINVAL;
-	if (clockid != CLOCK_MONOTONIC &&
-	    clockid != CLOCK_REALTIME)
+	if ((flags & ~TFD_CREATE_FLAGS) ||
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME))
 		return -EINVAL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-			       flags & (O_CLOEXEC | O_NONBLOCK));
+			       flags & TFD_SHARED_FCNTL_FLAGS);
 	if (ufd < 0)
 		kfree(ctx);
 
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
 
-	if (!timespec_valid(&ktmr.it_value) ||
+	if ((flags & ~TFD_SETTIME_FLAGS) ||
+	    !timespec_valid(&ktmr.it_value) ||
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h
index 86cb0501d3e2..2d0792983f8c 100644
--- a/include/linux/timerfd.h
+++ b/include/linux/timerfd.h
@@ -11,13 +11,21 @@
 /* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
-/* Flags for timerfd_settime.  */
+/*
+ * CAREFUL: Check include/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from eventfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
 #define TFD_TIMER_ABSTIME (1 << 0)
-
-/* Flags for timerfd_create.  */
 #define TFD_CLOEXEC O_CLOEXEC
 #define TFD_NONBLOCK O_NONBLOCK
 
+#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK)
+/* Flags for timerfd_create.  */
+#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS
+/* Flags for timerfd_settime.  */
+#define TFD_SETTIME_FLAGS TFD_TIMER_ABSTIME
 
 #endif /* _LINUX_TIMERFD_H */
-
-- 
cgit v1.2.3


From 1cf6e7d83bf334cc5916137862c920a97aabc018 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 18 Feb 2009 14:48:18 -0800
Subject: mm: task dirty accounting fix

YAMAMOTO-san noticed that task_dirty_inc doesn't seem to be called properly for
cases where set_page_dirty is not used to dirty a page (eg. mark_buffer_dirty).

Additionally, there is some inconsistency about when task_dirty_inc is
called.  It is used for dirty balancing, however it even gets called for
__set_page_dirty_no_writeback.

So rather than increment it in a set_page_dirty wrapper, move it down to
exactly where the dirty page accounting stats are incremented.

Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c         |  1 +
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 13 +++----------
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25bc..ff4d1cdd779b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page,
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
 			__inc_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
+			task_dirty_inc(current);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7dc04ff5ab89..10074212a35b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1159,6 +1159,7 @@ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
+void task_dirty_inc(struct task_struct *tsk);
 
 /* readahead.c */
 #define VM_MAX_READAHEAD	128	/* kbytes */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3c84128596ba..74dc57c74349 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 
-static inline void task_dirty_inc(struct task_struct *tsk)
+void task_dirty_inc(struct task_struct *tsk)
 {
 	prop_inc_single(&vm_dirties, &tsk->dirties);
 }
@@ -1230,6 +1230,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
 				__inc_bdi_stat(mapping->backing_dev_info,
 						BDI_RECLAIMABLE);
+				task_dirty_inc(current);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -1262,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
-static int __set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 
@@ -1280,14 +1281,6 @@ static int __set_page_dirty(struct page *page)
 	}
 	return 0;
 }
-
-int set_page_dirty(struct page *page)
-{
-	int ret = __set_page_dirty(page);
-	if (ret)
-		task_dirty_inc(current);
-	return ret;
-}
 EXPORT_SYMBOL(set_page_dirty);
 
 /*
-- 
cgit v1.2.3


From 287d859222e0adbc67666a6154aaf42d7d5bbb54 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 18 Feb 2009 14:48:26 -0800
Subject: atmel-mci: fix initialization of dma slave data

The conversion of atmel-mci to dma_request_channel missed the
initialization of the channel dma_slave information.  The filter_fn passed
to dma_request_channel is responsible for initializing the channel's
private data.  This implementation has the additional benefit of enabling
a generic client-channel data passing mechanism.

Reviewed-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/dma/dmaengine.c      | 2 ++
 drivers/dma/dw_dmac.c        | 5 ++---
 drivers/dma/dw_dmac_regs.h   | 2 --
 drivers/mmc/host/atmel-mci.c | 5 +++--
 include/linux/dmaengine.h    | 2 ++
 5 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index a58993011edb..280a9d263eb3 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -518,6 +518,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
 				       dma_chan_name(chan), err);
 			else
 				break;
+			chan->private = NULL;
 			chan = NULL;
 		}
 	}
@@ -536,6 +537,7 @@ void dma_release_channel(struct dma_chan *chan)
 	WARN_ONCE(chan->client_count != 1,
 		  "chan reference count %d != 1\n", chan->client_count);
 	dma_chan_put(chan);
+	chan->private = NULL;
 	mutex_unlock(&dma_list_mutex);
 }
 EXPORT_SYMBOL_GPL(dma_release_channel);
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 6b702cc46b3d..a97c07eef7ec 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -560,7 +560,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		unsigned long flags)
 {
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
-	struct dw_dma_slave	*dws = dwc->dws;
+	struct dw_dma_slave	*dws = chan->private;
 	struct dw_desc		*prev;
 	struct dw_desc		*first;
 	u32			ctllo;
@@ -790,7 +790,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 	cfghi = DWC_CFGH_FIFO_MODE;
 	cfglo = 0;
 
-	dws = dwc->dws;
+	dws = chan->private;
 	if (dws) {
 		/*
 		 * We need controller-specific data to set up slave
@@ -866,7 +866,6 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	spin_lock_bh(&dwc->lock);
 	list_splice_init(&dwc->free_list, &list);
 	dwc->descs_allocated = 0;
-	dwc->dws = NULL;
 
 	/* Disable interrupts */
 	channel_clear_bit(dw, MASK.XFER, dwc->mask);
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h
index 00fdd187bb0c..b252b202c5cf 100644
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -139,8 +139,6 @@ struct dw_dma_chan {
 	struct list_head	queue;
 	struct list_head	free_list;
 
-	struct dw_dma_slave	*dws;
-
 	unsigned int		descs_allocated;
 };
 
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 76bfe16c09b1..2b1196e6142c 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -1548,9 +1548,10 @@ static bool filter(struct dma_chan *chan, void *slave)
 {
 	struct dw_dma_slave *dws = slave;
 
-	if (dws->dma_dev == chan->device->dev)
+	if (dws->dma_dev == chan->device->dev) {
+		chan->private = dws;
 		return true;
-	else
+	} else
 		return false;
 }
 #endif
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3e68469c1885..f0413845f20e 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -121,6 +121,7 @@ struct dma_chan_percpu {
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client-count: how many clients are using this channel
  * @table_count: number of appearances in the mem-to-mem allocation table
+ * @private: private data for certain client-channel associations
  */
 struct dma_chan {
 	struct dma_device *device;
@@ -134,6 +135,7 @@ struct dma_chan {
 	struct dma_chan_percpu *local;
 	int client_count;
 	int table_count;
+	void *private;
 };
 
 /**
-- 
cgit v1.2.3


From f2dbcfa738368c8a40d4a5f0b65dc9879577cb21 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 18 Feb 2009 14:48:32 -0800
Subject: mm: clean up for early_pfn_to_nid()

What's happening is that the assertion in mm/page_alloc.c:move_freepages()
is triggering:

	BUG_ON(page_zone(start_page) != page_zone(end_page));

Once I knew this is what was happening, I added some annotations:

	if (unlikely(page_zone(start_page) != page_zone(end_page))) {
		printk(KERN_ERR "move_freepages: Bogus zones: "
		       "start_page[%p] end_page[%p] zone[%p]\n",
		       start_page, end_page, zone);
		printk(KERN_ERR "move_freepages: "
		       "start_zone[%p] end_zone[%p]\n",
		       page_zone(start_page), page_zone(end_page));
		printk(KERN_ERR "move_freepages: "
		       "start_pfn[0x%lx] end_pfn[0x%lx]\n",
		       page_to_pfn(start_page), page_to_pfn(end_page));
		printk(KERN_ERR "move_freepages: "
		       "start_nid[%d] end_nid[%d]\n",
		       page_to_nid(start_page), page_to_nid(end_page));
 ...

And here's what I got:

	move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00]
	move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00]
	move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff]
	move_freepages: start_nid[1] end_nid[0]

My memory layout on this box is:

[    0.000000] Zone PFN ranges:
[    0.000000]   Normal   0x00000000 -> 0x0081ff5d
[    0.000000] Movable zone start PFN for each node
[    0.000000] early_node_map[8] active PFN ranges
[    0.000000]     0: 0x00000000 -> 0x00020000
[    0.000000]     1: 0x00800000 -> 0x0081f7ff
[    0.000000]     1: 0x0081f800 -> 0x0081fe50
[    0.000000]     1: 0x0081fed1 -> 0x0081fed8
[    0.000000]     1: 0x0081feda -> 0x0081fedb
[    0.000000]     1: 0x0081fedd -> 0x0081fee5
[    0.000000]     1: 0x0081fee7 -> 0x0081ff51
[    0.000000]     1: 0x0081ff59 -> 0x0081ff5d

So it's a block move in that 0x81f600-->0x81f7ff region which triggers
the problem.

This patch:

Declaration of early_pfn_to_nid() is scattered over per-arch include
files, and it seems it's complicated to know when the declaration is used.
 I think it makes fix-for-memmap-init not easy.

This patch moves all declaration to include/linux/mm.h

After this,
  if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
     -> Use static definition in include/linux/mm.h
  else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
     -> Use generic definition in mm/page_alloc.c
  else
     -> per-arch back end function will be called.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: David Miller <davem@davemlloft.net>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/mmzone.h   |  4 ----
 arch/ia64/mm/numa.c              |  2 +-
 arch/x86/include/asm/mmzone_32.h |  2 --
 arch/x86/include/asm/mmzone_64.h |  2 --
 arch/x86/mm/numa_64.c            |  2 +-
 include/linux/mm.h               | 19 ++++++++++++++++---
 mm/page_alloc.c                  |  8 +++++++-
 7 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/mmzone.h b/arch/ia64/include/asm/mmzone.h
index 34efe88eb849..f2ca32069b3f 100644
--- a/arch/ia64/include/asm/mmzone.h
+++ b/arch/ia64/include/asm/mmzone.h
@@ -31,10 +31,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 #endif
 }
 
-#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-extern int early_pfn_to_nid(unsigned long pfn);
-#endif
-
 #ifdef CONFIG_IA64_DIG /* DIG systems are small */
 # define MAX_PHYSNODE_ID	8
 # define NR_NODE_MEMBLKS	(MAX_NUMNODES * 8)
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
index b73bf1838e57..5061c3fb6796 100644
--- a/arch/ia64/mm/numa.c
+++ b/arch/ia64/mm/numa.c
@@ -58,7 +58,7 @@ paddr_to_nid(unsigned long paddr)
  * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where
  * the section resides.
  */
-int early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec;
 
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 07f1af494ca5..105fb90a0635 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -32,8 +32,6 @@ static inline void get_memcfg_numa(void)
 	get_memcfg_numa_flat();
 }
 
-extern int early_pfn_to_nid(unsigned long pfn);
-
 extern void resume_map_numa_kva(pgd_t *pgd);
 
 #else /* !CONFIG_NUMA */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a5b3817d4b9e..a29f48c2a322 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -40,8 +40,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn +	\
 				 NODE_DATA(nid)->node_spanned_pages)
 
-extern int early_pfn_to_nid(unsigned long pfn);
-
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	(64 * 1024 * 1024)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 71a14f89f89e..f3516da035d1 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -145,7 +145,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
 	return shift;
 }
 
-int early_pfn_to_nid(unsigned long pfn)
+int __meminit  __early_pfn_to_nid(unsigned long pfn)
 {
 	return phys_to_nid(pfn << PAGE_SHIFT);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 10074212a35b..065cdf8c09fb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1041,10 +1041,23 @@ extern void free_bootmem_with_active_regions(int nid,
 typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
 extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-extern int early_pfn_to_nid(unsigned long pfn);
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \
+    !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
+static inline int __early_pfn_to_nid(unsigned long pfn)
+{
+	return 0;
+}
+#else
+/* please see mm/page_alloc.c */
+extern int __meminit early_pfn_to_nid(unsigned long pfn);
+#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+/* there is a per-arch backend function. */
+extern int __meminit __early_pfn_to_nid(unsigned long pfn);
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+#endif
+
 extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void memmap_init_zone(unsigned long, int, unsigned long,
 				unsigned long, enum memmap_context);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5675b3073854..c5dd74602efc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2989,7 +2989,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
-int __meminit early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	int i;
 
@@ -3005,6 +3005,12 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+	return __early_pfn_to_nid(pfn);
+}
+
+
 /* Basic iterator support to walk early_node_map[] */
 #define for_each_active_range_index_in_nid(i, nid) \
 	for (i = first_active_region_index_in_nid(nid); i != -1; \
-- 
cgit v1.2.3


From cc2559bccc72767cb446f79b071d96c30c26439b Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 18 Feb 2009 14:48:33 -0800
Subject: mm: fix memmap init for handling memory hole

Now, early_pfn_in_nid(PFN, NID) may returns false if PFN is a hole.
and memmap initialization was not done. This was a trouble for
sparc boot.

To fix this, the PFN should be initialized and marked as PG_reserved.
This patch changes early_pfn_in_nid() return true if PFN is a hole.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reported-by: David Miller <davem@davemlloft.net>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/numa.c    |  2 +-
 include/linux/mmzone.h |  2 +-
 mm/page_alloc.c        | 23 ++++++++++++++++++++---
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
index 5061c3fb6796..3efea7d0a351 100644
--- a/arch/ia64/mm/numa.c
+++ b/arch/ia64/mm/numa.c
@@ -70,7 +70,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
 			return node_memblk[i].nid;
 	}
 
-	return 0;
+	return -1;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 09c14e213b63..1aca6cebbb78 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1071,7 +1071,7 @@ void sparse_init(void);
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
-#define early_pfn_in_nid(pfn, nid)	(early_pfn_to_nid(pfn) == (nid))
+bool early_pfn_in_nid(unsigned long pfn, int nid);
 #else
 #define early_pfn_in_nid(pfn, nid)	(1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c5dd74602efc..5c44ed49ca93 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3000,16 +3000,33 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
 		if (start_pfn <= pfn && pfn < end_pfn)
 			return early_node_map[i].nid;
 	}
-
-	return 0;
+	/* This is a memory hole */
+	return -1;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
-	return __early_pfn_to_nid(pfn);
+	int nid;
+
+	nid = __early_pfn_to_nid(pfn);
+	if (nid >= 0)
+		return nid;
+	/* just returns 0 */
+	return 0;
 }
 
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+	int nid;
+
+	nid = __early_pfn_to_nid(pfn);
+	if (nid >= 0 && nid != node)
+		return false;
+	return true;
+}
+#endif
 
 /* Basic iterator support to walk early_node_map[] */
 #define for_each_active_range_index_in_nid(i, nid) \
-- 
cgit v1.2.3


From ffa7525c13eb3db0fd19a3e1cffe2ce6f561f5f3 Mon Sep 17 00:00:00 2001
From: Adam Lackorzynski <adam@os.inf.tu-dresden.de>
Date: Wed, 18 Feb 2009 14:48:34 -0800
Subject: jsm: additional device support

I have a Digi Neo 8 PCI card (114f:00b1) Serial controller: Digi
International Digi Neo 8 (rev 05)

that works with the jsm driver after using the following patch.

Signed-off-by: Adam Lackorzynski <adam@os.inf.tu-dresden.de>
Cc: Scott H Kilau <Scott_Kilau@digi.com>
Cc: Wendy Xiong <wendyx@us.ibm.com>
Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/jsm/jsm_driver.c | 3 +++
 include/linux/pci_ids.h         | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/jsm/jsm_driver.c b/drivers/serial/jsm/jsm_driver.c
index 92187e28608a..ac79cbe4c2cf 100644
--- a/drivers/serial/jsm/jsm_driver.c
+++ b/drivers/serial/jsm/jsm_driver.c
@@ -84,6 +84,8 @@ static int jsm_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	brd->pci_dev = pdev;
 	if (pdev->device == PCIE_DEVICE_ID_NEO_4_IBM)
 		brd->maxports = 4;
+	else if (pdev->device == PCI_DEVICE_ID_DIGI_NEO_8)
+		brd->maxports = 8;
 	else
 		brd->maxports = 2;
 
@@ -212,6 +214,7 @@ static struct pci_device_id jsm_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_NEO_2RJ45), 0, 0, 2 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_NEO_2RJ45PRI), 0, 0, 3 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCIE_DEVICE_ID_NEO_4_IBM), 0, 0, 4 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_DIGI, PCI_DEVICE_ID_DIGI_NEO_8), 0, 0, 5 },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, jsm_pci_tbl);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 918391b4b109..114b8192eab9 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1445,6 +1445,7 @@
 #define PCI_DEVICE_ID_DIGI_DF_M_E	0x0071
 #define PCI_DEVICE_ID_DIGI_DF_M_IOM2_A	0x0072
 #define PCI_DEVICE_ID_DIGI_DF_M_A	0x0073
+#define PCI_DEVICE_ID_DIGI_NEO_8	0x00B1
 #define PCI_DEVICE_ID_NEO_2DB9          0x00C8
 #define PCI_DEVICE_ID_NEO_2DB9PRI       0x00C9
 #define PCI_DEVICE_ID_NEO_2RJ45         0x00CA
-- 
cgit v1.2.3


From 97bef7dd05563807539122c488a5dd93ed327722 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bernhard.walle@gmx.de>
Date: Wed, 18 Feb 2009 14:48:40 -0800
Subject: Bernhard has moved

Since I don't work for SUSE any more and the bwalle@suse.de address is
invalid, correct it in the copyright headers and documentation.

Signed-off-by: Bernhard Walle <bernhard.walle@gmx.de>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/sysfs-firmware-memmap | 2 +-
 drivers/firmware/memmap.c                       | 2 +-
 include/linux/firmware-map.h                    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap
index 0d99ee6ae02e..eca0d65087dc 100644
--- a/Documentation/ABI/testing/sysfs-firmware-memmap
+++ b/Documentation/ABI/testing/sysfs-firmware-memmap
@@ -1,6 +1,6 @@
 What:		/sys/firmware/memmap/
 Date:		June 2008
-Contact:	Bernhard Walle <bwalle@suse.de>
+Contact:	Bernhard Walle <bernhard.walle@gmx.de>
 Description:
 		On all platforms, the firmware provides a memory map which the
 		kernel reads. The resources from that memory map are registered
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 261b9aa3f248..05aa2d406ac6 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -1,7 +1,7 @@
 /*
  * linux/drivers/firmware/memmap.c
  *  Copyright (C) 2008 SUSE LINUX Products GmbH
- *  by Bernhard Walle <bwalle@suse.de>
+ *  by Bernhard Walle <bernhard.walle@gmx.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License v2.0 as published by
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index 6e199c8dfacc..cca686b39123 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -1,7 +1,7 @@
 /*
  * include/linux/firmware-map.h:
  *  Copyright (C) 2008 SUSE LINUX Products GmbH
- *  by Bernhard Walle <bwalle@suse.de>
+ *  by Bernhard Walle <bernhard.walle@gmx.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License v2.0 as published by
-- 
cgit v1.2.3


From 3ef0e5ba467366125f04b423f4638baca54a4fc1 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 20 Feb 2009 15:38:41 -0800
Subject: slab: introduce kzfree()

kzfree() is a wrapper for kfree() that additionally zeroes the underlying
memory before releasing it to the slab allocator.

Currently there is code which memset()s the memory region of an object
before releasing it back to the slab allocator to make sure
security-sensitive data are really zeroed out after use.

These callsites can then just use kzfree() which saves some code, makes
users greppable and allows for a stupid destructor that isn't necessarily
aware of the actual object size.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Matt Mackall <mpm@selenic.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h |  1 +
 mm/util.c            | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index f96d13c281e8..24c5602bee99 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -127,6 +127,7 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
 void * __must_check __krealloc(const void *, size_t, gfp_t);
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
+void kzfree(const void *);
 size_t ksize(const void *);
 
 /*
diff --git a/mm/util.c b/mm/util.c
index cb00b748ce47..37eaccdf3054 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -129,6 +129,26 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
 }
 EXPORT_SYMBOL(krealloc);
 
+/**
+ * kzfree - like kfree but zero memory
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kzfree() does nothing.
+ */
+void kzfree(const void *p)
+{
+	size_t ks;
+	void *mem = (void *)p;
+
+	if (unlikely(ZERO_OR_NULL_PTR(mem)))
+		return;
+	ks = ksize(mem);
+	memset(mem, 0, ks);
+	kfree(mem);
+}
+EXPORT_SYMBOL(kzfree);
+
 /*
  * strndup_user - duplicate an existing string from user space
  * @s: The string to duplicate
-- 
cgit v1.2.3


From 01b24fee285b098c74318a8be801ef3711ec18d7 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Fri, 20 Feb 2009 15:38:49 -0800
Subject: spi_bitbang: add more lowlevel function documentation

This adds more documentation of the lowlevel API to avoid future bugs.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spi/spi_bitbang.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h
index bf8de281b4ed..eed4254bd503 100644
--- a/include/linux/spi/spi_bitbang.h
+++ b/include/linux/spi/spi_bitbang.h
@@ -83,6 +83,13 @@ extern int spi_bitbang_stop(struct spi_bitbang *spi);
  *  int getmiso(struct spi_device *);
  *  void spidelay(unsigned);
  *
+ * setsck()'s is_on parameter is a zero/nonzero boolean.
+ *
+ * setmosi()'s is_on parameter is a zero/nonzero boolean.
+ *
+ * getmiso() is required to return 0 or 1 only. Any other value is invalid
+ * and will result in improper operation.
+ *
  * A non-inlined routine would call bitbang_txrx_*() routines.  The
  * main loop could easily compile down to a handful of instructions,
  * especially if the delay is a NOP (to run at peak speed).
-- 
cgit v1.2.3


From b6adea334c6c89d5e6c94f9196bbf3a279cb53bd Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Fri, 20 Feb 2009 15:38:52 -0800
Subject: 8250: fix boot hang with serial console when using with Serial Over
 Lan port

Intel 8257x Ethernet boards have a feature called Serial Over Lan.

This feature works by emulating a serial port, and it is detected by
kernel as a normal 8250 port.  However, this emulation is not perfect, as
also noticed on changeset 7500b1f602aad75901774a67a687ee985d85893f.

Before this patch, the kernel were trying to check if the serial TX is
capable of work using IRQ's.

This were done with a code similar this:

        serial_outp(up, UART_IER, UART_IER_THRI);
        lsr = serial_in(up, UART_LSR);
        iir = serial_in(up, UART_IIR);
        serial_outp(up, UART_IER, 0);

        if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT)
		up->bugs |= UART_BUG_TXEN;

This works fine for other 8250 ports, but, on 8250-emulated SoL port, the
chip is a little lazy to down UART_IIR_NO_INT at UART_IIR register.

Due to that, UART_BUG_TXEN is sometimes enabled.  However, as TX IRQ keeps
working, and the TX polling is now enabled, the driver miss-interprets the
IRQ received later, hanging up the machine until a key is pressed at the
serial console.

This is the 6 version of this patch.  Previous versions were trying to
introduce a large enough delay between serial_outp and serial_in(up,
UART_IIR), but not taking forever.  However, the needed delay couldn't be
safely determined.

At the experimental tests, a delay of 1us solves most of the cases, but
still hangs sometimes.  Increasing the delay to 5us was better, but still
doesn't solve.  A very high delay of 50 ms seemed to work every time.

However, poking around with delays and pray for it to be enough doesn't
seem to be a good approach, even for a quirk.

So, instead of playing with random large arbitrary delays, let's just
disable UART_BUG_TXEN for all SoL ports.

[akpm@linux-foundation.org: fix warnings]
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c       | 15 +++++++++++++++
 drivers/serial/8250_pci.c   | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h     |  3 +++
 include/linux/serial_core.h |  1 +
 4 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 0d934bfbdd9b..b4b39811b445 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2083,6 +2083,20 @@ static int serial8250_startup(struct uart_port *port)
 
 	serial8250_set_mctrl(&up->port, up->port.mctrl);
 
+	/* Serial over Lan (SoL) hack:
+	   Intel 8257x Gigabit ethernet chips have a
+	   16550 emulation, to be used for Serial Over Lan.
+	   Those chips take a longer time than a normal
+	   serial device to signalize that a transmission
+	   data was queued. Due to that, the above test generally
+	   fails. One solution would be to delay the reading of
+	   iir. However, this is not reliable, since the timeout
+	   is variable. So, let's just don't test if we receive
+	   TX irq. This way, we'll never enable UART_BUG_TXEN.
+	 */
+	if (up->port.flags & UPF_NO_TXEN_TEST)
+		goto dont_test_tx_en;
+
 	/*
 	 * Do a quick test to see if we receive an
 	 * interrupt when we enable the TX irq.
@@ -2102,6 +2116,7 @@ static int serial8250_startup(struct uart_port *port)
 		up->bugs &= ~UART_BUG_TXEN;
 	}
 
+dont_test_tx_en:
 	spin_unlock_irqrestore(&up->port.lock, flags);
 
 	/*
diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 536d8e510f66..533f82025adf 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -798,6 +798,21 @@ pci_default_setup(struct serial_private *priv,
 	return setup_port(priv, port, bar, offset, board->reg_shift);
 }
 
+static int skip_tx_en_setup(struct serial_private *priv,
+			const struct pciserial_board *board,
+			struct uart_port *port, int idx)
+{
+	port->flags |= UPF_NO_TXEN_TEST;
+	printk(KERN_DEBUG "serial8250: skipping TxEn test for device "
+			  "[%04x:%04x] subsystem [%04x:%04x]\n",
+			  priv->dev->vendor,
+			  priv->dev->device,
+			  priv->dev->subsystem_vendor,
+			  priv->dev->subsystem_device);
+
+	return pci_default_setup(priv, board, port, idx);
+}
+
 /* This should be in linux/pci_ids.h */
 #define PCI_VENDOR_ID_SBSMODULARIO	0x124B
 #define PCI_SUBVENDOR_ID_SBSMODULARIO	0x124B
@@ -864,6 +879,27 @@ static struct pci_serial_quirk pci_serial_quirks[] __refdata = {
 		.init		= pci_inteli960ni_init,
 		.setup		= pci_default_setup,
 	},
+	{
+		.vendor		= PCI_VENDOR_ID_INTEL,
+		.device		= PCI_DEVICE_ID_INTEL_8257X_SOL,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.setup		= skip_tx_en_setup,
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_INTEL,
+		.device		= PCI_DEVICE_ID_INTEL_82573L_SOL,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.setup		= skip_tx_en_setup,
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_INTEL,
+		.device		= PCI_DEVICE_ID_INTEL_82573E_SOL,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.setup		= skip_tx_en_setup,
+	},
 	/*
 	 * ITE
 	 */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 114b8192eab9..aca8c458aa8a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2323,6 +2323,9 @@
 #define PCI_DEVICE_ID_INTEL_82378	0x0484
 #define PCI_DEVICE_ID_INTEL_I960	0x0960
 #define PCI_DEVICE_ID_INTEL_I960RM	0x0962
+#define PCI_DEVICE_ID_INTEL_8257X_SOL	0x1062
+#define PCI_DEVICE_ID_INTEL_82573E_SOL	0x1085
+#define PCI_DEVICE_ID_INTEL_82573L_SOL	0x108F
 #define PCI_DEVICE_ID_INTEL_82815_MC	0x1130
 #define PCI_DEVICE_ID_INTEL_82815_CGC	0x1132
 #define PCI_DEVICE_ID_INTEL_82092AA_0	0x1221
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 90bbbf0b1161..df9245c7bd3b 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -296,6 +296,7 @@ struct uart_port {
 #define UPF_HARDPPS_CD		((__force upf_t) (1 << 11))
 #define UPF_LOW_LATENCY		((__force upf_t) (1 << 13))
 #define UPF_BUGGY_UART		((__force upf_t) (1 << 14))
+#define UPF_NO_TXEN_TEST	((__force upf_t) (1 << 15))
 #define UPF_MAGIC_MULTIPLIER	((__force upf_t) (1 << 16))
 #define UPF_CONS_FLOW		((__force upf_t) (1 << 23))
 #define UPF_SHARE_IRQ		((__force upf_t) (1 << 24))
-- 
cgit v1.2.3


From 216773a787c3c46ef26bf1742c1fdba37d26be45 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 14 Feb 2009 01:59:06 +0100
Subject: Consolidate driver_probe_done() loops into one place

there's a few places that currently loop over driver_probe_done(), and
I'm about to add another one. This patch abstracts it into a helper
to reduce duplication.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <lenb@kernel.org>
Acked-by: Greg KH <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/dd.c      | 17 +++++++++++++++++
 include/linux/device.h |  2 ++
 init/do_mounts.c       | 13 +++++++++----
 init/do_mounts_md.c    |  5 +++--
 4 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 315bed8d5e7f..135231239103 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -18,9 +18,11 @@
  */
 
 #include <linux/device.h>
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/wait.h>
+#include <linux/async.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -167,6 +169,21 @@ int driver_probe_done(void)
 	return 0;
 }
 
+/**
+ * wait_for_device_probe
+ * Wait for device probing to be completed.
+ *
+ * Note: this function polls at 100 msec intervals.
+ */
+int wait_for_device_probe(void)
+{
+	/* wait for the known devices to complete their probing */
+	while (driver_probe_done() != 0)
+		msleep(100);
+	async_synchronize_full();
+	return 0;
+}
+
 /**
  * driver_probe_device - attempt to bind device & driver together
  * @drv: driver to bind a device to
diff --git a/include/linux/device.h b/include/linux/device.h
index 45e5b1921fbb..47f343c7bdda 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -147,6 +147,8 @@ extern void put_driver(struct device_driver *drv);
 extern struct device_driver *driver_find(const char *name,
 					 struct bus_type *bus);
 extern int driver_probe_done(void);
+extern int wait_for_device_probe(void);
+
 
 /* sysfs interface for exporting driver attributes */
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 708105e163df..8d4ff5afc1d8 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -370,10 +370,14 @@ void __init prepare_namespace(void)
 		ssleep(root_delay);
 	}
 
-	/* wait for the known devices to complete their probing */
-	while (driver_probe_done() != 0)
-		msleep(100);
-	async_synchronize_full();
+	/*
+	 * wait for the known devices to complete their probing
+	 *
+	 * Note: this is a potential source of long boot delays.
+	 * For example, it is not atypical to wait 5 seconds here
+	 * for the touchpad of a laptop to initialize.
+	 */
+	wait_for_device_probe();
 
 	md_run_setup();
 
@@ -399,6 +403,7 @@ void __init prepare_namespace(void)
 		while (driver_probe_done() != 0 ||
 			(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
 			msleep(100);
+		async_synchronize_full();
 	}
 
 	is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR;
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index ff95e3192884..9bdddbcb3d6a 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -281,8 +281,9 @@ static void __init autodetect_raid(void)
 	 */
 	printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n");
 	printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n");
-	while (driver_probe_done() < 0)
-		msleep(100);
+
+	wait_for_device_probe();
+
 	fd = sys_open("/dev/md0", 0, 0);
 	if (fd >= 0) {
 		sys_ioctl(fd, RAID_AUTORUN, raid_autopart);
-- 
cgit v1.2.3


From 770824bdc421ff58a64db608294323571c949f4c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 22 Feb 2009 18:38:50 +0100
Subject: PM: Split up sysdev_[suspend|resume] from device_power_[down|up]

Move the sysdev_suspend/resume from the callee to the callers, with
no real change in semantics, so that we can rework the disabling of
interrupts during suspend/hibernation.

This is based on an earlier patch from Linus.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/apm_32.c  |  4 ++++
 drivers/base/base.h       |  2 --
 drivers/base/power/main.c |  3 ---
 drivers/xen/manage.c      |  8 ++++++++
 include/linux/pm.h        |  2 ++
 kernel/kexec.c            |  7 +++++++
 kernel/power/disk.c       | 11 +++++++++++
 kernel/power/main.c       |  8 ++++++--
 8 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 98807bb095ad..266ec6c18b6c 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1192,6 +1192,7 @@ static int suspend(int vetoable)
 	device_suspend(PMSG_SUSPEND);
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
+	sysdev_suspend(PMSG_SUSPEND);
 
 	local_irq_enable();
 
@@ -1208,6 +1209,7 @@ static int suspend(int vetoable)
 	if (err != APM_SUCCESS)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
+	sysdev_resume();
 	device_power_up(PMSG_RESUME);
 	local_irq_enable();
 	device_resume(PMSG_RESUME);
@@ -1228,6 +1230,7 @@ static void standby(void)
 
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
+	sysdev_suspend(PMSG_SUSPEND);
 	local_irq_enable();
 
 	err = set_system_power_state(APM_STATE_STANDBY);
@@ -1235,6 +1238,7 @@ static void standby(void)
 		apm_error("standby", err);
 
 	local_irq_disable();
+	sysdev_resume();
 	device_power_up(PMSG_RESUME);
 	local_irq_enable();
 }
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0a5f055dffba..9f50f1b545dc 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -88,8 +88,6 @@ extern void driver_detach(struct device_driver *drv);
 extern int driver_probe_device(struct device_driver *drv, struct device *dev);
 
 extern void sysdev_shutdown(void);
-extern int sysdev_suspend(pm_message_t state);
-extern int sysdev_resume(void);
 
 extern char *make_class_name(const char *name, struct kobject *kobj);
 
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 670c9d6c1407..2d14f4ae6c01 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -333,7 +333,6 @@ static void dpm_power_up(pm_message_t state)
  */
 void device_power_up(pm_message_t state)
 {
-	sysdev_resume();
 	dpm_power_up(state);
 }
 EXPORT_SYMBOL_GPL(device_power_up);
@@ -577,8 +576,6 @@ int device_power_down(pm_message_t state)
 		}
 		dev->power.status = DPM_OFF_IRQ;
 	}
-	if (!error)
-		error = sysdev_suspend(state);
 	if (error)
 		dpm_power_up(resume_event(state));
 	return error;
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 9b91617b9582..56892a142ee2 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -45,6 +45,13 @@ static int xen_suspend(void *data)
 		       err);
 		return err;
 	}
+	err = sysdev_suspend(PMSG_SUSPEND);
+	if (err) {
+		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
+			err);
+		device_power_up(PMSG_RESUME);
+		return err;
+	}
 
 	xen_mm_pin_all();
 	gnttab_suspend();
@@ -61,6 +68,7 @@ static int xen_suspend(void *data)
 	gnttab_resume();
 	xen_mm_unpin_all();
 
+	sysdev_resume();
 	device_power_up(PMSG_RESUME);
 
 	if (!*cancelled) {
diff --git a/include/linux/pm.h b/include/linux/pm.h
index de2e0a8f6728..24ba5f67b3a3 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -381,10 +381,12 @@ struct dev_pm_info {
 
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
+extern int sysdev_resume(void);
 extern void device_power_up(pm_message_t state);
 extern void device_resume(pm_message_t state);
 
 extern void device_pm_unlock(void);
+extern int sysdev_suspend(pm_message_t state);
 extern int device_power_down(pm_message_t state);
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8a6d7b08864e..483899578259 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1465,6 +1465,11 @@ int kernel_kexec(void)
 		error = device_power_down(PMSG_FREEZE);
 		if (error)
 			goto Enable_irqs;
+
+		/* Suspend system devices */
+		error = sysdev_suspend(PMSG_FREEZE);
+		if (error)
+			goto Power_up_devices;
 	} else
 #endif
 	{
@@ -1477,6 +1482,8 @@ int kernel_kexec(void)
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
+		sysdev_resume();
+ Power_up_devices:
 		device_power_up(PMSG_RESTORE);
  Enable_irqs:
 		local_irq_enable();
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 7b40e94b1d42..4a4a206b1979 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -227,6 +227,12 @@ static int create_image(int platform_mode)
 			"aborting hibernation\n");
 		goto Enable_irqs;
 	}
+	sysdev_suspend(PMSG_FREEZE);
+	if (error) {
+		printk(KERN_ERR "PM: Some devices failed to power down, "
+			"aborting hibernation\n");
+		goto Power_up_devices;
+	}
 
 	if (hibernation_test(TEST_CORE))
 		goto Power_up;
@@ -242,9 +248,11 @@ static int create_image(int platform_mode)
 	if (!in_suspend)
 		platform_leave(platform_mode);
  Power_up:
+	sysdev_resume();
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
+ Power_up_devices:
 	device_power_up(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Enable_irqs:
@@ -335,6 +343,7 @@ static int resume_target_kernel(void)
 			"aborting resume\n");
 		goto Enable_irqs;
 	}
+	sysdev_suspend(PMSG_QUIESCE);
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = restore_highmem();
@@ -357,6 +366,7 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
+	sysdev_resume();
 	device_power_up(PMSG_RECOVER);
  Enable_irqs:
 	local_irq_enable();
@@ -440,6 +450,7 @@ int hibernation_platform_enter(void)
 	local_irq_disable();
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
+		sysdev_suspend(PMSG_HIBERNATE);
 		hibernation_ops->enter();
 		/* We should never get here */
 		while (1);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b4d219016b6c..c9632f841f64 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -298,8 +298,12 @@ static int suspend_enter(suspend_state_t state)
 		goto Done;
 	}
 
-	if (!suspend_test(TEST_CORE))
-		error = suspend_ops->enter(state);
+	error = sysdev_suspend(PMSG_SUSPEND);
+	if (!error) {
+		if (!suspend_test(TEST_CORE))
+			error = suspend_ops->enter(state);
+		sysdev_resume();
+	}
 
 	device_power_up(PMSG_RESUME);
  Done:
-- 
cgit v1.2.3