From 704126ad81b8cb7d3d70adb9ecb143f4d3fb38af Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Sun, 4 Jan 2009 16:28:52 +0800
Subject: VT-d: handle Invalidation Queue Error to avoid system hang

When hardware detects any error with a descriptor from the invalidation
queue, it stops fetching new descriptors from the queue until software
clears the Invalidation Queue Error bit in the Fault Status register.
Following fix handles the IQE so the kernel won't be trapped in an
infinite loop.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/intel-iommu.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c4f6c101dbcd..d2e3cbfba14f 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -194,6 +194,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 /* FSTS_REG */
 #define DMA_FSTS_PPF ((u32)2)
 #define DMA_FSTS_PFO ((u32)1)
+#define DMA_FSTS_IQE (1 << 4)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
@@ -328,7 +329,7 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type,
 			  int non_present_entry_flush);
 
-extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
+extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
 extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t);
 extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t);
-- 
cgit v1.2.3


From 4ab0d47d0ab311eb181532c1ecb6d02905685071 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Tue, 24 Feb 2009 17:35:12 -0800
Subject: gpu/drm, x86, PAT: io_mapping_create_wc and resource_size_t

io_mapping_create_wc should take a resource_size_t parameter in place of
unsigned long. With unsigned long, there will be no way to map greater than 4GB
address in i386/32 bit.

On x86, greater than 4GB addresses cannot be mapped on i386 without PAE. Return
error for such a case.

Patch also adds a structure for io_mapping, that saves the base, size and
type on HAVE_ATOMIC_IOMAP archs, that can be used to verify the offset on
io_mapping_map calls.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Eric Anholt <eric@anholt.net>
Cc: Keith Packard <keithp@keithp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/iomap.h |  3 +++
 arch/x86/mm/iomap_32.c       | 18 +++++++++++++++++
 include/linux/io-mapping.h   | 46 +++++++++++++++++++++++++++++++++-----------
 3 files changed, 56 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index c1f06289b14b..86af26091d6c 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -23,6 +23,9 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
+int
+is_io_mapping_possible(resource_size_t base, unsigned long size);
+
 void *
 iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index ca53224fc56c..6c2b1af16926 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -20,6 +20,24 @@
 #include <asm/pat.h>
 #include <linux/module.h>
 
+#ifdef CONFIG_X86_PAE
+int
+is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+	return 1;
+}
+#else
+int
+is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+	/* There is no way to map greater than 1 << 32 address without PAE */
+	if (base + size > 0x100000000ULL)
+		return 0;
+
+	return 1;
+}
+#endif
+
 /* Map 'pfn' using fixed map 'type' and protections 'prot'
  */
 void *
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 82df31726a54..cbc2f0cd631b 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -30,11 +30,14 @@
  * See Documentation/io_mapping.txt
  */
 
-/* this struct isn't actually defined anywhere */
-struct io_mapping;
-
 #ifdef CONFIG_HAVE_ATOMIC_IOMAP
 
+struct io_mapping {
+	resource_size_t base;
+	unsigned long size;
+	pgprot_t prot;
+};
+
 /*
  * For small address space machines, mapping large objects
  * into the kernel virtual space isn't practical. Where
@@ -43,23 +46,40 @@ struct io_mapping;
  */
 
 static inline struct io_mapping *
-io_mapping_create_wc(unsigned long base, unsigned long size)
+io_mapping_create_wc(resource_size_t base, unsigned long size)
 {
-	return (struct io_mapping *) base;
+	struct io_mapping *iomap;
+
+	if (!is_io_mapping_possible(base, size))
+		return NULL;
+
+	iomap = kmalloc(sizeof(*iomap), GFP_KERNEL);
+	if (!iomap)
+		return NULL;
+
+	iomap->base = base;
+	iomap->size = size;
+	iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL));
+	return iomap;
 }
 
 static inline void
 io_mapping_free(struct io_mapping *mapping)
 {
+	kfree(mapping);
 }
 
 /* Atomic map/unmap */
 static inline void *
 io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
 {
-	offset += (unsigned long) mapping;
-	return iomap_atomic_prot_pfn(offset >> PAGE_SHIFT, KM_USER0,
-				     __pgprot(__PAGE_KERNEL_WC));
+	resource_size_t phys_addr;
+	unsigned long pfn;
+
+	BUG_ON(offset >= mapping->size);
+	phys_addr = mapping->base + offset;
+	pfn = (unsigned long) (phys_addr >> PAGE_SHIFT);
+	return iomap_atomic_prot_pfn(pfn, KM_USER0, mapping->prot);
 }
 
 static inline void
@@ -71,8 +91,9 @@ io_mapping_unmap_atomic(void *vaddr)
 static inline void *
 io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
 {
-	offset += (unsigned long) mapping;
-	return ioremap_wc(offset, PAGE_SIZE);
+	BUG_ON(offset >= mapping->size);
+	resource_size_t phys_addr = mapping->base + offset;
+	return ioremap_wc(phys_addr, PAGE_SIZE);
 }
 
 static inline void
@@ -83,9 +104,12 @@ io_mapping_unmap(void *vaddr)
 
 #else
 
+/* this struct isn't actually defined anywhere */
+struct io_mapping;
+
 /* Create the io_mapping object*/
 static inline struct io_mapping *
-io_mapping_create_wc(unsigned long base, unsigned long size)
+io_mapping_create_wc(resource_size_t base, unsigned long size)
 {
 	return (struct io_mapping *) ioremap_wc(base, size);
 }
-- 
cgit v1.2.3


From 8fed43684174b68f04d01d1210fd00536af790df Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 25 Feb 2009 20:28:24 +0100
Subject: ide: fix refcounting in device drivers

During host driver module removal del_gendisk() results in a final
put on drive->gendev and freeing the drive by drive_release_dev().

Convert device drivers from using struct kref to use struct device
so device driver's object holds reference on ->gendev and prevents
drive from prematurely going away.

Also fix ->remove methods to not erroneously drop reference on a
host driver by using only put_device() instead of ide*_put().

Reported-by: Stanislaw Gruszka <stf_xl@wp.pl>
Tested-by: Stanislaw Gruszka <stf_xl@wp.pl>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c   | 27 ++++++++++++++++++---------
 drivers/ide/ide-cd.h   |  2 +-
 drivers/ide/ide-gd.c   | 26 +++++++++++++++++---------
 drivers/ide/ide-gd.h   |  2 +-
 drivers/ide/ide-tape.c | 29 +++++++++++++++++++----------
 include/linux/ide.h    |  2 +-
 6 files changed, 57 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 690475b834de..ddfbea41d296 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -55,7 +55,7 @@
 
 static DEFINE_MUTEX(idecd_ref_mutex);
 
-static void ide_cd_release(struct kref *);
+static void ide_cd_release(struct device *);
 
 static struct cdrom_info *ide_cd_get(struct gendisk *disk)
 {
@@ -67,7 +67,7 @@ static struct cdrom_info *ide_cd_get(struct gendisk *disk)
 		if (ide_device_get(cd->drive))
 			cd = NULL;
 		else
-			kref_get(&cd->kref);
+			get_device(&cd->dev);
 
 	}
 	mutex_unlock(&idecd_ref_mutex);
@@ -79,7 +79,7 @@ static void ide_cd_put(struct cdrom_info *cd)
 	ide_drive_t *drive = cd->drive;
 
 	mutex_lock(&idecd_ref_mutex);
-	kref_put(&cd->kref, ide_cd_release);
+	put_device(&cd->dev);
 	ide_device_put(drive);
 	mutex_unlock(&idecd_ref_mutex);
 }
@@ -1798,15 +1798,17 @@ static void ide_cd_remove(ide_drive_t *drive)
 	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
 
 	ide_proc_unregister_driver(drive, info->driver);
-
+	device_del(&info->dev);
 	del_gendisk(info->disk);
 
-	ide_cd_put(info);
+	mutex_lock(&idecd_ref_mutex);
+	put_device(&info->dev);
+	mutex_unlock(&idecd_ref_mutex);
 }
 
-static void ide_cd_release(struct kref *kref)
+static void ide_cd_release(struct device *dev)
 {
-	struct cdrom_info *info = to_ide_drv(kref, cdrom_info);
+	struct cdrom_info *info = to_ide_drv(dev, cdrom_info);
 	struct cdrom_device_info *devinfo = &info->devinfo;
 	ide_drive_t *drive = info->drive;
 	struct gendisk *g = info->disk;
@@ -2005,7 +2007,12 @@ static int ide_cd_probe(ide_drive_t *drive)
 
 	ide_init_disk(g, drive);
 
-	kref_init(&info->kref);
+	info->dev.parent = &drive->gendev;
+	info->dev.release = ide_cd_release;
+	dev_set_name(&info->dev, dev_name(&drive->gendev));
+
+	if (device_register(&info->dev))
+		goto out_free_disk;
 
 	info->drive = drive;
 	info->driver = &ide_cdrom_driver;
@@ -2019,7 +2026,7 @@ static int ide_cd_probe(ide_drive_t *drive)
 	g->driverfs_dev = &drive->gendev;
 	g->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE;
 	if (ide_cdrom_setup(drive)) {
-		ide_cd_release(&info->kref);
+		put_device(&info->dev);
 		goto failed;
 	}
 
@@ -2029,6 +2036,8 @@ static int ide_cd_probe(ide_drive_t *drive)
 	add_disk(g);
 	return 0;
 
+out_free_disk:
+	put_disk(g);
 out_free_cd:
 	kfree(info);
 failed:
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index ac40d6cb90a2..c878bfcf1116 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -80,7 +80,7 @@ struct cdrom_info {
 	ide_drive_t		*drive;
 	struct ide_driver	*driver;
 	struct gendisk		*disk;
-	struct kref		kref;
+	struct device		dev;
 
 	/* Buffer for table of contents.  NULL if we haven't allocated
 	   a TOC buffer for this device yet. */
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 7857b209c6df..047109419902 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -25,7 +25,7 @@ module_param(debug_mask, ulong, 0644);
 
 static DEFINE_MUTEX(ide_disk_ref_mutex);
 
-static void ide_disk_release(struct kref *);
+static void ide_disk_release(struct device *);
 
 static struct ide_disk_obj *ide_disk_get(struct gendisk *disk)
 {
@@ -37,7 +37,7 @@ static struct ide_disk_obj *ide_disk_get(struct gendisk *disk)
 		if (ide_device_get(idkp->drive))
 			idkp = NULL;
 		else
-			kref_get(&idkp->kref);
+			get_device(&idkp->dev);
 	}
 	mutex_unlock(&ide_disk_ref_mutex);
 	return idkp;
@@ -48,7 +48,7 @@ static void ide_disk_put(struct ide_disk_obj *idkp)
 	ide_drive_t *drive = idkp->drive;
 
 	mutex_lock(&ide_disk_ref_mutex);
-	kref_put(&idkp->kref, ide_disk_release);
+	put_device(&idkp->dev);
 	ide_device_put(drive);
 	mutex_unlock(&ide_disk_ref_mutex);
 }
@@ -66,17 +66,18 @@ static void ide_gd_remove(ide_drive_t *drive)
 	struct gendisk *g = idkp->disk;
 
 	ide_proc_unregister_driver(drive, idkp->driver);
-
+	device_del(&idkp->dev);
 	del_gendisk(g);
-
 	drive->disk_ops->flush(drive);
 
-	ide_disk_put(idkp);
+	mutex_lock(&ide_disk_ref_mutex);
+	put_device(&idkp->dev);
+	mutex_unlock(&ide_disk_ref_mutex);
 }
 
-static void ide_disk_release(struct kref *kref)
+static void ide_disk_release(struct device *dev)
 {
-	struct ide_disk_obj *idkp = to_ide_drv(kref, ide_disk_obj);
+	struct ide_disk_obj *idkp = to_ide_drv(dev, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
 	struct gendisk *g = idkp->disk;
 
@@ -348,7 +349,12 @@ static int ide_gd_probe(ide_drive_t *drive)
 
 	ide_init_disk(g, drive);
 
-	kref_init(&idkp->kref);
+	idkp->dev.parent = &drive->gendev;
+	idkp->dev.release = ide_disk_release;
+	dev_set_name(&idkp->dev, dev_name(&drive->gendev));
+
+	if (device_register(&idkp->dev))
+		goto out_free_disk;
 
 	idkp->drive = drive;
 	idkp->driver = &ide_gd_driver;
@@ -373,6 +379,8 @@ static int ide_gd_probe(ide_drive_t *drive)
 	add_disk(g);
 	return 0;
 
+out_free_disk:
+	put_disk(g);
 out_free_idkp:
 	kfree(idkp);
 failed:
diff --git a/drivers/ide/ide-gd.h b/drivers/ide/ide-gd.h
index a86779f0756b..b604bdd318a1 100644
--- a/drivers/ide/ide-gd.h
+++ b/drivers/ide/ide-gd.h
@@ -17,7 +17,7 @@ struct ide_disk_obj {
 	ide_drive_t		*drive;
 	struct ide_driver	*driver;
 	struct gendisk		*disk;
-	struct kref		kref;
+	struct device		dev;
 	unsigned int		openers;	/* protected by BKL for now */
 
 	/* Last failed packet command */
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d7ecd3c79757..bb450a7608c2 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -169,7 +169,7 @@ typedef struct ide_tape_obj {
 	ide_drive_t		*drive;
 	struct ide_driver	*driver;
 	struct gendisk		*disk;
-	struct kref		kref;
+	struct device		dev;
 
 	/*
 	 *	failed_pc points to the last failed packet command, or contains
@@ -267,7 +267,7 @@ static DEFINE_MUTEX(idetape_ref_mutex);
 
 static struct class *idetape_sysfs_class;
 
-static void ide_tape_release(struct kref *);
+static void ide_tape_release(struct device *);
 
 static struct ide_tape_obj *ide_tape_get(struct gendisk *disk)
 {
@@ -279,7 +279,7 @@ static struct ide_tape_obj *ide_tape_get(struct gendisk *disk)
 		if (ide_device_get(tape->drive))
 			tape = NULL;
 		else
-			kref_get(&tape->kref);
+			get_device(&tape->dev);
 	}
 	mutex_unlock(&idetape_ref_mutex);
 	return tape;
@@ -290,7 +290,7 @@ static void ide_tape_put(struct ide_tape_obj *tape)
 	ide_drive_t *drive = tape->drive;
 
 	mutex_lock(&idetape_ref_mutex);
-	kref_put(&tape->kref, ide_tape_release);
+	put_device(&tape->dev);
 	ide_device_put(drive);
 	mutex_unlock(&idetape_ref_mutex);
 }
@@ -308,7 +308,7 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	mutex_lock(&idetape_ref_mutex);
 	tape = idetape_devs[i];
 	if (tape)
-		kref_get(&tape->kref);
+		get_device(&tape->dev);
 	mutex_unlock(&idetape_ref_mutex);
 	return tape;
 }
@@ -2256,15 +2256,17 @@ static void ide_tape_remove(ide_drive_t *drive)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_proc_unregister_driver(drive, tape->driver);
-
+	device_del(&tape->dev);
 	ide_unregister_region(tape->disk);
 
-	ide_tape_put(tape);
+	mutex_lock(&idetape_ref_mutex);
+	put_device(&tape->dev);
+	mutex_unlock(&idetape_ref_mutex);
 }
 
-static void ide_tape_release(struct kref *kref)
+static void ide_tape_release(struct device *dev)
 {
-	struct ide_tape_obj *tape = to_ide_drv(kref, ide_tape_obj);
+	struct ide_tape_obj *tape = to_ide_drv(dev, ide_tape_obj);
 	ide_drive_t *drive = tape->drive;
 	struct gendisk *g = tape->disk;
 
@@ -2407,7 +2409,12 @@ static int ide_tape_probe(ide_drive_t *drive)
 
 	ide_init_disk(g, drive);
 
-	kref_init(&tape->kref);
+	tape->dev.parent = &drive->gendev;
+	tape->dev.release = ide_tape_release;
+	dev_set_name(&tape->dev, dev_name(&drive->gendev));
+
+	if (device_register(&tape->dev))
+		goto out_free_disk;
 
 	tape->drive = drive;
 	tape->driver = &idetape_driver;
@@ -2436,6 +2443,8 @@ static int ide_tape_probe(ide_drive_t *drive)
 
 	return 0;
 
+out_free_disk:
+	put_disk(g);
 out_free_tape:
 	kfree(tape);
 failed:
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 194da5a4b0d6..fe235b65207e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -663,7 +663,7 @@ typedef struct ide_drive_s ide_drive_t;
 #define to_ide_device(dev)		container_of(dev, ide_drive_t, gendev)
 
 #define to_ide_drv(obj, cont_type)	\
-	container_of(obj, struct cont_type, kref)
+	container_of(obj, struct cont_type, dev)
 
 #define ide_drv_g(disk, cont_type)	\
 	container_of((disk)->private_data, struct cont_type, driver)
-- 
cgit v1.2.3


From 1e42807918d17e8c93bf14fbb74be84b141334c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 23 Feb 2009 09:03:10 +0100
Subject: block: reduce stack footprint of blk_recount_segments()

blk_recalc_rq_segments() requires a request structure passed in, which
we don't have from blk_recount_segments(). So the latter allocates one on
the stack, using > 400 bytes of stack for that. This can cause us to spill
over one page of stack from ext4 at least:

 0)     4560     400   blk_recount_segments+0x43/0x62
 1)     4160      32   bio_phys_segments+0x1c/0x24
 2)     4128      32   blk_rq_bio_prep+0x2a/0xf9
 3)     4096      32   init_request_from_bio+0xf9/0xfe
 4)     4064     112   __make_request+0x33c/0x3f6
 5)     3952     144   generic_make_request+0x2d1/0x321
 6)     3808      64   submit_bio+0xb9/0xc3
 7)     3744      48   submit_bh+0xea/0x10e
 8)     3696     368   ext4_mb_init_cache+0x257/0xa6a [ext4]
 9)     3328     288   ext4_mb_regular_allocator+0x421/0xcd9 [ext4]
10)     3040     160   ext4_mb_new_blocks+0x211/0x4b4 [ext4]
11)     2880     336   ext4_ext_get_blocks+0xb61/0xd45 [ext4]
12)     2544      96   ext4_get_blocks_wrap+0xf2/0x200 [ext4]
13)     2448      80   ext4_da_get_block_write+0x6e/0x16b [ext4]
14)     2368     352   mpage_da_map_blocks+0x7e/0x4b3 [ext4]
15)     2016     352   ext4_da_writepages+0x2ce/0x43c [ext4]
16)     1664      32   do_writepages+0x2d/0x3c
17)     1632     144   __writeback_single_inode+0x162/0x2cd
18)     1488      96   generic_sync_sb_inodes+0x1e3/0x32b
19)     1392      16   sync_sb_inodes+0xe/0x10
20)     1376      48   writeback_inodes+0x69/0xb3
21)     1328     208   balance_dirty_pages_ratelimited_nr+0x187/0x2f9
22)     1120     224   generic_file_buffered_write+0x1d4/0x2c4
23)      896     176   __generic_file_aio_write_nolock+0x35f/0x393
24)      720      80   generic_file_aio_write+0x6c/0xc8
25)      640      80   ext4_file_write+0xa9/0x137 [ext4]
26)      560     320   do_sync_write+0xf0/0x137
27)      240      48   vfs_write+0xb3/0x13c
28)      192      64   sys_write+0x4c/0x74
29)      128     128   system_call_fastpath+0x16/0x1b

Split the segment counting out into a __blk_recalc_rq_segments() helper
to avoid allocating an onstack request just for checking the physical
segment count.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-merge.c      | 94 ++++++++++++++++++++++++++++----------------------
 include/linux/blkdev.h |  2 ++
 2 files changed, 55 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index b92f5b0866b0..a104593e70c3 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -38,72 +38,84 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
 	}
 }
 
-void blk_recalc_rq_segments(struct request *rq)
+static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
+					     struct bio *bio,
+					     unsigned int *seg_size_ptr)
 {
-	int nr_phys_segs;
 	unsigned int phys_size;
 	struct bio_vec *bv, *bvprv = NULL;
-	int seg_size;
-	int cluster;
-	struct req_iterator iter;
-	int high, highprv = 1;
-	struct request_queue *q = rq->q;
+	int cluster, i, high, highprv = 1;
+	unsigned int seg_size, nr_phys_segs;
+	struct bio *fbio;
 
-	if (!rq->bio)
-		return;
+	if (!bio)
+		return 0;
 
+	fbio = bio;
 	cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	seg_size = 0;
 	phys_size = nr_phys_segs = 0;
-	rq_for_each_segment(bv, rq, iter) {
-		/*
-		 * the trick here is making sure that a high page is never
-		 * considered part of another segment, since that might
-		 * change with the bounce page.
-		 */
-		high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
-		if (high || highprv)
-			goto new_segment;
-		if (cluster) {
-			if (seg_size + bv->bv_len > q->max_segment_size)
-				goto new_segment;
-			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
-				goto new_segment;
-			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+	for_each_bio(bio) {
+		bio_for_each_segment(bv, bio, i) {
+			/*
+			 * the trick here is making sure that a high page is
+			 * never considered part of another segment, since that
+			 * might change with the bounce page.
+			 */
+			high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+			if (high || highprv)
 				goto new_segment;
+			if (cluster) {
+				if (seg_size + bv->bv_len > q->max_segment_size)
+					goto new_segment;
+				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
+					goto new_segment;
+				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+					goto new_segment;
+
+				seg_size += bv->bv_len;
+				bvprv = bv;
+				continue;
+			}
+new_segment:
+			if (nr_phys_segs == 1 && seg_size >
+			    fbio->bi_seg_front_size)
+				fbio->bi_seg_front_size = seg_size;
 
-			seg_size += bv->bv_len;
+			nr_phys_segs++;
 			bvprv = bv;
-			continue;
+			seg_size = bv->bv_len;
+			highprv = high;
 		}
-new_segment:
-		if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
-			rq->bio->bi_seg_front_size = seg_size;
-
-		nr_phys_segs++;
-		bvprv = bv;
-		seg_size = bv->bv_len;
-		highprv = high;
 	}
 
-	if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
+	if (seg_size_ptr)
+		*seg_size_ptr = seg_size;
+
+	return nr_phys_segs;
+}
+
+void blk_recalc_rq_segments(struct request *rq)
+{
+	unsigned int seg_size = 0, phys_segs;
+
+	phys_segs = __blk_recalc_rq_segments(rq->q, rq->bio, &seg_size);
+
+	if (phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
 		rq->bio->bi_seg_front_size = seg_size;
 	if (seg_size > rq->biotail->bi_seg_back_size)
 		rq->biotail->bi_seg_back_size = seg_size;
 
-	rq->nr_phys_segments = nr_phys_segs;
+	rq->nr_phys_segments = phys_segs;
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	struct request rq;
 	struct bio *nxt = bio->bi_next;
-	rq.q = q;
-	rq.bio = rq.biotail = bio;
+
 	bio->bi_next = NULL;
-	blk_recalc_rq_segments(&rq);
+	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, NULL);
 	bio->bi_next = nxt;
-	bio->bi_phys_segments = rq.nr_phys_segments;
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dcaa0fd84b02..465d6babc847 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -708,6 +708,8 @@ struct req_iterator {
 };
 
 /* This should not be used directly - use rq_for_each_segment */
+#define for_each_bio(_bio)		\
+	for (; _bio; _bio = _bio->bi_next)
 #define __rq_for_each_bio(_bio, rq)	\
 	if ((rq->bio))			\
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
-- 
cgit v1.2.3


From b342501cd31e5546d0c9ca8ceff5ded1832f9e5b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 20:20:29 +0100
Subject: sched: allow architectures to specify sched_clock_stable

Allow CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures to still specify
that their sched_clock() implementation is reliable.

This will be used by x86 to switch on a faster sched_clock_cpu()
implementation on certain CPU types.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 10 ++++++++++
 kernel/sched_clock.c  | 45 ++++++++++++++++++++-------------------------
 2 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8981e52c714f..a063d19b7a7d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1670,6 +1670,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 	return set_cpus_allowed_ptr(p, &new_mask);
 }
 
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+extern int sched_clock_stable;
+#endif
+
 extern unsigned long long sched_clock(void);
 
 extern void sched_clock_init(void);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..a755d023805a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -24,11 +24,11 @@
  * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
  * consistent between cpus (never more than 2 jiffies difference).
  */
-#include <linux/sched.h>
-#include <linux/percpu.h>
 #include <linux/spinlock.h>
-#include <linux/ktime.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -43,6 +43,10 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 static __read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__read_mostly int sched_clock_stable;
+#else
+static const int sched_clock_stable = 1;
+#endif
 
 struct sched_clock_data {
 	/*
@@ -87,7 +91,7 @@ void sched_clock_init(void)
 }
 
 /*
- * min,max except they take wrapping into account
+ * min, max except they take wrapping into account
  */
 
 static inline u64 wrap_min(u64 x, u64 y)
@@ -116,10 +120,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	if (unlikely(delta < 0))
 		delta = 0;
 
+	if (unlikely(!sched_clock_running))
+		return 0ull;
+
 	/*
 	 * scd->clock = clamp(scd->tick_gtod + delta,
-	 * 		      max(scd->tick_gtod, scd->clock),
-	 * 		      scd->tick_gtod + TICK_NSEC);
+	 *		      max(scd->tick_gtod, scd->clock),
+	 *		      scd->tick_gtod + TICK_NSEC);
 	 */
 
 	clock = scd->tick_gtod + delta;
@@ -148,12 +155,13 @@ static void lock_double_clock(struct sched_clock_data *data1,
 
 u64 sched_clock_cpu(int cpu)
 {
-	struct sched_clock_data *scd = cpu_sdc(cpu);
 	u64 now, clock, this_clock, remote_clock;
+	struct sched_clock_data *scd;
 
-	if (unlikely(!sched_clock_running))
-		return 0ull;
+	if (sched_clock_stable)
+		return sched_clock();
 
+	scd = cpu_sdc(cpu);
 	WARN_ON_ONCE(!irqs_disabled());
 	now = sched_clock();
 
@@ -193,6 +201,8 @@ u64 sched_clock_cpu(int cpu)
 	return clock;
 }
 
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
 void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd = this_scd();
@@ -235,22 +245,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-
-void sched_clock_init(void)
-{
-	sched_clock_running = 1;
-}
-
-u64 sched_clock_cpu(int cpu)
-{
-	if (unlikely(!sched_clock_running))
-		return 0;
-
-	return sched_clock();
-}
-
-#endif
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 unsigned long long cpu_clock(int cpu)
 {
-- 
cgit v1.2.3


From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Fri, 27 Feb 2009 15:13:54 +0530
Subject: sched: don't allow setuid to succeed if the user does not have rt
 bandwidth

Impact: fix hung task with certain (non-default) rt-limit settings

Corey Hickey reported that on using setuid to change the uid of a
rt process, the process would be unkillable and not be running.
This is because there was no rt runtime for that user group. Add
in a check to see if a user can attach an rt task to its task group.
On failure, return EINVAL, which is also returned in
CONFIG_CGROUP_SCHED.

Reported-by: Corey Hickey <bugfood-ml@fatooh.org>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++++
 kernel/sched.c        | 13 +++++++++++--
 kernel/sys.c          | 31 ++++++++++++++++++++-----------
 kernel/user.c         | 18 ++++++++++++++++++
 4 files changed, 53 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8981e52c714f..8c216e057c94 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg);
 extern int sched_group_set_rt_period(struct task_group *tg,
 				      long rt_period_us);
 extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 #endif
 #endif
 
+extern int task_can_switch_user(struct user_struct *up,
+					struct task_struct *tsk);
+
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index c3baa9653d1d..8e2558c2ba67 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void)
 
 	return ret;
 }
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+	/* Don't accept realtime tasks when there is no way for them to run */
+	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+		return 0;
+
+	return 1;
+}
+
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
@@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-	/* Don't accept realtime tasks when there is no way for them to run */
-	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
diff --git a/kernel/sys.c b/kernel/sys.c
index f145c415bc16..37f458e6882a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -559,7 +559,7 @@ error:
 	abort_creds(new);
 	return retval;
 }
-  
+
 /*
  * change the user struct in a credentials set to match the new UID
  */
@@ -571,6 +571,11 @@ static int set_user(struct cred *new)
 	if (!new_user)
 		return -EAGAIN;
 
+	if (!task_can_switch_user(new_user, current)) {
+		free_uid(new_user);
+		return -EINVAL;
+	}
+
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 			new_user != INIT_USER) {
@@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 			goto error;
 	}
 
-	retval = -EAGAIN;
-	if (new->uid != old->uid && set_user(new) < 0)
-		goto error;
-
+	if (new->uid != old->uid) {
+		retval = set_user(new);
+		if (retval < 0)
+			goto error;
+	}
 	if (ruid != (uid_t) -1 ||
 	    (euid != (uid_t) -1 && euid != old->uid))
 		new->suid = new->euid;
@@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
 		new->suid = new->uid = uid;
-		if (uid != old->uid && set_user(new) < 0) {
-			retval = -EAGAIN;
-			goto error;
+		if (uid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				goto error;
 		}
 	} else if (uid != old->uid && uid != new->suid) {
 		goto error;
@@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 			goto error;
 	}
 
-	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
 		new->uid = ruid;
-		if (ruid != old->uid && set_user(new) < 0)
-			goto error;
+		if (ruid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				goto error;
+		}
 	}
 	if (euid != (uid_t) -1)
 		new->euid = euid;
diff --git a/kernel/user.c b/kernel/user.c
index 3551ac742395..6a9b696128c8 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags)
 
 #endif
 
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
+/*
+ * We need to check if a setuid can take place. This function should be called
+ * before successfully completing the setuid.
+ */
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+
+	return sched_rt_can_attach(up->tg, tsk);
+
+}
+#else
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+	return 1;
+}
+#endif
+
 /*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
  * caller must undo that ref with free_uid().
-- 
cgit v1.2.3


From 5170836679185357dc1b7660bad13287b39e1e33 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Feb 2009 14:03:03 -0800
Subject: Fix recursive lock in free_uid()/free_user_ns()

free_uid() and free_user_ns() are corecursive when CONFIG_USER_SCHED=n,
but free_user_ns() is called from free_uid() by way of uid_hash_remove(),
which requires uidhash_lock to be held.  free_user_ns() then calls
free_uid() to complete the destruction.

Fix this by deferring the destruction of the user_namespace.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_namespace.h |  1 +
 kernel/user_namespace.c        | 21 +++++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 315bcd375224..cc4f45361dbb 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -13,6 +13,7 @@ struct user_namespace {
 	struct kref		kref;
 	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*creator;
+	struct work_struct	destroyer;
 };
 
 extern struct user_namespace init_user_ns;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 79084311ee57..076c7c8215b0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -60,12 +60,25 @@ int create_user_ns(struct cred *new)
 	return 0;
 }
 
-void free_user_ns(struct kref *kref)
+/*
+ * Deferred destructor for a user namespace.  This is required because
+ * free_user_ns() may be called with uidhash_lock held, but we need to call
+ * back to free_uid() which will want to take the lock again.
+ */
+static void free_user_ns_work(struct work_struct *work)
 {
-	struct user_namespace *ns;
-
-	ns = container_of(kref, struct user_namespace, kref);
+	struct user_namespace *ns =
+		container_of(work, struct user_namespace, destroyer);
 	free_uid(ns->creator);
 	kfree(ns);
 }
+
+void free_user_ns(struct kref *kref)
+{
+	struct user_namespace *ns =
+		container_of(kref, struct user_namespace, kref);
+
+	INIT_WORK(&ns->destroyer, free_user_ns_work);
+	schedule_work(&ns->destroyer);
+}
 EXPORT_SYMBOL(free_user_ns);
-- 
cgit v1.2.3