From c15227de132f1295f3db6b7df9079956b1020fd8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Sep 2009 13:52:12 +0200 Subject: block: use normal I/O path for discard requests prepare_discard_fn() was being called in a place where memory allocation was effectively impossible. This makes it inappropriate for all but the most trivial translations of Linux's DISCARD operation to the block command set. Additionally adding a payload there makes the ownership of the bio backing unclear as it's now allocated by the device driver and not the submitter as usual. It is replaced with QUEUE_FLAG_DISCARD which is used to indicate whether the queue supports discard operations or not. blkdev_issue_discard now allocates a one-page, sector-length payload which is the right thing for the common ATA and SCSI implementations. The mtd implementation of prepare_discard_fn() is replaced with simply checking for the request being a discard. Largely based on a previous patch from Matthew Wilcox which did the prepare_discard_fn but not the different payload allocation yet. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e23a86cae5ac..f62d45e87618 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -82,7 +82,6 @@ enum rq_cmd_type_bits { enum { REQ_LB_OP_EJECT = 0x40, /* eject request */ REQ_LB_OP_FLUSH = 0x41, /* flush request */ - REQ_LB_OP_DISCARD = 0x42, /* discard sectors */ }; /* @@ -261,7 +260,6 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); -typedef int (prepare_discard_fn) (struct request_queue *, struct request *); struct bio_vec; struct bvec_merge_data { @@ -340,7 +338,6 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unplug_fn *unplug_fn; - prepare_discard_fn *prepare_discard_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; softirq_done_fn *softirq_done_fn; @@ -460,6 +457,7 @@ struct request_queue #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ #define QUEUE_FLAG_CQ 16 /* hardware does queuing */ +#define QUEUE_FLAG_DISCARD 17 /* supports DISCARD */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ @@ -591,6 +589,7 @@ enum { #define blk_queue_flushing(q) ((q)->ordseq) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) +#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) @@ -955,7 +954,6 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); -extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *); extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); -- cgit v1.2.3 From 67efc9258010da35b27b3854d0880c7e193004ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Sep 2009 13:54:20 +0200 Subject: block: allow large discard requests Currently we set the bio size to the byte equivalent of the blocks to be trimmed when submitting the initial DISCARD ioctl. That means it is subject to the max_hw_sectors limitation of the HBA which is much lower than the size of a DISCARD request we can support. Add a separate max_discard_sectors tunable to limit the size for discard requests. We limit the max discard request size in bytes to 32bit as that is the limit for bio->bi_size. This could be much larger if we had a way to pass that information through the block layer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-barrier.c | 10 ++++++---- block/blk-core.c | 3 ++- block/blk-settings.c | 13 +++++++++++++ include/linux/blkdev.h | 3 +++ 4 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 21f5025c3945..8873b9b439ff 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -385,6 +385,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, while (nr_sects && !ret) { unsigned int sector_size = q->limits.logical_block_size; + unsigned int max_discard_sectors = + min(q->limits.max_discard_sectors, UINT_MAX >> 9); bio = bio_alloc(gfp_mask, 1); if (!bio) @@ -411,10 +413,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, * touch many more blocks on disk than the actual payload * length. */ - if (nr_sects > queue_max_hw_sectors(q)) { - bio->bi_size = queue_max_hw_sectors(q) << 9; - nr_sects -= queue_max_hw_sectors(q); - sector += queue_max_hw_sectors(q); + if (nr_sects > max_discard_sectors) { + bio->bi_size = max_discard_sectors << 9; + nr_sects -= max_discard_sectors; + sector += max_discard_sectors; } else { bio->bi_size = nr_sects << 9; nr_sects = 0; diff --git a/block/blk-core.c b/block/blk-core.c index 80a020dd1580..34504f309728 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1436,7 +1436,8 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } - if (unlikely(nr_sectors > queue_max_hw_sectors(q))) { + if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) && + nr_sectors > queue_max_hw_sectors(q))) { printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), bio_sectors(bio), diff --git a/block/blk-settings.c b/block/blk-settings.c index d29498ef1eb5..e0695bca7027 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_segment_size = MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; lim->max_hw_sectors = INT_MAX; + lim->max_discard_sectors = SAFE_MAX_SECTORS; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -238,6 +239,18 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors) } EXPORT_SYMBOL(blk_queue_max_hw_sectors); +/** + * blk_queue_max_discard_sectors - set max sectors for a single discard + * @q: the request queue for the device + * @max_discard: maximum number of sectors to discard + **/ +void blk_queue_max_discard_sectors(struct request_queue *q, + unsigned int max_discard_sectors) +{ + q->limits.max_discard_sectors = max_discard_sectors; +} +EXPORT_SYMBOL(blk_queue_max_discard_sectors); + /** * blk_queue_max_phys_segments - set max phys segments for a request for this queue * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f62d45e87618..1a03b715dfad 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -311,6 +311,7 @@ struct queue_limits { unsigned int alignment_offset; unsigned int io_min; unsigned int io_opt; + unsigned int max_discard_sectors; unsigned short logical_block_size; unsigned short max_hw_segments; @@ -928,6 +929,8 @@ extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); +extern void blk_queue_max_discard_sectors(struct request_queue *q, + unsigned int max_discard_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); extern void blk_queue_alignment_offset(struct request_queue *q, -- cgit v1.2.3 From 8e2967555571659d2c8a70dd120710110ed7bba4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 3 Oct 2009 16:26:03 +0200 Subject: cfq-iosched: implement slower async initiate and queue ramp up This slowly ramps up the async queue depth based on the time passed since the sync IO, and doesn't allow async at all until a sync slice period has passed. Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++++ block/cfq-iosched.c | 56 ++++++++++++++++++++++++++++++++------------------ include/linux/blkdev.h | 4 ++++ 3 files changed, 48 insertions(+), 20 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index ddaaea4fdffc..a8c7fbe52e24 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2492,6 +2492,14 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_delayed_work(struct request_queue *q, + struct delayed_work *work, + unsigned long delay) +{ + return queue_delayed_work(kblockd_workqueue, work, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work); + int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 70b48ea0e3e9..fce8a749f4be 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -150,7 +150,7 @@ struct cfq_data { * idle window management */ struct timer_list idle_slice_timer; - struct work_struct unplug_work; + struct delayed_work unplug_work; struct cfq_queue *active_queue; struct cfq_io_context *active_cic; @@ -268,11 +268,13 @@ static inline int cfq_bio_sync(struct bio *bio) * scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing */ -static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) +static inline void cfq_schedule_dispatch(struct cfq_data *cfqd, + unsigned long delay) { if (cfqd->busy_queues) { cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); + kblockd_schedule_delayed_work(cfqd->queue, &cfqd->unplug_work, + delay); } } @@ -1316,8 +1318,6 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) * Does this cfqq already have too much IO in flight? */ if (cfqq->dispatched >= max_dispatch) { - unsigned long load_at = cfqd->last_end_sync_rq + cfq_slice_sync; - /* * idle queue must always only have a single IO in flight */ @@ -1331,20 +1331,36 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) return 0; /* - * If a sync request has completed recently, don't overload - * the dispatch queue yet with async requests. + * Sole queue user, allow bigger slice */ - if (cfqd->cfq_desktop && !cfq_cfqq_sync(cfqq) - && time_before(jiffies, load_at)) - return 0; + max_dispatch *= 4; + } + + /* + * Async queues must wait a bit before being allowed dispatch. + * We also ramp up the dispatch depth gradually for async IO, + * based on the last sync IO we serviced + */ + if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_desktop) { + unsigned long last_sync = jiffies - cfqd->last_end_sync_rq; + unsigned int depth; /* - * we are the only queue, allow up to 4 times of 'quantum' + * must wait a bit longer */ - if (cfqq->dispatched >= 4 * max_dispatch) + if (last_sync < cfq_slice_sync) { + cfq_schedule_dispatch(cfqd, cfq_slice_sync - last_sync); return 0; + } + + depth = last_sync / cfq_slice_sync; + if (depth < max_dispatch) + max_dispatch = depth; } + if (cfqq->dispatched >= max_dispatch) + return 0; + /* * Dispatch a request from this cfqq */ @@ -1389,7 +1405,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); + cfq_schedule_dispatch(cfqd, 0); } kmem_cache_free(cfq_pool, cfqq); @@ -1484,7 +1500,7 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (unlikely(cfqq == cfqd->active_queue)) { __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); + cfq_schedule_dispatch(cfqd, 0); } cfq_put_queue(cfqq); @@ -2201,7 +2217,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) } if (!rq_in_driver(cfqd)) - cfq_schedule_dispatch(cfqd); + cfq_schedule_dispatch(cfqd, 0); } /* @@ -2331,7 +2347,7 @@ queue_fail: if (cic) put_io_context(cic->ioc); - cfq_schedule_dispatch(cfqd); + cfq_schedule_dispatch(cfqd, 0); spin_unlock_irqrestore(q->queue_lock, flags); cfq_log(cfqd, "set_request fail"); return 1; @@ -2340,7 +2356,7 @@ queue_fail: static void cfq_kick_queue(struct work_struct *work) { struct cfq_data *cfqd = - container_of(work, struct cfq_data, unplug_work); + container_of(work, struct cfq_data, unplug_work.work); struct request_queue *q = cfqd->queue; spin_lock_irq(q->queue_lock); @@ -2394,7 +2410,7 @@ static void cfq_idle_slice_timer(unsigned long data) expire: cfq_slice_expired(cfqd, timed_out); out_kick: - cfq_schedule_dispatch(cfqd); + cfq_schedule_dispatch(cfqd, 0); out_cont: spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); } @@ -2402,7 +2418,7 @@ out_cont: static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) { del_timer_sync(&cfqd->idle_slice_timer); - cancel_work_sync(&cfqd->unplug_work); + cancel_delayed_work_sync(&cfqd->unplug_work); } static void cfq_put_async_queues(struct cfq_data *cfqd) @@ -2484,7 +2500,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->idle_slice_timer.function = cfq_idle_slice_timer; cfqd->idle_slice_timer.data = (unsigned long) cfqd; - INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); + INIT_DELAYED_WORK(&cfqd->unplug_work, cfq_kick_queue); cfqd->cfq_quantum = cfq_quantum; cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1a03b715dfad..a7323930d2ba 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1147,7 +1147,11 @@ static inline void put_dev_sector(Sector p) } struct work_struct; +struct delayed_work; int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); +int kblockd_schedule_delayed_work(struct request_queue *q, + struct delayed_work *work, + unsigned long delay); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) -- cgit v1.2.3 From ac481c20ef8f6c6f2be75d581863f40c43874ef7 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Sat, 3 Oct 2009 20:52:01 +0200 Subject: block: Topology ioctls Not all users of the topology information want to use libblkid. Provide the topology information through bdev ioctls. Also clarify sector size comments for existing BLK ioctls. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/compat_ioctl.c | 13 +++++++++++++ block/ioctl.c | 17 +++++++++++++++-- include/linux/blkdev.h | 35 ++++++++++++++++++++++++++++++----- include/linux/fs.h | 4 ++++ 4 files changed, 62 insertions(+), 7 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7865a34e0faa..9bd086c1a4d5 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -21,6 +21,11 @@ static int compat_put_int(unsigned long arg, int val) return put_user(val, (compat_int_t __user *)compat_ptr(arg)); } +static int compat_put_uint(unsigned long arg, unsigned int val) +{ + return put_user(val, (compat_uint_t __user *)compat_ptr(arg)); +} + static int compat_put_long(unsigned long arg, long val) { return put_user(val, (compat_long_t __user *)compat_ptr(arg)); @@ -734,6 +739,14 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) switch (cmd) { case HDIO_GETGEO: return compat_hdio_getgeo(disk, bdev, compat_ptr(arg)); + case BLKPBSZGET: + return compat_put_uint(arg, bdev_physical_block_size(bdev)); + case BLKIOMIN: + return compat_put_uint(arg, bdev_io_min(bdev)); + case BLKIOOPT: + return compat_put_uint(arg, bdev_io_opt(bdev)); + case BLKALIGNOFF: + return compat_put_int(arg, bdev_alignment_offset(bdev)); case BLKFLSBUF: case BLKROSET: case BLKDISCARD: diff --git a/block/ioctl.c b/block/ioctl.c index d3e6b5827a34..1f4d1de12b09 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -138,6 +138,11 @@ static int put_int(unsigned long arg, int val) return put_user(val, (int __user *)arg); } +static int put_uint(unsigned long arg, unsigned int val) +{ + return put_user(val, (unsigned int __user *)arg); +} + static int put_long(unsigned long arg, long val) { return put_user(val, (long __user *)arg); @@ -263,10 +268,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); - case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */ + case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ return put_int(arg, block_size(bdev)); - case BLKSSZGET: /* get block device hardware sector size */ + case BLKSSZGET: /* get block device logical block size */ return put_int(arg, bdev_logical_block_size(bdev)); + case BLKPBSZGET: /* get block device physical block size */ + return put_uint(arg, bdev_physical_block_size(bdev)); + case BLKIOMIN: + return put_uint(arg, bdev_io_min(bdev)); + case BLKIOOPT: + return put_uint(arg, bdev_io_opt(bdev)); + case BLKALIGNOFF: + return put_int(arg, bdev_alignment_offset(bdev)); case BLKSECTGET: return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); case BLKRASET: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a7323930d2ba..25119041e034 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1081,25 +1081,37 @@ static inline unsigned int queue_physical_block_size(struct request_queue *q) return q->limits.physical_block_size; } +static inline int bdev_physical_block_size(struct block_device *bdev) +{ + return queue_physical_block_size(bdev_get_queue(bdev)); +} + static inline unsigned int queue_io_min(struct request_queue *q) { return q->limits.io_min; } +static inline int bdev_io_min(struct block_device *bdev) +{ + return queue_io_min(bdev_get_queue(bdev)); +} + static inline unsigned int queue_io_opt(struct request_queue *q) { return q->limits.io_opt; } +static inline int bdev_io_opt(struct block_device *bdev) +{ + return queue_io_opt(bdev_get_queue(bdev)); +} + static inline int queue_alignment_offset(struct request_queue *q) { - if (q && q->limits.misaligned) + if (q->limits.misaligned) return -1; - if (q && q->limits.alignment_offset) - return q->limits.alignment_offset; - - return 0; + return q->limits.alignment_offset; } static inline int queue_sector_alignment_offset(struct request_queue *q, @@ -1109,6 +1121,19 @@ static inline int queue_sector_alignment_offset(struct request_queue *q, & (q->limits.io_min - 1); } +static inline int bdev_alignment_offset(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q->limits.misaligned) + return -1; + + if (bdev != bdev->bd_contains) + return bdev->bd_part->alignment_offset; + + return q->limits.alignment_offset; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2adaa2529f18..883eaacfd924 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -300,6 +300,10 @@ struct inodes_stat_t { #define BLKTRACESTOP _IO(0x12,117) #define BLKTRACETEARDOWN _IO(0x12,118) #define BLKDISCARD _IO(0x12,119) +#define BLKIOMIN _IO(0x12,120) +#define BLKIOOPT _IO(0x12,121) +#define BLKALIGNOFF _IO(0x12,122) +#define BLKPBSZGET _IO(0x12,123) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3