From 5478755616ae2ef1ce144dded589b62b2a50d575 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Mon, 29 Nov 2010 10:03:55 +0100 Subject: block: check for proper length of iov entries earlier in blk_rq_map_user_iov() commit 9284bcf checks for proper length of iov entries in blk_rq_map_user_iov(). But if the map is unaligned, kernel will break out the loop without checking for the proper length. So we need to check the proper length before the unalign check. Signed-off-by: Xiaotian Feng Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-map.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 5d5dbe47c228..e663ac2d8e68 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -201,12 +201,13 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, for (i = 0; i < iov_count; i++) { unsigned long uaddr = (unsigned long)iov[i].iov_base; + if (!iov[i].iov_len) + return -EINVAL; + if (uaddr & queue_dma_alignment(q)) { unaligned = 1; break; } - if (!iov[i].iov_len) - return -EINVAL; } if (unaligned || (q->dma_pad_mask & len) || map_data) -- cgit v1.2.3 From d1ae8ffdfaa16b2ab2e9346e81cf0ab6eaaae347 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 1 Dec 2010 19:34:46 +0100 Subject: blk-throttle: Trim/adjust slice_end once a bio has been dispatched o During some testing I did following and noticed throttling stops working. - Put a very low limit on a cgroup, say 1 byte per second. - Start some reads, this will set slice_end to a very high value. - Change the limit to higher value say 1MB/s - Now IO unthrottles and finishes as expected. - Try to do the read again but IO is not limited to 1MB/s as expected. o What is happening. - Initially low value of limit sets slice_end to a very high value. - During updation of limit, slice_end is not being truncated. - Very high value of slice_end leads to keeping the existing slice valid for a very long time and new slice does not start. - tg_may_dispatch() is called in blk_throtle_bio(), and trim_slice() is not called in this path. So slice_start is some old value and practically we are able to do huge amount of IO. o There are many ways it can be fixed. I have fixed it by trying to adjust/cleanup slice_end in trim_slice(). Generally we extend slices if bio is big and can't be dispatched in one slice. After dispatch of bio, readjust the slice_end to make sure we don't end up with huge values. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 004be80fd894..2d134b7c40a3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -355,6 +355,12 @@ throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) tg->slice_end[rw], jiffies); } +static inline void throtl_set_slice_end(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); +} + static inline void throtl_extend_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { @@ -391,6 +397,16 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) if (throtl_slice_used(td, tg, rw)) return; + /* + * A bio has been dispatched. Also adjust slice_end. It might happen + * that initially cgroup limit was very low resulting in high + * slice_end, but later limit was bumped up and bio was dispached + * sooner, then we need to reduce slice_end. A high bogus slice_end + * is bad because it does not allow new slice to start. + */ + + throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); + time_elapsed = jiffies - tg->slice_start[rw]; nr_slices = time_elapsed / throtl_slice; -- cgit v1.2.3 From 04a6b516cdc6efc2500b52a540cf65be8c5aaf9e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 1 Dec 2010 19:34:52 +0100 Subject: blk-throttle: Correct the placement of smp_rmb() o I was discussing what are the variable being updated without spin lock and why do we need barriers and Oleg pointed out that location of smp_rmb() should be between read of td->limits_changed and tg->limits_changed. This patch fixes it. o Following is one possible sequence of events. Say cpu0 is executing throtl_update_blkio_group_read_bps() and cpu1 is executing throtl_process_limit_change(). cpu0 cpu1 tg->limits_changed = true; smp_mb__before_atomic_inc(); atomic_inc(&td->limits_changed); if (!atomic_read(&td->limits_changed)) return; if (tg->limits_changed) do_something; If cpu0 has updated tg->limits_changed and td->limits_changed, we want to make sure that if update to td->limits_changed is visible on cpu1, then update to tg->limits_changed should also be visible. Oleg pointed out to ensure that we need to insert an smp_rmb() between td->limits_changed read and tg->limits_changed read. o I had erroneously put smp_rmb() before atomic_read(&td->limits_changed). This patch fixes it. Reported-by: Oleg Nesterov Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2d134b7c40a3..381b09bb562b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -725,26 +725,21 @@ static void throtl_process_limit_change(struct throtl_data *td) struct throtl_grp *tg; struct hlist_node *pos, *n; - /* - * Make sure atomic_inc() effects from - * throtl_update_blkio_group_read_bps(), group of functions are - * visible. - * Is this required or smp_mb__after_atomic_inc() was suffcient - * after the atomic_inc(). - */ - smp_rmb(); if (!atomic_read(&td->limits_changed)) return; throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); - hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { - /* - * Do I need an smp_rmb() here to make sure tg->limits_changed - * update is visible. I am relying on smp_rmb() at the - * beginning of function and not putting a new one here. - */ + /* + * Make sure updates from throtl_update_blkio_group_read_bps() group + * of functions to tg->limits_changed are visible. We do not + * want update td->limits_changed to be visible but update to + * tg->limits_changed not being visible yet on this cpu. Hence + * the read barrier. + */ + smp_rmb(); + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { if (throtl_tg_on_rr(tg) && tg->limits_changed) { throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" " riops=%u wiops=%u", tg->bps[READ], -- cgit v1.2.3 From e692cb668fdd5a712c6ed2a2d6f2a36ee83997b4 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Dec 2010 19:41:49 +0100 Subject: block: Deprecate QUEUE_FLAG_CLUSTER and use queue_limits instead When stacking devices, a request_queue is not always available. This forced us to have a no_cluster flag in the queue_limits that could be used as a carrier until the request_queue had been set up for a metadevice. There were several problems with that approach. First of all it was up to the stacking device to remember to set queue flag after stacking had completed. Also, the queue flag and the queue limits had to be kept in sync at all times. We got that wrong, which could lead to us issuing commands that went beyond the max scatterlist limit set by the driver. The proper fix is to avoid having two flags for tracking the same thing. We deprecate QUEUE_FLAG_CLUSTER and use the queue limit directly in the block layer merging functions. The queue_limit 'no_cluster' is turned into 'cluster' to avoid double negatives and to ease stacking. Clustering defaults to being enabled as before. The queue flag logic is removed from the stacking function, and explicitly setting the cluster flag is no longer necessary in DM and MD. Reported-by: Ed Lin Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-merge.c | 6 +++--- block/blk-settings.c | 25 ++----------------------- block/blk-sysfs.c | 2 +- drivers/md/dm-table.c | 5 ----- drivers/md/md.c | 3 --- drivers/scsi/scsi_lib.c | 3 +-- include/linux/blkdev.h | 9 ++++++--- 7 files changed, 13 insertions(+), 40 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 77b7c26df6b5..74bc4a768f32 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, return 0; fbio = bio; - cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); + cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; for_each_bio(bio) { @@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments); static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { - if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + if (!blk_queue_cluster(q)) return 0; if (bio->bi_seg_back_size + nxt->bi_seg_front_size > @@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, int nsegs, cluster; nsegs = 0; - cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); + cluster = blk_queue_cluster(q); /* * for each bio in rq diff --git a/block/blk-settings.c b/block/blk-settings.c index 701859fb9647..e55f5fc4ca22 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -126,7 +126,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; - lim->no_cluster = 0; + lim->cluster = 1; } EXPORT_SYMBOL(blk_set_default_limits); @@ -464,15 +464,6 @@ EXPORT_SYMBOL(blk_queue_io_opt); void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) { blk_stack_limits(&t->limits, &b->limits, 0); - - if (!t->queue_lock) - WARN_ON_ONCE(1); - else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { - unsigned long flags; - spin_lock_irqsave(t->queue_lock, flags); - queue_flag_clear(QUEUE_FLAG_CLUSTER, t); - spin_unlock_irqrestore(t->queue_lock, flags); - } } EXPORT_SYMBOL(blk_queue_stack_limits); @@ -545,7 +536,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->io_min = max(t->io_min, b->io_min); t->io_opt = lcm(t->io_opt, b->io_opt); - t->no_cluster |= b->no_cluster; + t->cluster &= b->cluster; t->discard_zeroes_data &= b->discard_zeroes_data; /* Physical block size a multiple of the logical block size? */ @@ -641,7 +632,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) { struct request_queue *t = disk->queue; - struct request_queue *b = bdev_get_queue(bdev); if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; @@ -652,17 +642,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", top, bottom); } - - if (!t->queue_lock) - WARN_ON_ONCE(1); - else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { - unsigned long flags; - - spin_lock_irqsave(t->queue_lock, flags); - if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) - queue_flag_clear(QUEUE_FLAG_CLUSTER, t); - spin_unlock_irqrestore(t->queue_lock, flags); - } } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 013457f47fdc..41fb69150b4d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -119,7 +119,7 @@ static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char * static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { - if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + if (blk_queue_cluster(q)) return queue_var_show(queue_max_segment_size(q), (page)); return queue_var_show(PAGE_CACHE_SIZE, (page)); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 90267f8d64ee..e2da1912a2cb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1131,11 +1131,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ q->limits = *limits; - if (limits->no_cluster) - queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); - else - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); - if (!dm_table_supports_discards(t)) queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); else diff --git a/drivers/md/md.c b/drivers/md/md.c index 84c46a161927..52694d29663d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4296,9 +4296,6 @@ static int md_alloc(dev_t dev, char *name) goto abort; mddev->queue->queuedata = mddev; - /* Can be unlocked because the queue is new: no concurrency */ - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); - blk_queue_make_request(mddev->queue, md_make_request); disk = alloc_disk(1 << shift); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index eafeeda6e194..9d7ba07dc5ef 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1642,9 +1642,8 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, blk_queue_max_segment_size(q, dma_get_max_seg_size(dev)); - /* New queue, no concurrency on queue_flags */ if (!shost->use_clustering) - queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); + q->limits.cluster = 0; /* * set a reasonable default alignment on word boundaries: the diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4f..95aeeeb49e8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -250,7 +250,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; - unsigned char no_cluster; + unsigned char cluster; signed char discard_zeroes_data; }; @@ -380,7 +380,6 @@ struct request_queue #endif }; -#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ @@ -403,7 +402,6 @@ struct request_queue #define QUEUE_FLAG_SECDISCARD 19 /* supports SECDISCARD */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_CLUSTER) | \ (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) @@ -510,6 +508,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define rq_data_dir(rq) ((rq)->cmd_flags & 1) +static inline unsigned int blk_queue_cluster(struct request_queue *q) +{ + return q->limits.cluster; +} + /* * We regard a request as sync, if either a read or a sync write */ -- cgit v1.2.3 From 72d4cd9f38b5ed96b75df4c622be25e1c2648dd3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 17 Dec 2010 08:34:20 +0100 Subject: block: max hardware sectors limit wrapper Implement blk_limits_max_hw_sectors() and make blk_queue_max_hw_sectors() a wrapper around it. DM needs this to avoid setting queue_limits' max_hw_sectors and max_sectors directly. dm_set_device_limits() now leverages blk_limits_max_hw_sectors() logic to establish the appropriate max_hw_sectors minimum (PAGE_SIZE). Fixes issue where DM was incorrectly setting max_sectors rather than max_hw_sectors (which caused dm_merge_bvec()'s max_hw_sectors check to be ineffective). Signed-off-by: Mike Snitzer Cc: stable@kernel.org Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 26 ++++++++++++++++++++------ drivers/md/dm-table.c | 5 ++--- include/linux/blkdev.h | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index e55f5fc4ca22..36c8c1f2af18 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -229,8 +229,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device + * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request + * @limits: the queue limits * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -244,7 +244,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * per-device basis in /sys/block//queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) { if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); @@ -252,9 +252,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto __func__, max_hw_sectors); } - q->limits.max_hw_sectors = max_hw_sectors; - q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_hw_sectors = max_hw_sectors; + limits->max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); +} +EXPORT_SYMBOL(blk_limits_max_hw_sectors); + +/** + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * + * Description: + * See description for blk_limits_max_hw_sectors(). + **/ +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +{ + blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e2da1912a2cb..4d705cea0f8c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -517,9 +517,8 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ if (q->merge_bvec_fn && !ti->type->merge) - limits->max_sectors = - min_not_zero(limits->max_sectors, - (unsigned int) (PAGE_SIZE >> 9)); + blk_limits_max_hw_sectors(limits, + (unsigned int) (PAGE_SIZE >> 9)); return 0; } EXPORT_SYMBOL_GPL(dm_set_device_limits); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95aeeeb49e8b..36ab42c9bb99 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -808,6 +808,7 @@ extern struct request_queue *blk_init_allocated_queue(struct request_queue *, extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); +extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); -- cgit v1.2.3