diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 11 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/as-iosched.c | 10 | ||||
-rw-r--r-- | block/blk-barrier.c | 31 | ||||
-rw-r--r-- | block/blk-core.c | 198 | ||||
-rw-r--r-- | block/blk-integrity.c | 1 | ||||
-rw-r--r-- | block/blk-iopoll.c | 227 | ||||
-rw-r--r-- | block/blk-merge.c | 45 | ||||
-rw-r--r-- | block/blk-settings.c | 105 | ||||
-rw-r--r-- | block/blk-sysfs.c | 20 | ||||
-rw-r--r-- | block/blk.h | 1 | ||||
-rw-r--r-- | block/bsg.c | 6 | ||||
-rw-r--r-- | block/cfq-iosched.c | 260 | ||||
-rw-r--r-- | block/cmd-filter.c | 233 | ||||
-rw-r--r-- | block/elevator.c | 3 | ||||
-rw-r--r-- | block/genhd.c | 32 | ||||
-rw-r--r-- | block/ioctl.c | 49 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 44 |
18 files changed, 735 insertions, 543 deletions
diff --git a/block/Kconfig b/block/Kconfig index 95a86adc33a1..9be0b56eaee1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -48,9 +48,9 @@ config LBDAF If unsure, say Y. config BLK_DEV_BSG - bool "Block layer SG support v4 (EXPERIMENTAL)" - depends on EXPERIMENTAL - ---help--- + bool "Block layer SG support v4" + default y + help Saying Y here will enable generic SG (SCSI generic) v4 support for any block device. @@ -60,7 +60,10 @@ config BLK_DEV_BSG protocols (e.g. Task Management Functions and SMP in Serial Attached SCSI). - If unsure, say N. + This option is required by recent UDEV versions to properly + access device serial numbers, etc. + + If unsure, say Y. config BLK_DEV_INTEGRITY bool "Block layer data integrity support" diff --git a/block/Makefile b/block/Makefile index e9fa4dd690f2..ba74ca6bfa14 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - ioctl.o genhd.o scsi_ioctl.o cmd-filter.o + blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6ee1d3..ce8ba57c6557 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 30022b4e2f63..6593ab39cfe9 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err) clear_bit(BIO_UPTODATE, &bio->bi_flags); } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } @@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err) * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: DISCARD_FL_* flags to control behaviour * * Description: - * Issue a discard request for the sectors in question. Does not wait. + * Issue a discard request for the sectors in question. */ -int blkdev_issue_discard(struct block_device *bdev, - sector_t sector, sector_t nr_sects, gfp_t gfp_mask) +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags) { - struct request_queue *q; - struct bio *bio; + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + int type = flags & DISCARD_FL_BARRIER ? + DISCARD_BARRIER : DISCARD_NOBARRIER; int ret = 0; - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); if (!q) return -ENXIO; @@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev, return -EOPNOTSUPP; while (nr_sects && !ret) { - bio = bio_alloc(gfp_mask, 0); + struct bio *bio = bio_alloc(gfp_mask, 0); if (!bio) return -ENOMEM; bio->bi_end_io = blkdev_discard_end_io; bio->bi_bdev = bdev; + if (flags & DISCARD_FL_WAIT) + bio->bi_private = &wait; bio->bi_sector = sector; @@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev, bio->bi_size = nr_sects << 9; nr_sects = 0; } + bio_get(bio); - submit_bio(DISCARD_BARRIER, bio); + submit_bio(type, bio); + + if (flags & DISCARD_FL_WAIT) + wait_for_completion(&wait); - /* Check if it failed immediately */ if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; else if (!bio_flagged(bio, BIO_UPTODATE)) diff --git a/block/blk-core.c b/block/blk-core.c index b06cf5c2a829..8135228e4b29 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io) part_stat_inc(cpu, part, merges[rw]); else { part_round_stats(cpu, part); - part_inc_in_flight(part); + part_inc_in_flight(part, rw); } part_stat_unlock(); @@ -501,6 +501,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + q->backing_dev_info.name = "block"; err = bdi_init(&q->backing_dev_info); if (err) { @@ -575,13 +576,6 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) return NULL; } - /* - * if caller didn't supply a lock, they get per-queue locking with - * our embedded lock - */ - if (!lock) - lock = &q->__queue_lock; - q->request_fn = rfn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; @@ -595,8 +589,6 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) q->sg_reserved_size = INT_MAX; - blk_set_cmd_filter_defaults(&q->cmd_filter); - /* * all done */ @@ -1039,7 +1031,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, if (part->in_flight) { __part_stat_add(cpu, part, time_in_queue, - part->in_flight * (now - part->stamp)); + part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1120,31 +1112,27 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_type = REQ_TYPE_FS; /* - * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) + * Inherit FAILFAST from bio (for read-ahead, and explicit + * FAILFAST). FAILFAST flags are identical for req and bio. */ - if (bio_rw_ahead(bio)) - req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | - REQ_FAILFAST_DRIVER); - if (bio_failfast_dev(bio)) - req->cmd_flags |= REQ_FAILFAST_DEV; - if (bio_failfast_transport(bio)) - req->cmd_flags |= REQ_FAILFAST_TRANSPORT; - if (bio_failfast_driver(bio)) - req->cmd_flags |= REQ_FAILFAST_DRIVER; - - if (unlikely(bio_discard(bio))) { + if (bio_rw_flagged(bio, BIO_RW_AHEAD)) + req->cmd_flags |= REQ_FAILFAST_MASK; + else + req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; + + if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { req->cmd_flags |= REQ_DISCARD; - if (bio_barrier(bio)) + if (bio_rw_flagged(bio, BIO_RW_BARRIER)) req->cmd_flags |= REQ_SOFTBARRIER; req->q->prepare_discard_fn(req->q, req); - } else if (unlikely(bio_barrier(bio))) + } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) req->cmd_flags |= REQ_HARDBARRIER; - if (bio_sync(bio)) + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) req->cmd_flags |= REQ_RW_SYNC; - if (bio_rw_meta(bio)) + if (bio_rw_flagged(bio, BIO_RW_META)) req->cmd_flags |= REQ_RW_META; - if (bio_noidle(bio)) + if (bio_rw_flagged(bio, BIO_RW_NOIDLE)) req->cmd_flags |= REQ_NOIDLE; req->errors = 0; @@ -1159,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) */ static inline bool queue_should_plug(struct request_queue *q) { - return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); + return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); } static int __make_request(struct request_queue *q, struct bio *bio) @@ -1168,10 +1156,16 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); - const int sync = bio_sync(bio); - const int unplug = bio_unplug(bio); + const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); + const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; + if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) && + (q->next_ordered == QUEUE_ORDERED_NONE)) { + bio_endio(bio, -EOPNOTSUPP); + return 0; + } /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1181,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (unlikely(bio_barrier(bio)) || elv_queue_empty(q)) + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1194,6 +1188,9 @@ static int __make_request(struct request_queue *q, struct bio *bio) trace_block_bio_backmerge(q, bio); + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bytes; @@ -1213,6 +1210,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) trace_block_bio_frontmerge(q, bio); + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { + blk_rq_set_mixed_merge(req); + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= ff; + } + bio->bi_next = req->bio; req->bio = bio; @@ -1460,24 +1463,20 @@ static inline void __generic_make_request(struct bio *bio) if (old_sector != -1) trace_block_remap(q, bio, old_dev, old_sector); - trace_block_bio_queue(q, bio); - old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; if (bio_check_eod(bio, nr_sectors)) goto end_io; - if (bio_discard(bio) && !q->prepare_discard_fn) { - err = -EOPNOTSUPP; - goto end_io; - } - if (bio_barrier(bio) && bio_has_data(bio) && - (q->next_ordered == QUEUE_ORDERED_NONE)) { + if (bio_rw_flagged(bio, BIO_RW_DISCARD) && + !q->prepare_discard_fn) { err = -EOPNOTSUPP; goto end_io; } + trace_block_bio_queue(q, bio); + ret = q->make_request_fn(q, bio); } while (ret); @@ -1662,6 +1661,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); +/** + * blk_rq_err_bytes - determine number of bytes till the next failure boundary + * @rq: request to examine + * + * Description: + * A request could be merge of IOs which require different failure + * handling. This function determines the number of bytes which + * can be failed from the beginning of the request without + * crossing into area which need to be retried further. + * + * Return: + * The number of bytes to fail. + * + * Context: + * queue_lock must be held. + */ +unsigned int blk_rq_err_bytes(const struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + unsigned int bytes = 0; + struct bio *bio; + + if (!(rq->cmd_flags & REQ_MIXED_MERGE)) + return blk_rq_bytes(rq); + + /* + * Currently the only 'mixing' which can happen is between + * different fastfail types. We can safely fail portions + * which have all the failfast bits that the first one has - + * the ones which are at least as eager to fail as the first + * one. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + if ((bio->bi_rw & ff) != ff) + break; + bytes += bio->bi_size; + } + + /* this could lead to infinite loop */ + BUG_ON(blk_rq_bytes(rq) && !bytes); + return bytes; +} +EXPORT_SYMBOL_GPL(blk_rq_err_bytes); + static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { @@ -1695,7 +1738,7 @@ static void blk_account_io_done(struct request *req) part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rw); part_stat_unlock(); } @@ -1815,8 +1858,15 @@ void blk_dequeue_request(struct request *rq) * and to it is freed is accounted as io that is in progress at * the driver side. */ - if (blk_account_rq(rq)) + if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; + /* + * Mark this device as supporting hardware queuing, if + * we have more IOs in flight than 4. + */ + if (!blk_queue_queuing(q) && queue_in_flight(q) > 4) + set_bit(QUEUE_FLAG_CQ, &q->queue_flags); + } } /** @@ -2008,6 +2058,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (blk_fs_request(req) || blk_discard_rq(req)) req->__sector += total_bytes >> 9; + /* mixed attributes always follow the first bio */ + if (req->cmd_flags & REQ_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK; + } + /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. @@ -2145,7 +2201,7 @@ bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { return blk_end_bidi_request(rq, error, nr_bytes, 0); } -EXPORT_SYMBOL_GPL(blk_end_request); +EXPORT_SYMBOL(blk_end_request); /** * blk_end_request_all - Helper function for drives to finish the request. @@ -2166,7 +2222,7 @@ void blk_end_request_all(struct request *rq, int error) pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } -EXPORT_SYMBOL_GPL(blk_end_request_all); +EXPORT_SYMBOL(blk_end_request_all); /** * blk_end_request_cur - Helper function to finish the current request chunk. @@ -2184,7 +2240,26 @@ bool blk_end_request_cur(struct request *rq, int error) { return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } -EXPORT_SYMBOL_GPL(blk_end_request_cur); +EXPORT_SYMBOL(blk_end_request_cur); + +/** + * blk_end_request_err - Finish a request till the next failure boundary. + * @rq: the request to finish till the next failure boundary for + * @error: must be negative errno + * + * Description: + * Complete @rq till the next failure boundary. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool blk_end_request_err(struct request *rq, int error) +{ + WARN_ON(error >= 0); + return blk_end_request(rq, error, blk_rq_err_bytes(rq)); +} +EXPORT_SYMBOL_GPL(blk_end_request_err); /** * __blk_end_request - Helper function for drivers to complete the request. @@ -2203,7 +2278,7 @@ bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { return __blk_end_bidi_request(rq, error, nr_bytes, 0); } -EXPORT_SYMBOL_GPL(__blk_end_request); +EXPORT_SYMBOL(__blk_end_request); /** * __blk_end_request_all - Helper function for drives to finish the request. @@ -2224,7 +2299,7 @@ void __blk_end_request_all(struct request *rq, int error) pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } -EXPORT_SYMBOL_GPL(__blk_end_request_all); +EXPORT_SYMBOL(__blk_end_request_all); /** * __blk_end_request_cur - Helper function to finish the current request chunk. @@ -2243,14 +2318,33 @@ bool __blk_end_request_cur(struct request *rq, int error) { return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } -EXPORT_SYMBOL_GPL(__blk_end_request_cur); +EXPORT_SYMBOL(__blk_end_request_cur); + +/** + * __blk_end_request_err - Finish a request till the next failure boundary. + * @rq: the request to finish till the next failure boundary for + * @error: must be negative errno + * + * Description: + * Complete @rq till the next failure boundary. Must be called + * with queue lock held. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool __blk_end_request_err(struct request *rq, int error) +{ + WARN_ON(error >= 0); + return __blk_end_request(rq, error, blk_rq_err_bytes(rq)); +} +EXPORT_SYMBOL_GPL(__blk_end_request_err); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { - /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and - we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ - rq->cmd_flags |= (bio->bi_rw & 3); + /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ + rq->cmd_flags |= bio->bi_rw & REQ_RW; if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); @@ -2365,7 +2459,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, __bio_clone(bio, bio_src); if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, gfp_mask)) + bio_integrity_clone(bio, bio_src, gfp_mask, bs)) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 73e28d355688..15c630813b1c 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -379,6 +379,7 @@ void blk_integrity_unregister(struct gendisk *disk) kobject_uevent(&bi->kobj, KOBJ_REMOVE); kobject_del(&bi->kobj); + kobject_put(&bi->kobj); kmem_cache_free(integrity_cachep, bi); disk->integrity = NULL; } diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c new file mode 100644 index 000000000000..ca564202ed7a --- /dev/null +++ b/block/blk-iopoll.c @@ -0,0 +1,227 @@ +/* + * Functions related to interrupt-poll handling in the block layer. This + * is similar to NAPI for network devices. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/blk-iopoll.h> +#include <linux/delay.h> + +#include "blk.h" + +int blk_iopoll_enabled = 1; +EXPORT_SYMBOL(blk_iopoll_enabled); + +static unsigned int blk_iopoll_budget __read_mostly = 256; + +static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); + +/** + * blk_iopoll_sched - Schedule a run of the iopoll handler + * @iop: The parent iopoll structure + * + * Description: + * Add this blk_iopoll structure to the pending poll list and trigger the + * raise of the blk iopoll softirq. The driver must already have gotten a + * succesful return from blk_iopoll_sched_prep() before calling this. + **/ +void blk_iopoll_sched(struct blk_iopoll *iop) +{ + unsigned long flags; + + local_irq_save(flags); + list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_iopoll_sched); + +/** + * __blk_iopoll_complete - Mark this @iop as un-polled again + * @iop: The parent iopoll structure + * + * Description: + * See blk_iopoll_complete(). This function must be called with interrupts + * disabled. + **/ +void __blk_iopoll_complete(struct blk_iopoll *iop) +{ + list_del(&iop->list); + smp_mb__before_clear_bit(); + clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(__blk_iopoll_complete); + +/** + * blk_iopoll_complete - Mark this @iop as un-polled again + * @iop: The parent iopoll structure + * + * Description: + * If a driver consumes less than the assigned budget in its run of the + * iopoll handler, it'll end the polled mode by calling this function. The + * iopoll handler will not be invoked again before blk_iopoll_sched_prep() + * is called. + **/ +void blk_iopoll_complete(struct blk_iopoll *iopoll) +{ + unsigned long flags; + + local_irq_save(flags); + __blk_iopoll_complete(iopoll); + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_iopoll_complete); + +static void blk_iopoll_softirq(struct softirq_action *h) +{ + struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); + int rearm = 0, budget = blk_iopoll_budget; + unsigned long start_time = jiffies; + + local_irq_disable(); + + while (!list_empty(list)) { + struct blk_iopoll *iop; + int work, weight; + + /* + * If softirq window is exhausted then punt. + */ + if (budget <= 0 || time_after(jiffies, start_time)) { + rearm = 1; + break; + } + + local_irq_enable(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + iop = list_entry(list->next, struct blk_iopoll, list); + + weight = iop->weight; + work = 0; + if (test_bit(IOPOLL_F_SCHED, &iop->state)) + work = iop->poll(iop, weight); + + budget -= work; + + local_irq_disable(); + + /* + * Drivers must not modify the iopoll state, if they + * consume their assigned weight (or more, some drivers can't + * easily just stop processing, they have to complete an + * entire mask of commands).In such cases this code + * still "owns" the iopoll instance and therefore can + * move the instance around on the list at-will. + */ + if (work >= weight) { + if (blk_iopoll_disable_pending(iop)) + __blk_iopoll_complete(iop); + else + list_move_tail(&iop->list, list); + } + } + + if (rearm) + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + + local_irq_enable(); +} + +/** + * blk_iopoll_disable - Disable iopoll on this @iop + * @iop: The parent iopoll structure + * + * Description: + * Disable io polling and wait for any pending callbacks to have completed. + **/ +void blk_iopoll_disable(struct blk_iopoll *iop) +{ + set_bit(IOPOLL_F_DISABLE, &iop->state); + while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state)) + msleep(1); + clear_bit(IOPOLL_F_DISABLE, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_disable); + +/** + * blk_iopoll_enable - Enable iopoll on this @iop + * @iop: The parent iopoll structure + * + * Description: + * Enable iopoll on this @iop. Note that the handler run will not be + * scheduled, it will only mark it as active. + **/ +void blk_iopoll_enable(struct blk_iopoll *iop) +{ + BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); + smp_mb__before_clear_bit(); + clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_enable); + +/** + * blk_iopoll_init - Initialize this @iop + * @iop: The parent iopoll structure + * @weight: The default weight (or command completion budget) + * @poll_fn: The handler to invoke + * + * Description: + * Initialize this blk_iopoll structure. Before being actively used, the + * driver must call blk_iopoll_enable(). + **/ +void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) +{ + memset(iop, 0, sizeof(*iop)); + INIT_LIST_HEAD(&iop->list); + iop->weight = weight; + iop->poll = poll_fn; + set_bit(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_init); + +static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + &__get_cpu_var(blk_cpu_iopoll)); + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = { + .notifier_call = blk_iopoll_cpu_notify, +}; + +static __init int blk_iopoll_setup(void) +{ + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); + + open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq); + register_hotcpu_notifier(&blk_iopoll_cpu_notifier); + return 0; +} +subsys_initcall(blk_iopoll_setup); diff --git a/block/blk-merge.c b/block/blk-merge.c index 39ce64432ba6..99cb5cf1f447 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, return 1; } +/** + * blk_rq_set_mixed_merge - mark a request as mixed merge + * @rq: request to mark as mixed merge + * + * Description: + * @rq is about to be mixed merged. Make sure the attributes + * which can be mixed are set in each bio and mark @rq as mixed + * merged. + */ +void blk_rq_set_mixed_merge(struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + struct bio *bio; + + if (rq->cmd_flags & REQ_MIXED_MERGE) + return; + + /* + * @rq will no longer represent mixable attributes for all the + * contained bios. It will just track those of the first one. + * Distributes the attributs to each bio. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) && + (bio->bi_rw & REQ_FAILFAST_MASK) != ff); + bio->bi_rw |= ff; + } + rq->cmd_flags |= REQ_MIXED_MERGE; +} + static void blk_account_io_merge(struct request *req) { if (blk_do_io_stat(req)) { @@ -321,7 +351,7 @@ static void blk_account_io_merge(struct request *req) part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rq_data_dir(req)); part_stat_unlock(); } @@ -360,6 +390,19 @@ static int attempt_merge(struct request_queue *q, struct request *req, return 0; /* + * If failfast settings disagree or any of the two is already + * a mixed merge, mark both as mixed before proceeding. This + * makes sure that all involved bios have mixable attributes + * set properly. + */ + if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || + (req->cmd_flags & REQ_FAILFAST_MASK) != + (next->cmd_flags & REQ_FAILFAST_MASK)) { + blk_rq_set_mixed_merge(req); + blk_rq_set_mixed_merge(next); + } + + /* * At this point we have either done a back merge * or front merge. We need the smaller start_time of * the merged requests to be the current request diff --git a/block/blk-settings.c b/block/blk-settings.c index bd582a7f5310..83413ff83739 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -7,6 +7,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ +#include <linux/gcd.h> #include "blk.h" @@ -165,6 +166,13 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) blk_set_default_limits(&q->limits); /* + * If the caller didn't supply a lock, fall back to our embedded + * per-queue locks + */ + if (!q->queue_lock) + q->queue_lock = &q->__queue_lock; + + /* * by default assume old behaviour and bounce for any highmem page */ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); @@ -377,8 +385,8 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) EXPORT_SYMBOL(blk_queue_alignment_offset); /** - * blk_queue_io_min - set minimum request size for the queue - * @q: the request queue for the device + * blk_limits_io_min - set minimum request size for a device + * @limits: the queue limits * @min: smallest I/O size in bytes * * Description: @@ -387,30 +395,73 @@ EXPORT_SYMBOL(blk_queue_alignment_offset); * smallest I/O the device can perform without incurring a performance * penalty. */ -void blk_queue_io_min(struct request_queue *q, unsigned int min) +void blk_limits_io_min(struct queue_limits *limits, unsigned int min) { - q->limits.io_min = min; + limits->io_min = min; - if (q->limits.io_min < q->limits.logical_block_size) - q->limits.io_min = q->limits.logical_block_size; + if (limits->io_min < limits->logical_block_size) + limits->io_min = limits->logical_block_size; - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; + if (limits->io_min < limits->physical_block_size) + limits->io_min = limits->physical_block_size; +} +EXPORT_SYMBOL(blk_limits_io_min); + +/** + * blk_queue_io_min - set minimum request size for the queue + * @q: the request queue for the device + * @min: smallest I/O size in bytes + * + * Description: + * Storage devices may report a granularity or preferred minimum I/O + * size which is the smallest request the device can perform without + * incurring a performance penalty. For disk drives this is often the + * physical block size. For RAID arrays it is often the stripe chunk + * size. A properly aligned multiple of minimum_io_size is the + * preferred request size for workloads where a high number of I/O + * operations is desired. + */ +void blk_queue_io_min(struct request_queue *q, unsigned int min) +{ + blk_limits_io_min(&q->limits, min); } EXPORT_SYMBOL(blk_queue_io_min); /** + * blk_limits_io_opt - set optimal request size for a device + * @limits: the queue limits + * @opt: smallest I/O size in bytes + * + * Description: + * Storage devices may report an optimal I/O size, which is the + * device's preferred unit for sustained I/O. This is rarely reported + * for disk drives. For RAID arrays it is usually the stripe width or + * the internal track size. A properly aligned multiple of + * optimal_io_size is the preferred request size for workloads where + * sustained throughput is desired. + */ +void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt) +{ + limits->io_opt = opt; +} +EXPORT_SYMBOL(blk_limits_io_opt); + +/** * blk_queue_io_opt - set optimal request size for the queue * @q: the request queue for the device * @opt: optimal request size in bytes * * Description: - * Drivers can call this function to set the preferred I/O request - * size for devices that report such a value. + * Storage devices may report an optimal I/O size, which is the + * device's preferred unit for sustained I/O. This is rarely reported + * for disk drives. For RAID arrays it is usually the stripe width or + * the internal track size. A properly aligned multiple of + * optimal_io_size is the preferred request size for workloads where + * sustained throughput is desired. */ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { - q->limits.io_opt = opt; + blk_limits_io_opt(&q->limits, opt); } EXPORT_SYMBOL(blk_queue_io_opt); @@ -426,27 +477,7 @@ EXPORT_SYMBOL(blk_queue_io_opt); **/ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) { - /* zero is "infinity" */ - t->limits.max_sectors = min_not_zero(queue_max_sectors(t), - queue_max_sectors(b)); - - t->limits.max_hw_sectors = min_not_zero(queue_max_hw_sectors(t), - queue_max_hw_sectors(b)); - - t->limits.seg_boundary_mask = min_not_zero(queue_segment_boundary(t), - queue_segment_boundary(b)); - - t->limits.max_phys_segments = min_not_zero(queue_max_phys_segments(t), - queue_max_phys_segments(b)); - - t->limits.max_hw_segments = min_not_zero(queue_max_hw_segments(t), - queue_max_hw_segments(b)); - - t->limits.max_segment_size = min_not_zero(queue_max_segment_size(t), - queue_max_segment_size(b)); - - t->limits.logical_block_size = max(queue_logical_block_size(t), - queue_logical_block_size(b)); + blk_stack_limits(&t->limits, &b->limits, 0); if (!t->queue_lock) WARN_ON_ONCE(1); @@ -516,6 +547,16 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, return -1; } + /* Find lcm() of optimal I/O size */ + if (t->io_opt && b->io_opt) + t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt); + else if (b->io_opt) + t->io_opt = b->io_opt; + + /* Verify that optimal I/O size is a multiple of io_min */ + if (t->io_min && t->io_opt % t->io_min) + return -1; + return 0; } EXPORT_SYMBOL(blk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b1cd04087d6a..b78c9c3e2670 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -16,9 +16,9 @@ struct queue_sysfs_entry { }; static ssize_t -queue_var_show(unsigned int var, char *page) +queue_var_show(unsigned long var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%lu\n", var); } static ssize_t @@ -40,7 +40,12 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) { struct request_list *rl = &q->rq; unsigned long nr; - int ret = queue_var_store(&nr, page, count); + int ret; + + if (!q->request_fn) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; @@ -77,7 +82,8 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); + unsigned long ra_kb = q->backing_dev_info.ra_pages << + (PAGE_CACHE_SHIFT - 10); return queue_var_show(ra_kb, (page)); } @@ -132,7 +138,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) return -EINVAL; spin_lock_irq(q->queue_lock); - blk_queue_max_sectors(q, max_sectors_kb << 1); + q->limits.max_sectors = max_sectors_kb << 1; spin_unlock_irq(q->queue_lock); return ret; @@ -189,9 +195,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) { - unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); - return queue_var_show(set != 0, page); + return queue_var_show(set, page); } static ssize_t diff --git a/block/blk.h b/block/blk.h index 3fae6add5430..5ee3d7e72feb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, int attempt_back_merge(struct request_queue *q, struct request *rq); int attempt_front_merge(struct request_queue *q, struct request *rq); void blk_recalc_rq_segments(struct request *rq); +void blk_rq_set_mixed_merge(struct request *rq); void blk_queue_congestion_threshold(struct request_queue *q); diff --git a/block/bsg.c b/block/bsg.c index e7d475254248..0676301f16d0 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -186,7 +186,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(&q->cmd_filter, rq->cmd, has_write_perm)) + if (blk_verify_command(rq->cmd, has_write_perm)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -1062,7 +1062,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue); static struct cdev bsg_cdev; -static char *bsg_nodename(struct device *dev) +static char *bsg_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); } @@ -1087,7 +1087,7 @@ static int __init bsg_init(void) ret = PTR_ERR(bsg_class); goto destroy_kmemcache; } - bsg_class->nodename = bsg_nodename; + bsg_class->devnode = bsg_devnode; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); if (ret) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 833ec18eaa63..1ca813b16e78 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -71,6 +71,51 @@ struct cfq_rb_root { #define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } /* + * Per process-grouping structure + */ +struct cfq_queue { + /* reference count */ + atomic_t ref; + /* various state flags, see below */ + unsigned int flags; + /* parent cfq_data */ + struct cfq_data *cfqd; + /* service_tree member */ + struct rb_node rb_node; + /* service_tree key */ + unsigned long rb_key; + /* prio tree member */ + struct rb_node p_node; + /* prio tree root we belong to, if any */ + struct rb_root *p_root; + /* sorted list of pending requests */ + struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ + struct request *next_rq; + /* requests queued in sort_list */ + int queued[2]; + /* currently allocated requests */ + int allocated[2]; + /* fifo list of requests in sort_list */ + struct list_head fifo; + + unsigned long slice_end; + long slice_resid; + unsigned int slice_dispatch; + + /* pending metadata requests */ + int meta_pending; + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; + + /* io prio of this group */ + unsigned short ioprio, org_ioprio; + unsigned short ioprio_class, org_ioprio_class; + + pid_t pid; +}; + +/* * Per block device queue structure */ struct cfq_data { @@ -89,13 +134,8 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; - /* - * Used to track any pending rt requests so we can pre-empt current - * non-RT cfqq in service when this value is non-zero. - */ - unsigned int busy_rt_queues; - int rq_in_driver; + int rq_in_driver[2]; int sync_flight; /* @@ -135,58 +175,17 @@ struct cfq_data { unsigned int cfq_slice_idle; struct list_head cic_list; -}; - -/* - * Per process-grouping structure - */ -struct cfq_queue { - /* reference count */ - atomic_t ref; - /* various state flags, see below */ - unsigned int flags; - /* parent cfq_data */ - struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - unsigned long rb_key; - /* prio tree member */ - struct rb_node p_node; - /* prio tree root we belong to, if any */ - struct rb_root *p_root; - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct request *next_rq; - /* requests queued in sort_list */ - int queued[2]; - /* currently allocated requests */ - int allocated[2]; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - unsigned long slice_end; - long slice_resid; - unsigned int slice_dispatch; - /* pending metadata requests */ - int meta_pending; - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - - /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; - - pid_t pid; + /* + * Fallback dummy cfqq for extreme OOM conditions + */ + struct cfq_queue oom_cfqq; }; enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ - CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ @@ -213,7 +212,6 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ CFQ_CFQQ_FNS(on_rr); CFQ_CFQQ_FNS(wait_request); CFQ_CFQQ_FNS(must_dispatch); -CFQ_CFQQ_FNS(must_alloc); CFQ_CFQQ_FNS(must_alloc_slice); CFQ_CFQQ_FNS(fifo_expire); CFQ_CFQQ_FNS(idle_window); @@ -234,6 +232,11 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); +static inline int rq_in_driver(struct cfq_data *cfqd) +{ + return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; +} + static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, int is_sync) { @@ -252,7 +255,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, */ static inline int cfq_bio_sync(struct bio *bio) { - if (bio_data_dir(bio) == READ || bio_sync(bio)) + if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO)) return 1; return 0; @@ -643,8 +646,6 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -668,8 +669,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues--; } /* @@ -755,9 +754,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver++; + cfqd->rq_in_driver[rq_is_sync(rq)]++; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - cfqd->rq_in_driver); + rq_in_driver(cfqd)); cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } @@ -765,11 +764,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) static void cfq_deactivate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; + const int sync = rq_is_sync(rq); - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; + WARN_ON(!cfqd->rq_in_driver[sync]); + cfqd->rq_in_driver[sync]--; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - cfqd->rq_in_driver); + rq_in_driver(cfqd)); } static void cfq_remove_request(struct request *rq) @@ -1075,7 +1075,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * still requests with the driver, don't idle */ - if (cfqd->rq_in_driver) + if (rq_in_driver(cfqd)) return; /* @@ -1110,6 +1110,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); cfq_remove_request(rq); cfqq->dispatched++; elv_dispatch_sort(q, rq); @@ -1174,20 +1175,6 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto expire; /* - * If we have a RT cfqq waiting, then we pre-empt the current non-rt - * cfqq. - */ - if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { - /* - * We simulate this as cfqq timed out so that it gets to bank - * the remaining of its time slice. - */ - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - goto new_queue; - } - - /* * The active queue has requests and isn't expired, allow it to * dispatch. */ @@ -1307,6 +1294,12 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) return 0; /* + * Drain async requests before we start sync IO + */ + if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + return 0; + + /* * If this is an async queue and we have sync IO in flight, let it wait */ if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) @@ -1357,7 +1350,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) cfq_slice_expired(cfqd, 0); } - cfq_log(cfqd, "dispatched a request"); + cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); return 1; } @@ -1422,7 +1415,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1431,7 +1424,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1557,7 +1550,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -1641,6 +1634,26 @@ static void cfq_ioc_set_ioprio(struct io_context *ioc) ioc->ioprio_changed = 0; } +static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, + pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&cfqq->rb_node); + RB_CLEAR_NODE(&cfqq->p_node); + INIT_LIST_HEAD(&cfqq->fifo); + + atomic_set(&cfqq->ref, 0); + cfqq->cfqd = cfqd; + + cfq_mark_cfqq_prio_changed(cfqq); + + if (is_sync) { + if (!cfq_class_idle(cfqq)) + cfq_mark_cfqq_idle_window(cfqq); + cfq_mark_cfqq_sync(cfqq); + } + cfqq->pid = pid; +} + static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, gfp_t gfp_mask) @@ -1653,56 +1666,40 @@ retry: /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); - if (!cfqq) { + /* + * Always try a new alloc if we fell back to the OOM cfqq + * originally, since it should just be a temporary situation. + */ + if (!cfqq || cfqq == &cfqd->oom_cfqq) { + cfqq = NULL; if (new_cfqq) { cfqq = new_cfqq; new_cfqq = NULL; } else if (gfp_mask & __GFP_WAIT) { - /* - * Inform the allocator of the fact that we will - * just repeat this allocation if it fails, to allow - * the allocator to do whatever it needs to attempt to - * free memory. - */ spin_unlock_irq(cfqd->queue->queue_lock); new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_NOFAIL | __GFP_ZERO, + gfp_mask | __GFP_ZERO, cfqd->queue->node); spin_lock_irq(cfqd->queue->queue_lock); - goto retry; + if (new_cfqq) + goto retry; } else { cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask | __GFP_ZERO, cfqd->queue->node); - if (!cfqq) - goto out; } - RB_CLEAR_NODE(&cfqq->rb_node); - RB_CLEAR_NODE(&cfqq->p_node); - INIT_LIST_HEAD(&cfqq->fifo); - - atomic_set(&cfqq->ref, 0); - cfqq->cfqd = cfqd; - - cfq_mark_cfqq_prio_changed(cfqq); - - cfq_init_prio_data(cfqq, ioc); - - if (is_sync) { - if (!cfq_class_idle(cfqq)) - cfq_mark_cfqq_idle_window(cfqq); - cfq_mark_cfqq_sync(cfqq); - } - cfqq->pid = current->pid; - cfq_log_cfqq(cfqd, cfqq, "alloced"); + if (cfqq) { + cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); + cfq_init_prio_data(cfqq, ioc); + cfq_log_cfqq(cfqd, cfqq, "alloced"); + } else + cfqq = &cfqd->oom_cfqq; } if (new_cfqq) kmem_cache_free(cfq_pool, new_cfqq); -out: - WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); return cfqq; } @@ -1735,11 +1732,8 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, cfqq = *async_cfqq; } - if (!cfqq) { + if (!cfqq) cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); - if (!cfqq) - return NULL; - } /* * pin the queue now that it's allocated, scheduler exit will prune it @@ -2124,11 +2118,11 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { - if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = cfqd->rq_in_driver; + if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) + cfqd->rq_in_driver_peak = rq_in_driver(cfqd); if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) + rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) @@ -2155,9 +2149,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_update_hw_tag(cfqd); - WARN_ON(!cfqd->rq_in_driver); + WARN_ON(!cfqd->rq_in_driver[sync]); WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver--; + cfqd->rq_in_driver[sync]--; cfqq->dispatched--; if (cfq_cfqq_sync(cfqq)) @@ -2191,7 +2185,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_arm_slice_timer(cfqd); } - if (!cfqd->rq_in_driver) + if (!rq_in_driver(cfqd)) cfq_schedule_dispatch(cfqd); } @@ -2223,8 +2217,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq) static inline int __cfq_may_queue(struct cfq_queue *cfqq) { - if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && - !cfq_cfqq_must_alloc_slice(cfqq)) { + if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { cfq_mark_cfqq_must_alloc_slice(cfqq); return ELV_MQUEUE_MUST; } @@ -2305,17 +2298,12 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) goto queue_fail; cfqq = cic_to_cfqq(cic, is_sync); - if (!cfqq) { + if (!cfqq || cfqq == &cfqd->oom_cfqq) { cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); - - if (!cfqq) - goto queue_fail; - cic_set_cfqq(cic, cfqq, is_sync); } cfqq->allocated[rw]++; - cfq_clear_cfqq_must_alloc(cfqq); atomic_inc(&cfqq->ref); spin_unlock_irqrestore(q->queue_lock, flags); @@ -2465,6 +2453,14 @@ static void *cfq_init_queue(struct request_queue *q) for (i = 0; i < CFQ_PRIO_LISTS; i++) cfqd->prio_trees[i] = RB_ROOT; + /* + * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); + atomic_inc(&cfqd->oom_cfqq.ref); + INIT_LIST_HEAD(&cfqd->cic_list); cfqd->queue = q; @@ -2658,7 +2654,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/block/cmd-filter.c b/block/cmd-filter.c deleted file mode 100644 index 572bbc2f900d..000000000000 --- a/block/cmd-filter.c +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright 2004 Peter M. Jones <pjones@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - * - */ - -#include <linux/list.h> -#include <linux/genhd.h> -#include <linux/spinlock.h> -#include <linux/capability.h> -#include <linux/bitops.h> -#include <linux/blkdev.h> - -#include <scsi/scsi.h> -#include <linux/cdrom.h> - -int blk_verify_command(struct blk_cmd_filter *filter, - unsigned char *cmd, fmode_t has_write_perm) -{ - /* root can do any command. */ - if (capable(CAP_SYS_RAWIO)) - return 0; - - /* if there's no filter set, assume we're filtering everything out */ - if (!filter) - return -EPERM; - - /* Anybody who can open the device can do a read-safe command */ - if (test_bit(cmd[0], filter->read_ok)) - return 0; - - /* Write-safe commands require a writable open */ - if (test_bit(cmd[0], filter->write_ok) && has_write_perm) - return 0; - - return -EPERM; -} -EXPORT_SYMBOL(blk_verify_command); - -#if 0 -/* and now, the sysfs stuff */ -static ssize_t rcf_cmds_show(struct blk_cmd_filter *filter, char *page, - int rw) -{ - char *npage = page; - unsigned long *okbits; - int i; - - if (rw == READ) - okbits = filter->read_ok; - else - okbits = filter->write_ok; - - for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) { - if (test_bit(i, okbits)) { - npage += sprintf(npage, "0x%02x", i); - if (i < BLK_SCSI_MAX_CMDS - 1) - sprintf(npage++, " "); - } - } - - if (npage != page) - npage += sprintf(npage, "\n"); - - return npage - page; -} - -static ssize_t rcf_readcmds_show(struct blk_cmd_filter *filter, char *page) -{ - return rcf_cmds_show(filter, page, READ); -} - -static ssize_t rcf_writecmds_show(struct blk_cmd_filter *filter, - char *page) -{ - return rcf_cmds_show(filter, page, WRITE); -} - -static ssize_t rcf_cmds_store(struct blk_cmd_filter *filter, - const char *page, size_t count, int rw) -{ - unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits; - int cmd, set; - char *p, *status; - - if (rw == READ) { - memcpy(&okbits, filter->read_ok, sizeof(okbits)); - target_okbits = filter->read_ok; - } else { - memcpy(&okbits, filter->write_ok, sizeof(okbits)); - target_okbits = filter->write_ok; - } - - while ((p = strsep((char **)&page, " ")) != NULL) { - set = 1; - - if (p[0] == '+') { - p++; - } else if (p[0] == '-') { - set = 0; - p++; - } - - cmd = simple_strtol(p, &status, 16); - - /* either of these cases means invalid input, so do nothing. */ - if ((status == p) || cmd >= BLK_SCSI_MAX_CMDS) - return -EINVAL; - - if (set) - __set_bit(cmd, okbits); - else - __clear_bit(cmd, okbits); - } - - memcpy(target_okbits, okbits, sizeof(okbits)); - return count; -} - -static ssize_t rcf_readcmds_store(struct blk_cmd_filter *filter, - const char *page, size_t count) -{ - return rcf_cmds_store(filter, page, count, READ); -} - -static ssize_t rcf_writecmds_store(struct blk_cmd_filter *filter, - const char *page, size_t count) -{ - return rcf_cmds_store(filter, page, count, WRITE); -} - -struct rcf_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_cmd_filter *, char *); - ssize_t (*store)(struct blk_cmd_filter *, const char *, size_t); -}; - -static struct rcf_sysfs_entry rcf_readcmds_entry = { - .attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR }, - .show = rcf_readcmds_show, - .store = rcf_readcmds_store, -}; - -static struct rcf_sysfs_entry rcf_writecmds_entry = { - .attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR }, - .show = rcf_writecmds_show, - .store = rcf_writecmds_store, -}; - -static struct attribute *default_attrs[] = { - &rcf_readcmds_entry.attr, - &rcf_writecmds_entry.attr, - NULL, -}; - -#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr) - -static ssize_t -rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page) -{ - struct rcf_sysfs_entry *entry = to_rcf(attr); - struct blk_cmd_filter *filter; - - filter = container_of(kobj, struct blk_cmd_filter, kobj); - if (entry->show) - return entry->show(filter, page); - - return 0; -} - -static ssize_t -rcf_attr_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct rcf_sysfs_entry *entry = to_rcf(attr); - struct blk_cmd_filter *filter; - - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - - if (!entry->store) - return -EINVAL; - - filter = container_of(kobj, struct blk_cmd_filter, kobj); - return entry->store(filter, page, length); -} - -static struct sysfs_ops rcf_sysfs_ops = { - .show = rcf_attr_show, - .store = rcf_attr_store, -}; - -static struct kobj_type rcf_ktype = { - .sysfs_ops = &rcf_sysfs_ops, - .default_attrs = default_attrs, -}; - -int blk_register_filter(struct gendisk *disk) -{ - int ret; - struct blk_cmd_filter *filter = &disk->queue->cmd_filter; - - ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, - &disk_to_dev(disk)->kobj, - "%s", "cmd_filter"); - if (ret < 0) - return ret; - - return 0; -} -EXPORT_SYMBOL(blk_register_filter); - -void blk_unregister_filter(struct gendisk *disk) -{ - struct blk_cmd_filter *filter = &disk->queue->cmd_filter; - - kobject_put(&filter->kobj); -} -EXPORT_SYMBOL(blk_unregister_filter); -#endif diff --git a/block/elevator.c b/block/elevator.c index ca861927ba41..1975b619c86d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) /* * Don't merge file system requests and discard requests */ - if (bio_discard(bio) != bio_discard(rq->bio)) + if (bio_rw_flagged(bio, BIO_RW_DISCARD) != + bio_rw_flagged(rq->bio, BIO_RW_DISCARD)) return 0; /* diff --git a/block/genhd.c b/block/genhd.c index f4c64c2b303a..517e4332cb37 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_alignment_offset.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -901,7 +903,7 @@ static struct attribute_group disk_attr_group = { .attrs = disk_attrs, }; -static struct attribute_group *disk_attr_groups[] = { +static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, NULL }; @@ -996,12 +998,12 @@ struct class block_class = { .name = "block", }; -static char *block_nodename(struct device *dev) +static char *block_devnode(struct device *dev, mode_t *mode) { struct gendisk *disk = dev_to_disk(dev); - if (disk->nodename) - return disk->nodename(disk); + if (disk->devnode) + return disk->devnode(disk, mode); return NULL; } @@ -1009,7 +1011,7 @@ static struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, - .nodename = block_nodename, + .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS @@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[1]), (unsigned long long)part_stat_read(hd, sectors[1]), jiffies_to_msecs(part_stat_read(hd, ticks[1])), - hd->in_flight, + part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); @@ -1215,6 +1217,16 @@ void put_disk(struct gendisk *disk) EXPORT_SYMBOL(put_disk); +static void set_disk_ro_uevent(struct gendisk *gd, int ro) +{ + char event[] = "DISK_RO=1"; + char *envp[] = { event, NULL }; + + if (!ro) + event[8] = '0'; + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); +} + void set_device_ro(struct block_device *bdev, int flag) { bdev->bd_part->policy = flag; @@ -1227,8 +1239,12 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); + if (disk->part0.policy != flag) { + set_disk_ro_uevent(disk, flag); + disk->part0.policy = flag; + } + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) part->policy = flag; disk_part_iter_exit(&piter); diff --git a/block/ioctl.c b/block/ioctl.c index 500e4c73cc52..d3e6b5827a34 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev) return res; } -static void blk_ioc_discard_endio(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - complete(bio->bi_private); -} - static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, uint64_t len) { - struct request_queue *q = bdev_get_queue(bdev); - int ret = 0; - if (start & 511) return -EINVAL; if (len & 511) @@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (bdev->bd_inode->i_size >> 9)) return -EINVAL; - - if (!q->prepare_discard_fn) - return -EOPNOTSUPP; - - while (len && !ret) { - DECLARE_COMPLETION_ONSTACK(wait); - struct bio *bio; - - bio = bio_alloc(GFP_KERNEL, 0); - - bio->bi_end_io = blk_ioc_discard_endio; - bio->bi_bdev = bdev; - bio->bi_private = &wait; - bio->bi_sector = start; - - if (len > queue_max_hw_sectors(q)) { - bio->bi_size = queue_max_hw_sectors(q) << 9; - len -= queue_max_hw_sectors(q); - start += queue_max_hw_sectors(q); - } else { - bio->bi_size = len << 9; - len = 0; - } - submit_bio(DISCARD_NOBARRIER, bio); - - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; + return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, + DISCARD_FL_WAIT); } static int put_ushort(unsigned long arg, unsigned short val) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 5f8e798ede4e..e5b10017a50b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -32,6 +32,11 @@ #include <scsi/scsi_ioctl.h> #include <scsi/scsi_cmnd.h> +struct blk_cmd_filter { + unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; + unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; +} blk_default_cmd_filter; + /* Command group 3 is reserved and should never be used. */ const unsigned char scsi_command_size_tbl[8] = { @@ -105,7 +110,7 @@ static int sg_emulated_host(struct request_queue *q, int __user *p) return put_user(1, p); } -void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) +static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) { /* Basic read-only commands */ __set_bit(TEST_UNIT_READY, filter->read_ok); @@ -187,14 +192,37 @@ void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(GPCMD_SET_STREAMING, filter->write_ok); __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); } -EXPORT_SYMBOL_GPL(blk_set_cmd_filter_defaults); + +int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) +{ + struct blk_cmd_filter *filter = &blk_default_cmd_filter; + + /* root can do any command. */ + if (capable(CAP_SYS_RAWIO)) + return 0; + + /* if there's no filter set, assume we're filtering everything out */ + if (!filter) + return -EPERM; + + /* Anybody who can open the device can do a read-safe command */ + if (test_bit(cmd[0], filter->read_ok)) + return 0; + + /* Write-safe commands require a writable open */ + if (test_bit(cmd[0], filter->write_ok) && has_write_perm) + return 0; + + return -EPERM; +} +EXPORT_SYMBOL(blk_verify_command); static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, struct sg_io_hdr *hdr, fmode_t mode) { if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(&q->cmd_filter, rq->cmd, mode & FMODE_WRITE)) + if (blk_verify_command(rq->cmd, mode & FMODE_WRITE)) return -EPERM; /* @@ -427,7 +455,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(&q->cmd_filter, rq->cmd, mode & FMODE_WRITE); + err = blk_verify_command(rq->cmd, mode & FMODE_WRITE); if (err) goto error; @@ -645,5 +673,11 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod blk_put_queue(q); return err; } - EXPORT_SYMBOL(scsi_cmd_ioctl); + +int __init blk_scsi_ioctl_init(void) +{ + blk_set_cmd_filter_defaults(&blk_default_cmd_filter); + return 0; +} +fs_initcall(blk_scsi_ioctl_init); |