From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Mar 2011 08:52:07 +0100 Subject: block: remove per-queue plugging Code has been converted over to the new explicit on-stack plugging, and delay users have been converted to use the new API for that. So lets kill off the old plugging along with aops->sync_page(). Signed-off-by: Jens Axboe --- drivers/md/bitmap.c | 3 +- drivers/md/dm-crypt.c | 9 +----- drivers/md/dm-kcopyd.c | 52 +++--------------------------- drivers/md/dm-raid.c | 2 +- drivers/md/dm-raid1.c | 2 -- drivers/md/dm-table.c | 24 -------------- drivers/md/dm.c | 33 +++---------------- drivers/md/linear.c | 17 ---------- drivers/md/md.c | 7 ---- drivers/md/multipath.c | 31 ------------------ drivers/md/raid0.c | 16 ---------- drivers/md/raid1.c | 83 ++++++++++------------------------------------- drivers/md/raid10.c | 87 +++++++++++--------------------------------------- drivers/md/raid5.c | 62 +++++------------------------------ drivers/md/raid5.h | 2 +- 15 files changed, 58 insertions(+), 372 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 9a35320fb59f..54bfc274b39a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1339,8 +1339,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&bitmap->lock); - md_unplug(bitmap->mddev); - schedule(); + io_schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4e054bd91664..2c62c1169f78 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -991,11 +991,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_destructor = dm_crypt_bio_destructor; } -static void kcryptd_unplug(struct crypt_config *cc) -{ - blk_unplug(bdev_get_queue(cc->dev->bdev)); -} - static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) { struct crypt_config *cc = io->target->private; @@ -1008,10 +1003,8 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) * one in order to decrypt the whole bio data *afterwards*. */ clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); - if (!clone) { - kcryptd_unplug(cc); + if (!clone) return 1; - } crypt_inc_pending(io); diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 924f5f0084c2..400cf35094a4 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -37,13 +37,6 @@ struct dm_kcopyd_client { unsigned int nr_pages; unsigned int nr_free_pages; - /* - * Block devices to unplug. - * Non-NULL pointer means that a block device has some pending requests - * and needs to be unplugged. - */ - struct block_device *unplug[2]; - struct dm_io_client *io_client; wait_queue_head_t destroyq; @@ -315,31 +308,6 @@ static int run_complete_job(struct kcopyd_job *job) return 0; } -/* - * Unplug the block device at the specified index. - */ -static void unplug(struct dm_kcopyd_client *kc, int rw) -{ - if (kc->unplug[rw] != NULL) { - blk_unplug(bdev_get_queue(kc->unplug[rw])); - kc->unplug[rw] = NULL; - } -} - -/* - * Prepare block device unplug. If there's another device - * to be unplugged at the same array index, we unplug that - * device first. - */ -static void prepare_unplug(struct dm_kcopyd_client *kc, int rw, - struct block_device *bdev) -{ - if (likely(kc->unplug[rw] == bdev)) - return; - unplug(kc, rw); - kc->unplug[rw] = bdev; -} - static void complete_io(unsigned long error, void *context) { struct kcopyd_job *job = (struct kcopyd_job *) context; @@ -386,15 +354,12 @@ static int run_io_job(struct kcopyd_job *job) .client = job->kc->io_client, }; - if (job->rw == READ) { + if (job->rw == READ) r = dm_io(&io_req, 1, &job->source, NULL); - prepare_unplug(job->kc, READ, job->source.bdev); - } else { + else { if (job->num_dests > 1) io_req.bi_rw |= REQ_UNPLUG; r = dm_io(&io_req, job->num_dests, job->dests, NULL); - if (!(io_req.bi_rw & REQ_UNPLUG)) - prepare_unplug(job->kc, WRITE, job->dests[0].bdev); } return r; @@ -466,6 +431,7 @@ static void do_work(struct work_struct *work) { struct dm_kcopyd_client *kc = container_of(work, struct dm_kcopyd_client, kcopyd_work); + struct blk_plug plug; /* * The order that these are called is *very* important. @@ -473,18 +439,12 @@ static void do_work(struct work_struct *work) * Pages jobs when successful will jump onto the io jobs * list. io jobs call wake when they complete and it all * starts again. - * - * Note that io_jobs add block devices to the unplug array, - * this array is cleared with "unplug" calls. It is thus - * forbidden to run complete_jobs after io_jobs and before - * unplug because the block device could be destroyed in - * job completion callback. */ + blk_start_plug(&plug); process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); process_jobs(&kc->io_jobs, kc, run_io_job); - unplug(kc, READ); - unplug(kc, WRITE); + blk_finish_plug(&plug); } /* @@ -665,8 +625,6 @@ int dm_kcopyd_client_create(unsigned int nr_pages, INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); - memset(kc->unplug, 0, sizeof(kc->unplug)); - kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); if (!kc->job_pool) goto bad_slab; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b9e1e15ef11c..5ef136cdba91 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -394,7 +394,7 @@ static void raid_unplug(struct dm_target_callbacks *cb) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - md_raid5_unplug_device(rs->md.private); + md_raid5_kick_device(rs->md.private); } /* diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index dee326775c60..976ad4688afc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -842,8 +842,6 @@ static void do_mirror(struct work_struct *work) do_reads(ms, &reads); do_writes(ms, &writes); do_failures(ms, &failures); - - dm_table_unplug_all(ms->ti->table); } /*----------------------------------------------------------------- diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 38e4eb1bb965..f50a7b952257 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1275,29 +1275,6 @@ int dm_table_any_busy_target(struct dm_table *t) return 0; } -void dm_table_unplug_all(struct dm_table *t) -{ - struct dm_dev_internal *dd; - struct list_head *devices = dm_table_get_devices(t); - struct dm_target_callbacks *cb; - - list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); - char b[BDEVNAME_SIZE]; - - if (likely(q)) - blk_unplug(q); - else - DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s", - dm_device_name(t->md), - bdevname(dd->dm_dev.bdev, b)); - } - - list_for_each_entry(cb, &t->target_callbacks, list) - if (cb->unplug_fn) - cb->unplug_fn(cb); -} - struct mapped_device *dm_table_get_md(struct dm_table *t) { return t->md; @@ -1345,4 +1322,3 @@ EXPORT_SYMBOL(dm_table_get_mode); EXPORT_SYMBOL(dm_table_get_md); EXPORT_SYMBOL(dm_table_put); EXPORT_SYMBOL(dm_table_get); -EXPORT_SYMBOL(dm_table_unplug_all); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index eaa3af0e0632..d22b9905c168 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -807,8 +807,6 @@ void dm_requeue_unmapped_request(struct request *clone) dm_unprep_request(rq); spin_lock_irqsave(q->queue_lock, flags); - if (elv_queue_empty(q)) - blk_plug_device(q); blk_requeue_request(q, rq); spin_unlock_irqrestore(q->queue_lock, flags); @@ -1613,10 +1611,10 @@ static void dm_request_fn(struct request_queue *q) * number of in-flight I/Os after the queue is stopped in * dm_suspend(). */ - while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { + while (!blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) - goto plug_and_out; + goto delay_and_out; /* always use block 0 to find the target for flushes for now */ pos = 0; @@ -1627,7 +1625,7 @@ static void dm_request_fn(struct request_queue *q) BUG_ON(!dm_target_is_valid(ti)); if (ti->type->busy && ti->type->busy(ti)) - goto plug_and_out; + goto delay_and_out; blk_start_request(rq); clone = rq->special; @@ -1647,11 +1645,8 @@ requeued: BUG_ON(!irqs_disabled()); spin_lock(q->queue_lock); -plug_and_out: - if (!elv_queue_empty(q)) - /* Some requests still remain, retry later */ - blk_plug_device(q); - +delay_and_out: + blk_delay_queue(q, HZ / 10); out: dm_table_put(map); @@ -1680,20 +1675,6 @@ static int dm_lld_busy(struct request_queue *q) return r; } -static void dm_unplug_all(struct request_queue *q) -{ - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); - - if (map) { - if (dm_request_based(md)) - generic_unplug_device(q); - - dm_table_unplug_all(map); - dm_table_put(map); - } -} - static int dm_any_congested(void *congested_data, int bdi_bits) { int r = bdi_bits; @@ -1817,7 +1798,6 @@ static void dm_init_md_queue(struct mapped_device *md) md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - md->queue->unplug_fn = dm_unplug_all; blk_queue_merge_bvec(md->queue, dm_merge_bvec); blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); } @@ -2263,8 +2243,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) int r = 0; DECLARE_WAITQUEUE(wait, current); - dm_unplug_all(md->queue); - add_wait_queue(&md->wait, &wait); while (1) { @@ -2539,7 +2517,6 @@ int dm_resume(struct mapped_device *md) clear_bit(DMF_SUSPENDED, &md->flags); - dm_table_unplug_all(map); r = 0; out: dm_table_put(map); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 8a2f767f26d8..38861b5b9d90 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q, return maxsectors << 9; } -static void linear_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - linear_conf_t *conf; - int i; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); - - for (i=0; i < mddev->raid_disks; i++) { - struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); - blk_unplug(r_queue); - } - rcu_read_unlock(); -} - static int linear_congested(void *data, int bits) { mddev_t *mddev = data; @@ -225,7 +209,6 @@ static int linear_run (mddev_t *mddev) md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->unplug_fn = linear_unplug; mddev->queue->backing_dev_info.congested_fn = linear_congested; mddev->queue->backing_dev_info.congested_data = mddev; md_integrity_register(mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0cc30ecda4c1..ca0d79c264b9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4812,7 +4812,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) __md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; - mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; /* tell userspace to handle 'inactive' */ @@ -6669,8 +6668,6 @@ EXPORT_SYMBOL_GPL(md_allow_write); void md_unplug(mddev_t *mddev) { - if (mddev->queue) - blk_unplug(mddev->queue); if (mddev->plug) mddev->plug->unplug_fn(mddev->plug); } @@ -6853,7 +6850,6 @@ void md_do_sync(mddev_t *mddev) >= mddev->resync_max - mddev->curr_resync_completed )) { /* time to update curr_resync_completed */ - md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; @@ -6929,7 +6925,6 @@ void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - md_unplug(mddev); cond_resched(); currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 @@ -6948,8 +6943,6 @@ void md_do_sync(mddev_t *mddev) * this also signals 'finished resyncing' to md_stop */ out: - md_unplug(mddev); - wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 6d7ddf32ef2e..1cc8ed44e4ad 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -106,36 +106,6 @@ static void multipath_end_request(struct bio *bio, int error) rdev_dec_pending(rdev, conf->mddev); } -static void unplug_slaves(mddev_t *mddev) -{ - multipath_conf_t *conf = mddev->private; - int i; - - rcu_read_lock(); - for (i=0; iraid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) - && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); -} - -static void multipath_unplug(struct request_queue *q) -{ - unplug_slaves(q->queuedata); -} - - static int multipath_make_request(mddev_t *mddev, struct bio * bio) { multipath_conf_t *conf = mddev->private; @@ -518,7 +488,6 @@ static int multipath_run (mddev_t *mddev) */ md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; mddev->queue->backing_dev_info.congested_data = mddev; md_integrity_register(mddev); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 637a96855edb..6338c0fe6208 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -25,21 +25,6 @@ #include "raid0.h" #include "raid5.h" -static void raid0_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - raid0_conf_t *conf = mddev->private; - mdk_rdev_t **devlist = conf->devlist; - int raid_disks = conf->strip_zone[0].nb_dev; - int i; - - for (i=0; i < raid_disks; i++) { - struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); - - blk_unplug(r_queue); - } -} - static int raid0_congested(void *data, int bits) { mddev_t *mddev = data; @@ -272,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) mdname(mddev), (unsigned long long)smallest->sectors); } - mddev->queue->unplug_fn = raid0_unplug; mddev->queue->backing_dev_info.congested_fn = raid0_congested; mddev->queue->backing_dev_info.congested_data = mddev; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a23ffa397ba9..b67d822d57ae 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -52,23 +52,16 @@ #define NR_RAID1_BIOS 256 -static void unplug_slaves(mddev_t *mddev); - static void allow_barrier(conf_t *conf); static void lower_barrier(conf_t *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) { struct pool_info *pi = data; - r1bio_t *r1_bio; int size = offsetof(r1bio_t, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ - r1_bio = kzalloc(size, gfp_flags); - if (!r1_bio && pi->mddev) - unplug_slaves(pi->mddev); - - return r1_bio; + return kzalloc(size, gfp_flags); } static void r1bio_pool_free(void *r1_bio, void *data) @@ -91,10 +84,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, pi); - if (!r1_bio) { - unplug_slaves(pi->mddev); + if (!r1_bio) return NULL; - } /* * Allocate bios : 1 for reading, n-1 for writing @@ -520,37 +511,6 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) return new_disk; } -static void unplug_slaves(mddev_t *mddev) -{ - conf_t *conf = mddev->private; - int i; - - rcu_read_lock(); - for (i=0; iraid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); -} - -static void raid1_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - - unplug_slaves(mddev); - md_wakeup_thread(mddev->thread); -} - static int raid1_congested(void *data, int bits) { mddev_t *mddev = data; @@ -580,20 +540,16 @@ static int raid1_congested(void *data, int bits) } -static int flush_pending_writes(conf_t *conf) +static void flush_pending_writes(conf_t *conf) { /* Any writes that have been queued but are awaiting * bitmap updates get flushed here. - * We return 1 if any requests were actually submitted. */ - int rv = 0; - spin_lock_irq(&conf->device_lock); if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(conf->mddev->queue); spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to * disk before proceeding w/ I/O */ @@ -605,10 +561,14 @@ static int flush_pending_writes(conf_t *conf) generic_make_request(bio); bio = next; } - rv = 1; } else spin_unlock_irq(&conf->device_lock); - return rv; +} + +static void md_kick_device(mddev_t *mddev) +{ + blk_flush_plug(current); + md_wakeup_thread(mddev->thread); } /* Barriers.... @@ -640,8 +600,7 @@ static void raise_barrier(conf_t *conf) /* Wait until no block IO is waiting */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, - conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + conf->resync_lock, md_kick_device(conf->mddev)); /* block any new IO from starting */ conf->barrier++; @@ -649,8 +608,7 @@ static void raise_barrier(conf_t *conf) /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + conf->resync_lock, md_kick_device(conf->mddev)); spin_unlock_irq(&conf->resync_lock); } @@ -672,7 +630,7 @@ static void wait_barrier(conf_t *conf) conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + md_kick_device(conf->mddev)); conf->nr_waiting--; } conf->nr_pending++; @@ -709,7 +667,7 @@ static void freeze_array(conf_t *conf) conf->nr_pending == conf->nr_queued+1, conf->resync_lock, ({ flush_pending_writes(conf); - raid1_unplug(conf->mddev->queue); })); + md_kick_device(conf->mddev); })); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -959,7 +917,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_inc(&r1_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); } r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); @@ -968,7 +925,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (do_sync) + if (do_sync || !bitmap) md_wakeup_thread(mddev->thread); return 0; @@ -1558,7 +1515,6 @@ static void raid1d(mddev_t *mddev) unsigned long flags; conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; - int unplug=0; mdk_rdev_t *rdev; md_check_recovery(mddev); @@ -1566,7 +1522,7 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - unplug += flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -1580,10 +1536,9 @@ static void raid1d(mddev_t *mddev) mddev = r1_bio->mddev; conf = mddev->private; - if (test_bit(R1BIO_IsSync, &r1_bio->state)) { + if (test_bit(R1BIO_IsSync, &r1_bio->state)) sync_request_write(mddev, r1_bio); - unplug = 1; - } else { + else { int disk; /* we got a read error. Maybe the drive is bad. Maybe just @@ -1633,14 +1588,11 @@ static void raid1d(mddev_t *mddev) bio->bi_end_io = raid1_end_read_request; bio->bi_rw = READ | do_sync; bio->bi_private = r1_bio; - unplug = 1; generic_make_request(bio); } } cond_resched(); } - if (unplug) - unplug_slaves(mddev); } @@ -2064,7 +2016,6 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); - mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; md_integrity_register(mddev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3b607b28741b..e79f1c5bf71b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -57,23 +57,16 @@ */ #define NR_RAID10_BIOS 256 -static void unplug_slaves(mddev_t *mddev); - static void allow_barrier(conf_t *conf); static void lower_barrier(conf_t *conf); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { conf_t *conf = data; - r10bio_t *r10_bio; int size = offsetof(struct r10bio_s, devs[conf->copies]); /* allocate a r10bio with room for raid_disks entries in the bios array */ - r10_bio = kzalloc(size, gfp_flags); - if (!r10_bio && conf->mddev) - unplug_slaves(conf->mddev); - - return r10_bio; + return kzalloc(size, gfp_flags); } static void r10bio_pool_free(void *r10_bio, void *data) @@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) int nalloc; r10_bio = r10bio_pool_alloc(gfp_flags, conf); - if (!r10_bio) { - unplug_slaves(conf->mddev); + if (!r10_bio) return NULL; - } if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) nalloc = conf->copies; /* resync */ @@ -597,37 +588,6 @@ rb_out: return disk; } -static void unplug_slaves(mddev_t *mddev) -{ - conf_t *conf = mddev->private; - int i; - - rcu_read_lock(); - for (i=0; i < conf->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); -} - -static void raid10_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - - unplug_slaves(q->queuedata); - md_wakeup_thread(mddev->thread); -} - static int raid10_congested(void *data, int bits) { mddev_t *mddev = data; @@ -649,20 +609,16 @@ static int raid10_congested(void *data, int bits) return ret; } -static int flush_pending_writes(conf_t *conf) +static void flush_pending_writes(conf_t *conf) { /* Any writes that have been queued but are awaiting * bitmap updates get flushed here. - * We return 1 if any requests were actually submitted. */ - int rv = 0; - spin_lock_irq(&conf->device_lock); if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(conf->mddev->queue); spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to disk * before proceeding w/ I/O */ @@ -674,11 +630,16 @@ static int flush_pending_writes(conf_t *conf) generic_make_request(bio); bio = next; } - rv = 1; } else spin_unlock_irq(&conf->device_lock); - return rv; } + +static void md_kick_device(mddev_t *mddev) +{ + blk_flush_plug(current); + md_wakeup_thread(mddev->thread); +} + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -708,8 +669,7 @@ static void raise_barrier(conf_t *conf, int force) /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + conf->resync_lock, md_kick_device(conf->mddev)); /* block any new IO from starting */ conf->barrier++; @@ -717,8 +677,7 @@ static void raise_barrier(conf_t *conf, int force) /* No wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + conf->resync_lock, md_kick_device(conf->mddev)); spin_unlock_irq(&conf->resync_lock); } @@ -739,7 +698,7 @@ static void wait_barrier(conf_t *conf) conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + md_kick_device(conf->mddev)); conf->nr_waiting--; } conf->nr_pending++; @@ -776,7 +735,7 @@ static void freeze_array(conf_t *conf) conf->nr_pending == conf->nr_queued+1, conf->resync_lock, ({ flush_pending_writes(conf); - raid10_unplug(conf->mddev->queue); })); + md_kick_device(conf->mddev); })); spin_unlock_irq(&conf->resync_lock); } @@ -971,7 +930,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) atomic_inc(&r10_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); } @@ -988,7 +946,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - if (do_sync) + if (do_sync || !mddev->bitmap) md_wakeup_thread(mddev->thread); return 0; @@ -1681,7 +1639,6 @@ static void raid10d(mddev_t *mddev) unsigned long flags; conf_t *conf = mddev->private; struct list_head *head = &conf->retry_list; - int unplug=0; mdk_rdev_t *rdev; md_check_recovery(mddev); @@ -1689,7 +1646,7 @@ static void raid10d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - unplug += flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { @@ -1703,13 +1660,11 @@ static void raid10d(mddev_t *mddev) mddev = r10_bio->mddev; conf = mddev->private; - if (test_bit(R10BIO_IsSync, &r10_bio->state)) { + if (test_bit(R10BIO_IsSync, &r10_bio->state)) sync_request_write(mddev, r10_bio); - unplug = 1; - } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { + else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) recovery_request_write(mddev, r10_bio); - unplug = 1; - } else { + else { int mirror; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. @@ -1756,14 +1711,11 @@ static void raid10d(mddev_t *mddev) bio->bi_rw = READ | do_sync; bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; - unplug = 1; generic_make_request(bio); } } cond_resched(); } - if (unplug) - unplug_slaves(mddev); } @@ -2376,7 +2328,6 @@ static int run(mddev_t *mddev) md_set_array_sectors(mddev, size); mddev->resync_max_sectors = size; - mddev->queue->unplug_fn = raid10_unplug; mddev->queue->backing_dev_info.congested_fn = raid10_congested; mddev->queue->backing_dev_info.congested_data = mddev; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 702812824195..e867ee42b152 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -433,8 +433,6 @@ static int has_failed(raid5_conf_t *conf) return 0; } -static void unplug_slaves(mddev_t *mddev); - static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, int previous, int noblock, int noquiesce) @@ -463,8 +461,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, - md_raid5_unplug_device(conf) - ); + md_raid5_kick_device(conf)); conf->inactive_blocked = 0; } else init_stripe(sh, sector, previous); @@ -1473,8 +1470,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list), conf->device_lock, - unplug_slaves(conf->mddev) - ); + blk_flush_plug(current)); osh = get_free_stripe(conf); spin_unlock_irq(&conf->device_lock); atomic_set(&nsh->count, 1); @@ -3645,58 +3641,19 @@ static void activate_bit_delay(raid5_conf_t *conf) } } -static void unplug_slaves(mddev_t *mddev) +void md_raid5_kick_device(raid5_conf_t *conf) { - raid5_conf_t *conf = mddev->private; - int i; - int devs = max(conf->raid_disks, conf->previous_raid_disks); - - rcu_read_lock(); - for (i = 0; i < devs; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); -} - -void md_raid5_unplug_device(raid5_conf_t *conf) -{ - unsigned long flags; - - spin_lock_irqsave(&conf->device_lock, flags); - - if (plugger_remove_plug(&conf->plug)) { - conf->seq_flush++; - raid5_activate_delayed(conf); - } + blk_flush_plug(current); + raid5_activate_delayed(conf); md_wakeup_thread(conf->mddev->thread); - - spin_unlock_irqrestore(&conf->device_lock, flags); - - unplug_slaves(conf->mddev); } -EXPORT_SYMBOL_GPL(md_raid5_unplug_device); +EXPORT_SYMBOL_GPL(md_raid5_kick_device); static void raid5_unplug(struct plug_handle *plug) { raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); - md_raid5_unplug_device(conf); -} -static void raid5_unplug_queue(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - md_raid5_unplug_device(mddev->private); + md_raid5_kick_device(conf); } int md_raid5_congested(mddev_t *mddev, int bits) @@ -4100,7 +4057,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) * add failed due to overlap. Flush everything * and wait a while */ - md_raid5_unplug_device(conf); + md_raid5_kick_device(conf); release_stripe(sh); schedule(); goto retry; @@ -4365,7 +4322,6 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ - unplug_slaves(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); @@ -4569,7 +4525,6 @@ static void raid5d(mddev_t *mddev) spin_unlock_irq(&conf->device_lock); async_tx_issue_pending_all(); - unplug_slaves(mddev); pr_debug("--- raid5d inactive\n"); } @@ -5205,7 +5160,6 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; mddev->queue->queue_lock = &conf->device_lock; - mddev->queue->unplug_fn = raid5_unplug_queue; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2ace0582b409..8d563a4f022a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -503,6 +503,6 @@ static inline int algorithm_is_DDF(int layout) } extern int md_raid5_congested(mddev_t *mddev, int bits); -extern void md_raid5_unplug_device(raid5_conf_t *conf); +extern void md_raid5_kick_device(raid5_conf_t *conf); extern int raid5_set_cache_size(mddev_t *mddev, int size); #endif -- cgit v1.2.3 From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Mar 2011 11:56:30 +0100 Subject: block: kill off REQ_UNPLUG With the plugging now being explicitly controlled by the submitter, callers need not pass down unplugging hints to the block layer. If they want to unplug, it's because they manually plugged on their own - in which case, they should just unplug at will. Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_main.c | 3 +-- drivers/block/drbd/drbd_receiver.c | 9 +-------- drivers/md/bitmap.c | 2 +- drivers/md/dm-io.c | 2 +- drivers/md/dm-kcopyd.c | 5 +---- drivers/md/md.c | 5 ++--- fs/btrfs/extent_io.c | 2 +- fs/buffer.c | 14 ++++---------- fs/direct-io.c | 2 +- fs/ext4/page-io.c | 3 +-- fs/gfs2/log.c | 4 ++-- fs/gfs2/lops.c | 12 ++++++------ fs/gfs2/meta_io.c | 2 +- fs/jbd/commit.c | 2 +- fs/jbd2/commit.c | 6 +++--- fs/nilfs2/segbuf.c | 2 +- fs/xfs/linux-2.6/xfs_aops.c | 3 +-- include/linux/blk_types.h | 2 -- include/linux/fs.h | 28 +++++++++------------------- kernel/power/block_io.c | 2 +- mm/page_io.c | 2 +- 24 files changed, 43 insertions(+), 75 deletions(-) (limited to 'drivers/md') diff --git a/block/blk-core.c b/block/blk-core.c index 82a45898ba76..7e9715ae18c8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1290,7 +1290,7 @@ get_rq: } plug = current->plug; - if (plug && !sync) { + if (plug) { if (!plug->should_sort && !list_empty(&plug->list)) { struct request *__rq; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 2096628d6e65..aca302492ff2 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -80,7 +80,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA; - rw |= REQ_UNPLUG | REQ_SYNC; + rw |= REQ_SYNC; bio = bio_alloc(GFP_NOIO, 1); bio->bi_bdev = bdev->md_bdev; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 0b5718e19586..b0bd27dfc1e8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -377,7 +377,7 @@ union p_header { #define DP_HARDBARRIER 1 /* depricated */ #define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 -#define DP_UNPLUG 8 /* equals REQ_UNPLUG */ +#define DP_UNPLUG 8 /* not used anymore */ #define DP_FUA 16 /* equals REQ_FUA */ #define DP_FLUSH 32 /* equals REQ_FLUSH */ #define DP_DISCARD 64 /* equals REQ_DISCARD */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6049cb85310d..8a43ce0edeed 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2477,12 +2477,11 @@ static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw) { if (mdev->agreed_pro_version >= 95) return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | - (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) | (bi_rw & REQ_FUA ? DP_FUA : 0) | (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); else - return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0; + return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; } /* Used to send write requests diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 84132f8bf8a4..8e68be939deb 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1100,8 +1100,6 @@ next_bio: /* > e->sector, unless this is the first bio */ bio->bi_sector = sector; bio->bi_bdev = mdev->ldev->backing_bdev; - /* we special case some flags in the multi-bio case, see below - * (REQ_UNPLUG) */ bio->bi_rw = rw; bio->bi_private = e; bio->bi_end_io = drbd_endio_sec; @@ -1130,10 +1128,6 @@ next_bio: bios = bios->bi_next; bio->bi_next = NULL; - /* strip off REQ_UNPLUG unless it is the last bio */ - if (bios) - bio->bi_rw &= ~REQ_UNPLUG; - drbd_generic_make_request(mdev, fault_type, bio); } while (bios); return 0; @@ -1621,12 +1615,11 @@ static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf) { if (mdev->agreed_pro_version >= 95) return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | - (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) | (dpf & DP_FUA ? REQ_FUA : 0) | (dpf & DP_FLUSH ? REQ_FUA : 0) | (dpf & DP_DISCARD ? REQ_DISCARD : 0); else - return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0; + return dpf & DP_RW_SYNC ? REQ_SYNC : 0; } /* mirrored write */ diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 54bfc274b39a..ca203cb23f3c 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -347,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); - submit_bh(WRITE | REQ_UNPLUG | REQ_SYNC, bh); + submit_bh(WRITE | REQ_SYNC, bh); bh = bh->b_this_page; } diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 136d4f71a116..76a5af00a26b 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -352,7 +352,7 @@ static void dispatch_io(int rw, unsigned int num_regions, BUG_ON(num_regions > DM_IO_MAX_REGIONS); if (sync) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; /* * For multiple regions we need to be careful to rewind diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 400cf35094a4..1bb73a13ca40 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -356,11 +356,8 @@ static int run_io_job(struct kcopyd_job *job) if (job->rw == READ) r = dm_io(&io_req, 1, &job->source, NULL); - else { - if (job->num_dests > 1) - io_req.bi_rw |= REQ_UNPLUG; + else r = dm_io(&io_req, job->num_dests, job->dests, NULL); - } return r; } diff --git a/drivers/md/md.c b/drivers/md/md.c index ca0d79c264b9..28f9c1ee4e3a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -777,8 +777,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, bio->bi_end_io = super_written; atomic_inc(&mddev->pending_writes); - submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, - bio); + submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); } void md_super_wait(mddev_t *mddev) @@ -806,7 +805,7 @@ int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, struct completion event; int ret; - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 92ac5192c518..b76f7cd47401 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2182,7 +2182,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC_PLUG; + write_flags = WRITE_SYNC; else write_flags = WRITE; diff --git a/fs/buffer.c b/fs/buffer.c index f903f2e5b4fe..42534f67d71b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -767,7 +767,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * still in flight on potentially older * contents. */ - write_dirty_buffer(bh, WRITE_SYNC_PLUG); + write_dirty_buffer(bh, WRITE_SYNC); /* * Kick off IO for the previous mapping. Note @@ -1602,14 +1602,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * prevents this contention from occurring. * * If block_write_full_page() is called with wbc->sync_mode == - * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this - * causes the writes to be flagged as synchronous writes, but the - * block device queue will NOT be unplugged, since usually many pages - * will be pushed to the out before the higher-level caller actually - * waits for the writes to be completed. The various wait functions, - * such as wait_on_writeback_range() will ultimately call sync_page() - * which will ultimately call blk_run_backing_dev(), which will end up - * unplugging the device queue. + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this + * causes the writes to be flagged as synchronous writes. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc, @@ -1622,7 +1616,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE); + WRITE_SYNC : WRITE); BUG_ON(!PageLocked(page)); diff --git a/fs/direct-io.c b/fs/direct-io.c index df709b3b860a..426083136099 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1173,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct dio *dio; if (rw & WRITE) - rw = WRITE_ODIRECT_PLUG; + rw = WRITE_ODIRECT; if (bdev) bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 955cc309142f..e2cd90e4bb7c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -310,8 +310,7 @@ static int io_submit_init(struct ext4_io_submit *io, io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE); + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); io->io_next_block = bh->b_blocknr; return 0; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index eb01f3575e10..7f1c11202342 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -121,7 +121,7 @@ __acquires(&sdp->sd_log_lock) lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); } else { unlock_buffer(bh); brelse(bh); @@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) lock_buffer(bh); if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); } else { unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index bf33f822058d..48b545a1979a 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -200,7 +200,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_log_unlock(sdp); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); gfs2_log_lock(sdp); n = 0; @@ -210,7 +210,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); lock_buffer(bd2->bd_bh); bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); gfs2_log_lock(sdp); if (++n >= num) break; @@ -352,7 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); bh = gfs2_log_get_buf(sdp); mh = (struct gfs2_meta_header *)bh->b_data; @@ -369,7 +369,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); } static void revoke_lo_before_scan(struct gfs2_jdesc *jd, @@ -571,7 +571,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, ptr = bh_log_ptr(bh); get_bh(bh); - submit_bh(WRITE_SYNC_PLUG, bh); + submit_bh(WRITE_SYNC, bh); gfs2_log_lock(sdp); while(!list_empty(list)) { bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); @@ -597,7 +597,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, } else { bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); } - submit_bh(WRITE_SYNC_PLUG, bh1); + submit_bh(WRITE_SYNC, bh1); gfs2_log_lock(sdp); ptr += 2; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index a566331db4e1..867b713cba92 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb struct buffer_head *bh, *head; int nr_underway = 0; int write_op = REQ_META | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 34a4861c14b8..66be299acb1b 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -333,7 +333,7 @@ void journal_commit_transaction(journal_t *journal) * instead we rely on sync_buffer() doing the unplug for us. */ if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC_PLUG; + write_op = WRITE_SYNC; spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f3ad1598b201..3da1cc4346d5 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -137,9 +137,9 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh); + ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); else - ret = submit_bh(WRITE_SYNC_PLUG, bh); + ret = submit_bh(WRITE_SYNC, bh); *cbh = bh; return ret; @@ -369,7 +369,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * instead we rely on sync_buffer() doing the unplug for us. */ if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC_PLUG; + write_op = WRITE_SYNC; trace_jbd2_commit_locking(journal, commit_transaction); stats.run.rs_wait = commit_transaction->t_max_wait; stats.run.rs_locked = jiffies; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 0f83e93935b2..2853ff20f85a 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); } diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 83c1c20d145a..6bbb0ee33253 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -413,8 +413,7 @@ xfs_submit_ioend_bio( if (xfs_ioend_new_eof(ioend)) xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE, bio); + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); } STATIC struct bio * diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 16b286473042..be50d9e70a7d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -128,7 +128,6 @@ enum rq_flag_bits { __REQ_NOIDLE, /* don't anticipate more IO after this one */ /* bio only flags */ - __REQ_UNPLUG, /* unplug the immediately after submission */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ @@ -172,7 +171,6 @@ enum rq_flag_bits { REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define REQ_CLONE_MASK REQ_COMMON_MASK -#define REQ_UNPLUG (1 << __REQ_UNPLUG) #define REQ_RAHEAD (1 << __REQ_RAHEAD) #define REQ_THROTTLED (1 << __REQ_THROTTLED) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9f2cf69911b8..543e226ea6a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -135,16 +135,10 @@ struct inodes_stat_t { * block layer could (in theory) choose to ignore this * request if it runs into resource problems. * WRITE A normal async write. Device will be plugged. - * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down + * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down * the hint that someone will be waiting on this IO - * shortly. The device must still be unplugged explicitly, - * WRITE_SYNC_PLUG does not do this as we could be - * submitting more writes before we actually wait on any - * of them. - * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device - * immediately after submission. The write equivalent - * of READ_SYNC. - * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. + * shortly. The write equivalent of READ_SYNC. + * WRITE_ODIRECT Special case write for O_DIRECT only. * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on * non-volatile media on completion. @@ -160,18 +154,14 @@ struct inodes_stat_t { #define WRITE RW_MASK #define READA RWA_MASK -#define READ_SYNC (READ | REQ_SYNC | REQ_UNPLUG) +#define READ_SYNC (READ | REQ_SYNC) #define READ_META (READ | REQ_META) -#define WRITE_SYNC_PLUG (WRITE | REQ_SYNC | REQ_NOIDLE) -#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) -#define WRITE_ODIRECT_PLUG (WRITE | REQ_SYNC) +#define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) +#define WRITE_ODIRECT (WRITE | REQ_SYNC) #define WRITE_META (WRITE | REQ_META) -#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FLUSH) -#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FUA) -#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ - REQ_FLUSH | REQ_FUA) +#define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH) +#define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) +#define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) #define SEL_IN 1 #define SEL_OUT 2 diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c02df9..d09dd10c5a5e 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; + const int bio_rw = rw | REQ_SYNC; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf469..dc76b4d0611e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3 From a91a2785b200864aef2270ed6a3babac7a253a20 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 17 Mar 2011 11:11:05 +0100 Subject: block: Require subsystems to explicitly allocate bio_set integrity mempool MD and DM create a new bio_set for every metadevice. Each bio_set has an integrity mempool attached regardless of whether the metadevice is capable of passing integrity metadata. This is a waste of memory. Instead we defer the allocation decision to MD and DM since we know at metadevice creation time whether integrity passthrough is needed or not. Automatic integrity mempool allocation can then be removed from bioset_create() and we make an explicit integrity allocation for the fs_bio_set. Signed-off-by: Martin K. Petersen Reported-by: Zdenek Kabelac Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 7 +++++-- drivers/md/dm.c | 12 +++++++++--- drivers/md/dm.h | 2 +- drivers/md/linear.c | 3 +-- drivers/md/md.c | 8 ++++++-- drivers/md/multipath.c | 7 +++++-- drivers/md/raid0.c | 3 +-- drivers/md/raid1.c | 5 ++--- drivers/md/raid10.c | 7 +++++-- fs/bio-integrity.c | 3 +++ fs/bio.c | 6 +++--- 11 files changed, 41 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f50a7b952257..416d4e258df6 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -55,6 +55,7 @@ struct dm_table { struct dm_target *targets; unsigned discards_supported:1; + unsigned integrity_supported:1; /* * Indicates the rw permissions for the new logical @@ -859,7 +860,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t) return -EINVAL; } - t->mempools = dm_alloc_md_mempools(type); + t->mempools = dm_alloc_md_mempools(type, t->integrity_supported); if (!t->mempools) return -ENOMEM; @@ -935,8 +936,10 @@ static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device struct dm_dev_internal *dd; list_for_each_entry(dd, devices, list) - if (bdev_get_integrity(dd->dm_dev.bdev)) + if (bdev_get_integrity(dd->dm_dev.bdev)) { + t->integrity_supported = 1; return blk_integrity_register(dm_disk(md), NULL); + } return 0; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d22b9905c168..88820704a191 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2620,9 +2620,10 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) { struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); + unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; if (!pools) return NULL; @@ -2639,13 +2640,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) if (!pools->tio_pool) goto free_io_pool_and_out; - pools->bs = (type == DM_TYPE_BIO_BASED) ? - bioset_create(16, 0) : bioset_create(MIN_IOS, 0); + pools->bs = bioset_create(pool_size, 0); if (!pools->bs) goto free_tio_pool_and_out; + if (integrity && bioset_integrity_create(pools->bs, pool_size)) + goto free_bioset_and_out; + return pools; +free_bioset_and_out: + bioset_free(pools->bs); + free_tio_pool_and_out: mempool_destroy(pools->tio_pool); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 0c2dd5f4af76..1aaf16746da8 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -149,7 +149,7 @@ void dm_kcopyd_exit(void); /* * Mempool operations */ -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity); void dm_free_md_mempools(struct dm_md_mempools *pools); #endif diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 338804f8fb3b..abfb59a61ede 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -210,8 +210,7 @@ static int linear_run (mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->backing_dev_info.congested_fn = linear_congested; mddev->queue->backing_dev_info.congested_data = mddev; - md_integrity_register(mddev); - return 0; + return md_integrity_register(mddev); } static void free_conf(struct rcu_head *head) diff --git a/drivers/md/md.c b/drivers/md/md.c index 86ba66c0b28a..1876761f1828 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1803,8 +1803,12 @@ int md_integrity_register(mddev_t *mddev) mdname(mddev)); return -EINVAL; } - printk(KERN_NOTICE "md: data integrity on %s enabled\n", - mdname(mddev)); + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); + if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { + printk(KERN_ERR "md: failed to create integrity pool for %s\n", + mdname(mddev)); + return -EINVAL; + } return 0; } EXPORT_SYMBOL(md_integrity_register); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 5e694b151c30..c35890990985 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -315,7 +315,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number) p->rdev = rdev; goto abort; } - md_integrity_register(mddev); + err = md_integrity_register(mddev); } abort: @@ -489,7 +489,10 @@ static int multipath_run (mddev_t *mddev) mddev->queue->backing_dev_info.congested_fn = multipath_congested; mddev->queue->backing_dev_info.congested_data = mddev; - md_integrity_register(mddev); + + if (md_integrity_register(mddev)) + goto out_free_conf; + return 0; out_free_conf: diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 95916fd6394a..e86bf3682e1e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -379,8 +379,7 @@ static int raid0_run(mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); - md_integrity_register(mddev); - return 0; + return md_integrity_register(mddev); } static int raid0_stop(mddev_t *mddev) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8f34ad5c478b..c2a21ae56d97 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1132,7 +1132,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) p->rdev = rdev; goto abort; } - md_integrity_register(mddev); + err = md_integrity_register(mddev); } abort: @@ -2017,8 +2017,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; - md_integrity_register(mddev); - return 0; + return md_integrity_register(mddev); } static int stop(mddev_t *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c0d0f5f7e407..f7b62370b374 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1188,7 +1188,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number) p->rdev = rdev; goto abort; } - md_integrity_register(mddev); + err = md_integrity_register(mddev); } abort: @@ -2343,7 +2343,10 @@ static int run(mddev_t *mddev) if (conf->near_copies < conf->raid_disks) blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); - md_integrity_register(mddev); + + if (md_integrity_register(mddev)) + goto out_free_conf; + return 0; out_free_conf: diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index e49cce234c65..9c5e6b2cd11a 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) { unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); + if (bs->bio_integrity_pool) + return 0; + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); diff --git a/fs/bio.c b/fs/bio.c index 5694b756ed01..85e2eabb1f81 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - if (bioset_integrity_create(bs, pool_size)) - goto bad; - if (!biovec_create_pools(bs, pool_size)) return bs; @@ -1682,6 +1679,9 @@ static int __init init_bio(void) if (!fs_bio_set) panic("bio: can't allocate bios\n"); + if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) + panic("bio: can't create integrity pool\n"); + bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, sizeof(struct bio_pair)); if (!bio_split_pool) -- cgit v1.2.3 From 1e9bb8808ac11094d711d20d580e7b45a4992d0c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 22 Mar 2011 08:35:35 +0100 Subject: block: fix non-atomic access to genhd inflight structures After the stack plugging introduction, these are called lockless. Ensure that the counters are updated atomically. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/md/dm.c | 7 ++++--- fs/partitions/check.c | 3 ++- include/linux/genhd.h | 12 ++++++------ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 88820704a191..0cf68b478878 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -477,7 +477,8 @@ static void start_io_acct(struct dm_io *io) cpu = part_stat_lock(); part_round_stats(cpu, &dm_disk(md)->part0); part_stat_unlock(); - dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); + atomic_set(&dm_disk(md)->part0.in_flight[rw], + atomic_inc_return(&md->pending[rw])); } static void end_io_acct(struct dm_io *io) @@ -497,8 +498,8 @@ static void end_io_acct(struct dm_io *io) * After this is decremented the bio must not be touched if it is * a flush. */ - dm_disk(md)->part0.in_flight[rw] = pending = - atomic_dec_return(&md->pending[rw]); + pending = atomic_dec_return(&md->pending[rw]); + atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); pending += atomic_read(&md->pending[rw^0x1]); /* nudge anyone waiting on suspend queue */ diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 9c21119512b9..ac546975031f 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); + return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), + atomic_read(&p->in_flight[1])); } #ifdef CONFIG_FAIL_MAKE_REQUEST diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c0d5f6945c1e..d764a426e9fd 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -109,7 +109,7 @@ struct hd_struct { int make_it_fail; #endif unsigned long stamp; - int in_flight[2]; + atomic_t in_flight[2]; #ifdef CONFIG_SMP struct disk_stats __percpu *dkstats; #else @@ -370,21 +370,21 @@ static inline void free_part_stats(struct hd_struct *part) static inline void part_inc_in_flight(struct hd_struct *part, int rw) { - part->in_flight[rw]++; + atomic_inc(&part->in_flight[rw]); if (part->partno) - part_to_disk(part)->part0.in_flight[rw]++; + atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); } static inline void part_dec_in_flight(struct hd_struct *part, int rw) { - part->in_flight[rw]--; + atomic_dec(&part->in_flight[rw]); if (part->partno) - part_to_disk(part)->part0.in_flight[rw]--; + atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); } static inline int part_in_flight(struct hd_struct *part) { - return part->in_flight[0] + part->in_flight[1]; + return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]); } static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) -- cgit v1.2.3 From 6b33aff368def952be78102c0935ebd219f9a748 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Mar 2011 16:42:13 -0700 Subject: md: use little-endian bitops As a preparation for removing ext2 non-atomic bit operations from asm/bitops.h. This converts ext2 non-atomic bit operations to little-endian bit operations. Signed-off-by: Akinobu Mita Acked-by: NeilBrown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/bitmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 9a35320fb59f..a2ce0b2da281 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -854,7 +854,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) if (bitmap->flags & BITMAP_HOSTENDIAN) set_bit(bit, kaddr); else - ext2_set_bit(bit, kaddr); + __test_and_set_bit_le(bit, kaddr); kunmap_atomic(kaddr, KM_USER0); PRINTK("set file bit %lu page %lu\n", bit, page->index); } @@ -1050,7 +1050,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (bitmap->flags & BITMAP_HOSTENDIAN) b = test_bit(bit, paddr); else - b = ext2_test_bit(bit, paddr); + b = test_bit_le(bit, paddr); kunmap_atomic(paddr, KM_USER0); if (b) { /* if the disk bit is set, set the memory bit */ @@ -1226,7 +1226,7 @@ void bitmap_daemon_work(mddev_t *mddev) clear_bit(file_page_offset(bitmap, j), paddr); else - ext2_clear_bit(file_page_offset(bitmap, j), + __test_and_clear_bit_le(file_page_offset(bitmap, j), paddr); kunmap_atomic(paddr, KM_USER0); } else -- cgit v1.2.3 From bb5cda3d706f44e5696533c9a7353c458f2871e0 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Mar 2011 16:42:13 -0700 Subject: dm: use little-endian bitops As a preparation for removing ext2 non-atomic bit operations from asm/bitops.h. This converts ext2 non-atomic bit operations to little-endian bit operations. Signed-off-by: Akinobu Mita Cc: Alasdair Kergon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/dm-log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 6951536ea29c..57968eb382c1 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -251,20 +251,20 @@ struct log_c { */ static inline int log_test_bit(uint32_t *bs, unsigned bit) { - return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; + return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; } static inline void log_set_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - ext2_set_bit(bit, (unsigned long *) bs); + __test_and_set_bit_le(bit, (unsigned long *) bs); l->touched_cleaned = 1; } static inline void log_clear_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - ext2_clear_bit(bit, (unsigned long *) bs); + __test_and_clear_bit_le(bit, (unsigned long *) bs); l->touched_dirtied = 1; } @@ -740,7 +740,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) return 0; do { - *region = ext2_find_next_zero_bit( + *region = find_next_zero_bit_le( (unsigned long *) lc->sync_bits, lc->region_count, lc->sync_search); -- cgit v1.2.3 From 024d37e95ec4a7ccc256973ab2feab01f4fbdd2d Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 24 Mar 2011 13:52:14 +0000 Subject: dm: fix opening log and cow devices for read only tables If a table is read-only, also open any log and cow devices it uses read-only. Previously, even read-only devices were opened read-write internally. After patch 75f1dc0d076d1c1168f2115f1941ea627d38bd5a block: check bdev_read_only() from blkdev_get() was applied, loading such tables began to fail. The patch was reverted by e51900f7d38cbcfb481d84567fd92540e7e1d23a block: revert block_dev read-only check but this patch fixes this part of the code to work with the original patch. Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log.c | 2 +- drivers/md/dm-snap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 6951536ea29c..8e8a868ca857 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -543,7 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, return -EINVAL; } - r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev); + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); if (r) return r; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index fdde53cd12b7..a2d330942cb2 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1080,7 +1080,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv++; argc--; - r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow); + r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); if (r) { ti->error = "Cannot get COW device"; goto bad_cow; -- cgit v1.2.3 From 3407ef5262b55ca5d7139d2b555ef792fe531eec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Mar 2011 13:54:24 +0000 Subject: dm: add flakey target This target is the same as the linear target except that it returns I/O errors periodically. It's been found useful in simulating failing devices for testing purposes. I needed a dm target to do some failure testing on btrfs's raid code, and Mike pointed me at this. Signed-off-by: Josef Bacik Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/dm-flakey.txt | 17 +++ drivers/md/Kconfig | 6 + drivers/md/Makefile | 1 + drivers/md/dm-flakey.c | 212 ++++++++++++++++++++++++++++++ 4 files changed, 236 insertions(+) create mode 100644 Documentation/device-mapper/dm-flakey.txt create mode 100644 drivers/md/dm-flakey.c (limited to 'drivers/md') diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt new file mode 100644 index 000000000000..c8efdfd19a65 --- /dev/null +++ b/Documentation/device-mapper/dm-flakey.txt @@ -0,0 +1,17 @@ +dm-flakey +========= + +This target is the same as the linear target except that it returns I/O +errors periodically. It's been found useful in simulating failing +devices for testing purposes. + +Starting from the time the table is loaded, the device is available for + seconds, then returns errors for seconds, +and then this cycle repeats. + +Parameters: + : Full pathname to the underlying block-device, or a + "major:minor" device-number. + : Starting sector within the device. + : Number of seconds device is available. + : Number of seconds device returns errors. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 98d9ec85e0eb..8420129fc5ee 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -327,4 +327,10 @@ config DM_UEVENT ---help--- Generate udev events for DM events. +config DM_FLAKEY + tristate "Flakey target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + A target that intermittently fails I/O for debugging purposes. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d0138606c2e8..448838b1f92a 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c new file mode 100644 index 000000000000..ea790623c30b --- /dev/null +++ b/drivers/md/dm-flakey.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2003 Sistina Software (UK) Limited. + * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "flakey" + +/* + * Flakey: Used for testing only, simulates intermittent, + * catastrophic device failure. + */ +struct flakey_c { + struct dm_dev *dev; + unsigned long start_time; + sector_t start; + unsigned up_interval; + unsigned down_interval; +}; + +/* + * Construct a flakey mapping: + */ +static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct flakey_c *fc; + unsigned long long tmp; + + if (argc != 4) { + ti->error = "dm-flakey: Invalid argument count"; + return -EINVAL; + } + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) { + ti->error = "dm-flakey: Cannot allocate linear context"; + return -ENOMEM; + } + fc->start_time = jiffies; + + if (sscanf(argv[1], "%llu", &tmp) != 1) { + ti->error = "dm-flakey: Invalid device sector"; + goto bad; + } + fc->start = tmp; + + if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { + ti->error = "dm-flakey: Invalid up interval"; + goto bad; + } + + if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { + ti->error = "dm-flakey: Invalid down interval"; + goto bad; + } + + if (!(fc->up_interval + fc->down_interval)) { + ti->error = "dm-flakey: Total (up + down) interval is zero"; + goto bad; + } + + if (fc->up_interval + fc->down_interval < fc->up_interval) { + ti->error = "dm-flakey: Interval overflow"; + goto bad; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { + ti->error = "dm-flakey: Device lookup failed"; + goto bad; + } + + ti->num_flush_requests = 1; + ti->private = fc; + return 0; + +bad: + kfree(fc); + return -EINVAL; +} + +static void flakey_dtr(struct dm_target *ti) +{ + struct flakey_c *fc = ti->private; + + dm_put_device(ti, fc->dev); + kfree(fc); +} + +static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) +{ + struct flakey_c *fc = ti->private; + + return fc->start + (bi_sector - ti->begin); +} + +static void flakey_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct flakey_c *fc = ti->private; + + bio->bi_bdev = fc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); +} + +static int flakey_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct flakey_c *fc = ti->private; + unsigned elapsed; + + /* Are we alive ? */ + elapsed = (jiffies - fc->start_time) / HZ; + if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) + return -EIO; + + flakey_map_bio(ti, bio); + + return DM_MAPIO_REMAPPED; +} + +static int flakey_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct flakey_c *fc = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, + (unsigned long long)fc->start, fc->up_interval, + fc->down_interval); + break; + } + return 0; +} + +static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) +{ + struct flakey_c *fc = ti->private; + + return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg); +} + +static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct flakey_c *fc = ti->private; + struct request_queue *q = bdev_get_queue(fc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = fc->dev->bdev; + bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct flakey_c *fc = ti->private; + + return fn(ti, fc->dev, fc->start, ti->len, data); +} + +static struct target_type flakey_target = { + .name = "flakey", + .version = {1, 1, 0}, + .module = THIS_MODULE, + .ctr = flakey_ctr, + .dtr = flakey_dtr, + .map = flakey_map, + .status = flakey_status, + .ioctl = flakey_ioctl, + .merge = flakey_merge, + .iterate_devices = flakey_iterate_devices, +}; + +static int __init dm_flakey_init(void) +{ + int r = dm_register_target(&flakey_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_flakey_exit(void) +{ + dm_unregister_target(&flakey_target); +} + +/* Module hooks */ +module_init(dm_flakey_init); +module_exit(dm_flakey_exit); + +MODULE_DESCRIPTION(DM_NAME " flakey target"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From de8be5ac70f50a2340f24fd769a1aafa5a51ae34 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 24 Mar 2011 13:54:27 +0000 Subject: dm crypt: wipe keys string immediately after key is set Always wipe the original copy of the key after processing it in crypt_set_key(). Signed-off-by: Milan Broz Acked-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4e054bd91664..7dcae0d74db7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1331,20 +1331,29 @@ static int crypt_setkey_allcpus(struct crypt_config *cc) static int crypt_set_key(struct crypt_config *cc, char *key) { + int r = -EINVAL; + int key_string_len = strlen(key); + /* The key size may not be changed. */ - if (cc->key_size != (strlen(key) >> 1)) - return -EINVAL; + if (cc->key_size != (key_string_len >> 1)) + goto out; /* Hyphen (which gives a key_size of zero) means there is no key. */ if (!cc->key_size && strcmp(key, "-")) - return -EINVAL; + goto out; if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) - return -EINVAL; + goto out; set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return crypt_setkey_allcpus(cc); + r = crypt_setkey_allcpus(cc); + +out: + /* Hex key string not needed after here, so wipe it. */ + memset(key, '0', key_string_len); + + return r; } static int crypt_wipe_key(struct crypt_config *cc) -- cgit v1.2.3 From 6bb43b5d1f54fb44c0408d86d5e71e4405a3ebe1 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 24 Mar 2011 13:54:28 +0000 Subject: dm ioctl: prepare for crypt key wiping Prepare code for implementing buffer wipe flag. No functional change in this patch. Signed-off-by: Milan Broz Acked-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 6d12775a1061..3a8f6a0e6eb5 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1501,11 +1501,6 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) return r; } -static void free_params(struct dm_ioctl *param) -{ - vfree(param); -} - static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) { struct dm_ioctl tmp, *dmi; @@ -1520,13 +1515,15 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) if (!dmi) return -ENOMEM; - if (copy_from_user(dmi, user, tmp.data_size)) { - vfree(dmi); - return -EFAULT; - } + if (copy_from_user(dmi, user, tmp.data_size)) + goto bad; *param = dmi; return 0; + +bad: + vfree(dmi); + return -EFAULT; } static int validate_params(uint cmd, struct dm_ioctl *param) @@ -1564,7 +1561,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) unsigned int cmd; struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; - size_t param_size; + size_t input_param_size; /* only root can play with this */ if (!capable(CAP_SYS_ADMIN)) @@ -1605,6 +1602,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) * Copy the parameters into kernel space. */ r = copy_params(user, ¶m); + input_param_size = param->data_size; current->flags &= ~PF_MEMALLOC; @@ -1615,9 +1613,8 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) if (r) goto out; - param_size = param->data_size; param->data_size = sizeof(*param); - r = fn(param, param_size); + r = fn(param, input_param_size); /* * Copy the results back to userland. @@ -1625,8 +1622,8 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) if (!r && copy_to_user(user, param, param->data_size)) r = -EFAULT; - out: - free_params(param); +out: + vfree(param); return r; } -- cgit v1.2.3 From f868120549fc1664b2c451d4b9882a363928c698 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 24 Mar 2011 13:54:30 +0000 Subject: dm ioctl: add flag to wipe buffers for secure data Add DM_SECURE_DATA_FLAG which userspace can use to ensure that all buffers allocated for dm-ioctl are wiped immediately after use. The user buffer is wiped as well (we do not want to keep and return sensitive data back to userspace if the flag is set). Wiping is useful for cryptsetup to ensure that the key is present in memory only in defined places and only for the time needed. (For crypt, key can be present in table during load or table status, wait and message commands). Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 23 +++++++++++++++++++++-- include/linux/dm-ioctl.h | 12 +++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 3a8f6a0e6eb5..4cacdad2270a 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1504,6 +1504,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) { struct dm_ioctl tmp, *dmi; + int secure_data; if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) return -EFAULT; @@ -1511,17 +1512,28 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) return -EINVAL; + secure_data = tmp.flags & DM_SECURE_DATA_FLAG; + dmi = vmalloc(tmp.data_size); - if (!dmi) + if (!dmi) { + if (secure_data && clear_user(user, tmp.data_size)) + return -EFAULT; return -ENOMEM; + } if (copy_from_user(dmi, user, tmp.data_size)) goto bad; + /* Wipe the user buffer so we do not return it to userspace */ + if (secure_data && clear_user(user, tmp.data_size)) + goto bad; + *param = dmi; return 0; bad: + if (secure_data) + memset(dmi, 0, tmp.data_size); vfree(dmi); return -EFAULT; } @@ -1531,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) /* Always clear this flag */ param->flags &= ~DM_BUFFER_FULL_FLAG; param->flags &= ~DM_UEVENT_GENERATED_FLAG; + param->flags &= ~DM_SECURE_DATA_FLAG; /* Ignores parameters */ if (cmd == DM_REMOVE_ALL_CMD || @@ -1558,6 +1571,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) static int ctl_ioctl(uint command, struct dm_ioctl __user *user) { int r = 0; + int wipe_buffer; unsigned int cmd; struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; @@ -1602,13 +1616,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) * Copy the parameters into kernel space. */ r = copy_params(user, ¶m); - input_param_size = param->data_size; current->flags &= ~PF_MEMALLOC; if (r) return r; + input_param_size = param->data_size; + wipe_buffer = param->flags & DM_SECURE_DATA_FLAG; + r = validate_params(cmd, param); if (r) goto out; @@ -1623,6 +1639,9 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) r = -EFAULT; out: + if (wipe_buffer) + memset(param, 0, input_param_size); + vfree(param); return r; } diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 78bbf47bbb96..3708455ee6c3 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 19 -#define DM_VERSION_PATCHLEVEL 1 -#define DM_VERSION_EXTRA "-ioctl (2011-01-07)" +#define DM_VERSION_MINOR 20 +#define DM_VERSION_PATCHLEVEL 0 +#define DM_VERSION_EXTRA "-ioctl (2011-02-02)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -328,4 +328,10 @@ enum { */ #define DM_UUID_FLAG (1 << 14) /* In */ +/* + * If set, all buffers are wiped after use. Use when sending + * or requesting sensitive data such as an encryption key. + */ +#define DM_SECURE_DATA_FLAG (1 << 15) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ -- cgit v1.2.3 From 19040c0bc8efcb767221d8ef7bb9c32ff0586179 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Mar 2011 13:54:31 +0000 Subject: dm mpath: fail message ioctl if specified path is not valid Fail the reinstate_path and fail_path message ioctl if the specified path is not valid. The message ioctl would succeed for the 'reinistate_path' and 'fail_path' messages even if action was not taken because the specified device was not a valid path of the multipath device. Before, when /dev/vdb is not a path of mpathb: $ dmsetup message mpathb 0 reinstate_path /dev/vdb $ echo $? 0 After: $ dmsetup message mpathb 0 reinstate_path /dev/vdb device-mapper: message ioctl failed: Invalid argument Command failed $ echo $? 1 Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 4b0b63c290a6..52f0de394632 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1065,7 +1065,7 @@ out: static int action_dev(struct multipath *m, struct dm_dev *dev, action_fn action) { - int r = 0; + int r = -EINVAL; struct pgpath *pgpath; struct priority_group *pg; @@ -1669,7 +1669,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, -- cgit v1.2.3 From a490a07a67b7a37f588021410e06b721a752fc34 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Mar 2011 13:54:33 +0000 Subject: dm mpath: allow table load with no priority groups This patch adjusts the multipath target to allow a table with both 0 priority groups and 0 for the initial priority group number. If any mpath device is held open when all paths in the last priority group have failed, userspace multipathd will attempt to reload the associated DM table to reflect the fact that the device no longer has any priority groups. But the reload attempt always failed because the multipath target did not allow 0 priority groups. All multipath target messages related to priority group (enable_group, disable_group, switch_group) will handle a priority group of 0 (will cause error). When reloading a multipath table with 0 priority groups, userspace multipathd must be updated to specify an initial priority group number of 0 (rather than 1). Signed-off-by: Mike Snitzer Cc: Babu Moger Acked-by: Hannes Reinecke Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 52f0de394632..a550a057d991 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -844,8 +844,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, { /* target parameters */ static struct param _params[] = { - {1, 1024, "invalid number of priority groups"}, - {1, 1024, "invalid initial priority group number"}, + {0, 1024, "invalid number of priority groups"}, + {0, 1024, "invalid initial priority group number"}, }; int r; @@ -879,6 +879,13 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, if (r) goto bad; + if ((!m->nr_priority_groups && next_pg_num) || + (m->nr_priority_groups && !next_pg_num)) { + ti->error = "invalid initial priority group"; + r = -EINVAL; + goto bad; + } + /* parse the priority groups */ while (as.argc) { struct priority_group *pg; @@ -1415,7 +1422,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type, else if (m->current_pg) pg_num = m->current_pg->pg_num; else - pg_num = 1; + pg_num = (m->nr_priority_groups ? 1 : 0); DMEMIT("%u ", pg_num); -- cgit v1.2.3 From 29915202006c2e7bafe81348eb498ff9a724ac61 Mon Sep 17 00:00:00 2001 From: Mustafa Mesanovic Date: Thu, 24 Mar 2011 13:54:35 +0000 Subject: dm stripe: implement merge method Implement a merge function in the striped target. When the striped target's underlying devices provide a merge_bvec_fn (like all DM devices do via dm_merge_bvec) it is important to call down to them when building a biovec that doesn't span a stripe boundary. Without the merge method, a striped DM device stacked on DM devices causes bios with a single page to be submitted which results in unnecessary overhead that hurts performance. This change really helps filesystems (e.g. XFS and now ext4) which take care to assemble larger bios. By implementing stripe_merge(), DM and the stripe target no longer undermine the filesystem's work by only allowing a single page per bio. Buffered IO sees the biggest improvement (particularly uncached reads, buffered writes to a lesser degree). This is especially so for more capable "enterprise" storage LUNs. The performance improvement has been measured to be ~12-35% -- when a reasonable chunk_size is used (e.g. 64K) in conjunction with a stripe count that is a power of 2. In contrast, the performance penalty is ~5-7% for the pathological worst case stripe configuration (small chunk_size with a stripe count that is not a power of 2). The reason for this is that stripe_map_sector() is now called once for every call to dm_merge_bvec(). stripe_map_sector() will use slower division if stripe count isn't a power of 2. Signed-off-by: Mustafa Mesanovic Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index dddfa14f2982..3d80cf0c152d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -396,9 +396,29 @@ static void stripe_io_hints(struct dm_target *ti, blk_limits_io_opt(limits, chunk_size * sc->stripes); } +static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct stripe_c *sc = ti->private; + sector_t bvm_sector = bvm->bi_sector; + uint32_t stripe; + struct request_queue *q; + + stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector); + + q = bdev_get_queue(sc->stripe[stripe].dev->bdev); + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = sc->stripe[stripe].dev->bdev; + bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static struct target_type stripe_target = { .name = "striped", - .version = {1, 3, 1}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, @@ -407,6 +427,7 @@ static struct target_type stripe_target = { .status = stripe_status, .iterate_devices = stripe_iterate_devices, .io_hints = stripe_io_hints, + .merge = stripe_merge, }; int __init dm_stripe_init(void) -- cgit v1.2.3 From 89078d572eb9ce8d4c04264b8b0ba86de0d74c8f Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 28 Mar 2011 20:09:12 -0400 Subject: md: Fix integrity registration error when no devices are capable We incorrectly returned -EINVAL when none of the devices in the array had an integrity profile. This in turn prevented mdadm from starting the metadevice. Fix this so we only return errors on mismatched profiles and memory allocation failures. Reported-by: Giacomo Catenazzi Reported-by: Thomas Gleixner Signed-off-by: Martin K. Petersen Signed-off-by: Linus Torvalds --- drivers/md/md.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 06ecea751a39..8b66e04c2ea6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1777,12 +1777,6 @@ int md_integrity_register(mddev_t *mddev) continue; if (rdev->raid_disk < 0) continue; - /* - * If at least one rdev is not integrity capable, we can not - * enable data integrity for the md device. - */ - if (!bdev_get_integrity(rdev->bdev)) - return -EINVAL; if (!reference) { /* Use the first rdev as the reference */ reference = rdev; @@ -1793,6 +1787,8 @@ int md_integrity_register(mddev_t *mddev) rdev->bdev->bd_disk) < 0) return -EINVAL; } + if (!reference || !bdev_get_integrity(reference->bdev)) + return 0; /* * All component devices are integrity capable and have matching * profiles, register the common profile for the md device. -- cgit v1.2.3