From 9fd097b14918875bd6f125ed699d7bbbba5893ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2011 21:32:55 +0200 Subject: block: unexport DISK_EVENT_MEDIA_CHANGE for legacy/fringe drivers In-kernel disk event polling doesn't matter for legacy/fringe drivers and may lead to infinite event loop if ->check_events() implementation generates events on level condition instead of edge. Now that block layer supports suppressing exporting unlisted events, simply leaving disk->events cleared allows these drivers to keep the internal revalidation behavior intact while avoiding weird interactions with userland event handler. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 1 - drivers/block/amiflop.c | 1 - drivers/block/ataflop.c | 1 - drivers/block/floppy.c | 1 - drivers/block/paride/pcd.c | 1 - drivers/block/paride/pd.c | 1 - drivers/block/paride/pf.c | 1 - drivers/block/swim.c | 1 - drivers/block/swim3.c | 1 - drivers/block/ub.c | 1 - drivers/block/xsysace.c | 1 - 11 files changed, 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 8066d086578a..e086fbbbe853 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -2547,7 +2547,6 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller) disk->major = MajorNumber; disk->first_minor = n << DAC960_MaxPartitionsBits; disk->fops = &DAC960_BlockDeviceOperations; - disk->events = DISK_EVENT_MEDIA_CHANGE; } /* Indicate the Block Device Registration completed successfully, diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 456c0cc90dcf..8eba86bba599 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1736,7 +1736,6 @@ static int __init fd_probe_drives(void) disk->major = FLOPPY_MAJOR; disk->first_minor = drive; disk->fops = &floppy_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; sprintf(disk->disk_name, "fd%d", drive); disk->private_data = &unit[drive]; set_capacity(disk, 880*2); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index c871eae14120..ede16c64ff07 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1964,7 +1964,6 @@ static int __init atari_floppy_init (void) unit[i].disk->first_minor = i; sprintf(unit[i].disk->disk_name, "fd%d", i); unit[i].disk->fops = &floppy_fops; - unit[i].disk->events = DISK_EVENT_MEDIA_CHANGE; unit[i].disk->private_data = &unit[i]; unit[i].disk->queue = blk_init_queue(do_fd_request, &ataflop_lock); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 301d7a9a41a6..db8f88586c8d 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4205,7 +4205,6 @@ static int __init floppy_init(void) disks[dr]->major = FLOPPY_MAJOR; disks[dr]->first_minor = TOMINOR(dr); disks[dr]->fops = &floppy_fops; - disks[dr]->events = DISK_EVENT_MEDIA_CHANGE; sprintf(disks[dr]->disk_name, "fd%d", dr); init_timer(&motor_off_timer[dr]); diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 2f2ccf686251..8690e31d9932 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -320,7 +320,6 @@ static void pcd_init_units(void) disk->first_minor = unit; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; - disk->events = DISK_EVENT_MEDIA_CHANGE; } } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 21dfdb776869..869e7676d46f 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -837,7 +837,6 @@ static void pd_probe_drive(struct pd_unit *disk) p->fops = &pd_fops; p->major = major; p->first_minor = (disk - pd) << PD_BITS; - p->events = DISK_EVENT_MEDIA_CHANGE; disk->gd = p; p->private_data = disk; p->queue = pd_queue; diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 7adeb1edbf43..f21b520ef419 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -294,7 +294,6 @@ static void __init pf_init_units(void) disk->first_minor = unit; strcpy(disk->disk_name, pf->name); disk->fops = &pf_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; if (!(*drives[unit])[D_PRT]) pf_drive_count++; } diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 24a482f2fbd6..fd5adcd55944 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -858,7 +858,6 @@ static int __devinit swim_floppy_init(struct swim_priv *swd) swd->unit[drive].disk->first_minor = drive; sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); swd->unit[drive].disk->fops = &floppy_fops; - swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE; swd->unit[drive].disk->private_data = &swd->unit[drive]; swd->unit[drive].disk->queue = swd->queue; set_capacity(swd->unit[drive].disk, 2880); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 4c10f56facbf..773bfa792777 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1163,7 +1163,6 @@ static int __devinit swim3_attach(struct macio_dev *mdev, const struct of_device disk->major = FLOPPY_MAJOR; disk->first_minor = i; disk->fops = &floppy_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = &floppy_states[i]; disk->queue = swim3_queue; disk->flags |= GENHD_FL_REMOVABLE; diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 68b9430c7cfe..0e376d46bdd1 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -2334,7 +2334,6 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum) disk->major = UB_MAJOR; disk->first_minor = lun->id * UB_PARTS_PER_LUN; disk->fops = &ub_bd_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; disk->private_data = lun; disk->driverfs_dev = &sc->intf->dev; diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 645ff765cd12..6c7fd7db6dff 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -1005,7 +1005,6 @@ static int __devinit ace_setup(struct ace_device *ace) ace->gd->major = ace_major; ace->gd->first_minor = ace->id * ACE_NUM_MINORS; ace->gd->fops = &ace_fops; - ace->gd->events = DISK_EVENT_MEDIA_CHANGE; ace->gd->queue = ace->queue; ace->gd->private_data = ace; snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a'); -- cgit v1.2.3 From 4ad12621e442b7a072e81270808f617cb65c5672 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 3 May 2011 09:23:36 -0700 Subject: libceph: fix ceph_osdc_alloc_request error checks ceph_osdc_alloc_request returns NULL on failure. Signed-off-by: Sage Weil --- drivers/block/rbd.c | 4 ++-- net/ceph/osd_client.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 16dc3645291c..3e904717c1c0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -777,9 +777,9 @@ static int rbd_do_request(struct request *rq, ops, false, GFP_NOIO, pages, bio); - if (IS_ERR(req)) { + if (!req) { up_read(&header->snap_rwsem); - ret = PTR_ERR(req); + ret = -ENOMEM; goto done_pages; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5a80f41c0cba..6b5dda1cb5df 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -470,8 +470,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, snapc, ops, use_mempool, GFP_NOFS, NULL, NULL); - if (IS_ERR(req)) - return req; + if (!req) + return NULL; /* calculate max write size */ calc_layout(osdc, vino, layout, off, plen, req, ops); -- cgit v1.2.3 From 11f770027b5c0de16544f3ec82b5c6f9f8d5a644 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 May 2011 16:13:54 -0700 Subject: rbd: fix leak of ops struct The ops vector must be freed by the rbd_do_request caller. Signed-off-by: Sage Weil --- drivers/block/rbd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3e904717c1c0..2146cab1c61b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -996,6 +996,8 @@ static int rbd_do_op(struct request *rq, ops, num_reply, rbd_req_cb, 0, NULL); + + rbd_destroy_ops(ops); done: kfree(seg_name); return ret; @@ -1063,7 +1065,9 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, { struct ceph_osd_req_op *ops; struct page **pages = NULL; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); + int ret; + + ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); if (ret < 0) return ret; -- cgit v1.2.3 From 1fec70932d867416ffe620dd17005f168cc84eb5 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 13 May 2011 13:52:56 -0700 Subject: rbd: fix split bio handling The rbd driver currently splits bios when they span an object boundary. However, the blk_end_request expects the completions to roll up the results in block device order, and the split rbd/ceph ops can complete in any order. This patch adds a struct rbd_req_coll to track completion of split requests and ensures that the results are passed back up to the block layer in order. This fixes errors where the file system gets completion of a read operation that spans an object boundary before the data has actually arrived. The bug is easily reproduced with iozone with a working set larger than available RAM. Reported-by: Fyodor Ustinov Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- drivers/block/rbd.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 151 insertions(+), 20 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2146cab1c61b..9712fad82bc6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -92,6 +92,8 @@ struct rbd_client { struct list_head node; }; +struct rbd_req_coll; + /* * a single io request */ @@ -100,6 +102,24 @@ struct rbd_request { struct bio *bio; /* cloned bio */ struct page **pages; /* list of used pages */ u64 len; + int coll_index; + struct rbd_req_coll *coll; +}; + +struct rbd_req_status { + int done; + int rc; + u64 bytes; +}; + +/* + * a collection of requests + */ +struct rbd_req_coll { + int total; + int num_done; + struct kref kref; + struct rbd_req_status status[0]; }; struct rbd_snap { @@ -416,6 +436,17 @@ static void rbd_put_client(struct rbd_device *rbd_dev) rbd_dev->client = NULL; } +/* + * Destroy requests collection + */ +static void rbd_coll_release(struct kref *kref) +{ + struct rbd_req_coll *coll = + container_of(kref, struct rbd_req_coll, kref); + + dout("rbd_coll_release %p\n", coll); + kfree(coll); +} /* * Create a new header structure, translate header format from the on-disk @@ -590,6 +621,14 @@ static u64 rbd_get_segment(struct rbd_image_header *header, return len; } +static int rbd_get_num_segments(struct rbd_image_header *header, + u64 ofs, u64 len) +{ + u64 start_seg = ofs >> header->obj_order; + u64 end_seg = (ofs + len - 1) >> header->obj_order; + return end_seg - start_seg + 1; +} + /* * bio helpers */ @@ -735,6 +774,50 @@ static void rbd_destroy_ops(struct ceph_osd_req_op *ops) kfree(ops); } +static void rbd_coll_end_req_index(struct request *rq, + struct rbd_req_coll *coll, + int index, + int ret, u64 len) +{ + struct request_queue *q; + int min, max, i; + + dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", + coll, index, ret, len); + + if (!rq) + return; + + if (!coll) { + blk_end_request(rq, ret, len); + return; + } + + q = rq->q; + + spin_lock_irq(q->queue_lock); + coll->status[index].done = 1; + coll->status[index].rc = ret; + coll->status[index].bytes = len; + max = min = coll->num_done; + while (max < coll->total && coll->status[max].done) + max++; + + for (i = min; istatus[i].rc, + coll->status[i].bytes); + coll->num_done++; + kref_put(&coll->kref, rbd_coll_release); + } + spin_unlock_irq(q->queue_lock); +} + +static void rbd_coll_end_req(struct rbd_request *req, + int ret, u64 len) +{ + rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); +} + /* * Send ceph osd request */ @@ -749,6 +832,8 @@ static int rbd_do_request(struct request *rq, int flags, struct ceph_osd_req_op *ops, int num_reply, + struct rbd_req_coll *coll, + int coll_index, void (*rbd_cb)(struct ceph_osd_request *req, struct ceph_msg *msg), struct ceph_osd_request **linger_req, @@ -763,12 +848,20 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request_head *reqhead; struct rbd_image_header *header = &dev->header; - ret = -ENOMEM; req_data = kzalloc(sizeof(*req_data), GFP_NOIO); - if (!req_data) - goto done; + if (!req_data) { + if (coll) + rbd_coll_end_req_index(rq, coll, coll_index, + -ENOMEM, len); + return -ENOMEM; + } + + if (coll) { + req_data->coll = coll; + req_data->coll_index = coll_index; + } - dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs); + dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); down_read(&header->snap_rwsem); @@ -828,7 +921,8 @@ static int rbd_do_request(struct request *rq, ret = ceph_osdc_wait_request(&dev->client->osdc, req); if (ver) *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%lld\n", le64_to_cpu(req->r_reassert_version.version)); + dout("reassert_ver=%lld\n", + le64_to_cpu(req->r_reassert_version.version)); ceph_osdc_put_request(req); } return ret; @@ -837,10 +931,8 @@ done_err: bio_chain_put(req_data->bio); ceph_osdc_put_request(req); done_pages: + rbd_coll_end_req(req_data, ret, len); kfree(req_data); -done: - if (rq) - blk_end_request(rq, ret, len); return ret; } @@ -874,7 +966,7 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) bytes = req_data->len; } - blk_end_request(req_data->rq, rc, bytes); + rbd_coll_end_req(req_data, rc, bytes); if (req_data->bio) bio_chain_put(req_data->bio); @@ -934,6 +1026,7 @@ static int rbd_req_sync_op(struct rbd_device *dev, flags, ops, 2, + NULL, 0, NULL, linger_req, ver); if (ret < 0) @@ -959,7 +1052,9 @@ static int rbd_do_op(struct request *rq, u64 snapid, int opcode, int flags, int num_reply, u64 ofs, u64 len, - struct bio *bio) + struct bio *bio, + struct rbd_req_coll *coll, + int coll_index) { char *seg_name; u64 seg_ofs; @@ -995,6 +1090,7 @@ static int rbd_do_op(struct request *rq, flags, ops, num_reply, + coll, coll_index, rbd_req_cb, 0, NULL); rbd_destroy_ops(ops); @@ -1010,13 +1106,15 @@ static int rbd_req_write(struct request *rq, struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 ofs, u64 len, - struct bio *bio) + struct bio *bio, + struct rbd_req_coll *coll, + int coll_index) { return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, 2, - ofs, len, bio); + ofs, len, bio, coll, coll_index); } /* @@ -1026,14 +1124,16 @@ static int rbd_req_read(struct request *rq, struct rbd_device *rbd_dev, u64 snapid, u64 ofs, u64 len, - struct bio *bio) + struct bio *bio, + struct rbd_req_coll *coll, + int coll_index) { return rbd_do_op(rq, rbd_dev, NULL, (snapid ? snapid : CEPH_NOSNAP), CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 2, - ofs, len, bio); + ofs, len, bio, coll, coll_index); } /* @@ -1081,6 +1181,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, CEPH_OSD_FLAG_READ, ops, 1, + NULL, 0, rbd_simple_req_cb, 0, NULL); rbd_destroy_ops(ops); @@ -1278,6 +1379,20 @@ static int rbd_req_sync_exec(struct rbd_device *dev, return ret; } +static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) +{ + struct rbd_req_coll *coll = + kzalloc(sizeof(struct rbd_req_coll) + + sizeof(struct rbd_req_status) * num_reqs, + GFP_ATOMIC); + + if (!coll) + return NULL; + coll->total = num_reqs; + kref_init(&coll->kref); + return coll; +} + /* * block device queue callback */ @@ -1295,6 +1410,8 @@ static void rbd_rq_fn(struct request_queue *q) bool do_write; int size, op_size = 0; u64 ofs; + int num_segs, cur_seg = 0; + struct rbd_req_coll *coll; /* peek at request from block layer */ if (!rq) @@ -1325,6 +1442,14 @@ static void rbd_rq_fn(struct request_queue *q) do_write ? "write" : "read", size, blk_rq_pos(rq) * 512ULL); + num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); + coll = rbd_alloc_coll(num_segs); + if (!coll) { + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENOMEM); + goto next; + } + do { /* a bio clone to be passed down to OSD req */ dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); @@ -1332,35 +1457,41 @@ static void rbd_rq_fn(struct request_queue *q) rbd_dev->header.block_name, ofs, size, NULL, NULL); + kref_get(&coll->kref); bio = bio_chain_clone(&rq_bio, &next_bio, &bp, op_size, GFP_ATOMIC); if (!bio) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENOMEM); - goto next; + rbd_coll_end_req_index(rq, coll, cur_seg, + -ENOMEM, op_size); + goto next_seg; } + /* init OSD command: write or read */ if (do_write) rbd_req_write(rq, rbd_dev, rbd_dev->header.snapc, ofs, - op_size, bio); + op_size, bio, + coll, cur_seg); else rbd_req_read(rq, rbd_dev, cur_snap_id(rbd_dev), ofs, - op_size, bio); + op_size, bio, + coll, cur_seg); +next_seg: size -= op_size; ofs += op_size; + cur_seg++; rq_bio = next_bio; } while (size > 0); + kref_put(&coll->kref, rbd_coll_release); if (bp) bio_pair_release(bp); - spin_lock_irq(q->queue_lock); next: rq = blk_fetch_request(q); -- cgit v1.2.3