diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/nvme-core.c | 203 | ||||
-rw-r--r-- | drivers/block/nvme-scsi.c | 36 | ||||
-rw-r--r-- | drivers/block/rbd.c | 242 |
3 files changed, 346 insertions, 135 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a842c71dcc21..02351e217165 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -10,10 +10,6 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #include <linux/nvme.h> @@ -46,16 +42,26 @@ #include <scsi/sg.h> #include <asm-generic/io-64-nonatomic-lo-hi.h> -#define NVME_Q_DEPTH 1024 +#include <trace/events/block.h> + +#define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define ADMIN_TIMEOUT (60 * HZ) -#define IOD_TIMEOUT (4 * NVME_IO_TIMEOUT) +#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define IOD_TIMEOUT (retry_time * HZ) + +static unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); -unsigned char io_timeout = 30; -module_param(io_timeout, byte, 0644); +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); +static unsigned char retry_time = 30; +module_param(retry_time, byte, 0644); +MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); + static int nvme_major; module_param(nvme_major, int, 0); @@ -67,6 +73,7 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; +static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); @@ -199,16 +206,13 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) -#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) +#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) return; - if (ctx == CMD_CTX_FLUSH) - return; if (ctx == CMD_CTX_ABORT) { ++nvmeq->dev->abort_limit; return; @@ -247,8 +251,9 @@ static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, void *ctx; struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (cmdid >= nvmeq->q_depth) { - *fn = special_completion; + if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { + if (fn) + *fn = special_completion; return CMD_CTX_INVALID; } if (fn) @@ -281,9 +286,17 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) { + struct nvme_queue *nvmeq; unsigned queue_id = get_cpu_var(*dev->io_queue); + rcu_read_lock(); - return rcu_dereference(dev->queues[queue_id]); + nvmeq = rcu_dereference(dev->queues[queue_id]); + if (nvmeq) + return nvmeq; + + rcu_read_unlock(); + put_cpu_var(*dev->io_queue); + return NULL; } static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) @@ -295,8 +308,15 @@ static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) __acquires(RCU) { + struct nvme_queue *nvmeq; + rcu_read_lock(); - return rcu_dereference(dev->queues[q_idx]); + nvmeq = rcu_dereference(dev->queues[q_idx]); + if (nvmeq) + return nvmeq; + + rcu_read_unlock(); + return NULL; } static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) @@ -387,25 +407,30 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) static void nvme_start_io_acct(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - const int rw = bio_data_dir(bio); - int cpu = part_stat_lock(); - part_round_stats(cpu, &disk->part0); - part_stat_inc(cpu, &disk->part0, ios[rw]); - part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); - part_inc_in_flight(&disk->part0, rw); - part_stat_unlock(); + if (blk_queue_io_stat(disk->queue)) { + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + part_round_stats(cpu, &disk->part0); + part_stat_inc(cpu, &disk->part0, ios[rw]); + part_stat_add(cpu, &disk->part0, sectors[rw], + bio_sectors(bio)); + part_inc_in_flight(&disk->part0, rw); + part_stat_unlock(); + } } static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) { struct gendisk *disk = bio->bi_bdev->bd_disk; - const int rw = bio_data_dir(bio); - unsigned long duration = jiffies - start_time; - int cpu = part_stat_lock(); - part_stat_add(cpu, &disk->part0, ticks[rw], duration); - part_round_stats(cpu, &disk->part0); - part_dec_in_flight(&disk->part0, rw); - part_stat_unlock(); + if (blk_queue_io_stat(disk->queue)) { + const int rw = bio_data_dir(bio); + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + part_stat_add(cpu, &disk->part0, ticks[rw], duration); + part_round_stats(cpu, &disk->part0); + part_dec_in_flight(&disk->part0, rw); + part_stat_unlock(); + } } static void bio_completion(struct nvme_queue *nvmeq, void *ctx, @@ -414,6 +439,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_iod *iod = ctx; struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; + int error = 0; if (unlikely(status)) { if (!(status & NVME_SC_DNR || @@ -426,6 +452,7 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, wake_up(&nvmeq->sq_full); return; } + error = -EIO; } if (iod->nents) { dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, @@ -433,10 +460,9 @@ static void bio_completion(struct nvme_queue *nvmeq, void *ctx, nvme_end_io_acct(bio, iod->start_time); } nvme_free_iod(nvmeq->dev, iod); - if (status) - bio_endio(bio, -EIO); - else - bio_endio(bio, 0); + + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); + bio_endio(bio, error); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -525,6 +551,8 @@ static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, if (!split) return -ENOMEM; + trace_block_split(bdev_get_queue(bio->bi_bdev), bio, + split->bi_iter.bi_sector); bio_chain(split, bio); if (!waitqueue_active(&nvmeq->sq_full)) @@ -627,16 +655,6 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; } -int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) -{ - int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, - special_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; - - return nvme_submit_flush(nvmeq, ns, cmdid); -} - static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) { struct bio *bio = iod->private; @@ -652,7 +670,7 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) if (bio->bi_rw & REQ_DISCARD) return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if ((bio->bi_rw & REQ_FLUSH) && !iod->nents) + if (bio->bi_rw & REQ_FLUSH) return nvme_submit_flush(nvmeq, ns, cmdid); control = 0; @@ -686,6 +704,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) return 0; } +static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) +{ + struct bio *split = bio_clone(bio, GFP_ATOMIC); + if (!split) + return -ENOMEM; + + split->bi_iter.bi_size = 0; + split->bi_phys_segments = 0; + bio->bi_rw &= ~REQ_FLUSH; + bio_chain(split, bio); + + if (!waitqueue_active(&nvmeq->sq_full)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, split); + bio_list_add(&nvmeq->sq_cong, bio); + wake_up_process(nvme_thread); + + return 0; +} + /* * Called with local interrupts disabled and the q_lock held. May not sleep. */ @@ -696,11 +734,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, int psegs = bio_phys_segments(ns->queue, bio); int result; - if ((bio->bi_rw & REQ_FLUSH) && psegs) { - result = nvme_submit_flush_data(nvmeq, ns); - if (result) - return result; - } + if ((bio->bi_rw & REQ_FLUSH) && psegs) + return nvme_split_flush_data(nvmeq, bio); iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); if (!iod) @@ -795,7 +830,6 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio) int result = -EBUSY; if (!nvmeq) { - put_nvmeq(NULL); bio_endio(bio, -EIO); return; } @@ -870,10 +904,8 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, struct nvme_queue *nvmeq; nvmeq = lock_nvmeq(dev, q_idx); - if (!nvmeq) { - unlock_nvmeq(nvmeq); + if (!nvmeq) return -ENODEV; - } cmdinfo.task = current; cmdinfo.status = -EINTR; @@ -898,9 +930,10 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, if (cmdinfo.status == -EINTR) { nvmeq = lock_nvmeq(dev, q_idx); - if (nvmeq) + if (nvmeq) { nvme_abort_command(nvmeq, cmdid); - unlock_nvmeq(nvmeq); + unlock_nvmeq(nvmeq); + } return -EINTR; } @@ -1358,7 +1391,8 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) return -EINTR; if (time_after(jiffies, timeout)) { dev_err(&dev->pci_dev->dev, - "Device not ready; aborting initialisation\n"); + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); return -ENODEV; } } @@ -1481,7 +1515,11 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, goto put_pages; } + err = -ENOMEM; iod = nvme_alloc_iod(count, length, GFP_KERNEL); + if (!iod) + goto put_pages; + sg = iod->sg; sg_init_table(sg, count); for (i = 0; i < count; i++) { @@ -1494,7 +1532,6 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, sg_mark_end(&sg[i - 1]); iod->nents = count; - err = -ENOMEM; nents = dma_map_sg(&dev->pci_dev->dev, sg, count, write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (!nents) @@ -1894,6 +1931,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); disk->major = nvme_major; disk->first_minor = 0; @@ -2062,8 +2101,13 @@ static int set_queue_count(struct nvme_dev *dev, int count) status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, &result); - if (status) - return status < 0 ? -EIO : -EBUSY; + if (status < 0) + return status; + if (status > 0) { + dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", + status); + return -EBUSY; + } return min(result & 0xffff, result >> 16) + 1; } @@ -2072,14 +2116,25 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } +static void nvme_cpu_workfn(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work); + if (dev->initialized) + nvme_assign_io_queues(dev); +} + static int nvme_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - struct nvme_dev *dev = container_of(self, struct nvme_dev, nb); + struct nvme_dev *dev; + switch (action) { case CPU_ONLINE: case CPU_DEAD: - nvme_assign_io_queues(dev); + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) + schedule_work(&dev->cpu_work); + spin_unlock(&dev_list_lock); break; } return NOTIFY_OK; @@ -2148,11 +2203,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_free_queues(dev, nr_io_queues + 1); nvme_assign_io_queues(dev); - dev->nb.notifier_call = &nvme_cpu_notify; - result = register_hotcpu_notifier(&dev->nb); - if (result) - goto free_queues; - return 0; free_queues: @@ -2184,6 +2234,7 @@ static int nvme_dev_add(struct nvme_dev *dev) res = nvme_identify(dev, 0, 1, dma_addr); if (res) { + dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); res = -EIO; goto out; } @@ -2192,6 +2243,7 @@ static int nvme_dev_add(struct nvme_dev *dev) nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; + dev->vwc = ctrl->vwc; memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); @@ -2450,8 +2502,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) int i; dev->initialized = 0; - unregister_hotcpu_notifier(&dev->nb); - nvme_dev_list_remove(dev); if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { @@ -2722,6 +2772,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); + INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); dev->pci_dev = pdev; pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); @@ -2801,6 +2852,7 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); + flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); nvme_dev_remove(dev); nvme_dev_shutdown(dev); @@ -2889,11 +2941,18 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; - result = pci_register_driver(&nvme_driver); + nvme_nb.notifier_call = &nvme_cpu_notify; + result = register_hotcpu_notifier(&nvme_nb); if (result) goto unregister_blkdev; + + result = pci_register_driver(&nvme_driver); + if (result) + goto unregister_hotcpu; return 0; + unregister_hotcpu: + unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: @@ -2904,9 +2963,11 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); + unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); + _nvme_check_size(); } MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 2c3f5be06da1..a4cd6d691c63 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1,6 +1,6 @@ /* * NVM Express device driver - * Copyright (c) 2011, Intel Corporation. + * Copyright (c) 2011-2014, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -10,10 +10,6 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ /* @@ -243,8 +239,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define READ_CAP_16_RESP_SIZE 32 /* NVMe Namespace and Command Defines */ -#define NVME_GET_SMART_LOG_PAGE 0x02 -#define NVME_GET_FEAT_TEMP_THRESH 0x04 #define BYTES_TO_DWORDS 4 #define NVME_MAX_FIRMWARE_SLOT 7 @@ -686,6 +680,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, u8 resp_data_format = 0x02; u8 protect; u8 cmdque = 0x01 << 1; + u8 fw_offset = sizeof(dev->firmware_rev); mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); @@ -721,7 +716,11 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ strncpy(&inq_response[8], "NVMe ", 8); strncpy(&inq_response[16], dev->model, 16); - strncpy(&inq_response[32], dev->firmware_rev, 4); + + while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) + fw_offset--; + fw_offset -= 4; + strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); @@ -1018,8 +1017,8 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, c.common.opcode = nvme_admin_get_log_page; c.common.nsid = cpu_to_le32(0xFFFFFFFF); c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c = LOG_TEMP_UNKNOWN; @@ -1086,8 +1085,8 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.common.opcode = nvme_admin_get_log_page; c.common.nsid = cpu_to_le32(0xFFFFFFFF); c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); res = nvme_submit_admin_cmd(dev, &c, NULL); if (res != NVME_SC_SUCCESS) { temp_c_cur = LOG_TEMP_UNKNOWN; @@ -1477,7 +1476,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } id_ctrl = mem; - lowest_pow_st = id_ctrl->npss - 1; + lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); switch (pc) { case NVME_POWER_STATE_START_VALID: @@ -1494,20 +1493,19 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, break; case NVME_POWER_STATE_IDLE: /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ - /* min of desired state and (lps-1) because lps is STOP */ if (pcmod == 0x0) - ps_desired = min(POWER_STATE_1, (lowest_pow_st - 1)); + ps_desired = POWER_STATE_1; else if (pcmod == 0x1) - ps_desired = min(POWER_STATE_2, (lowest_pow_st - 1)); + ps_desired = POWER_STATE_2; else if (pcmod == 0x2) - ps_desired = min(POWER_STATE_3, (lowest_pow_st - 1)); + ps_desired = POWER_STATE_3; break; case NVME_POWER_STATE_STANDBY: /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ if (pcmod == 0x0) - ps_desired = max(0, (lowest_pow_st - 2)); + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); else if (pcmod == 0x1) - ps_desired = max(0, (lowest_pow_st - 1)); + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); break; case NVME_POWER_STATE_LU_CONTROL: default: diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4c95b503b09e..bbeb404b3a07 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -541,7 +541,6 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) return -ENOENT; (void) get_device(&rbd_dev->dev); - set_device_ro(bdev, rbd_dev->mapping.read_only); return 0; } @@ -559,10 +558,76 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) put_device(&rbd_dev->dev); } +static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) +{ + int ret = 0; + int val; + bool ro; + bool ro_changed = false; + + /* get_user() may sleep, so call it before taking rbd_dev->lock */ + if (get_user(val, (int __user *)(arg))) + return -EFAULT; + + ro = val ? true : false; + /* Snapshot doesn't allow to write*/ + if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) + return -EROFS; + + spin_lock_irq(&rbd_dev->lock); + /* prevent others open this device */ + if (rbd_dev->open_count > 1) { + ret = -EBUSY; + goto out; + } + + if (rbd_dev->mapping.read_only != ro) { + rbd_dev->mapping.read_only = ro; + ro_changed = true; + } + +out: + spin_unlock_irq(&rbd_dev->lock); + /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ + if (ret == 0 && ro_changed) + set_disk_ro(rbd_dev->disk, ro ? 1 : 0); + + return ret; +} + +static int rbd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + int ret = 0; + + switch (cmd) { + case BLKROSET: + ret = rbd_ioctl_set_ro(rbd_dev, arg); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return rbd_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + static const struct block_device_operations rbd_bd_ops = { .owner = THIS_MODULE, .open = rbd_open, .release = rbd_release, + .ioctl = rbd_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = rbd_compat_ioctl, +#endif }; /* @@ -1382,6 +1447,13 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } +static void rbd_img_request_get(struct rbd_img_request *img_request) +{ + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_get(&img_request->kref); +} + static bool img_request_child_test(struct rbd_img_request *img_request); static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); @@ -2142,6 +2214,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) img_request->next_completion = which; out: spin_unlock_irq(&img_request->completion_lock); + rbd_img_request_put(img_request); if (!more) rbd_img_request_complete(img_request); @@ -2242,6 +2315,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, goto out_unwind; obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; + rbd_img_request_get(img_request); if (write_request) { osd_req_op_alloc_hint_init(osd_req, which, @@ -2872,56 +2946,55 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) } /* - * Request sync osd watch/unwatch. The value of "start" determines - * whether a watch request is being initiated or torn down. + * Initiate a watch request, synchronously. */ -static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; int ret; - rbd_assert(start ^ !!rbd_dev->watch_event); - rbd_assert(start ^ !!rbd_dev->watch_request); + rbd_assert(!rbd_dev->watch_event); + rbd_assert(!rbd_dev->watch_request); - if (start) { - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, - &rbd_dev->watch_event); - if (ret < 0) - return ret; - rbd_assert(rbd_dev->watch_event != NULL); - } + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + return ret; + + rbd_assert(rbd_dev->watch_event); - ret = -ENOMEM; obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) + OBJ_REQUEST_NODATA); + if (!obj_request) { + ret = -ENOMEM; goto out_cancel; + } obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, obj_request); - if (!obj_request->osd_req) - goto out_cancel; + if (!obj_request->osd_req) { + ret = -ENOMEM; + goto out_put; + } - if (start) - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); - else - ceph_osdc_unregister_linger_request(osdc, - rbd_dev->watch_request->osd_req); + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, start ? 1 : 0); + rbd_dev->watch_event->cookie, 0, 1); rbd_osd_req_format_write(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); if (ret) - goto out_cancel; + goto out_linger; + ret = rbd_obj_request_wait(obj_request); if (ret) - goto out_cancel; + goto out_linger; + ret = obj_request->result; if (ret) - goto out_cancel; + goto out_linger; /* * A watch request is set to linger, so the underlying osd @@ -2931,36 +3004,84 @@ static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) * it. We'll drop that reference (below) after we've * unregistered it. */ - if (start) { - rbd_dev->watch_request = obj_request; + rbd_dev->watch_request = obj_request; - return 0; + return 0; + +out_linger: + ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req); +out_put: + rbd_obj_request_put(obj_request); +out_cancel: + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; + + return ret; +} + +/* + * Tear down a watch request, synchronously. + */ +static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + int ret; + + rbd_assert(rbd_dev->watch_event); + rbd_assert(rbd_dev->watch_request); + + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) { + ret = -ENOMEM; + goto out_cancel; + } + + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, + obj_request); + if (!obj_request->osd_req) { + ret = -ENOMEM; + goto out_put; } + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, 0, 0); + rbd_osd_req_format_write(obj_request); + + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out_put; + + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out_put; + + ret = obj_request->result; + if (ret) + goto out_put; + /* We have successfully torn down the watch request */ + ceph_osdc_unregister_linger_request(osdc, + rbd_dev->watch_request->osd_req); rbd_obj_request_put(rbd_dev->watch_request); rbd_dev->watch_request = NULL; + +out_put: + rbd_obj_request_put(obj_request); out_cancel: - /* Cancel the event if we're tearing down, or on error */ ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; - if (obj_request) - rbd_obj_request_put(obj_request); return ret; } -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) -{ - return __rbd_dev_header_watch_sync(rbd_dev, true); -} - static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) { int ret; - ret = __rbd_dev_header_watch_sync(rbd_dev, false); + ret = __rbd_dev_header_unwatch_sync(rbd_dev); if (ret) { rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", ret); @@ -3058,7 +3179,6 @@ static void rbd_request_fn(struct request_queue *q) __releases(q->queue_lock) __acquires(q->queue_lock) { struct rbd_device *rbd_dev = q->queuedata; - bool read_only = rbd_dev->mapping.read_only; struct request *rq; int result; @@ -3094,7 +3214,7 @@ static void rbd_request_fn(struct request_queue *q) if (write_request) { result = -EROFS; - if (read_only) + if (rbd_dev->mapping.read_only) goto end_request; rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); } @@ -4683,6 +4803,38 @@ out_err: } /* + * Return pool id (>= 0) or a negative error code. + */ +static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) +{ + u64 newest_epoch; + unsigned long timeout = rbdc->client->options->mount_timeout * HZ; + int tries = 0; + int ret; + +again: + ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); + if (ret == -ENOENT && tries++ < 1) { + ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", + &newest_epoch); + if (ret < 0) + return ret; + + if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { + ceph_monc_request_next_osdmap(&rbdc->client->monc); + (void) ceph_monc_wait_osdmap(&rbdc->client->monc, + newest_epoch, timeout); + goto again; + } else { + /* the osdmap we have is new enough */ + return -ENOENT; + } + } + + return ret; +} + +/* * An rbd format 2 image has a unique identifier, distinct from the * name given to it by the user. Internally, that identifier is * what's used to specify the names of objects related to the image. @@ -4752,7 +4904,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) image_id = ceph_extract_encoded_string(&p, p + ret, NULL, GFP_NOIO); - ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; + ret = PTR_ERR_OR_ZERO(image_id); if (!ret) rbd_dev->image_format = 2; } else { @@ -4907,6 +5059,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) if (ret) goto err_out_disk; set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); ret = rbd_bus_add_dev(rbd_dev); if (ret) @@ -5053,7 +5206,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, struct rbd_options *rbd_opts = NULL; struct rbd_spec *spec = NULL; struct rbd_client *rbdc; - struct ceph_osd_client *osdc; bool read_only; int rc = -ENOMEM; @@ -5075,8 +5227,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, } /* pick the pool */ - osdc = &rbdc->client->osdc; - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); + rc = rbd_add_get_pool_id(rbdc, spec->pool_name); if (rc < 0) goto err_out_client; spec->pool_id = (u64)rc; @@ -5387,6 +5538,7 @@ err_out_slab: static void __exit rbd_exit(void) { + ida_destroy(&rbd_dev_id_ida); rbd_sysfs_cleanup(); if (single_major) unregister_blkdev(rbd_major, RBD_DRV_NAME); |