diff options
Diffstat (limited to 'drivers')
51 files changed, 1205 insertions, 804 deletions
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 0469aceaa230..485865fd0412 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3780,7 +3780,7 @@ static int compat_getdrvprm(int drive, v.native_format = UDP->native_format; mutex_unlock(&floppy_mutex); - if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_params))) + if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_params))) return -EFAULT; return 0; } @@ -3816,7 +3816,7 @@ static int compat_getdrvstat(int drive, bool poll, v.bufblocks = UDRS->bufblocks; mutex_unlock(&floppy_mutex); - if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_struct))) + if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_struct))) return -EFAULT; return 0; Eintr: diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ab7ca5989097..1410fa893653 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1755,6 +1755,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_SET_FD: case LOOP_CHANGE_FD: case LOOP_SET_BLOCK_SIZE: + case LOOP_SET_DIRECT_IO: err = lo_ioctl(bdev, mode, cmd, arg); break; default: diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index e21d2ded732b..a8e3815295fe 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -108,6 +108,7 @@ struct nbd_device { struct nbd_config *config; struct mutex config_lock; struct gendisk *disk; + struct workqueue_struct *recv_workq; struct list_head list; struct task_struct *task_recv; @@ -121,6 +122,7 @@ struct nbd_cmd { struct mutex lock; int index; int cookie; + int retries; blk_status_t status; unsigned long flags; u32 cmd_cookie; @@ -138,7 +140,6 @@ static struct dentry *nbd_dbg_dir; static unsigned int nbds_max = 16; static int max_part = 16; -static struct workqueue_struct *recv_workqueue; static int part_shift; static int nbd_dev_dbg_init(struct nbd_device *nbd); @@ -344,6 +345,22 @@ static void sock_shutdown(struct nbd_device *nbd) dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n"); } +static u32 req_to_nbd_cmd_type(struct request *req) +{ + switch (req_op(req)) { + case REQ_OP_DISCARD: + return NBD_CMD_TRIM; + case REQ_OP_FLUSH: + return NBD_CMD_FLUSH; + case REQ_OP_WRITE: + return NBD_CMD_WRITE; + case REQ_OP_READ: + return NBD_CMD_READ; + default: + return U32_MAX; + } +} + static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, bool reserved) { @@ -357,8 +374,10 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } config = nbd->config; - if (!mutex_trylock(&cmd->lock)) + if (!mutex_trylock(&cmd->lock)) { + nbd_config_put(nbd); return BLK_EH_RESET_TIMER; + } if (config->num_connections > 1) { dev_err_ratelimited(nbd_to_dev(nbd), @@ -389,10 +408,25 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, nbd_config_put(nbd); return BLK_EH_DONE; } - } else { - dev_err_ratelimited(nbd_to_dev(nbd), - "Connection timed out\n"); } + + if (!nbd->tag_set.timeout) { + /* + * Userspace sets timeout=0 to disable socket disconnection, + * so just warn and reset the timer. + */ + cmd->retries++; + dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n", + req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)), + (unsigned long long)blk_rq_pos(req) << 9, + blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries); + + mutex_unlock(&cmd->lock); + nbd_config_put(nbd); + return BLK_EH_RESET_TIMER; + } + + dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); set_bit(NBD_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); @@ -480,22 +514,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); - switch (req_op(req)) { - case REQ_OP_DISCARD: - type = NBD_CMD_TRIM; - break; - case REQ_OP_FLUSH: - type = NBD_CMD_FLUSH; - break; - case REQ_OP_WRITE: - type = NBD_CMD_WRITE; - break; - case REQ_OP_READ: - type = NBD_CMD_READ; - break; - default: + type = req_to_nbd_cmd_type(req); + if (type == U32_MAX) return -EIO; - } if (rq_data_dir(req) == WRITE && (config->flags & NBD_FLAG_READ_ONLY)) { @@ -526,6 +547,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) } cmd->index = index; cmd->cookie = nsock->cookie; + cmd->retries = 0; request.type = htonl(type | nbd_cmd_flags); if (type != NBD_CMD_FLUSH) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); @@ -1036,7 +1058,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) /* We take the tx_mutex in an error path in the recv_work, so we * need to queue_work outside of the tx_mutex. */ - queue_work(recv_workqueue, &args->work); + queue_work(nbd->recv_workq, &args->work); atomic_inc(&config->live_connections); wake_up(&config->conn_wait); @@ -1137,6 +1159,10 @@ static void nbd_config_put(struct nbd_device *nbd) kfree(nbd->config); nbd->config = NULL; + if (nbd->recv_workq) + destroy_workqueue(nbd->recv_workq); + nbd->recv_workq = NULL; + nbd->tag_set.timeout = 0; nbd->disk->queue->limits.discard_granularity = 0; nbd->disk->queue->limits.discard_alignment = 0; @@ -1165,6 +1191,14 @@ static int nbd_start_device(struct nbd_device *nbd) return -EINVAL; } + nbd->recv_workq = alloc_workqueue("knbd%d-recv", + WQ_MEM_RECLAIM | WQ_HIGHPRI | + WQ_UNBOUND, 0, nbd->index); + if (!nbd->recv_workq) { + dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n"); + return -ENOMEM; + } + blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); nbd->task_recv = current; @@ -1195,7 +1229,7 @@ static int nbd_start_device(struct nbd_device *nbd) INIT_WORK(&args->work, recv_work); args->nbd = nbd; args->index = i; - queue_work(recv_workqueue, &args->work); + queue_work(nbd->recv_workq, &args->work); } nbd_size_update(nbd); return error; @@ -1215,8 +1249,10 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b mutex_unlock(&nbd->config_lock); ret = wait_event_interruptible(config->recv_wq, atomic_read(&config->recv_threads) == 0); - if (ret) + if (ret) { sock_shutdown(nbd); + flush_workqueue(nbd->recv_workq); + } mutex_lock(&nbd->config_lock); nbd_bdev_reset(bdev); /* user requested, ignore socket errors */ @@ -1246,6 +1282,13 @@ static bool nbd_is_valid_blksize(unsigned long blksize) return true; } +static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout) +{ + nbd->tag_set.timeout = timeout * HZ; + if (timeout) + blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); +} + /* Must be called with config_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) @@ -1276,10 +1319,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_size_set(nbd, config->blksize, arg); return 0; case NBD_SET_TIMEOUT: - if (arg) { - nbd->tag_set.timeout = arg * HZ; - blk_queue_rq_timeout(nbd->disk->queue, arg * HZ); - } + nbd_set_cmd_timeout(nbd, arg); return 0; case NBD_SET_FLAGS: @@ -1799,11 +1839,9 @@ again: if (ret) goto out; - if (info->attrs[NBD_ATTR_TIMEOUT]) { - u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); - nbd->tag_set.timeout = timeout * HZ; - blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); - } + if (info->attrs[NBD_ATTR_TIMEOUT]) + nbd_set_cmd_timeout(nbd, + nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { config->dead_conn_timeout = nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); @@ -1875,6 +1913,12 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) nbd_disconnect(nbd); nbd_clear_sock(nbd); mutex_unlock(&nbd->config_lock); + /* + * Make sure recv thread has finished, so it does not drop the last + * config ref and try to destroy the workqueue from inside the work + * queue. + */ + flush_workqueue(nbd->recv_workq); if (test_and_clear_bit(NBD_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); @@ -1971,11 +2015,9 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (ret) goto out; - if (info->attrs[NBD_ATTR_TIMEOUT]) { - u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); - nbd->tag_set.timeout = timeout * HZ; - blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); - } + if (info->attrs[NBD_ATTR_TIMEOUT]) + nbd_set_cmd_timeout(nbd, + nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { config->dead_conn_timeout = nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); @@ -2261,20 +2303,12 @@ static int __init nbd_init(void) if (nbds_max > 1UL << (MINORBITS - part_shift)) return -EINVAL; - recv_workqueue = alloc_workqueue("knbd-recv", - WQ_MEM_RECLAIM | WQ_HIGHPRI | - WQ_UNBOUND, 0); - if (!recv_workqueue) - return -ENOMEM; - if (register_blkdev(NBD_MAJOR, "nbd")) { - destroy_workqueue(recv_workqueue); + if (register_blkdev(NBD_MAJOR, "nbd")) return -EIO; - } if (genl_register_family(&nbd_genl_family)) { unregister_blkdev(NBD_MAJOR, "nbd"); - destroy_workqueue(recv_workqueue); return -EINVAL; } nbd_dbg_init(); @@ -2316,7 +2350,6 @@ static void __exit nbd_cleanup(void) idr_destroy(&nbd_index_idr); genl_unregister_family(&nbd_genl_family); - destroy_workqueue(recv_workqueue); unregister_blkdev(NBD_MAJOR, "nbd"); } diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index a1b9929bd911..a235c45e22a7 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -2,6 +2,9 @@ #ifndef __BLK_NULL_BLK_H #define __BLK_NULL_BLK_H +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/blk-mq.h> @@ -90,13 +93,13 @@ int null_zone_init(struct nullb_device *dev); void null_zone_exit(struct nullb_device *dev); int null_zone_report(struct gendisk *disk, sector_t sector, struct blk_zone *zones, unsigned int *nr_zones); -void null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors); -void null_zone_reset(struct nullb_cmd *cmd, sector_t sector); +blk_status_t null_handle_zoned(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors); #else static inline int null_zone_init(struct nullb_device *dev) { - pr_err("null_blk: CONFIG_BLK_DEV_ZONED not enabled\n"); + pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); return -EINVAL; } static inline void null_zone_exit(struct nullb_device *dev) {} @@ -106,10 +109,11 @@ static inline int null_zone_report(struct gendisk *disk, sector_t sector, { return -EOPNOTSUPP; } -static inline void null_zone_write(struct nullb_cmd *cmd, sector_t sector, - unsigned int nr_sectors) +static inline blk_status_t null_handle_zoned(struct nullb_cmd *cmd, + enum req_opf op, sector_t sector, + sector_t nr_sectors) { + return BLK_STS_NOTSUPP; } -static inline void null_zone_reset(struct nullb_cmd *cmd, sector_t sector) {} #endif /* CONFIG_BLK_DEV_ZONED */ #endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 99328ded60d1..0e7da5015ccd 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -141,8 +141,8 @@ static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); -static int nr_devices = 1; -module_param(nr_devices, int, 0444); +static unsigned int nr_devices = 1; +module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static bool g_blocking; @@ -1133,93 +1133,61 @@ static void null_restart_queue_async(struct nullb *nullb) blk_mq_start_stopped_hw_queues(q, true); } -static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) +static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; - int err = 0; + blk_status_t sts = BLK_STS_OK; + struct request *rq = cmd->rq; - if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { - struct request *rq = cmd->rq; - - if (!hrtimer_active(&nullb->bw_timer)) - hrtimer_restart(&nullb->bw_timer); - - if (atomic_long_sub_return(blk_rq_bytes(rq), - &nullb->cur_bytes) < 0) { - null_stop_queue(nullb); - /* race with timer */ - if (atomic_long_read(&nullb->cur_bytes) > 0) - null_restart_queue_async(nullb); - /* requeue request */ - return BLK_STS_DEV_RESOURCE; - } - } + if (!hrtimer_active(&nullb->bw_timer)) + hrtimer_restart(&nullb->bw_timer); - if (nullb->dev->badblocks.shift != -1) { - int bad_sectors; - sector_t sector, size, first_bad; - bool is_flush = true; - - if (dev->queue_mode == NULL_Q_BIO && - bio_op(cmd->bio) != REQ_OP_FLUSH) { - is_flush = false; - sector = cmd->bio->bi_iter.bi_sector; - size = bio_sectors(cmd->bio); - } - if (dev->queue_mode != NULL_Q_BIO && - req_op(cmd->rq) != REQ_OP_FLUSH) { - is_flush = false; - sector = blk_rq_pos(cmd->rq); - size = blk_rq_sectors(cmd->rq); - } - if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector, - size, &first_bad, &bad_sectors)) { - cmd->error = BLK_STS_IOERR; - goto out; - } + if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { + null_stop_queue(nullb); + /* race with timer */ + if (atomic_long_read(&nullb->cur_bytes) > 0) + null_restart_queue_async(nullb); + /* requeue request */ + sts = BLK_STS_DEV_RESOURCE; } + return sts; +} - if (dev->memory_backed) { - if (dev->queue_mode == NULL_Q_BIO) { - if (bio_op(cmd->bio) == REQ_OP_FLUSH) - err = null_handle_flush(nullb); - else - err = null_handle_bio(cmd); - } else { - if (req_op(cmd->rq) == REQ_OP_FLUSH) - err = null_handle_flush(nullb); - else - err = null_handle_rq(cmd); - } - } - cmd->error = errno_to_blk_status(err); +static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, + sector_t sector, + sector_t nr_sectors) +{ + struct badblocks *bb = &cmd->nq->dev->badblocks; + sector_t first_bad; + int bad_sectors; - if (!cmd->error && dev->zoned) { - sector_t sector; - unsigned int nr_sectors; - enum req_opf op; + if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_IOERR; - if (dev->queue_mode == NULL_Q_BIO) { - op = bio_op(cmd->bio); - sector = cmd->bio->bi_iter.bi_sector; - nr_sectors = cmd->bio->bi_iter.bi_size >> 9; - } else { - op = req_op(cmd->rq); - sector = blk_rq_pos(cmd->rq); - nr_sectors = blk_rq_sectors(cmd->rq); - } + return BLK_STS_OK; +} - if (op == REQ_OP_WRITE) - null_zone_write(cmd, sector, nr_sectors); - else if (op == REQ_OP_ZONE_RESET) - null_zone_reset(cmd, sector); - } -out: +static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, + enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + int err; + + if (dev->queue_mode == NULL_Q_BIO) + err = null_handle_bio(cmd); + else + err = null_handle_rq(cmd); + + return errno_to_blk_status(err); +} + +static inline void nullb_complete_cmd(struct nullb_cmd *cmd) +{ /* Complete IO by inline, softirq or timer */ - switch (dev->irqmode) { + switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: - switch (dev->queue_mode) { + switch (cmd->nq->dev->queue_mode) { case NULL_Q_MQ: blk_mq_complete_request(cmd->rq); break; @@ -1238,6 +1206,40 @@ out: null_cmd_end_timer(cmd); break; } +} + +static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors, enum req_opf op) +{ + struct nullb_device *dev = cmd->nq->dev; + struct nullb *nullb = dev->nullb; + blk_status_t sts; + + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { + sts = null_handle_throttled(cmd); + if (sts != BLK_STS_OK) + return sts; + } + + if (op == REQ_OP_FLUSH) { + cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + goto out; + } + + if (nullb->dev->badblocks.shift != -1) { + cmd->error = null_handle_badblocks(cmd, sector, nr_sectors); + if (cmd->error != BLK_STS_OK) + goto out; + } + + if (dev->memory_backed) + cmd->error = null_handle_memory_backed(cmd, op); + + if (!cmd->error && dev->zoned) + cmd->error = null_handle_zoned(cmd, op, sector, nr_sectors); + +out: + nullb_complete_cmd(cmd); return BLK_STS_OK; } @@ -1280,6 +1282,8 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb) static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) { + sector_t sector = bio->bi_iter.bi_sector; + sector_t nr_sectors = bio_sectors(bio); struct nullb *nullb = q->queuedata; struct nullb_queue *nq = nullb_to_queue(nullb); struct nullb_cmd *cmd; @@ -1287,7 +1291,7 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) cmd = alloc_cmd(nq, 1); cmd->bio = bio; - null_handle_cmd(cmd); + null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); return BLK_QC_T_NONE; } @@ -1311,7 +1315,7 @@ static bool should_requeue_request(struct request *rq) static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) { - pr_info("null: rq %p timed out\n", rq); + pr_info("rq %p timed out\n", rq); blk_mq_complete_request(rq); return BLK_EH_DONE; } @@ -1321,6 +1325,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); struct nullb_queue *nq = hctx->driver_data; + sector_t nr_sectors = blk_rq_sectors(bd->rq); + sector_t sector = blk_rq_pos(bd->rq); might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); @@ -1349,7 +1355,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, if (should_timeout_request(bd->rq)) return BLK_STS_OK; - return null_handle_cmd(cmd); + return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); } static const struct blk_mq_ops null_mq_ops = { @@ -1688,6 +1694,9 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); nullb->q->limits.zoned = BLK_ZONED_HM; + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q); + blk_queue_required_elevator_features(nullb->q, + ELEVATOR_F_ZBD_SEQ_WRITE); } nullb->q->queuedata = nullb; @@ -1739,28 +1748,28 @@ static int __init null_init(void) struct nullb_device *dev; if (g_bs > PAGE_SIZE) { - pr_warn("null_blk: invalid block size\n"); - pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); + pr_warn("invalid block size\n"); + pr_warn("defaults block size to %lu\n", PAGE_SIZE); g_bs = PAGE_SIZE; } if (!is_power_of_2(g_zone_size)) { - pr_err("null_blk: zone_size must be power-of-two\n"); + pr_err("zone_size must be power-of-two\n"); return -EINVAL; } if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { - pr_err("null_blk: invalid home_node value\n"); + pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; } if (g_queue_mode == NULL_Q_RQ) { - pr_err("null_blk: legacy IO path no longer available\n"); + pr_err("legacy IO path no longer available\n"); return -EINVAL; } if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { - pr_warn("null_blk: submit_queues param is set to %u.\n", + pr_warn("submit_queues param is set to %u.\n", nr_online_nodes); g_submit_queues = nr_online_nodes; } @@ -1803,7 +1812,7 @@ static int __init null_init(void) } } - pr_info("null: module loaded\n"); + pr_info("module loaded\n"); return 0; err_dev: diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index cb28d93f2bd1..eabc116832a7 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -17,7 +17,7 @@ int null_zone_init(struct nullb_device *dev) unsigned int i; if (!is_power_of_2(dev->zone_size)) { - pr_err("null_blk: zone_size must be power-of-two\n"); + pr_err("zone_size must be power-of-two\n"); return -EINVAL; } @@ -31,7 +31,7 @@ int null_zone_init(struct nullb_device *dev) if (dev->zone_nr_conv >= dev->nr_zones) { dev->zone_nr_conv = dev->nr_zones - 1; - pr_info("null_blk: changed the number of conventional zones to %u", + pr_info("changed the number of conventional zones to %u", dev->zone_nr_conv); } @@ -84,7 +84,7 @@ int null_zone_report(struct gendisk *disk, sector_t sector, return 0; } -void null_zone_write(struct nullb_cmd *cmd, sector_t sector, +static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; @@ -95,14 +95,12 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector, case BLK_ZONE_COND_FULL: /* Cannot write to a full zone */ cmd->error = BLK_STS_IOERR; - break; + return BLK_STS_IOERR; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_IMP_OPEN: /* Writes must be at the write pointer position */ - if (sector != zone->wp) { - cmd->error = BLK_STS_IOERR; - break; - } + if (sector != zone->wp) + return BLK_STS_IOERR; if (zone->cond == BLK_ZONE_COND_EMPTY) zone->cond = BLK_ZONE_COND_IMP_OPEN; @@ -115,22 +113,51 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector, break; default: /* Invalid zone condition */ - cmd->error = BLK_STS_IOERR; - break; + return BLK_STS_IOERR; } + return BLK_STS_OK; } -void null_zone_reset(struct nullb_cmd *cmd, sector_t sector) +static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector) { struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); struct blk_zone *zone = &dev->zones[zno]; + size_t i; + + switch (req_op(cmd->rq)) { + case REQ_OP_ZONE_RESET_ALL: + for (i = 0; i < dev->nr_zones; i++) { + if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL) + continue; + zone[i].cond = BLK_ZONE_COND_EMPTY; + zone[i].wp = zone[i].start; + } + break; + case REQ_OP_ZONE_RESET: + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { - cmd->error = BLK_STS_IOERR; - return; + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; + break; + default: + cmd->error = BLK_STS_NOTSUPP; + break; } + return BLK_STS_OK; +} - zone->cond = BLK_ZONE_COND_EMPTY; - zone->wp = zone->start; +blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op, + sector_t sector, sector_t nr_sectors) +{ + switch (op) { + case REQ_OP_WRITE: + return null_zone_write(cmd, sector, nr_sectors); + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + return null_zone_reset(cmd, sector); + default: + return BLK_STS_OK; + } } diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 001dbdcbf355..636bfea2de6f 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -314,8 +314,8 @@ static void pcd_init_units(void) disk->queue = blk_mq_init_sq_queue(&cd->tag_set, &pcd_mq_ops, 1, BLK_MQ_F_SHOULD_MERGE); if (IS_ERR(disk->queue)) { - put_disk(disk); disk->queue = NULL; + put_disk(disk); continue; } @@ -723,9 +723,9 @@ static int pcd_detect(void) k = 0; if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ cd = pcd; - if (pi_init(cd->pi, 1, -1, -1, -1, -1, -1, pcd_buffer, - PI_PCD, verbose, cd->name)) { - if (!pcd_probe(cd, -1, id) && cd->disk) { + if (cd->disk && pi_init(cd->pi, 1, -1, -1, -1, -1, -1, + pcd_buffer, PI_PCD, verbose, cd->name)) { + if (!pcd_probe(cd, -1, id)) { cd->present = 1; k++; } else @@ -736,11 +736,13 @@ static int pcd_detect(void) int *conf = *drives[unit]; if (!conf[D_PRT]) continue; + if (!cd->disk) + continue; if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD], conf[D_UNI], conf[D_PRO], conf[D_DLY], pcd_buffer, PI_PCD, verbose, cd->name)) continue; - if (!pcd_probe(cd, conf[D_SLV], id) && cd->disk) { + if (!pcd_probe(cd, conf[D_SLV], id)) { cd->present = 1; k++; } else diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 1e9c50a7256c..6b7d4cab3687 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -300,8 +300,8 @@ static void __init pf_init_units(void) disk->queue = blk_mq_init_sq_queue(&pf->tag_set, &pf_mq_ops, 1, BLK_MQ_F_SHOULD_MERGE); if (IS_ERR(disk->queue)) { - put_disk(disk); disk->queue = NULL; + put_disk(disk); continue; } diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index a600934fdd9c..7543e395a2c6 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -4,6 +4,8 @@ * Initial release: Matias Bjorling <m@bjorling.me> */ +#define pr_fmt(fmt) "nvm: " fmt + #include <linux/list.h> #include <linux/types.h> #include <linux/sem.h> @@ -74,7 +76,7 @@ static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) for (i = lun_begin; i <= lun_end; i++) { if (test_and_set_bit(i, dev->lun_map)) { - pr_err("nvm: lun %d already allocated\n", i); + pr_err("lun %d already allocated\n", i); goto err; } } @@ -264,7 +266,7 @@ static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin, int lun_end) { if (lun_begin > lun_end || lun_end >= geo->all_luns) { - pr_err("nvm: lun out of bound (%u:%u > %u)\n", + pr_err("lun out of bound (%u:%u > %u)\n", lun_begin, lun_end, geo->all_luns - 1); return -EINVAL; } @@ -297,7 +299,7 @@ static int __nvm_config_extended(struct nvm_dev *dev, if (e->op == 0xFFFF) { e->op = NVM_TARGET_DEFAULT_OP; } else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) { - pr_err("nvm: invalid over provisioning value\n"); + pr_err("invalid over provisioning value\n"); return -EINVAL; } @@ -334,23 +336,23 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) e = create->conf.e; break; default: - pr_err("nvm: config type not valid\n"); + pr_err("config type not valid\n"); return -EINVAL; } tt = nvm_find_target_type(create->tgttype); if (!tt) { - pr_err("nvm: target type %s not found\n", create->tgttype); + pr_err("target type %s not found\n", create->tgttype); return -EINVAL; } if ((tt->flags & NVM_TGT_F_HOST_L2P) != (dev->geo.dom & NVM_RSP_L2P)) { - pr_err("nvm: device is incompatible with target L2P type.\n"); + pr_err("device is incompatible with target L2P type.\n"); return -EINVAL; } if (nvm_target_exists(create->tgtname)) { - pr_err("nvm: target name already exists (%s)\n", + pr_err("target name already exists (%s)\n", create->tgtname); return -EINVAL; } @@ -367,7 +369,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op); if (!tgt_dev) { - pr_err("nvm: could not create target device\n"); + pr_err("could not create target device\n"); ret = -ENOMEM; goto err_t; } @@ -493,8 +495,11 @@ static int nvm_remove_tgt(struct nvm_ioctl_remove *remove) } up_read(&nvm_lock); - if (!t) + if (!t) { + pr_err("failed to remove target %s\n", + remove->tgtname); return 1; + } __nvm_remove_target(t, true); kref_put(&dev->ref, nvm_free); @@ -686,7 +691,7 @@ static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, rqd->nr_ppas = nr_ppas; rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); if (!rqd->ppa_list) { - pr_err("nvm: failed to allocate dma memory\n"); + pr_err("failed to allocate dma memory\n"); return -ENOMEM; } @@ -731,7 +736,7 @@ static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd) return flags; } -int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf) { struct nvm_dev *dev = tgt_dev->parent; int ret; @@ -745,19 +750,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); /* In case of error, fail with right address format */ - ret = dev->ops->submit_io(dev, rqd); + ret = dev->ops->submit_io(dev, rqd, buf); if (ret) nvm_rq_dev_to_tgt(tgt_dev, rqd); return ret; } EXPORT_SYMBOL(nvm_submit_io); -int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) +static void nvm_sync_end_io(struct nvm_rq *rqd) +{ + struct completion *waiting = rqd->private; + + complete(waiting); +} + +static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd, + void *buf) +{ + DECLARE_COMPLETION_ONSTACK(wait); + int ret = 0; + + rqd->end_io = nvm_sync_end_io; + rqd->private = &wait; + + ret = dev->ops->submit_io(dev, rqd, buf); + if (ret) + return ret; + + wait_for_completion_io(&wait); + + return 0; +} + +int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, + void *buf) { struct nvm_dev *dev = tgt_dev->parent; int ret; - if (!dev->ops->submit_io_sync) + if (!dev->ops->submit_io) return -ENODEV; nvm_rq_tgt_to_dev(tgt_dev, rqd); @@ -765,9 +796,7 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) rqd->dev = tgt_dev; rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - /* In case of error, fail with right address format */ - ret = dev->ops->submit_io_sync(dev, rqd); - nvm_rq_dev_to_tgt(tgt_dev, rqd); + ret = nvm_submit_io_wait(dev, rqd, buf); return ret; } @@ -788,12 +817,13 @@ EXPORT_SYMBOL(nvm_end_io); static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd) { - if (!dev->ops->submit_io_sync) + if (!dev->ops->submit_io) return -ENODEV; + rqd->dev = NULL; rqd->flags = nvm_set_flags(&dev->geo, rqd); - return dev->ops->submit_io_sync(dev, rqd); + return nvm_submit_io_wait(dev, rqd, NULL); } static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa) @@ -1048,7 +1078,7 @@ int nvm_set_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, return 0; if (nr_ppas > NVM_MAX_VLBA) { - pr_err("nvm: unable to update all blocks atomically\n"); + pr_err("unable to update all blocks atomically\n"); return -EINVAL; } @@ -1111,27 +1141,26 @@ static int nvm_init(struct nvm_dev *dev) int ret = -EINVAL; if (dev->ops->identity(dev)) { - pr_err("nvm: device could not be identified\n"); + pr_err("device could not be identified\n"); goto err; } - pr_debug("nvm: ver:%u.%u nvm_vendor:%x\n", - geo->major_ver_id, geo->minor_ver_id, - geo->vmnt); + pr_debug("ver:%u.%u nvm_vendor:%x\n", geo->major_ver_id, + geo->minor_ver_id, geo->vmnt); ret = nvm_core_init(dev); if (ret) { - pr_err("nvm: could not initialize core structures.\n"); + pr_err("could not initialize core structures.\n"); goto err; } - pr_info("nvm: registered %s [%u/%u/%u/%u/%u]\n", + pr_info("registered %s [%u/%u/%u/%u/%u]\n", dev->name, dev->geo.ws_min, dev->geo.ws_opt, dev->geo.num_chk, dev->geo.all_luns, dev->geo.num_ch); return 0; err: - pr_err("nvm: failed to initialize nvm\n"); + pr_err("failed to initialize nvm\n"); return ret; } @@ -1169,7 +1198,7 @@ int nvm_register(struct nvm_dev *dev) dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist", exp_pool_size); if (!dev->dma_pool) { - pr_err("nvm: could not create dma pool\n"); + pr_err("could not create dma pool\n"); kref_put(&dev->ref, nvm_free); return -ENOMEM; } @@ -1214,7 +1243,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) up_write(&nvm_lock); if (!dev) { - pr_err("nvm: device not found\n"); + pr_err("device not found\n"); return -EINVAL; } @@ -1288,7 +1317,7 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg) i++; if (i > 31) { - pr_err("nvm: max 31 devices can be reported.\n"); + pr_err("max 31 devices can be reported.\n"); break; } } @@ -1315,7 +1344,7 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg) if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED && create.conf.e.rsv != 0) { - pr_err("nvm: reserved config field in use\n"); + pr_err("reserved config field in use\n"); return -EINVAL; } @@ -1331,7 +1360,7 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg) flags &= ~NVM_TARGET_FACTORY; if (flags) { - pr_err("nvm: flag not supported\n"); + pr_err("flag not supported\n"); return -EINVAL; } } @@ -1349,7 +1378,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) remove.tgtname[DISK_NAME_LEN - 1] = '\0'; if (remove.flags != 0) { - pr_err("nvm: no flags supported\n"); + pr_err("no flags supported\n"); return -EINVAL; } @@ -1365,7 +1394,7 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg) return -EFAULT; if (init.flags != 0) { - pr_err("nvm: no flags supported\n"); + pr_err("no flags supported\n"); return -EINVAL; } diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index f546e6f28b8a..b413bafe93fd 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -507,7 +507,7 @@ void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) pblk->sec_per_write = sec_per_write; } -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) +int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf) { struct nvm_tgt_dev *dev = pblk->dev; @@ -518,7 +518,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) return NVM_IO_ERR; #endif - return nvm_submit_io(dev, rqd); + return nvm_submit_io(dev, rqd, buf); } void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) @@ -541,7 +541,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) } } -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf) { struct nvm_tgt_dev *dev = pblk->dev; int ret; @@ -553,7 +553,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) return NVM_IO_ERR; #endif - ret = nvm_submit_io_sync(dev, rqd); + ret = nvm_submit_io_sync(dev, rqd, buf); if (trace_pblk_chunk_state_enabled() && !ret && rqd->opcode == NVM_OP_PWRITE) @@ -562,65 +562,19 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) return ret; } -int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd) +static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd, + void *buf) { struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); int ret; pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io_sync(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd, buf); pblk_up_chunk(pblk, ppa_list[0]); return ret; } -static void pblk_bio_map_addr_endio(struct bio *bio) -{ - bio_put(bio); -} - -struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, - unsigned int nr_secs, unsigned int len, - int alloc_type, gfp_t gfp_mask) -{ - struct nvm_tgt_dev *dev = pblk->dev; - void *kaddr = data; - struct page *page; - struct bio *bio; - int i, ret; - - if (alloc_type == PBLK_KMALLOC_META) - return bio_map_kern(dev->q, kaddr, len, gfp_mask); - - bio = bio_kmalloc(gfp_mask, nr_secs); - if (!bio) - return ERR_PTR(-ENOMEM); - - for (i = 0; i < nr_secs; i++) { - page = vmalloc_to_page(kaddr); - if (!page) { - pblk_err(pblk, "could not map vmalloc bio\n"); - bio_put(bio); - bio = ERR_PTR(-ENOMEM); - goto out; - } - - ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { - pblk_err(pblk, "could not add page to bio\n"); - bio_put(bio); - bio = ERR_PTR(-ENOMEM); - goto out; - } - - kaddr += PAGE_SIZE; - } - - bio->bi_end_io = pblk_bio_map_addr_endio; -out: - return bio; -} - int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, unsigned long secs_to_flush, bool skip_meta) { @@ -722,9 +676,7 @@ u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line) int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) { - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_line_meta *lm = &pblk->lm; - struct bio *bio; struct ppa_addr *ppa_list; struct nvm_rq rqd; u64 paddr = pblk_line_smeta_start(pblk, line); @@ -736,16 +688,6 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) if (ret) return ret; - bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto clear_rqd; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - - rqd.bio = bio; rqd.opcode = NVM_OP_PREAD; rqd.nr_ppas = lm->smeta_sec; rqd.is_seq = 1; @@ -754,10 +696,9 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) for (i = 0; i < lm->smeta_sec; i++, paddr++) ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - ret = pblk_submit_io_sync(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd, line->smeta); if (ret) { pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - bio_put(bio); goto clear_rqd; } @@ -776,9 +717,7 @@ clear_rqd: static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, u64 paddr) { - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_line_meta *lm = &pblk->lm; - struct bio *bio; struct ppa_addr *ppa_list; struct nvm_rq rqd; __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); @@ -791,16 +730,6 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, if (ret) return ret; - bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto clear_rqd; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - rqd.bio = bio; rqd.opcode = NVM_OP_PWRITE; rqd.nr_ppas = lm->smeta_sec; rqd.is_seq = 1; @@ -814,10 +743,9 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, meta->lba = lba_list[paddr] = addr_empty; } - ret = pblk_submit_io_sync_sem(pblk, &rqd); + ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta); if (ret) { pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - bio_put(bio); goto clear_rqd; } @@ -838,10 +766,8 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; void *ppa_list_buf, *meta_list; - struct bio *bio; struct ppa_addr *ppa_list; struct nvm_rq rqd; u64 paddr = line->emeta_ssec; @@ -867,17 +793,6 @@ next_rq: rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); rq_len = rq_ppas * geo->csecs; - bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len, - l_mg->emeta_alloc_type, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto free_rqd_dma; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - - rqd.bio = bio; rqd.meta_list = meta_list; rqd.ppa_list = ppa_list_buf; rqd.dma_meta_list = dma_meta_list; @@ -896,7 +811,6 @@ next_rq: while (test_bit(pos, line->blk_bitmap)) { paddr += min; if (pblk_boundary_paddr_checks(pblk, paddr)) { - bio_put(bio); ret = -EINTR; goto free_rqd_dma; } @@ -906,7 +820,6 @@ next_rq: } if (pblk_boundary_paddr_checks(pblk, paddr + min)) { - bio_put(bio); ret = -EINTR; goto free_rqd_dma; } @@ -915,10 +828,9 @@ next_rq: ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id); } - ret = pblk_submit_io_sync(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf); if (ret) { pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - bio_put(bio); goto free_rqd_dma; } @@ -963,7 +875,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ - ret = pblk_submit_io_sync(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd, NULL); rqd.private = pblk; __pblk_end_io_erase(pblk, &rqd); @@ -1792,7 +1704,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ - err = pblk_submit_io(pblk, rqd); + err = pblk_submit_io(pblk, rqd, NULL); if (err) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; @@ -1923,13 +1835,11 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; unsigned int lba_list_size = lm->emeta_len[2]; struct pblk_w_err_gc *w_err_gc = line->w_err_gc; struct pblk_emeta *emeta = line->emeta; - w_err_gc->lba_list = pblk_malloc(lba_list_size, - l_mg->emeta_alloc_type, GFP_KERNEL); + w_err_gc->lba_list = kvmalloc(lba_list_size, GFP_KERNEL); memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf), lba_list_size); } diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 63ee205b41c4..2581eebcfc41 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -132,14 +132,12 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk, struct pblk_line *line) { struct line_emeta *emeta_buf; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; unsigned int lba_list_size = lm->emeta_len[2]; __le64 *lba_list; int ret; - emeta_buf = pblk_malloc(lm->emeta_len[0], - l_mg->emeta_alloc_type, GFP_KERNEL); + emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); if (!emeta_buf) return NULL; @@ -147,7 +145,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk, if (ret) { pblk_err(pblk, "line %d read emeta failed (%d)\n", line->id, ret); - pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + kvfree(emeta_buf); return NULL; } @@ -161,16 +159,16 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk, if (ret) { pblk_err(pblk, "inconsistent emeta (line %d)\n", line->id); - pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + kvfree(emeta_buf); return NULL; } - lba_list = pblk_malloc(lba_list_size, - l_mg->emeta_alloc_type, GFP_KERNEL); + lba_list = kvmalloc(lba_list_size, GFP_KERNEL); + if (lba_list) memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size); - pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); + kvfree(emeta_buf); return lba_list; } @@ -181,7 +179,6 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) ws); struct pblk *pblk = line_ws->pblk; struct pblk_line *line = line_ws->line; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line_meta *lm = &pblk->lm; struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; @@ -272,7 +269,7 @@ next_rq: goto next_rq; out: - pblk_mfree(lba_list, l_mg->emeta_alloc_type); + kvfree(lba_list); kfree(line_ws); kfree(invalid_bitmap); @@ -286,7 +283,7 @@ fail_free_gc_data: fail_free_gc_rq: kfree(gc_rq); fail_free_lba_list: - pblk_mfree(lba_list, l_mg->emeta_alloc_type); + kvfree(lba_list); fail_free_invalid_bitmap: kfree(invalid_bitmap); fail_free_ws: diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index b351c7f002de..9a967a2e83dd 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -543,7 +543,7 @@ static void pblk_line_mg_free(struct pblk *pblk) for (i = 0; i < PBLK_DATA_LINES; i++) { kfree(l_mg->sline_meta[i]); - pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); + kvfree(l_mg->eline_meta[i]->buf); kfree(l_mg->eline_meta[i]); } @@ -560,7 +560,7 @@ static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg, kfree(line->erase_bitmap); kfree(line->chks); - pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type); + kvfree(w_err_gc->lba_list); kfree(w_err_gc); } @@ -890,29 +890,14 @@ static int pblk_line_mg_init(struct pblk *pblk) if (!emeta) goto fail_free_emeta; - if (lm->emeta_len[0] > KMALLOC_MAX_CACHE_SIZE) { - l_mg->emeta_alloc_type = PBLK_VMALLOC_META; - - emeta->buf = vmalloc(lm->emeta_len[0]); - if (!emeta->buf) { - kfree(emeta); - goto fail_free_emeta; - } - - emeta->nr_entries = lm->emeta_sec[0]; - l_mg->eline_meta[i] = emeta; - } else { - l_mg->emeta_alloc_type = PBLK_KMALLOC_META; - - emeta->buf = kmalloc(lm->emeta_len[0], GFP_KERNEL); - if (!emeta->buf) { - kfree(emeta); - goto fail_free_emeta; - } - - emeta->nr_entries = lm->emeta_sec[0]; - l_mg->eline_meta[i] = emeta; + emeta->buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); + if (!emeta->buf) { + kfree(emeta); + goto fail_free_emeta; } + + emeta->nr_entries = lm->emeta_sec[0]; + l_mg->eline_meta[i] = emeta; } for (i = 0; i < l_mg->nr_lines; i++) @@ -926,10 +911,7 @@ static int pblk_line_mg_init(struct pblk *pblk) fail_free_emeta: while (--i >= 0) { - if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META) - vfree(l_mg->eline_meta[i]->buf); - else - kfree(l_mg->eline_meta[i]->buf); + kvfree(l_mg->eline_meta[i]->buf); kfree(l_mg->eline_meta[i]); } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d98ea392fe33..8efd14e683dc 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -342,7 +342,7 @@ split_retry: bio_put(int_bio); int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); goto split_retry; - } else if (pblk_submit_io(pblk, rqd)) { + } else if (pblk_submit_io(pblk, rqd, NULL)) { /* Submitting IO to drive failed, let's report an error */ rqd->error = -ENODEV; pblk_end_io_read(rqd); @@ -417,11 +417,7 @@ out: int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct bio *bio; struct nvm_rq rqd; - int data_len; int ret = NVM_IO_OK; memset(&rqd, 0, sizeof(struct nvm_rq)); @@ -446,26 +442,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) if (!(gc_rq->secs_to_gc)) goto out; - data_len = (gc_rq->secs_to_gc) * geo->csecs; - bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, - PBLK_VMALLOC_META, GFP_KERNEL); - if (IS_ERR(bio)) { - pblk_err(pblk, "could not allocate GC bio (%lu)\n", - PTR_ERR(bio)); - ret = PTR_ERR(bio); - goto err_free_dma; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - rqd.opcode = NVM_OP_PREAD; rqd.nr_ppas = gc_rq->secs_to_gc; - rqd.bio = bio; - if (pblk_submit_io_sync(pblk, &rqd)) { + if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) { ret = -EIO; - goto err_free_bio; + goto err_free_dma; } pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs); @@ -489,8 +471,6 @@ out: pblk_free_rqd_meta(pblk, &rqd); return ret; -err_free_bio: - bio_put(bio); err_free_dma: pblk_free_rqd_meta(pblk, &rqd); return ret; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index e6dda04de144..299ef47a17b2 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -178,12 +178,11 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line, void *meta_list; struct pblk_pad_rq *pad_rq; struct nvm_rq *rqd; - struct bio *bio; struct ppa_addr *ppa_list; void *data; __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); u64 w_ptr = line->cur_sec; - int left_line_ppas, rq_ppas, rq_len; + int left_line_ppas, rq_ppas; int i, j; int ret = 0; @@ -212,28 +211,15 @@ next_pad_rq: goto fail_complete; } - rq_len = rq_ppas * geo->csecs; - - bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, - PBLK_VMALLOC_META, GFP_KERNEL); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - goto fail_complete; - } - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); ret = pblk_alloc_rqd_meta(pblk, rqd); if (ret) { pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - bio_put(bio); goto fail_complete; } - rqd->bio = bio; + rqd->bio = NULL; rqd->opcode = NVM_OP_PWRITE; rqd->is_seq = 1; rqd->nr_ppas = rq_ppas; @@ -275,13 +261,12 @@ next_pad_rq: kref_get(&pad_rq->ref); pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io(pblk, rqd, data); if (ret) { pblk_err(pblk, "I/O submission failed: %d\n", ret); pblk_up_chunk(pblk, ppa_list[0]); kref_put(&pad_rq->ref, pblk_recov_complete); pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - bio_put(bio); goto fail_complete; } @@ -375,13 +360,12 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, struct ppa_addr *ppa_list; void *meta_list; struct nvm_rq *rqd; - struct bio *bio; void *data; dma_addr_t dma_ppa_list, dma_meta_list; __le64 *lba_list; u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; bool padded = false; - int rq_ppas, rq_len; + int rq_ppas; int i, j; int ret; u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec; @@ -404,18 +388,9 @@ next_rq: rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); if (!rq_ppas) rq_ppas = pblk->min_write_pgs; - rq_len = rq_ppas * geo->csecs; retry_rq: - bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bio_get(bio); - - rqd->bio = bio; + rqd->bio = NULL; rqd->opcode = NVM_OP_PREAD; rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; @@ -445,10 +420,9 @@ retry_rq: addr_to_gen_ppa(pblk, paddr + j, line->id); } - ret = pblk_submit_io_sync(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd, data); if (ret) { pblk_err(pblk, "I/O submission failed: %d\n", ret); - bio_put(bio); return ret; } @@ -460,24 +434,20 @@ retry_rq: if (padded) { pblk_log_read_err(pblk, rqd); - bio_put(bio); return -EINTR; } pad_distance = pblk_pad_distance(pblk, line); ret = pblk_recov_pad_line(pblk, line, pad_distance); if (ret) { - bio_put(bio); return ret; } padded = true; - bio_put(bio); goto retry_rq; } pblk_get_packed_meta(pblk, rqd); - bio_put(bio); for (i = 0; i < rqd->nr_ppas; i++) { struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 4e63f9b5954c..b9a2aeba95ab 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -373,7 +373,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) struct pblk_emeta *emeta = meta_line->emeta; struct ppa_addr *ppa_list; struct pblk_g_ctx *m_ctx; - struct bio *bio; struct nvm_rq *rqd; void *data; u64 paddr; @@ -391,20 +390,9 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) rq_len = rq_ppas * geo->csecs; data = ((void *)emeta->buf) + emeta->mem; - bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, - l_mg->emeta_alloc_type, GFP_KERNEL); - if (IS_ERR(bio)) { - pblk_err(pblk, "failed to map emeta io"); - ret = PTR_ERR(bio); - goto fail_free_rqd; - } - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd->bio = bio; - ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); if (ret) - goto fail_free_bio; + goto fail_free_rqd; ppa_list = nvm_rq_to_ppa_list(rqd); for (i = 0; i < rqd->nr_ppas; ) { @@ -423,7 +411,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io(pblk, rqd, data); if (ret) { pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); goto fail_rollback; @@ -437,8 +425,6 @@ fail_rollback: pblk_dealloc_page(pblk, meta_line, rq_ppas); list_add(&meta_line->list, &meta_line->list); spin_unlock(&l_mg->close_lock); -fail_free_bio: - bio_put(bio); fail_free_rqd: pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); return ret; @@ -523,7 +509,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) meta_line = pblk_should_submit_meta_io(pblk, rqd); /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd); + err = pblk_submit_io(pblk, rqd, NULL); if (err) { pblk_err(pblk, "data I/O submission failed: %d\n", err); return NVM_IO_ERR; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index a67855387f53..86ffa875bfe1 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -482,11 +482,6 @@ struct pblk_line { #define PBLK_DATA_LINES 4 enum { - PBLK_KMALLOC_META = 1, - PBLK_VMALLOC_META = 2, -}; - -enum { PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */ PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */ PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */ @@ -521,9 +516,6 @@ struct pblk_line_mgmt { __le32 *vsc_list; /* Valid sector counts for all lines */ - /* Metadata allocation type: VMALLOC | KMALLOC */ - int emeta_alloc_type; - /* Pre-allocated metadata for data lines */ struct pblk_smeta *sline_meta[PBLK_DATA_LINES]; struct pblk_emeta *eline_meta[PBLK_DATA_LINES]; @@ -783,14 +775,10 @@ struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, struct ppa_addr ppa); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf); +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf); int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd); -struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, - unsigned int nr_secs, unsigned int len, - int alloc_type, gfp_t gfp_mask); struct pblk_line *pblk_line_get(struct pblk *pblk); struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); struct pblk_line *pblk_line_replace_data(struct pblk *pblk); @@ -938,21 +926,6 @@ void pblk_rl_werr_line_out(struct pblk_rl *rl); int pblk_sysfs_init(struct gendisk *tdisk); void pblk_sysfs_exit(struct gendisk *tdisk); -static inline void *pblk_malloc(size_t size, int type, gfp_t flags) -{ - if (type == PBLK_KMALLOC_META) - return kmalloc(size, flags); - return vmalloc(size); -} - -static inline void pblk_mfree(void *ptr, int type) -{ - if (type == PBLK_KMALLOC_META) - kfree(ptr); - else - vfree(ptr); -} - static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx) { return c_ctx - sizeof(struct nvm_rq); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 73f5319295bc..c12cd809ab19 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -105,8 +105,14 @@ struct closure_syncer { static void closure_sync_fn(struct closure *cl) { - cl->s->done = 1; - wake_up_process(cl->s->task); + struct closure_syncer *s = cl->s; + struct task_struct *p; + + rcu_read_lock(); + p = READ_ONCE(s->task); + s->done = 1; + wake_up_process(p); + rcu_read_unlock(); } void __sched __closure_sync(struct closure *cl) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8b123be05254..336f43910383 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, while (size) { struct keybuf_key *w; unsigned int bytes = min(i->bytes, size); - int err = copy_to_user(buf, i->buf, bytes); - if (err) - return err; + if (copy_to_user(buf, i->buf, bytes)) + return -EFAULT; ret += bytes; buf += bytes; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e2059af90791..627dcea0f5b6 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -964,6 +964,7 @@ KTYPE(bch_cache_set_internal); static int __bch_cache_cmp(const void *l, const void *r) { + cond_resched(); return *((uint16_t *)r) - *((uint16_t *)l); } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index c9e44ac1f9a6..3f8577e2c13b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -408,6 +408,7 @@ static int map_request(struct dm_rq_target_io *tio) ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_rq_unprep_clone(clone); + blk_mq_cleanup_rq(clone); tio->ti->type->release_clone_rq(clone, &tio->info); tio->clone = NULL; return DM_MAPIO_REQUEUE; @@ -562,7 +563,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) if (err) goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(md->tag_set, md->queue); + q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true); if (IS_ERR(q)) { err = PTR_ERR(q); goto out_tag_set; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 7354466ddc90..c766c559d36d 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -258,6 +258,11 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_sector < start_sector)) goto out_of_bounds; + if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) { + bio_io_error(bio); + return true; + } + if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ struct bio *split = bio_split(bio, end_sector - bio_sector, diff --git a/drivers/md/md.c b/drivers/md/md.c index 24638ccedce4..1be7abeb24fd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -376,6 +376,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) struct mddev *mddev = q->queuedata; unsigned int sectors; + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + bio_io_error(bio); + return BLK_QC_T_NONE; + } + blk_queue_split(q, &bio); if (mddev == NULL || mddev->pers == NULL) { @@ -1232,6 +1237,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0) + mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; @@ -1647,6 +1654,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && + sb->level != 0) + return -EINVAL; + if (!refdev) { ret = 1; } else { @@ -1757,6 +1768,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0 && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) + mddev->layout = -1; + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); @@ -1826,8 +1841,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_BITMAP)) rdev->saved_raid_disk = -1; - } else - set_bit(In_sync, &rdev->flags); + } else { + /* + * If the array is FROZEN, then the device can't + * be in_sync with rest of array. + */ + if (!test_bit(MD_RECOVERY_FROZEN, + &mddev->recovery)) + set_bit(In_sync, &rdev->flags); + } rdev->raid_disk = role; break; } @@ -3664,11 +3686,7 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) return -EINVAL; if (decimals < 0) decimals = 0; - while (decimals < scale) { - result *= 10; - decimals ++; - } - *res = result; + *res = result * int_pow(10, scale - decimals); return 0; } @@ -4155,12 +4173,17 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, * active-idle * like active, but no writes have been seen for a while (100msec). * + * broken + * RAID0/LINEAR-only: same as clean, but array is missing a member. + * It's useful because RAID0/LINEAR mounted-arrays aren't stopped + * when a member is gone, so this state will at least alert the + * user that something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, - write_pending, active_idle, bad_word}; + write_pending, active_idle, broken, bad_word}; static char *array_states[] = { "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", - "write-pending", "active-idle", NULL }; + "write-pending", "active-idle", "broken", NULL }; static int match_word(const char *word, char **list) { @@ -4176,7 +4199,7 @@ array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; - if (mddev->pers) + if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { case 1: st = readonly; @@ -4196,7 +4219,10 @@ array_state_show(struct mddev *mddev, char *page) st = active; spin_unlock(&mddev->lock); } - else { + + if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) + st = broken; + } else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && mddev->dev_sectors == 0) @@ -4310,6 +4336,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case write_pending: case active_idle: + case broken: /* these cannot be set */ break; } @@ -5182,6 +5209,34 @@ static struct md_sysfs_entry md_consistency_policy = __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, consistency_policy_store); +static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->fail_last_dev); +} + +/* + * Setting fail_last_dev to true to allow last device to be forcibly removed + * from RAID1/RAID10. + */ +static ssize_t +fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) +{ + int ret; + bool value; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + if (value != mddev->fail_last_dev) + mddev->fail_last_dev = value; + + return len; +} +static struct md_sysfs_entry md_fail_last_dev = +__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, + fail_last_dev_store); + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, @@ -5198,6 +5253,7 @@ static struct attribute *md_default_attrs[] = { &md_array_size.attr, &max_corr_read_errors.attr, &md_consistency_policy.attr, + &md_fail_last_dev.attr, NULL, }; @@ -5744,9 +5800,6 @@ int md_run(struct mddev *mddev) md_update_sb(mddev, 0); md_new_event(mddev); - sysfs_notify_dirent_safe(mddev->sysfs_state); - sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; bitmap_abort: @@ -5767,6 +5820,7 @@ static int do_md_run(struct mddev *mddev) { int err; + set_bit(MD_NOT_READY, &mddev->flags); err = md_run(mddev); if (err) goto out; @@ -5787,9 +5841,14 @@ static int do_md_run(struct mddev *mddev) set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); + clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify(&mddev->kobj, NULL, "degraded"); out: + clear_bit(MD_NOT_READY, &mddev->flags); return err; } @@ -6849,6 +6908,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; + if (mddev->level == 0) + /* Cannot trust RAID0 layout info here */ + mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; if (mddev->persistent) { @@ -8900,6 +8962,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev_trylock(mddev)) { int spares = 0; + bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) mddev->safemode = 0; @@ -8945,7 +9008,7 @@ void md_check_recovery(struct mddev *mddev) } } - if (!mddev->external && !mddev->in_sync) { + if (try_set_sync && !mddev->external && !mddev->in_sync) { spin_lock(&mddev->lock); set_in_sync(mddev); spin_unlock(&mddev->lock); @@ -9043,7 +9106,8 @@ void md_reap_sync_thread(struct mddev *mddev) /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + mddev->degraded != mddev->raid_disks) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 10f98200e2f8..c5e3ff398b59 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -248,6 +248,12 @@ enum mddev_flags { MD_UPDATING_SB, /* md_check_recovery is updating the metadata * without explicitly holding reconfig_mutex. */ + MD_NOT_READY, /* do_md_run() is active, so 'array_state' + * must not report that array is ready yet + */ + MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop + * I/O in case an array member is gone/failed. + */ }; enum mddev_sb_flags { @@ -487,6 +493,7 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ bool has_superblocks:1; + bool fail_last_dev:1; }; enum recovery_flags { @@ -735,6 +742,19 @@ extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); +static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type) +{ + int flags = rdev->bdev->bd_disk->flags; + + if (!(flags & GENHD_FL_UP)) { + if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags)) + pr_warn("md: %s: %s array has a missing/failed member\n", + mdname(rdev->mddev), md_type); + return true; + } + return false; +} + static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { int faulty = test_bit(Faulty, &rdev->flags); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bf5cf184a260..f61693e59684 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -19,6 +19,9 @@ #include "raid0.h" #include "raid5.h" +static int default_layout = 0; +module_param(default_layout, int, 0644); + #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ (1L << MD_JOURNAL_CLEAN) | \ @@ -139,6 +142,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); + + if (conf->nr_strip_zones == 1) { + conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; + } else if (default_layout == RAID0_ORIG_LAYOUT || + default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = default_layout; + } else { + pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", + mdname(mddev)); + pr_err("md/raid0: please set raid.default_layout to 1 or 2\n"); + err = -ENOTSUPP; + goto abort; + } /* * now since we have the hard sector sizes, we can make sure * chunk size is a multiple of that sector size @@ -547,10 +566,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) static bool raid0_make_request(struct mddev *mddev, struct bio *bio) { + struct r0conf *conf = mddev->private; struct strip_zone *zone; struct md_rdev *tmp_dev; sector_t bio_sector; sector_t sector; + sector_t orig_sector; unsigned chunk_sects; unsigned sectors; @@ -584,8 +605,26 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + orig_sector = sector; zone = find_zone(mddev->private, §or); - tmp_dev = map_sector(mddev, zone, sector, §or); + switch (conf->layout) { + case RAID0_ORIG_LAYOUT: + tmp_dev = map_sector(mddev, zone, orig_sector, §or); + break; + case RAID0_ALT_MULTIZONE_LAYOUT: + tmp_dev = map_sector(mddev, zone, sector, §or); + break; + default: + WARN("md/raid0:%s: Invalid layout\n", mdname(mddev)); + bio_io_error(bio); + return true; + } + + if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) { + bio_io_error(bio); + return true; + } + bio_set_dev(bio, tmp_dev->bdev); bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 540e65d92642..3816e5477db1 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -8,11 +8,25 @@ struct strip_zone { int nb_dev; /* # of devices attached to the zone */ }; +/* Linux 3.14 (20d0189b101) made an unintended change to + * the RAID0 layout for multi-zone arrays (where devices aren't all + * the same size. + * RAID0_ORIG_LAYOUT restores the original layout + * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout + * The layouts are identical when there is only one zone (all + * devices the same size). + */ + +enum r0layout { + RAID0_ORIG_LAYOUT = 1, + RAID0_ALT_MULTIZONE_LAYOUT = 2, +}; struct r0conf { struct strip_zone *strip_zone; struct md_rdev **devlist; /* lists of rdevs, pointed to * by strip_zone->dev */ int nr_strip_zones; + enum r0layout layout; }; #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34e26834ad28..0466ee2453b4 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -447,19 +447,21 @@ static void raid1_end_write_request(struct bio *bio) /* We never try FailFast to WriteMostly devices */ !test_bit(WriteMostly, &rdev->flags)) { md_error(r1_bio->mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - /* This is the only remaining device, - * We need to retry the write without - * FailFast - */ - set_bit(R1BIO_WriteError, &r1_bio->state); - else { - /* Finished with this branch */ - r1_bio->bios[mirror] = NULL; - to_put = bio; - } - } else + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + * For failfast, this is the only remaining device, + * We need to retry the write without FailFast. + */ + if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); + else { + /* Finished with this branch */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + } } else { /* * Set R1BIO_Uptodate in our master bio, so that we @@ -872,8 +874,11 @@ static void flush_pending_writes(struct r1conf *conf) * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. + * + * If resync/recovery is interrupted, returns -EINTR; + * Otherwise, returns 0. */ -static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr) +static int raise_barrier(struct r1conf *conf, sector_t sector_nr) { int idx = sector_to_idx(sector_nr); @@ -1612,12 +1617,12 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) /* * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. + * else if it is the last working disks with "fail_last_dev == false", + * ignore the error, let the next level up know. * else mark the drive as failed */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) + if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a @@ -1901,6 +1906,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) } while (sectors_to_go > 0); } +static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) +{ + if (atomic_dec_and_test(&r1_bio->remaining)) { + struct mddev *mddev = r1_bio->mddev; + int s = r1_bio->sectors; + + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } + } +} + static void end_sync_write(struct bio *bio) { int uptodate = !bio->bi_status; @@ -1927,16 +1948,7 @@ static void end_sync_write(struct bio *bio) ) set_bit(R1BIO_MadeGood, &r1_bio->state); - if (atomic_dec_and_test(&r1_bio->remaining)) { - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); - } - } + put_sync_write_buf(r1_bio, uptodate); } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, @@ -2219,17 +2231,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) generic_make_request(wbio); } - if (atomic_dec_and_test(&r1_bio->remaining)) { - /* if we're here, all write(s) have completed, so clean up */ - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, 1); - } - } + put_sync_write_buf(r1_bio, 1); } /* @@ -3127,6 +3129,13 @@ static int raid1_run(struct mddev *mddev) !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || test_bit(Faulty, &conf->mirrors[i].rdev->flags)) mddev->degraded++; + /* + * RAID1 needs at least one disk in active + */ + if (conf->raid_disks - mddev->degraded < 1) { + ret = -EINVAL; + goto abort; + } if (conf->raid_disks - mddev->degraded == 1) mddev->recovery_cp = MaxSector; @@ -3160,8 +3169,12 @@ static int raid1_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) { md_unregister_thread(&mddev->thread); - raid1_free(mddev, conf); + goto abort; } + return 0; + +abort: + raid1_free(mddev, conf); return ret; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8a1354a08a1a..299c7b1c9718 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -465,19 +465,21 @@ static void raid10_end_write_request(struct bio *bio) if (test_bit(FailFast, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { md_error(rdev->mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - /* This is the only remaining device, - * We need to retry the write without - * FailFast - */ - set_bit(R10BIO_WriteError, &r10_bio->state); - else { - r10_bio->devs[slot].bio = NULL; - to_put = bio; - dec_rdev = 1; - } - } else + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + * For failfast, this is the only remaining device, + * We need to retry the write without FailFast. + */ + if (!test_bit(Faulty, &rdev->flags)) set_bit(R10BIO_WriteError, &r10_bio->state); + else { + r10_bio->devs[slot].bio = NULL; + to_put = bio; + dec_rdev = 1; + } } } else { /* @@ -1638,12 +1640,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) /* * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. + * else if it is the last working disks with "fail_last_dev == false", + * ignore the error, let the next level up know. * else mark the drive as failed */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) + if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev && !enough(conf, rdev->raid_disk)) { /* * Don't fail the drive, just return an IO error. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3de4e13bde98..223e97ab27e6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2526,7 +2526,8 @@ static void raid5_end_read_request(struct bio * bi) int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); - atomic_inc(&rdev->read_errors); + if (!(bi->bi_status == BLK_STS_PROTECTION)) + atomic_inc(&rdev->read_errors); if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) pr_warn_ratelimited( "md/raid:%s: read error on replacement device (sector %llu on %s).\n", @@ -2549,16 +2550,24 @@ static void raid5_end_read_request(struct bio * bi) (unsigned long long)s, bdn); } else if (atomic_read(&rdev->read_errors) - > conf->max_nr_stripes) - pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", - mdname(conf->mddev), bdn); - else + > conf->max_nr_stripes) { + if (!test_bit(Faulty, &rdev->flags)) { + pr_warn("md/raid:%s: %d read_errors > %d stripes\n", + mdname(conf->mddev), + atomic_read(&rdev->read_errors), + conf->max_nr_stripes); + pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", + mdname(conf->mddev), bdn); + } + } else retry = 1; if (set_bad && test_bit(In_sync, &rdev->flags) && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) retry = 1; if (retry) - if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { + if (sh->qd_idx >= 0 && sh->pd_idx == i) + set_bit(R5_ReadError, &sh->dev[i].flags); + else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { set_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); } else @@ -4612,7 +4621,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_FULL_WRITE) | (1 << STRIPE_BIOFILL_RUN) | (1 << STRIPE_COMPUTE_RUN) | - (1 << STRIPE_OPS_REQ_PENDING) | (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | (1 << STRIPE_BATCH_ERR) | @@ -5491,7 +5499,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) return; logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); + last_sector = bio_end_sector(bi); bi->bi_next = NULL; @@ -5718,7 +5726,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = false; } - set_bit(STRIPE_HANDLE, &sh->state); + if (!sh->batch_head) + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && (bi->bi_opf & REQ_SYNC) && diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index cf991f13403e..f90e0704bed9 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -357,7 +357,6 @@ enum { STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ STRIPE_BIOFILL_RUN, STRIPE_COMPUTE_RUN, - STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, STRIPE_DISCARD, STRIPE_ON_RELEASE_LIST, @@ -493,9 +492,7 @@ struct disk_info { */ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) { - int sectors = bio_sectors(bio); - - if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) + if (bio_end_sector(bio) < sector + STRIPE_SECTORS) return bio->bi_next; else return NULL; diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index ec43ac9199e2..2b36f052bfb9 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -64,6 +64,7 @@ config NVME_TCP depends on INET depends on BLK_DEV_NVME select NVME_FABRICS + select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using the TCP transport. This allows you to use remote block devices diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d3d6b7bd6903..1ede1763a5ee 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -22,12 +22,12 @@ #include <linux/pm_qos.h> #include <asm/unaligned.h> -#define CREATE_TRACE_POINTS -#include "trace.h" - #include "nvme.h" #include "fabrics.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + #define NVME_MINORS (1U << MINORBITS) unsigned int admin_timeout = 60; @@ -81,7 +81,6 @@ EXPORT_SYMBOL_GPL(nvme_reset_wq); struct workqueue_struct *nvme_delete_wq; EXPORT_SYMBOL_GPL(nvme_delete_wq); -static DEFINE_IDA(nvme_subsystems_ida); static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); @@ -197,9 +196,9 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns) return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); } -static blk_status_t nvme_error_status(struct request *req) +static blk_status_t nvme_error_status(u16 status) { - switch (nvme_req(req)->status & 0x7ff) { + switch (status & 0x7ff) { case NVME_SC_SUCCESS: return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: @@ -226,6 +225,8 @@ static blk_status_t nvme_error_status(struct request *req) return BLK_STS_PROTECTION; case NVME_SC_RESERVATION_CONFLICT: return BLK_STS_NEXUS; + case NVME_SC_HOST_PATH_ERROR: + return BLK_STS_TRANSPORT; default: return BLK_STS_IOERR; } @@ -260,7 +261,7 @@ static void nvme_retry_req(struct request *req) void nvme_complete_rq(struct request *req) { - blk_status_t status = nvme_error_status(req); + blk_status_t status = nvme_error_status(nvme_req(req)->status); trace_nvme_complete_rq(req); @@ -279,6 +280,8 @@ void nvme_complete_rq(struct request *req) return; } } + + nvme_trace_bio_complete(req, status); blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); @@ -288,8 +291,12 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); - nvme_req(req)->status = NVME_SC_ABORT_REQ; - blk_mq_complete_request_sync(req); + /* don't abort one completed request */ + if (blk_mq_request_completed(req)) + return true; + + nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; + blk_mq_complete_request(req); return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); @@ -1088,10 +1095,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n NVME_IDENTIFY_DATA_SIZE); } -static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns **id) { - struct nvme_id_ns *id; struct nvme_command c = { }; int error; @@ -1100,18 +1106,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (!id) - return NULL; + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM; - error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - kfree(id); - return NULL; + kfree(*id); } - return id; + return error; } static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, @@ -1180,7 +1185,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) EXPORT_SYMBOL_GPL(nvme_set_queue_count); #define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ + NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) static void nvme_enable_aen(struct nvme_ctrl *ctrl) { @@ -1195,6 +1201,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) if (status) dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", supported_aens); + + queue_work(nvme_wq, &ctrl->async_event_work); } static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) @@ -1594,9 +1602,11 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); } -static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, +static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { + int ret = 0; + memset(ids, 0, sizeof(*ids)); if (ctrl->vs >= NVME_VS(1, 1, 0)) @@ -1607,10 +1617,12 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ctrl, nsid, ids)) + ret = nvme_identify_ns_descs(ctrl, nsid, ids); + if (ret) dev_warn(ctrl->device, - "%s: Identify Descriptors failed\n", __func__); + "Identify Descriptors failed (%d)\n", ret); } + return ret; } static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) @@ -1738,25 +1750,37 @@ static int nvme_revalidate_disk(struct gendisk *disk) return -ENODEV; } - id = nvme_identify_ns(ctrl, ns->head->ns_id); - if (!id) - return -ENODEV; + ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); + if (ret) + goto out; if (id->ncap == 0) { ret = -ENODEV; - goto out; + goto free_id; } __nvme_revalidate_disk(disk, id); - nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + if (ret) + goto free_id; + if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); ret = -ENODEV; } -out: +free_id: kfree(id); +out: + /* + * Only fail the function if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + */ + if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) + ret = 0; + else if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -1952,7 +1976,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) * bits', but doing so may cause the device to complete commands to the * admin queue ... and we don't know what memory that might be pointing at! */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl) { int ret; @@ -1966,20 +1990,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - return nvme_wait_ready(ctrl, cap, false); + return nvme_wait_ready(ctrl, ctrl->cap, false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { /* * Default to a 4K page size, with the intention to update this * path in the future to accomodate architectures with differing * kernel and IO page sizes. */ - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + unsigned dev_page_min, page_shift = 12; int ret; + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (ret) { + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); + return ret; + } + dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; + if (page_shift < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", @@ -1998,7 +2029,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, cap, true); + return nvme_wait_ready(ctrl, ctrl->cap, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); @@ -2332,7 +2363,8 @@ static void nvme_release_subsystem(struct device *dev) struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - ida_simple_remove(&nvme_subsystems_ida, subsys->instance); + if (subsys->instance >= 0) + ida_simple_remove(&nvme_instance_ida, subsys->instance); kfree(subsys); } @@ -2361,6 +2393,17 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) lockdep_assert_held(&nvme_subsystems_lock); + /* + * Fail matches for discovery subsystems. This results + * in each discovery controller bound to a unique subsystem. + * This avoids issues with validating controller values + * that can only be true when there is a single unique subsystem. + * There may be multiple and completely independent entities + * that provide discovery controllers. + */ + if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) + return NULL; + list_for_each_entry(subsys, &nvme_subsystems, entry) { if (strcmp(subsys->subnqn, subsysnqn)) continue; @@ -2461,12 +2504,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); if (!subsys) return -ENOMEM; - ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); - if (ret < 0) { - kfree(subsys); - return ret; - } - subsys->instance = ret; + + subsys->instance = -1; mutex_init(&subsys->lock); kref_init(&subsys->ref); INIT_LIST_HEAD(&subsys->ctrls); @@ -2485,7 +2524,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; subsys->dev.groups = nvme_subsys_attrs_groups; - dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); + dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); device_initialize(&subsys->dev); mutex_lock(&nvme_subsystems_lock); @@ -2517,6 +2556,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) goto out_put_subsystem; } + if (!found) + subsys->instance = ctrl->instance; ctrl->subsys = subsys; list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); mutex_unlock(&nvme_subsystems_lock); @@ -2574,7 +2615,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; - u64 cap; int ret, page_shift; u32 max_hw_sectors; bool prev_apst_enabled; @@ -2584,16 +2624,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret; } - - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); - if (ret) { - dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); - return ret; - } - page_shift = NVME_CAP_MPSMIN(cap) + 12; + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(cap); + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); ret = nvme_identify_ctrl(ctrl, &id); if (ret) { @@ -3184,7 +3219,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ns_id = nsid; kref_init(&head->ref); - nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + if (ret) + goto out_cleanup_srcu; ret = __nvme_check_ids(ctrl->subsys, head); if (ret) { @@ -3209,6 +3246,8 @@ out_ida_remove: out_free_head: kfree(head); out: + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ERR_PTR(ret); } @@ -3232,7 +3271,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, } else { struct nvme_ns_ids ids; - nvme_report_ns_ids(ctrl, nsid, id, &ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (ret) + goto out_unlock; + if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", @@ -3247,6 +3289,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, out_unlock: mutex_unlock(&ctrl->subsys->lock); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3338,11 +3382,9 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - id = nvme_identify_ns(ctrl, nsid); - if (!id) { - ret = -EIO; + ret = nvme_identify_ns(ctrl, nsid, &id); + if (ret) goto out_free_queue; - } if (id->ncap == 0) { ret = -EINVAL; @@ -3404,6 +3446,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3617,6 +3661,33 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret; + + ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); + if (ret) + return ret; + + if (opts) { + ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_TRSVCID=%s", + opts->trsvcid ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", + opts->host_traddr ?: "none"); + } + return ret; +} + static void nvme_aen_uevent(struct nvme_ctrl *ctrl) { char *envp[2] = { NULL, NULL }; @@ -3723,6 +3794,9 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) queue_work(nvme_wq, &ctrl->ana_work); break; #endif + case NVME_AER_NOTICE_DISC_CHANGED: + ctrl->aen_result = result; + break; default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -3769,10 +3843,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->kato) nvme_start_keep_alive(ctrl); + nvme_enable_aen(ctrl); + if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_enable_aen(ctrl); - queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } } @@ -3792,7 +3866,9 @@ static void nvme_free_ctrl(struct device *dev) container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + if (subsys && ctrl->instance != subsys->instance) + ida_simple_remove(&nvme_instance_ida, ctrl->instance); + kfree(ctrl->effects); nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -3992,6 +4068,9 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) list_for_each_entry(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); up_read(&ctrl->namespaces_rwsem); + + if (ctrl->admin_q) + blk_sync_queue(ctrl->admin_q); } EXPORT_SYMBOL_GPL(nvme_sync_queues); @@ -4050,6 +4129,7 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_class); goto unregister_chrdev; } + nvme_class->dev_uevent = nvme_class_uevent; nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); if (IS_ERR(nvme_subsys_class)) { @@ -4074,7 +4154,6 @@ out: static void __exit nvme_core_exit(void) { - ida_destroy(&nvme_subsystems_ida); class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 1994d5b42f94..74b8818ac9a1 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.fctype = nvme_fabrics_type_property_get; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.attrib = 1; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.offset = cpu_to_le32(off); cmd.prop_set.value = cpu_to_le64(val); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (unlikely(ret)) dev_err(ctrl->device, @@ -381,8 +381,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) * Set keep-alive timeout in seconds granularity (ms * 1000) * and add a grace period for controller kato enforcement */ - cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : - cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + cmd.connect.kato = ctrl->kato ? + cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0; if (ctrl->opts->disable_sqflow) cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; @@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, data, sizeof(*data), 0, NVME_QID_ANY, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false); if (ret) { @@ -611,6 +611,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_DATA_DIGEST, "data_digest" }, { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, + { NVMF_OPT_TOS, "tos=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->duplicate_connect = false; opts->hdr_digest = false; opts->data_digest = false; + opts->tos = -1; /* < 0 == use transport default */ options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -738,13 +740,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); } opts->kato = token; - - if (opts->discovery_nqn && opts->kato) { - pr_err("Discovery controllers cannot accept KATO != 0\n"); - ret = -EINVAL; - goto out; - } - break; case NVMF_OPT_CTRL_LOSS_TMO: if (match_int(args, &token)) { @@ -856,6 +851,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->nr_poll_queues = token; break; + case NVMF_OPT_TOS: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < 0) { + pr_err("Invalid type of service %d\n", token); + ret = -EINVAL; + goto out; + } + if (token > 255) { + pr_warn("Clamping type of service to 255\n"); + token = 255; + } + opts->tos = token; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -865,7 +876,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } if (opts->discovery_nqn) { - opts->kato = 0; opts->nr_io_queues = 0; opts->nr_write_queues = 0; opts->nr_poll_queues = 0; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 3044d8b99a24..93f08d77c896 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -55,6 +55,7 @@ enum { NVMF_OPT_DATA_DIGEST = 1 << 16, NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, NVMF_OPT_NR_POLL_QUEUES = 1 << 18, + NVMF_OPT_TOS = 1 << 19, }; /** @@ -87,6 +88,7 @@ enum { * @data_digest: generate/verify data digest (TCP) * @nr_write_queues: number of queues for write I/O * @nr_poll_queues: number of queues for polling I/O + * @tos: type of service */ struct nvmf_ctrl_options { unsigned mask; @@ -108,6 +110,7 @@ struct nvmf_ctrl_options { bool data_digest; unsigned int nr_write_queues; unsigned int nr_poll_queues; + int tos; }; /* diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 232d8094091b..265f89e11d8b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1608,9 +1608,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) sizeof(op->rsp_iu), DMA_FROM_DEVICE); if (opstate == FCPOP_STATE_ABORTED) - status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - else if (freq->status) - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + else if (freq->status) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to lldd error %d\n", + ctrl->cnum, freq->status); + } /* * For the linux implementation, if we have an unsuccesful @@ -1637,8 +1641,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) * no payload in the CQE by the transport. */ if (freq->transferred_length != - be32_to_cpu(op->cmd_iu.data_len)) { - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + be32_to_cpu(op->cmd_iu.data_len)) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad transfer " + "length: %d vs expected %d\n", + ctrl->cnum, freq->transferred_length, + be32_to_cpu(op->cmd_iu.data_len)); goto done; } result.u64 = 0; @@ -1655,7 +1664,17 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) freq->transferred_length || op->rsp_iu.status_code || sqe->common.command_id != cqe->command_id)) { - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad NVMe_ERSP: " + "iu len %d, xfr len %d vs %d, status code " + "%d, cmdid %d vs %d\n", + ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len), + be32_to_cpu(op->rsp_iu.xfrd_len), + freq->transferred_length, + op->rsp_iu.status_code, + sqe->common.command_id, + cqe->command_id); goto done; } result = cqe->result; @@ -1663,7 +1682,11 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) break; default: - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu " + "len %d\n", + ctrl->cnum, freq->rcv_rsplen); goto done; } @@ -2006,6 +2029,7 @@ nvme_fc_ctrl_free(struct kref *ref) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); kfree(ctrl->queues); @@ -2107,7 +2131,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, struct nvme_fc_fcp_op *op) { struct nvmefc_fcp_req *freq = &op->fcp_req; - enum dma_data_direction dir; int ret; freq->sg_cnt = 0; @@ -2124,9 +2147,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); - dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, - op->nents, dir); + op->nents, rq_dma_dir(rq)); if (unlikely(freq->sg_cnt <= 0)) { sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE); freq->sg_cnt = 0; @@ -2149,8 +2171,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, return; fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, - ((rq_data_dir(rq) == WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE)); + rq_dma_dir(rq)); nvme_cleanup_cmd(rq); @@ -2633,8 +2654,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queue; - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -2648,23 +2667,15 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * prior connection values */ - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); - if (ret) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_disconnect_admin_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + ret = nvme_enable_ctrl(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; ctrl->ctrl.max_hw_sectors = (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + ret = nvme_init_identify(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -2774,6 +2785,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); } /* @@ -2796,6 +2808,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); /* kill the aens as they are a separate path */ nvme_fc_abort_aen_ops(ctrl); @@ -3109,10 +3122,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_free_queues; ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + ret = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_admin_tag_set; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { ret = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_admin_tag_set; + goto out_cleanup_fabrics_q; } /* @@ -3184,6 +3203,8 @@ fail_ctrl: out_cleanup_admin_q: blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_admin_tag_set: blk_mq_free_tag_set(&ctrl->admin_tag_set); out_free_queues: diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ba009d4c9dfa..ec46693f6b64 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -667,11 +667,14 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, return rq; } -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, + void *buf) { + struct nvm_geo *geo = &dev->geo; struct request_queue *q = dev->q; struct nvme_nvm_command *cmd; struct request *rq; + int ret; cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); if (!cmd) @@ -679,8 +682,15 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq = nvme_nvm_alloc_request(q, rqd, cmd); if (IS_ERR(rq)) { - kfree(cmd); - return PTR_ERR(rq); + ret = PTR_ERR(rq); + goto err_free_cmd; + } + + if (buf) { + ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas, + GFP_KERNEL); + if (ret) + goto err_free_cmd; } rq->end_io_data = rqd; @@ -688,33 +698,9 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); return 0; -} - -static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - struct request_queue *q = dev->q; - struct request *rq; - struct nvme_nvm_command cmd; - int ret = 0; - - memset(&cmd, 0, sizeof(struct nvme_nvm_command)); - - rq = nvme_nvm_alloc_request(q, rqd, &cmd); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - /* I/Os can fail and the error is signaled through rqd. Callers must - * handle the error accordingly. - */ - blk_execute_rq(q, NULL, rq, 0); - if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - - rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); - rqd->error = nvme_req(rq)->status; - - blk_mq_free_request(rq); +err_free_cmd: + kfree(cmd); return ret; } @@ -754,7 +740,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .get_chk_meta = nvme_nvm_get_chk_meta, .submit_io = nvme_nvm_submit_io, - .submit_io_sync = nvme_nvm_submit_io_sync, .create_dma_pool = nvme_nvm_create_dma_pool, .destroy_dma_pool = nvme_nvm_destroy_dma_pool, diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index af831d3d15d0..30de7efef003 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -509,14 +509,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, down_write(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) + unsigned nsid = le32_to_cpu(desc->nsids[n]); + + if (ns->head->ns_id < nsid) continue; - nvme_update_ns_ana_state(desc, ns); + if (ns->head->ns_id == nsid) + nvme_update_ns_ana_state(desc, ns); if (++n == nr_nsids) break; } up_write(&ctrl->namespaces_rwsem); - WARN_ON_ONCE(n < nr_nsids); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2d678fb968c7..b5013c101b35 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -16,6 +16,8 @@ #include <linux/fault-inject.h> #include <linux/rcupdate.h> +#include <trace/events/block.h> + extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -97,6 +99,21 @@ enum nvme_quirks { * Force simple suspend/resume path. */ NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10), + + /* + * Use only one interrupt vector for all queues + */ + NVME_QUIRK_SINGLE_VECTOR = (1 << 11), + + /* + * Use non-standard 128 bytes SQEs. + */ + NVME_QUIRK_128_BYTES_SQES = (1 << 12), + + /* + * Prevent tag overlap between queues + */ + NVME_QUIRK_SHARED_TAGS = (1 << 13), }; /* @@ -169,6 +186,7 @@ struct nvme_ctrl { const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; struct request_queue *connect_q; + struct request_queue *fabrics_q; struct device *dev; int instance; int numa_node; @@ -431,8 +449,8 @@ void nvme_complete_rq(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); @@ -520,6 +538,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ + struct nvme_ns *ns = req->q->queuedata; + + if (req->cmd_flags & REQ_NVME_MPATH) + trace_block_bio_complete(ns->head->disk->queue, + req->bio, status); +} + extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; @@ -567,6 +595,10 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ +} static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 732d5b63ec05..6b4d7b064b38 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,8 +28,8 @@ #include "trace.h" #include "nvme.h" -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) +#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) @@ -100,6 +100,7 @@ struct nvme_dev { unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; int q_depth; + int io_sqes; u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; @@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) struct nvme_queue { struct nvme_dev *dev; spinlock_t sq_lock; - struct nvme_command *sq_cmds; + void *sq_cmds; /* only used for poll queues: */ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; @@ -178,6 +179,7 @@ struct nvme_queue { u16 last_cq_head; u16 qid; u8 cq_phase; + u8 sqes; unsigned long flags; #define NVMEQ_ENABLED 0 #define NVMEQ_SQ_CMB 1 @@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, bool write_sq) { spin_lock(&nvmeq->sq_lock); - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), + cmd, sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; nvme_write_sq_db(nvmeq, write_sq); @@ -534,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - enum dma_data_direction dma_dir = rq_data_dir(req) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; if (iod->dma_len) { - dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir); + dma_unmap_page(dev->dev, dma_addr, iod->dma_len, + rq_dma_dir(req)); return; } @@ -1344,16 +1346,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) static void nvme_free_queue(struct nvme_queue *nvmeq) { - dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); if (!nvmeq->sq_cmds) return; if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), - nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); + nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } else { - dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), nvmeq->sq_cmds, nvmeq->sq_dma_addr); } } @@ -1403,7 +1405,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else - nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + nvme_disable_ctrl(&dev->ctrl); nvme_poll_irqdisable(nvmeq, -1); } @@ -1433,12 +1435,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, } static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, - int qid, int depth) + int qid) { struct pci_dev *pdev = to_pci_dev(dev->dev); if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { - nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); + nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); if (nvmeq->sq_cmds) { nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, nvmeq->sq_cmds); @@ -1447,11 +1449,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } - pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } } - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) return -ENOMEM; @@ -1465,12 +1467,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) if (dev->ctrl.queue_count > qid) return 0; - nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth), + nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; + nvmeq->q_depth = depth; + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) + if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) goto free_cqdma; nvmeq->dev = dev; @@ -1479,15 +1483,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - nvmeq->q_depth = depth; nvmeq->qid = qid; dev->ctrl.queue_count++; return 0; free_cqdma: - dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); + dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); free_nvmeq: return -ENOMEM; } @@ -1515,7 +1518,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; wmb(); /* ensure the first interrupt sees the initialization */ @@ -1552,7 +1555,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) nvme_init_queue(nvmeq, qid); if (!polled) { - nvmeq->cq_vector = vector; result = queue_request_irq(nvmeq); if (result < 0) goto release_sq; @@ -1679,7 +1681,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_disable_ctrl(&dev->ctrl); if (result < 0) return result; @@ -1695,7 +1697,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_enable_ctrl(&dev->ctrl); if (result) return result; @@ -2077,6 +2079,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) dev->io_queues[HCTX_TYPE_DEFAULT] = 1; dev->io_queues[HCTX_TYPE_READ] = 0; + /* + * Some Apple controllers require all queues to use the + * first vector. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) + irq_queues = 1; + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } @@ -2095,6 +2104,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) unsigned long size; nr_io_queues = max_io_queues(); + + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + nr_io_queues = 1; + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; @@ -2265,6 +2282,14 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; + /* + * Some Apple controllers requires tags to be unique + * across admin and IO queue, so reserve the first 32 + * tags of the IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + dev->tagset.reserved_tags = NVME_AQ_DEPTH; + ret = blk_mq_alloc_tag_set(&dev->tagset); if (ret) { dev_warn(dev->ctrl.device, @@ -2314,10 +2339,21 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; /* + * Some Apple controllers require a non-standard SQE size. + * Interestingly they also seem to ignore the CC:IOSQES register + * so we don't bother updating it here. + */ + if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) + dev->io_sqes = 7; + else + dev->io_sqes = NVME_NVM_IOSQES; + + /* * Temporary fix for the Apple controller found in the MacBook8,1 and * some MacBook7,1 to avoid controller resets and data loss. */ @@ -2334,6 +2370,18 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } + /* + * Controllers with the shared tags quirk need the IO queue to be + * big enough so that we get 32 tags for the admin queue + */ + if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && + (dev->q_depth < (NVME_AQ_DEPTH + 2))) { + dev->q_depth = NVME_AQ_DEPTH + 2; + dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", + dev->q_depth); + } + + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); @@ -2401,6 +2449,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + blk_mq_tagset_wait_completed_request(&dev->tagset); + blk_mq_tagset_wait_completed_request(&dev->admin_tagset); /* * The driver will not be starting up queues again if shutting down so @@ -3041,6 +3091,10 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_128_BYTES_SQES | + NVME_QUIRK_SHARED_TAGS }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 1a6449bc547b..dfa07bb9dfeb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -757,6 +757,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, { if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); } if (ctrl->async_event_sqe.data) { @@ -798,10 +799,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_async_qe; } + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -809,24 +816,15 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_cleanup_queue; - error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP, - &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_stop_queue; ctrl->ctrl.max_hw_sectors = (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + error = nvme_init_identify(&ctrl->ctrl); if (error) goto out_stop_queue; @@ -838,6 +836,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); @@ -907,10 +908,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, { blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); - if (ctrl->ctrl.admin_tagset) + if (ctrl->ctrl.admin_tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, nvme_cancel_request, &ctrl->ctrl); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); + } + if (remove) + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -920,9 +924,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); - if (ctrl->ctrl.tagset) + if (ctrl->ctrl.tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset); + } if (remove) nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); @@ -1059,6 +1065,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1145,9 +1152,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, req->mr = NULL; } - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); nvme_cleanup_cmd(rq); sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); @@ -1273,7 +1278,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, - rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + rq_dma_dir(rq)); if (unlikely(count <= 0)) { ret = -EIO; goto out_free_table; @@ -1302,9 +1307,7 @@ out: return 0; out_unmap_sg: - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); out_free_table: sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); return ret; @@ -1547,16 +1550,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { + struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; int ret; ret = nvme_rdma_create_queue_ib(queue); if (ret) return ret; + if (ctrl->opts->tos >= 0) + rdma_set_service_type(queue->cm_id, ctrl->opts->tos); ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { - dev_err(queue->ctrl->ctrl.device, - "rdma_resolve_route failed (%d).\n", + dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", queue->cm_error); goto out_destroy_queue; } @@ -1869,10 +1874,11 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&ctrl->reconnect_work); nvme_rdma_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else - nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + nvme_disable_ctrl(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } @@ -2051,7 +2057,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | - NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_rdma_create_ctrl, }; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 606b13d35d16..4ffd5957637a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -13,6 +13,7 @@ #include <net/tcp.h> #include <linux/blk-mq.h> #include <crypto/hash.h> +#include <net/busy_poll.h> #include "nvme.h" #include "fabrics.h" @@ -72,6 +73,7 @@ struct nvme_tcp_queue { int pdu_offset; size_t data_remaining; size_t ddgst_remaining; + unsigned int nr_cqe; /* send state */ struct nvme_tcp_request *request; @@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, } nvme_end_request(rq, cqe->status, cqe->result); + queue->nr_cqe++; return 0; } @@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, switch (hdr->type) { case nvme_tcp_c2h_data: - ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); case nvme_tcp_rsp: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); case nvme_tcp_r2t: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: dev_err(queue->ctrl->ctrl.device, "unsupported pdu type (%d)\n", hdr->type); return -EINVAL; } - - return ret; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) @@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { - if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; + } nvme_tcp_init_recv_ctx(queue); } } @@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -841,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) @@ -1023,14 +1024,16 @@ done: static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) { - struct sock *sk = queue->sock->sk; + struct socket *sock = queue->sock; + struct sock *sk = sock->sk; read_descriptor_t rd_desc; int consumed; rd_desc.arg.data = queue; rd_desc.count = 1; lock_sock(sk); - consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + queue->nr_cqe = 0; + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); return consumed; } @@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->queue_size = queue_size; if (qid > 0) - queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + queue->cmnd_capsule_len = nctrl->ioccsz * 16; else queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; @@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to create socket: %d\n", ret); return ret; } @@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_SYNCNT sock opt %d\n", ret); goto err_sock; } @@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_NODELAY sock opt %d\n", ret); goto err_sock; } @@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, (char *)&sol, sizeof(sol)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set SO_LINGER sock opt %d\n", ret); goto err_sock; } + /* Set socket type of service */ + if (nctrl->opts->tos >= 0) { + opt = nctrl->opts->tos; + ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, + (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(nctrl->device, + "failed to set IP_TOS sock opt %d\n", ret); + goto err_sock; + } + } + queue->sock->sk->sk_allocation = GFP_ATOMIC; if (!qid) n = 0; @@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->pdu_offset = 0; sk_set_memalloc(queue->sock->sk); - if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to bind queue %d socket %d\n", qid, ret); goto err_sock; @@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, if (queue->hdr_digest || queue->data_digest) { ret = nvme_tcp_alloc_crypto(queue); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to allocate queue %d crypto\n", qid); goto err_sock; } @@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_crypto; } - dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to connect socket: %d\n", ret); goto err_rcv_pdu; } @@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; queue->sock->sk->sk_state_change = nvme_tcp_state_change; queue->sock->sk->sk_write_space = nvme_tcp_write_space; + queue->sock->sk->sk_ll_usec = 1; write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = 2 /* default + read */; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; } ret = blk_mq_alloc_tag_set(set); @@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); return nr_io_queues; } @@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, min(opts->nr_io_queues, nr_io_queues); nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } } static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) @@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) nvme_tcp_stop_queue(ctrl, 0); if (remove) { blk_cleanup_queue(ctrl->admin_q); + blk_cleanup_queue(ctrl->fabrics_q); blk_mq_free_tag_set(ctrl->admin_tagset); } nvme_tcp_free_admin_queue(ctrl); @@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) goto out_free_queue; } + ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->fabrics_q)) { + error = PTR_ERR(ctrl->fabrics_q); + goto out_free_tagset; + } + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); if (IS_ERR(ctrl->admin_q)) { error = PTR_ERR(ctrl->admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_queue; - error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (error) { - dev_err(ctrl->device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - - error = nvme_enable_ctrl(ctrl, ctrl->cap); + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; + blk_mq_unquiesce_queue(ctrl->admin_q); + error = nvme_init_identify(ctrl); if (error) goto out_stop_queue; @@ -1735,6 +1758,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->admin_tagset); @@ -1748,10 +1774,13 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, { blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); - if (ctrl->admin_tagset) + if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); - blk_mq_unquiesce_queue(ctrl->admin_q); + blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); + } + if (remove) + blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1762,9 +1791,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, return; nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); - if (ctrl->tagset) + if (ctrl->tagset) { blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->tagset); + } if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); @@ -1793,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; - int ret = -EINVAL; + int ret; ret = nvme_tcp_configure_admin_queue(ctrl, new); if (ret) @@ -1876,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1892,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->admin_q); if (shutdown) nvme_shutdown_ctrl(ctrl); else - nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_disable_ctrl(ctrl); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } @@ -2151,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + dev_info(ctrl->ctrl.device, - "mapped %d/%d default/read queues.\n", + "mapped %d/%d/%d default/read/poll queues.\n", ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ]); + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); return 0; } +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; + + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) + sk_busy_loop(sk, true); + nvme_tcp_try_recv(queue); + return queue->nr_cqe; +} + static struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .complete = nvme_complete_rq, @@ -2167,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = { .init_hctx = nvme_tcp_init_hctx, .timeout = nvme_tcp_timeout, .map_queues = nvme_tcp_map_queues, + .poll = nvme_tcp_poll, }; static struct blk_mq_ops nvme_tcp_admin_mq_ops = { @@ -2220,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); ctrl->ctrl.opts = opts; - ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -2314,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | - NVMF_OPT_NR_WRITE_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 9778eb0406b3..5c3cb6928f3c 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, return ret; } +static const char *nvme_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_admin_identify(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvme_trace_get_lba_status(p, cdw10); default: return nvme_trace_common(p, cdw10); } diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 4dc12ea52f23..831a062d27cb 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -37,7 +37,6 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) static void nvmet_execute_get_log_page_error(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; - u16 status = NVME_SC_SUCCESS; unsigned long flags; off_t offset = 0; u64 slot; @@ -47,9 +46,8 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req) slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS; for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) { - status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot], - sizeof(struct nvme_error_slot)); - if (status) + if (nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot], + sizeof(struct nvme_error_slot))) break; if (slot == 0) @@ -59,7 +57,7 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req) offset += sizeof(struct nvme_error_slot); } spin_unlock_irqrestore(&ctrl->error_lock, flags); - nvmet_req_complete(req, status); + nvmet_req_complete(req, 0); } static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, @@ -81,9 +79,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, goto out; host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); + data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, + sectors[READ]), 1000); host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, + sectors[WRITE]), 1000); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -111,11 +111,11 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, if (!ns->bdev) continue; host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read += - part_stat_read(ns->bdev->bd_part, sectors[READ]); + data_units_read += DIV_ROUND_UP( + part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000); host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written += - part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + data_units_written += DIV_ROUND_UP( + part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); } rcu_read_unlock(); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 8efca26b4776..3764a8900850 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -381,9 +381,7 @@ int __init nvmet_init_discovery(void) { nvmet_disc_subsys = nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); - if (IS_ERR(nvmet_disc_subsys)) - return PTR_ERR(nvmet_disc_subsys); - return 0; + return PTR_ERR_OR_ZERO(nvmet_disc_subsys); } void nvmet_exit_discovery(void) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 0940c5024a34..748a39fca771 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -253,6 +253,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); } @@ -357,10 +358,16 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) goto out_free_sq; ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } error = nvmf_connect_admin_queue(&ctrl->ctrl); @@ -369,23 +376,15 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_cleanup_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_cleanup_queue; ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + error = nvme_init_identify(&ctrl->ctrl); if (error) goto out_cleanup_queue; @@ -394,6 +393,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) out_cleanup_queue: blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_tagset: blk_mq_free_tag_set(&ctrl->admin_tag_set); out_free_sq: @@ -407,16 +408,17 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); nvme_loop_destroy_io_queues(ctrl); } + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); nvme_loop_destroy_admin_queue(ctrl); } diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 69b83fa0c76c..bf4f03474e89 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -348,7 +348,8 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) return 0; err: - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); return NVME_SC_INTERNAL; } @@ -553,7 +554,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd) if (queue->nvme_sq.sqhd_disabled) { kfree(cmd->iov); - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); } return 1; @@ -584,7 +586,8 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, return -EAGAIN; kfree(cmd->iov); - sgl_free(cmd->req.sg); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); cmd->queue->snd_cmd = NULL; nvmet_tcp_put_cmd(cmd); return 1; @@ -1306,7 +1309,9 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) { nvmet_req_uninit(&cmd->req); nvmet_tcp_unmap_pdu_iovec(cmd); - sgl_free(cmd->req.sg); + kfree(cmd->iov); + if (cmd->req.sg_cnt) + sgl_free(cmd->req.sg); } static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) @@ -1410,6 +1415,7 @@ done: static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; + struct inet_sock *inet = inet_sk(sock->sk); struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret; @@ -1433,6 +1439,16 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) if (ret) return ret; + /* Set socket type of service */ + if (inet->rcv_tos > 0) { + int tos = inet->rcv_tos; + + ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, + (char *)&tos, sizeof(tos)); + if (ret) + return ret; + } + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = queue; queue->data_ready = sock->sk->sk_data_ready; diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 6af11d493271..1373a3c67962 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -33,6 +33,22 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p, return ret; } +static const char *nvmet_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -80,6 +96,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, return nvmet_trace_admin_identify(p, cdw10); case nvme_admin_get_features: return nvmet_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvmet_trace_get_lba_status(p, cdw10); default: return nvmet_trace_common(p, cdw10); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 11e64b50497f..4e88d7e9cf9a 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1089,6 +1089,18 @@ static void scsi_initialize_rq(struct request *rq) cmd->retries = 0; } +/* + * Only called when the request isn't completed by SCSI, and not freed by + * SCSI + */ +static void scsi_cleanup_rq(struct request *rq) +{ + if (rq->rq_flags & RQF_DONTPREP) { + scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq)); + rq->rq_flags &= ~RQF_DONTPREP; + } +} + /* Add a command to the list used by the aacraid and dpt_i2o drivers */ void scsi_add_cmd_to_list(struct scsi_cmnd *cmd) { @@ -1821,6 +1833,7 @@ static const struct blk_mq_ops scsi_mq_ops = { .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .initialize_rq_fn = scsi_initialize_rq, + .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, }; diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c index 74ded5f3c236..3717eea37ecb 100644 --- a/drivers/scsi/scsi_pm.c +++ b/drivers/scsi/scsi_pm.c @@ -94,8 +94,7 @@ static int scsi_dev_type_resume(struct device *dev, if (!err && scsi_is_sdev_device(dev)) { struct scsi_device *sdev = to_scsi_device(dev); - if (sdev->request_queue->dev) - blk_set_runtime_active(sdev->request_queue); + blk_set_runtime_active(sdev->request_queue); } } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 149d406aacc9..4b925552458f 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1293,7 +1293,9 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) case REQ_OP_WRITE: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: - return sd_zbc_setup_reset_cmnd(cmd); + return sd_zbc_setup_reset_cmnd(cmd, false); + case REQ_OP_ZONE_RESET_ALL: + return sd_zbc_setup_reset_cmnd(cmd, true); default: WARN_ON_ONCE(1); return BLK_STS_NOTSUPP; @@ -1959,6 +1961,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) case REQ_OP_WRITE_ZEROES: case REQ_OP_WRITE_SAME: case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: if (!result) { good_bytes = blk_rq_bytes(req); scsi_set_resid(SCpnt, 0); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 38c50946fc42..1eab779f812b 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -209,7 +209,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer); extern void sd_zbc_print_zones(struct scsi_disk *sdkp); -extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd); +extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd, bool all); extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, @@ -225,7 +225,8 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {} -static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) +static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd, + bool all) { return BLK_STS_TARGET; } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 5d6ff3931632..de4019dc0f0b 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -209,10 +209,11 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp) /** * sd_zbc_setup_reset_cmnd - Prepare a RESET WRITE POINTER scsi command. * @cmd: the command to setup + * @all: Reset all zones control. * * Called from sd_init_command() for a REQ_OP_ZONE_RESET request. */ -blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) +blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd, bool all) { struct request *rq = cmd->request; struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); @@ -234,7 +235,10 @@ blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) memset(cmd->cmnd, 0, cmd->cmd_len); cmd->cmnd[0] = ZBC_OUT; cmd->cmnd[1] = ZO_RESET_WRITE_POINTER; - put_unaligned_be64(block, &cmd->cmnd[2]); + if (all) + cmd->cmnd[14] = 0x1; + else + put_unaligned_be64(block, &cmd->cmnd[2]); rq->timeout = SD_TIMEOUT; cmd->sc_data_direction = DMA_NONE; @@ -261,6 +265,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, switch (req_op(rq)) { case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: if (result && sshdr->sense_key == ILLEGAL_REQUEST && @@ -487,6 +492,9 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_chunk_sectors(sdkp->disk->queue, logical_to_sectors(sdkp->device, zone_blocks)); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue); + blk_queue_required_elevator_features(sdkp->disk->queue, + ELEVATOR_F_ZBD_SEQ_WRITE); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); /* READ16/WRITE16 is mandatory for ZBC disks */ |