summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-18 02:57:47 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-18 02:57:47 +0300
commit7ad67ca5534ee7c958559c4ad610f05c4578e361 (patch)
treedc6b6a8a6b70b5f25b07bcdc06d8e77e705f6822 /drivers
parent5260c2b863ef1152445ce93476c95d8c8a727eef (diff)
parent9c7eddf1b080f98fed1aadb74fe784f29bf77a08 (diff)
downloadlinux-7ad67ca5534ee7c958559c4ad610f05c4578e361.tar.xz
Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: - Two NVMe pull requests: - ana log parse fix from Anton - nvme quirks support for Apple devices from Ben - fix missing bio completion tracing for multipath stack devices from Hannes and Mikhail - IP TOS settings for nvme rdma and tcp transports from Israel - rq_dma_dir cleanups from Israel - tracing for Get LBA Status command from Minwoo - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself - Some consolidation between the fabrics transports for handling the CAP register - reset race with ns scanning fix for fabrics (move fabrics commands to a dedicated request queue with a different lifetime from the admin request queue)." - controller reset and namespace scan races fixes - nvme discovery log change uevent support - naming improvements from Keith - multiple discovery controllers reject fix from James - some regular cleanups from various people - Series fixing (and re-fixing) null_blk debug printing and nr_devices checks (André) - A few pull requests from Song, with fixes from Andy, Guoqing, Guilherme, Neil, Nigel, and Yufen. - REQ_OP_ZONE_RESET_ALL support (Chaitanya) - Bio merge handling unification (Christoph) - Pick default elevator correctly for devices with special needs (Damien) - Block stats fixes (Hou) - Timeout and support devices nbd fixes (Mike) - Series fixing races around elevator switching and device add/remove (Ming) - sed-opal cleanups (Revanth) - Per device weight support for BFQ (Fam) - Support for blk-iocost, a new model that can properly account cost of IO workloads. (Tejun) - blk-cgroup writeback fixes (Tejun) - paride queue init fixes (zhengbin) - blk_set_runtime_active() cleanup (Stanley) - Block segment mapping optimizations (Bart) - lightnvm fixes (Hans/Minwoo/YueHaibing) - Various little fixes and cleanups * tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits) null_blk: format pr_* logs with pr_fmt null_blk: match the type of parameter nr_devices null_blk: do not fail the module load with zero devices block: also check RQF_STATS in blk_mq_need_time_stamp() block: make rq sector size accessible for block stats bfq: Fix bfq linkage error raid5: use bio_end_sector in r5_next_bio raid5: remove STRIPE_OPS_REQ_PENDING md: add feature flag MD_FEATURE_RAID0_LAYOUT md/raid0: avoid RAID0 data corruption due to layout confusion. raid5: don't set STRIPE_HANDLE to stripe which is in batch list raid5: don't increment read_errors on EILSEQ return nvmet: fix a wrong error status returned in error log page nvme: send discovery log page change events to userspace nvme: add uevent variables for controller devices nvme: enable aen regardless of the presence of I/O queues nvme-fabrics: allow discovery subsystems accept a kato nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery() nvme: Remove redundant assignment of cq vector nvme: Assign subsys instance from first ctrl ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/floppy.c4
-rw-r--r--drivers/block/loop.c1
-rw-r--r--drivers/block/nbd.c127
-rw-r--r--drivers/block/null_blk.h18
-rw-r--r--drivers/block/null_blk_main.c183
-rw-r--r--drivers/block/null_blk_zoned.c59
-rw-r--r--drivers/block/paride/pcd.c12
-rw-r--r--drivers/block/paride/pf.c2
-rw-r--r--drivers/lightnvm/core.c97
-rw-r--r--drivers/lightnvm/pblk-core.c116
-rw-r--r--drivers/lightnvm/pblk-gc.c19
-rw-r--r--drivers/lightnvm/pblk-init.c38
-rw-r--r--drivers/lightnvm/pblk-read.c26
-rw-r--r--drivers/lightnvm/pblk-recovery.c42
-rw-r--r--drivers/lightnvm/pblk-write.c20
-rw-r--r--drivers/lightnvm/pblk.h31
-rw-r--r--drivers/md/bcache/closure.c10
-rw-r--r--drivers/md/bcache/debug.c5
-rw-r--r--drivers/md/bcache/sysfs.c1
-rw-r--r--drivers/md/dm-rq.c3
-rw-r--r--drivers/md/md-linear.c5
-rw-r--r--drivers/md/md.c96
-rw-r--r--drivers/md/md.h20
-rw-r--r--drivers/md/raid0.c41
-rw-r--r--drivers/md/raid0.h14
-rw-r--r--drivers/md/raid1.c89
-rw-r--r--drivers/md/raid10.c32
-rw-r--r--drivers/md/raid5.c27
-rw-r--r--drivers/md/raid5.h5
-rw-r--r--drivers/nvme/host/Kconfig1
-rw-r--r--drivers/nvme/host/core.c201
-rw-r--r--drivers/nvme/host/fabrics.c38
-rw-r--r--drivers/nvme/host/fabrics.h3
-rw-r--r--drivers/nvme/host/fc.c73
-rw-r--r--drivers/nvme/host/lightnvm.c45
-rw-r--r--drivers/nvme/host/multipath.c8
-rw-r--r--drivers/nvme/host/nvme.h36
-rw-r--r--drivers/nvme/host/pci.c102
-rw-r--r--drivers/nvme/host/rdma.c61
-rw-r--r--drivers/nvme/host/tcp.c144
-rw-r--r--drivers/nvme/host/trace.c18
-rw-r--r--drivers/nvme/target/admin-cmd.c22
-rw-r--r--drivers/nvme/target/discovery.c4
-rw-r--r--drivers/nvme/target/loop.c30
-rw-r--r--drivers/nvme/target/tcp.c24
-rw-r--r--drivers/nvme/target/trace.c18
-rw-r--r--drivers/scsi/scsi_lib.c13
-rw-r--r--drivers/scsi/scsi_pm.c3
-rw-r--r--drivers/scsi/sd.c5
-rw-r--r--drivers/scsi/sd.h5
-rw-r--r--drivers/scsi/sd_zbc.c12
51 files changed, 1205 insertions, 804 deletions
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 0469aceaa230..485865fd0412 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3780,7 +3780,7 @@ static int compat_getdrvprm(int drive,
v.native_format = UDP->native_format;
mutex_unlock(&floppy_mutex);
- if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_params)))
+ if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_params)))
return -EFAULT;
return 0;
}
@@ -3816,7 +3816,7 @@ static int compat_getdrvstat(int drive, bool poll,
v.bufblocks = UDRS->bufblocks;
mutex_unlock(&floppy_mutex);
- if (copy_from_user(arg, &v, sizeof(struct compat_floppy_drive_struct)))
+ if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_struct)))
return -EFAULT;
return 0;
Eintr:
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ab7ca5989097..1410fa893653 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1755,6 +1755,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
case LOOP_SET_FD:
case LOOP_CHANGE_FD:
case LOOP_SET_BLOCK_SIZE:
+ case LOOP_SET_DIRECT_IO:
err = lo_ioctl(bdev, mode, cmd, arg);
break;
default:
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index e21d2ded732b..a8e3815295fe 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -108,6 +108,7 @@ struct nbd_device {
struct nbd_config *config;
struct mutex config_lock;
struct gendisk *disk;
+ struct workqueue_struct *recv_workq;
struct list_head list;
struct task_struct *task_recv;
@@ -121,6 +122,7 @@ struct nbd_cmd {
struct mutex lock;
int index;
int cookie;
+ int retries;
blk_status_t status;
unsigned long flags;
u32 cmd_cookie;
@@ -138,7 +140,6 @@ static struct dentry *nbd_dbg_dir;
static unsigned int nbds_max = 16;
static int max_part = 16;
-static struct workqueue_struct *recv_workqueue;
static int part_shift;
static int nbd_dev_dbg_init(struct nbd_device *nbd);
@@ -344,6 +345,22 @@ static void sock_shutdown(struct nbd_device *nbd)
dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
}
+static u32 req_to_nbd_cmd_type(struct request *req)
+{
+ switch (req_op(req)) {
+ case REQ_OP_DISCARD:
+ return NBD_CMD_TRIM;
+ case REQ_OP_FLUSH:
+ return NBD_CMD_FLUSH;
+ case REQ_OP_WRITE:
+ return NBD_CMD_WRITE;
+ case REQ_OP_READ:
+ return NBD_CMD_READ;
+ default:
+ return U32_MAX;
+ }
+}
+
static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
bool reserved)
{
@@ -357,8 +374,10 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
}
config = nbd->config;
- if (!mutex_trylock(&cmd->lock))
+ if (!mutex_trylock(&cmd->lock)) {
+ nbd_config_put(nbd);
return BLK_EH_RESET_TIMER;
+ }
if (config->num_connections > 1) {
dev_err_ratelimited(nbd_to_dev(nbd),
@@ -389,10 +408,25 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
nbd_config_put(nbd);
return BLK_EH_DONE;
}
- } else {
- dev_err_ratelimited(nbd_to_dev(nbd),
- "Connection timed out\n");
}
+
+ if (!nbd->tag_set.timeout) {
+ /*
+ * Userspace sets timeout=0 to disable socket disconnection,
+ * so just warn and reset the timer.
+ */
+ cmd->retries++;
+ dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
+ req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
+ (unsigned long long)blk_rq_pos(req) << 9,
+ blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
+
+ mutex_unlock(&cmd->lock);
+ nbd_config_put(nbd);
+ return BLK_EH_RESET_TIMER;
+ }
+
+ dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
set_bit(NBD_TIMEDOUT, &config->runtime_flags);
cmd->status = BLK_STS_IOERR;
mutex_unlock(&cmd->lock);
@@ -480,22 +514,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
- switch (req_op(req)) {
- case REQ_OP_DISCARD:
- type = NBD_CMD_TRIM;
- break;
- case REQ_OP_FLUSH:
- type = NBD_CMD_FLUSH;
- break;
- case REQ_OP_WRITE:
- type = NBD_CMD_WRITE;
- break;
- case REQ_OP_READ:
- type = NBD_CMD_READ;
- break;
- default:
+ type = req_to_nbd_cmd_type(req);
+ if (type == U32_MAX)
return -EIO;
- }
if (rq_data_dir(req) == WRITE &&
(config->flags & NBD_FLAG_READ_ONLY)) {
@@ -526,6 +547,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
cmd->index = index;
cmd->cookie = nsock->cookie;
+ cmd->retries = 0;
request.type = htonl(type | nbd_cmd_flags);
if (type != NBD_CMD_FLUSH) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
@@ -1036,7 +1058,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
/* We take the tx_mutex in an error path in the recv_work, so we
* need to queue_work outside of the tx_mutex.
*/
- queue_work(recv_workqueue, &args->work);
+ queue_work(nbd->recv_workq, &args->work);
atomic_inc(&config->live_connections);
wake_up(&config->conn_wait);
@@ -1137,6 +1159,10 @@ static void nbd_config_put(struct nbd_device *nbd)
kfree(nbd->config);
nbd->config = NULL;
+ if (nbd->recv_workq)
+ destroy_workqueue(nbd->recv_workq);
+ nbd->recv_workq = NULL;
+
nbd->tag_set.timeout = 0;
nbd->disk->queue->limits.discard_granularity = 0;
nbd->disk->queue->limits.discard_alignment = 0;
@@ -1165,6 +1191,14 @@ static int nbd_start_device(struct nbd_device *nbd)
return -EINVAL;
}
+ nbd->recv_workq = alloc_workqueue("knbd%d-recv",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI |
+ WQ_UNBOUND, 0, nbd->index);
+ if (!nbd->recv_workq) {
+ dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
+ return -ENOMEM;
+ }
+
blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
nbd->task_recv = current;
@@ -1195,7 +1229,7 @@ static int nbd_start_device(struct nbd_device *nbd)
INIT_WORK(&args->work, recv_work);
args->nbd = nbd;
args->index = i;
- queue_work(recv_workqueue, &args->work);
+ queue_work(nbd->recv_workq, &args->work);
}
nbd_size_update(nbd);
return error;
@@ -1215,8 +1249,10 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
mutex_unlock(&nbd->config_lock);
ret = wait_event_interruptible(config->recv_wq,
atomic_read(&config->recv_threads) == 0);
- if (ret)
+ if (ret) {
sock_shutdown(nbd);
+ flush_workqueue(nbd->recv_workq);
+ }
mutex_lock(&nbd->config_lock);
nbd_bdev_reset(bdev);
/* user requested, ignore socket errors */
@@ -1246,6 +1282,13 @@ static bool nbd_is_valid_blksize(unsigned long blksize)
return true;
}
+static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
+{
+ nbd->tag_set.timeout = timeout * HZ;
+ if (timeout)
+ blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+}
+
/* Must be called with config_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
@@ -1276,10 +1319,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
nbd_size_set(nbd, config->blksize, arg);
return 0;
case NBD_SET_TIMEOUT:
- if (arg) {
- nbd->tag_set.timeout = arg * HZ;
- blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
- }
+ nbd_set_cmd_timeout(nbd, arg);
return 0;
case NBD_SET_FLAGS:
@@ -1799,11 +1839,9 @@ again:
if (ret)
goto out;
- if (info->attrs[NBD_ATTR_TIMEOUT]) {
- u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
- nbd->tag_set.timeout = timeout * HZ;
- blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
- }
+ if (info->attrs[NBD_ATTR_TIMEOUT])
+ nbd_set_cmd_timeout(nbd,
+ nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
config->dead_conn_timeout =
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
@@ -1875,6 +1913,12 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
nbd_disconnect(nbd);
nbd_clear_sock(nbd);
mutex_unlock(&nbd->config_lock);
+ /*
+ * Make sure recv thread has finished, so it does not drop the last
+ * config ref and try to destroy the workqueue from inside the work
+ * queue.
+ */
+ flush_workqueue(nbd->recv_workq);
if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
nbd_config_put(nbd);
@@ -1971,11 +2015,9 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
if (ret)
goto out;
- if (info->attrs[NBD_ATTR_TIMEOUT]) {
- u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
- nbd->tag_set.timeout = timeout * HZ;
- blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
- }
+ if (info->attrs[NBD_ATTR_TIMEOUT])
+ nbd_set_cmd_timeout(nbd,
+ nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
config->dead_conn_timeout =
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
@@ -2261,20 +2303,12 @@ static int __init nbd_init(void)
if (nbds_max > 1UL << (MINORBITS - part_shift))
return -EINVAL;
- recv_workqueue = alloc_workqueue("knbd-recv",
- WQ_MEM_RECLAIM | WQ_HIGHPRI |
- WQ_UNBOUND, 0);
- if (!recv_workqueue)
- return -ENOMEM;
- if (register_blkdev(NBD_MAJOR, "nbd")) {
- destroy_workqueue(recv_workqueue);
+ if (register_blkdev(NBD_MAJOR, "nbd"))
return -EIO;
- }
if (genl_register_family(&nbd_genl_family)) {
unregister_blkdev(NBD_MAJOR, "nbd");
- destroy_workqueue(recv_workqueue);
return -EINVAL;
}
nbd_dbg_init();
@@ -2316,7 +2350,6 @@ static void __exit nbd_cleanup(void)
idr_destroy(&nbd_index_idr);
genl_unregister_family(&nbd_genl_family);
- destroy_workqueue(recv_workqueue);
unregister_blkdev(NBD_MAJOR, "nbd");
}
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index a1b9929bd911..a235c45e22a7 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -2,6 +2,9 @@
#ifndef __BLK_NULL_BLK_H
#define __BLK_NULL_BLK_H
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/blk-mq.h>
@@ -90,13 +93,13 @@ int null_zone_init(struct nullb_device *dev);
void null_zone_exit(struct nullb_device *dev);
int null_zone_report(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones);
-void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
- unsigned int nr_sectors);
-void null_zone_reset(struct nullb_cmd *cmd, sector_t sector);
+blk_status_t null_handle_zoned(struct nullb_cmd *cmd,
+ enum req_opf op, sector_t sector,
+ sector_t nr_sectors);
#else
static inline int null_zone_init(struct nullb_device *dev)
{
- pr_err("null_blk: CONFIG_BLK_DEV_ZONED not enabled\n");
+ pr_err("CONFIG_BLK_DEV_ZONED not enabled\n");
return -EINVAL;
}
static inline void null_zone_exit(struct nullb_device *dev) {}
@@ -106,10 +109,11 @@ static inline int null_zone_report(struct gendisk *disk, sector_t sector,
{
return -EOPNOTSUPP;
}
-static inline void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
- unsigned int nr_sectors)
+static inline blk_status_t null_handle_zoned(struct nullb_cmd *cmd,
+ enum req_opf op, sector_t sector,
+ sector_t nr_sectors)
{
+ return BLK_STS_NOTSUPP;
}
-static inline void null_zone_reset(struct nullb_cmd *cmd, sector_t sector) {}
#endif /* CONFIG_BLK_DEV_ZONED */
#endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 99328ded60d1..0e7da5015ccd 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -141,8 +141,8 @@ static int g_bs = 512;
module_param_named(bs, g_bs, int, 0444);
MODULE_PARM_DESC(bs, "Block size (in bytes)");
-static int nr_devices = 1;
-module_param(nr_devices, int, 0444);
+static unsigned int nr_devices = 1;
+module_param(nr_devices, uint, 0444);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
static bool g_blocking;
@@ -1133,93 +1133,61 @@ static void null_restart_queue_async(struct nullb *nullb)
blk_mq_start_stopped_hw_queues(q, true);
}
-static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
+static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
{
struct nullb_device *dev = cmd->nq->dev;
struct nullb *nullb = dev->nullb;
- int err = 0;
+ blk_status_t sts = BLK_STS_OK;
+ struct request *rq = cmd->rq;
- if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
- struct request *rq = cmd->rq;
-
- if (!hrtimer_active(&nullb->bw_timer))
- hrtimer_restart(&nullb->bw_timer);
-
- if (atomic_long_sub_return(blk_rq_bytes(rq),
- &nullb->cur_bytes) < 0) {
- null_stop_queue(nullb);
- /* race with timer */
- if (atomic_long_read(&nullb->cur_bytes) > 0)
- null_restart_queue_async(nullb);
- /* requeue request */
- return BLK_STS_DEV_RESOURCE;
- }
- }
+ if (!hrtimer_active(&nullb->bw_timer))
+ hrtimer_restart(&nullb->bw_timer);
- if (nullb->dev->badblocks.shift != -1) {
- int bad_sectors;
- sector_t sector, size, first_bad;
- bool is_flush = true;
-
- if (dev->queue_mode == NULL_Q_BIO &&
- bio_op(cmd->bio) != REQ_OP_FLUSH) {
- is_flush = false;
- sector = cmd->bio->bi_iter.bi_sector;
- size = bio_sectors(cmd->bio);
- }
- if (dev->queue_mode != NULL_Q_BIO &&
- req_op(cmd->rq) != REQ_OP_FLUSH) {
- is_flush = false;
- sector = blk_rq_pos(cmd->rq);
- size = blk_rq_sectors(cmd->rq);
- }
- if (!is_flush && badblocks_check(&nullb->dev->badblocks, sector,
- size, &first_bad, &bad_sectors)) {
- cmd->error = BLK_STS_IOERR;
- goto out;
- }
+ if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
+ null_stop_queue(nullb);
+ /* race with timer */
+ if (atomic_long_read(&nullb->cur_bytes) > 0)
+ null_restart_queue_async(nullb);
+ /* requeue request */
+ sts = BLK_STS_DEV_RESOURCE;
}
+ return sts;
+}
- if (dev->memory_backed) {
- if (dev->queue_mode == NULL_Q_BIO) {
- if (bio_op(cmd->bio) == REQ_OP_FLUSH)
- err = null_handle_flush(nullb);
- else
- err = null_handle_bio(cmd);
- } else {
- if (req_op(cmd->rq) == REQ_OP_FLUSH)
- err = null_handle_flush(nullb);
- else
- err = null_handle_rq(cmd);
- }
- }
- cmd->error = errno_to_blk_status(err);
+static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
+ sector_t sector,
+ sector_t nr_sectors)
+{
+ struct badblocks *bb = &cmd->nq->dev->badblocks;
+ sector_t first_bad;
+ int bad_sectors;
- if (!cmd->error && dev->zoned) {
- sector_t sector;
- unsigned int nr_sectors;
- enum req_opf op;
+ if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors))
+ return BLK_STS_IOERR;
- if (dev->queue_mode == NULL_Q_BIO) {
- op = bio_op(cmd->bio);
- sector = cmd->bio->bi_iter.bi_sector;
- nr_sectors = cmd->bio->bi_iter.bi_size >> 9;
- } else {
- op = req_op(cmd->rq);
- sector = blk_rq_pos(cmd->rq);
- nr_sectors = blk_rq_sectors(cmd->rq);
- }
+ return BLK_STS_OK;
+}
- if (op == REQ_OP_WRITE)
- null_zone_write(cmd, sector, nr_sectors);
- else if (op == REQ_OP_ZONE_RESET)
- null_zone_reset(cmd, sector);
- }
-out:
+static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
+ enum req_opf op)
+{
+ struct nullb_device *dev = cmd->nq->dev;
+ int err;
+
+ if (dev->queue_mode == NULL_Q_BIO)
+ err = null_handle_bio(cmd);
+ else
+ err = null_handle_rq(cmd);
+
+ return errno_to_blk_status(err);
+}
+
+static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
+{
/* Complete IO by inline, softirq or timer */
- switch (dev->irqmode) {
+ switch (cmd->nq->dev->irqmode) {
case NULL_IRQ_SOFTIRQ:
- switch (dev->queue_mode) {
+ switch (cmd->nq->dev->queue_mode) {
case NULL_Q_MQ:
blk_mq_complete_request(cmd->rq);
break;
@@ -1238,6 +1206,40 @@ out:
null_cmd_end_timer(cmd);
break;
}
+}
+
+static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
+ sector_t nr_sectors, enum req_opf op)
+{
+ struct nullb_device *dev = cmd->nq->dev;
+ struct nullb *nullb = dev->nullb;
+ blk_status_t sts;
+
+ if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
+ sts = null_handle_throttled(cmd);
+ if (sts != BLK_STS_OK)
+ return sts;
+ }
+
+ if (op == REQ_OP_FLUSH) {
+ cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+ goto out;
+ }
+
+ if (nullb->dev->badblocks.shift != -1) {
+ cmd->error = null_handle_badblocks(cmd, sector, nr_sectors);
+ if (cmd->error != BLK_STS_OK)
+ goto out;
+ }
+
+ if (dev->memory_backed)
+ cmd->error = null_handle_memory_backed(cmd, op);
+
+ if (!cmd->error && dev->zoned)
+ cmd->error = null_handle_zoned(cmd, op, sector, nr_sectors);
+
+out:
+ nullb_complete_cmd(cmd);
return BLK_STS_OK;
}
@@ -1280,6 +1282,8 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
{
+ sector_t sector = bio->bi_iter.bi_sector;
+ sector_t nr_sectors = bio_sectors(bio);
struct nullb *nullb = q->queuedata;
struct nullb_queue *nq = nullb_to_queue(nullb);
struct nullb_cmd *cmd;
@@ -1287,7 +1291,7 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
cmd = alloc_cmd(nq, 1);
cmd->bio = bio;
- null_handle_cmd(cmd);
+ null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio));
return BLK_QC_T_NONE;
}
@@ -1311,7 +1315,7 @@ static bool should_requeue_request(struct request *rq)
static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
{
- pr_info("null: rq %p timed out\n", rq);
+ pr_info("rq %p timed out\n", rq);
blk_mq_complete_request(rq);
return BLK_EH_DONE;
}
@@ -1321,6 +1325,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
{
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
struct nullb_queue *nq = hctx->driver_data;
+ sector_t nr_sectors = blk_rq_sectors(bd->rq);
+ sector_t sector = blk_rq_pos(bd->rq);
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
@@ -1349,7 +1355,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
if (should_timeout_request(bd->rq))
return BLK_STS_OK;
- return null_handle_cmd(cmd);
+ return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq));
}
static const struct blk_mq_ops null_mq_ops = {
@@ -1688,6 +1694,9 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
nullb->q->limits.zoned = BLK_ZONED_HM;
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q);
+ blk_queue_required_elevator_features(nullb->q,
+ ELEVATOR_F_ZBD_SEQ_WRITE);
}
nullb->q->queuedata = nullb;
@@ -1739,28 +1748,28 @@ static int __init null_init(void)
struct nullb_device *dev;
if (g_bs > PAGE_SIZE) {
- pr_warn("null_blk: invalid block size\n");
- pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
+ pr_warn("invalid block size\n");
+ pr_warn("defaults block size to %lu\n", PAGE_SIZE);
g_bs = PAGE_SIZE;
}
if (!is_power_of_2(g_zone_size)) {
- pr_err("null_blk: zone_size must be power-of-two\n");
+ pr_err("zone_size must be power-of-two\n");
return -EINVAL;
}
if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
- pr_err("null_blk: invalid home_node value\n");
+ pr_err("invalid home_node value\n");
g_home_node = NUMA_NO_NODE;
}
if (g_queue_mode == NULL_Q_RQ) {
- pr_err("null_blk: legacy IO path no longer available\n");
+ pr_err("legacy IO path no longer available\n");
return -EINVAL;
}
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) {
- pr_warn("null_blk: submit_queues param is set to %u.\n",
+ pr_warn("submit_queues param is set to %u.\n",
nr_online_nodes);
g_submit_queues = nr_online_nodes;
}
@@ -1803,7 +1812,7 @@ static int __init null_init(void)
}
}
- pr_info("null: module loaded\n");
+ pr_info("module loaded\n");
return 0;
err_dev:
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index cb28d93f2bd1..eabc116832a7 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -17,7 +17,7 @@ int null_zone_init(struct nullb_device *dev)
unsigned int i;
if (!is_power_of_2(dev->zone_size)) {
- pr_err("null_blk: zone_size must be power-of-two\n");
+ pr_err("zone_size must be power-of-two\n");
return -EINVAL;
}
@@ -31,7 +31,7 @@ int null_zone_init(struct nullb_device *dev)
if (dev->zone_nr_conv >= dev->nr_zones) {
dev->zone_nr_conv = dev->nr_zones - 1;
- pr_info("null_blk: changed the number of conventional zones to %u",
+ pr_info("changed the number of conventional zones to %u",
dev->zone_nr_conv);
}
@@ -84,7 +84,7 @@ int null_zone_report(struct gendisk *disk, sector_t sector,
return 0;
}
-void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
+static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
unsigned int nr_sectors)
{
struct nullb_device *dev = cmd->nq->dev;
@@ -95,14 +95,12 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
case BLK_ZONE_COND_FULL:
/* Cannot write to a full zone */
cmd->error = BLK_STS_IOERR;
- break;
+ return BLK_STS_IOERR;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_IMP_OPEN:
/* Writes must be at the write pointer position */
- if (sector != zone->wp) {
- cmd->error = BLK_STS_IOERR;
- break;
- }
+ if (sector != zone->wp)
+ return BLK_STS_IOERR;
if (zone->cond == BLK_ZONE_COND_EMPTY)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
@@ -115,22 +113,51 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
break;
default:
/* Invalid zone condition */
- cmd->error = BLK_STS_IOERR;
- break;
+ return BLK_STS_IOERR;
}
+ return BLK_STS_OK;
}
-void null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
+static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
{
struct nullb_device *dev = cmd->nq->dev;
unsigned int zno = null_zone_no(dev, sector);
struct blk_zone *zone = &dev->zones[zno];
+ size_t i;
+
+ switch (req_op(cmd->rq)) {
+ case REQ_OP_ZONE_RESET_ALL:
+ for (i = 0; i < dev->nr_zones; i++) {
+ if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
+ continue;
+ zone[i].cond = BLK_ZONE_COND_EMPTY;
+ zone[i].wp = zone[i].start;
+ }
+ break;
+ case REQ_OP_ZONE_RESET:
+ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+ return BLK_STS_IOERR;
- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
- cmd->error = BLK_STS_IOERR;
- return;
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ break;
+ default:
+ cmd->error = BLK_STS_NOTSUPP;
+ break;
}
+ return BLK_STS_OK;
+}
- zone->cond = BLK_ZONE_COND_EMPTY;
- zone->wp = zone->start;
+blk_status_t null_handle_zoned(struct nullb_cmd *cmd, enum req_opf op,
+ sector_t sector, sector_t nr_sectors)
+{
+ switch (op) {
+ case REQ_OP_WRITE:
+ return null_zone_write(cmd, sector, nr_sectors);
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ return null_zone_reset(cmd, sector);
+ default:
+ return BLK_STS_OK;
+ }
}
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 001dbdcbf355..636bfea2de6f 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -314,8 +314,8 @@ static void pcd_init_units(void)
disk->queue = blk_mq_init_sq_queue(&cd->tag_set, &pcd_mq_ops,
1, BLK_MQ_F_SHOULD_MERGE);
if (IS_ERR(disk->queue)) {
- put_disk(disk);
disk->queue = NULL;
+ put_disk(disk);
continue;
}
@@ -723,9 +723,9 @@ static int pcd_detect(void)
k = 0;
if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
cd = pcd;
- if (pi_init(cd->pi, 1, -1, -1, -1, -1, -1, pcd_buffer,
- PI_PCD, verbose, cd->name)) {
- if (!pcd_probe(cd, -1, id) && cd->disk) {
+ if (cd->disk && pi_init(cd->pi, 1, -1, -1, -1, -1, -1,
+ pcd_buffer, PI_PCD, verbose, cd->name)) {
+ if (!pcd_probe(cd, -1, id)) {
cd->present = 1;
k++;
} else
@@ -736,11 +736,13 @@ static int pcd_detect(void)
int *conf = *drives[unit];
if (!conf[D_PRT])
continue;
+ if (!cd->disk)
+ continue;
if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD],
conf[D_UNI], conf[D_PRO], conf[D_DLY],
pcd_buffer, PI_PCD, verbose, cd->name))
continue;
- if (!pcd_probe(cd, conf[D_SLV], id) && cd->disk) {
+ if (!pcd_probe(cd, conf[D_SLV], id)) {
cd->present = 1;
k++;
} else
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 1e9c50a7256c..6b7d4cab3687 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -300,8 +300,8 @@ static void __init pf_init_units(void)
disk->queue = blk_mq_init_sq_queue(&pf->tag_set, &pf_mq_ops,
1, BLK_MQ_F_SHOULD_MERGE);
if (IS_ERR(disk->queue)) {
- put_disk(disk);
disk->queue = NULL;
+ put_disk(disk);
continue;
}
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index a600934fdd9c..7543e395a2c6 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -4,6 +4,8 @@
* Initial release: Matias Bjorling <m@bjorling.me>
*/
+#define pr_fmt(fmt) "nvm: " fmt
+
#include <linux/list.h>
#include <linux/types.h>
#include <linux/sem.h>
@@ -74,7 +76,7 @@ static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
for (i = lun_begin; i <= lun_end; i++) {
if (test_and_set_bit(i, dev->lun_map)) {
- pr_err("nvm: lun %d already allocated\n", i);
+ pr_err("lun %d already allocated\n", i);
goto err;
}
}
@@ -264,7 +266,7 @@ static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
int lun_end)
{
if (lun_begin > lun_end || lun_end >= geo->all_luns) {
- pr_err("nvm: lun out of bound (%u:%u > %u)\n",
+ pr_err("lun out of bound (%u:%u > %u)\n",
lun_begin, lun_end, geo->all_luns - 1);
return -EINVAL;
}
@@ -297,7 +299,7 @@ static int __nvm_config_extended(struct nvm_dev *dev,
if (e->op == 0xFFFF) {
e->op = NVM_TARGET_DEFAULT_OP;
} else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) {
- pr_err("nvm: invalid over provisioning value\n");
+ pr_err("invalid over provisioning value\n");
return -EINVAL;
}
@@ -334,23 +336,23 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
e = create->conf.e;
break;
default:
- pr_err("nvm: config type not valid\n");
+ pr_err("config type not valid\n");
return -EINVAL;
}
tt = nvm_find_target_type(create->tgttype);
if (!tt) {
- pr_err("nvm: target type %s not found\n", create->tgttype);
+ pr_err("target type %s not found\n", create->tgttype);
return -EINVAL;
}
if ((tt->flags & NVM_TGT_F_HOST_L2P) != (dev->geo.dom & NVM_RSP_L2P)) {
- pr_err("nvm: device is incompatible with target L2P type.\n");
+ pr_err("device is incompatible with target L2P type.\n");
return -EINVAL;
}
if (nvm_target_exists(create->tgtname)) {
- pr_err("nvm: target name already exists (%s)\n",
+ pr_err("target name already exists (%s)\n",
create->tgtname);
return -EINVAL;
}
@@ -367,7 +369,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
if (!tgt_dev) {
- pr_err("nvm: could not create target device\n");
+ pr_err("could not create target device\n");
ret = -ENOMEM;
goto err_t;
}
@@ -493,8 +495,11 @@ static int nvm_remove_tgt(struct nvm_ioctl_remove *remove)
}
up_read(&nvm_lock);
- if (!t)
+ if (!t) {
+ pr_err("failed to remove target %s\n",
+ remove->tgtname);
return 1;
+ }
__nvm_remove_target(t, true);
kref_put(&dev->ref, nvm_free);
@@ -686,7 +691,7 @@ static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
rqd->nr_ppas = nr_ppas;
rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
if (!rqd->ppa_list) {
- pr_err("nvm: failed to allocate dma memory\n");
+ pr_err("failed to allocate dma memory\n");
return -ENOMEM;
}
@@ -731,7 +736,7 @@ static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd)
return flags;
}
-int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf)
{
struct nvm_dev *dev = tgt_dev->parent;
int ret;
@@ -745,19 +750,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd);
/* In case of error, fail with right address format */
- ret = dev->ops->submit_io(dev, rqd);
+ ret = dev->ops->submit_io(dev, rqd, buf);
if (ret)
nvm_rq_dev_to_tgt(tgt_dev, rqd);
return ret;
}
EXPORT_SYMBOL(nvm_submit_io);
-int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
+static void nvm_sync_end_io(struct nvm_rq *rqd)
+{
+ struct completion *waiting = rqd->private;
+
+ complete(waiting);
+}
+
+static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd,
+ void *buf)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ int ret = 0;
+
+ rqd->end_io = nvm_sync_end_io;
+ rqd->private = &wait;
+
+ ret = dev->ops->submit_io(dev, rqd, buf);
+ if (ret)
+ return ret;
+
+ wait_for_completion_io(&wait);
+
+ return 0;
+}
+
+int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
+ void *buf)
{
struct nvm_dev *dev = tgt_dev->parent;
int ret;
- if (!dev->ops->submit_io_sync)
+ if (!dev->ops->submit_io)
return -ENODEV;
nvm_rq_tgt_to_dev(tgt_dev, rqd);
@@ -765,9 +796,7 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
rqd->dev = tgt_dev;
rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd);
- /* In case of error, fail with right address format */
- ret = dev->ops->submit_io_sync(dev, rqd);
- nvm_rq_dev_to_tgt(tgt_dev, rqd);
+ ret = nvm_submit_io_wait(dev, rqd, buf);
return ret;
}
@@ -788,12 +817,13 @@ EXPORT_SYMBOL(nvm_end_io);
static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd)
{
- if (!dev->ops->submit_io_sync)
+ if (!dev->ops->submit_io)
return -ENODEV;
+ rqd->dev = NULL;
rqd->flags = nvm_set_flags(&dev->geo, rqd);
- return dev->ops->submit_io_sync(dev, rqd);
+ return nvm_submit_io_wait(dev, rqd, NULL);
}
static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa)
@@ -1048,7 +1078,7 @@ int nvm_set_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
return 0;
if (nr_ppas > NVM_MAX_VLBA) {
- pr_err("nvm: unable to update all blocks atomically\n");
+ pr_err("unable to update all blocks atomically\n");
return -EINVAL;
}
@@ -1111,27 +1141,26 @@ static int nvm_init(struct nvm_dev *dev)
int ret = -EINVAL;
if (dev->ops->identity(dev)) {
- pr_err("nvm: device could not be identified\n");
+ pr_err("device could not be identified\n");
goto err;
}
- pr_debug("nvm: ver:%u.%u nvm_vendor:%x\n",
- geo->major_ver_id, geo->minor_ver_id,
- geo->vmnt);
+ pr_debug("ver:%u.%u nvm_vendor:%x\n", geo->major_ver_id,
+ geo->minor_ver_id, geo->vmnt);
ret = nvm_core_init(dev);
if (ret) {
- pr_err("nvm: could not initialize core structures.\n");
+ pr_err("could not initialize core structures.\n");
goto err;
}
- pr_info("nvm: registered %s [%u/%u/%u/%u/%u]\n",
+ pr_info("registered %s [%u/%u/%u/%u/%u]\n",
dev->name, dev->geo.ws_min, dev->geo.ws_opt,
dev->geo.num_chk, dev->geo.all_luns,
dev->geo.num_ch);
return 0;
err:
- pr_err("nvm: failed to initialize nvm\n");
+ pr_err("failed to initialize nvm\n");
return ret;
}
@@ -1169,7 +1198,7 @@ int nvm_register(struct nvm_dev *dev)
dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist",
exp_pool_size);
if (!dev->dma_pool) {
- pr_err("nvm: could not create dma pool\n");
+ pr_err("could not create dma pool\n");
kref_put(&dev->ref, nvm_free);
return -ENOMEM;
}
@@ -1214,7 +1243,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
up_write(&nvm_lock);
if (!dev) {
- pr_err("nvm: device not found\n");
+ pr_err("device not found\n");
return -EINVAL;
}
@@ -1288,7 +1317,7 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
i++;
if (i > 31) {
- pr_err("nvm: max 31 devices can be reported.\n");
+ pr_err("max 31 devices can be reported.\n");
break;
}
}
@@ -1315,7 +1344,7 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
create.conf.e.rsv != 0) {
- pr_err("nvm: reserved config field in use\n");
+ pr_err("reserved config field in use\n");
return -EINVAL;
}
@@ -1331,7 +1360,7 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
flags &= ~NVM_TARGET_FACTORY;
if (flags) {
- pr_err("nvm: flag not supported\n");
+ pr_err("flag not supported\n");
return -EINVAL;
}
}
@@ -1349,7 +1378,7 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
remove.tgtname[DISK_NAME_LEN - 1] = '\0';
if (remove.flags != 0) {
- pr_err("nvm: no flags supported\n");
+ pr_err("no flags supported\n");
return -EINVAL;
}
@@ -1365,7 +1394,7 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
return -EFAULT;
if (init.flags != 0) {
- pr_err("nvm: no flags supported\n");
+ pr_err("no flags supported\n");
return -EINVAL;
}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index f546e6f28b8a..b413bafe93fd 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -507,7 +507,7 @@ void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write)
pblk->sec_per_write = sec_per_write;
}
-int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf)
{
struct nvm_tgt_dev *dev = pblk->dev;
@@ -518,7 +518,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
return NVM_IO_ERR;
#endif
- return nvm_submit_io(dev, rqd);
+ return nvm_submit_io(dev, rqd, buf);
}
void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
@@ -541,7 +541,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
}
}
-int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf)
{
struct nvm_tgt_dev *dev = pblk->dev;
int ret;
@@ -553,7 +553,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
return NVM_IO_ERR;
#endif
- ret = nvm_submit_io_sync(dev, rqd);
+ ret = nvm_submit_io_sync(dev, rqd, buf);
if (trace_pblk_chunk_state_enabled() && !ret &&
rqd->opcode == NVM_OP_PWRITE)
@@ -562,65 +562,19 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
return ret;
}
-int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd)
+static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd,
+ void *buf)
{
struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
int ret;
pblk_down_chunk(pblk, ppa_list[0]);
- ret = pblk_submit_io_sync(pblk, rqd);
+ ret = pblk_submit_io_sync(pblk, rqd, buf);
pblk_up_chunk(pblk, ppa_list[0]);
return ret;
}
-static void pblk_bio_map_addr_endio(struct bio *bio)
-{
- bio_put(bio);
-}
-
-struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
- unsigned int nr_secs, unsigned int len,
- int alloc_type, gfp_t gfp_mask)
-{
- struct nvm_tgt_dev *dev = pblk->dev;
- void *kaddr = data;
- struct page *page;
- struct bio *bio;
- int i, ret;
-
- if (alloc_type == PBLK_KMALLOC_META)
- return bio_map_kern(dev->q, kaddr, len, gfp_mask);
-
- bio = bio_kmalloc(gfp_mask, nr_secs);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- for (i = 0; i < nr_secs; i++) {
- page = vmalloc_to_page(kaddr);
- if (!page) {
- pblk_err(pblk, "could not map vmalloc bio\n");
- bio_put(bio);
- bio = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
- pblk_err(pblk, "could not add page to bio\n");
- bio_put(bio);
- bio = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- kaddr += PAGE_SIZE;
- }
-
- bio->bi_end_io = pblk_bio_map_addr_endio;
-out:
- return bio;
-}
-
int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
unsigned long secs_to_flush, bool skip_meta)
{
@@ -722,9 +676,7 @@ u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
{
- struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_line_meta *lm = &pblk->lm;
- struct bio *bio;
struct ppa_addr *ppa_list;
struct nvm_rq rqd;
u64 paddr = pblk_line_smeta_start(pblk, line);
@@ -736,16 +688,6 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
if (ret)
return ret;
- bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- goto clear_rqd;
- }
-
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
-
- rqd.bio = bio;
rqd.opcode = NVM_OP_PREAD;
rqd.nr_ppas = lm->smeta_sec;
rqd.is_seq = 1;
@@ -754,10 +696,9 @@ int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
for (i = 0; i < lm->smeta_sec; i++, paddr++)
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
- ret = pblk_submit_io_sync(pblk, &rqd);
+ ret = pblk_submit_io_sync(pblk, &rqd, line->smeta);
if (ret) {
pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
- bio_put(bio);
goto clear_rqd;
}
@@ -776,9 +717,7 @@ clear_rqd:
static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
u64 paddr)
{
- struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_line_meta *lm = &pblk->lm;
- struct bio *bio;
struct ppa_addr *ppa_list;
struct nvm_rq rqd;
__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
@@ -791,16 +730,6 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
if (ret)
return ret;
- bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- goto clear_rqd;
- }
-
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
- rqd.bio = bio;
rqd.opcode = NVM_OP_PWRITE;
rqd.nr_ppas = lm->smeta_sec;
rqd.is_seq = 1;
@@ -814,10 +743,9 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
meta->lba = lba_list[paddr] = addr_empty;
}
- ret = pblk_submit_io_sync_sem(pblk, &rqd);
+ ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta);
if (ret) {
pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
- bio_put(bio);
goto clear_rqd;
}
@@ -838,10 +766,8 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
- struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
void *ppa_list_buf, *meta_list;
- struct bio *bio;
struct ppa_addr *ppa_list;
struct nvm_rq rqd;
u64 paddr = line->emeta_ssec;
@@ -867,17 +793,6 @@ next_rq:
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
rq_len = rq_ppas * geo->csecs;
- bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
- l_mg->emeta_alloc_type, GFP_KERNEL);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- goto free_rqd_dma;
- }
-
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
-
- rqd.bio = bio;
rqd.meta_list = meta_list;
rqd.ppa_list = ppa_list_buf;
rqd.dma_meta_list = dma_meta_list;
@@ -896,7 +811,6 @@ next_rq:
while (test_bit(pos, line->blk_bitmap)) {
paddr += min;
if (pblk_boundary_paddr_checks(pblk, paddr)) {
- bio_put(bio);
ret = -EINTR;
goto free_rqd_dma;
}
@@ -906,7 +820,6 @@ next_rq:
}
if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
- bio_put(bio);
ret = -EINTR;
goto free_rqd_dma;
}
@@ -915,10 +828,9 @@ next_rq:
ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id);
}
- ret = pblk_submit_io_sync(pblk, &rqd);
+ ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf);
if (ret) {
pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
- bio_put(bio);
goto free_rqd_dma;
}
@@ -963,7 +875,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
/* The write thread schedules erases so that it minimizes disturbances
* with writes. Thus, there is no need to take the LUN semaphore.
*/
- ret = pblk_submit_io_sync(pblk, &rqd);
+ ret = pblk_submit_io_sync(pblk, &rqd, NULL);
rqd.private = pblk;
__pblk_end_io_erase(pblk, &rqd);
@@ -1792,7 +1704,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
/* The write thread schedules erases so that it minimizes disturbances
* with writes. Thus, there is no need to take the LUN semaphore.
*/
- err = pblk_submit_io(pblk, rqd);
+ err = pblk_submit_io(pblk, rqd, NULL);
if (err) {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
@@ -1923,13 +1835,11 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
{
struct pblk_line_meta *lm = &pblk->lm;
- struct pblk_line_mgmt *l_mg = &pblk->l_mg;
unsigned int lba_list_size = lm->emeta_len[2];
struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
struct pblk_emeta *emeta = line->emeta;
- w_err_gc->lba_list = pblk_malloc(lba_list_size,
- l_mg->emeta_alloc_type, GFP_KERNEL);
+ w_err_gc->lba_list = kvmalloc(lba_list_size, GFP_KERNEL);
memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
lba_list_size);
}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 63ee205b41c4..2581eebcfc41 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -132,14 +132,12 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
struct pblk_line *line)
{
struct line_emeta *emeta_buf;
- struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
unsigned int lba_list_size = lm->emeta_len[2];
__le64 *lba_list;
int ret;
- emeta_buf = pblk_malloc(lm->emeta_len[0],
- l_mg->emeta_alloc_type, GFP_KERNEL);
+ emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL);
if (!emeta_buf)
return NULL;
@@ -147,7 +145,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
if (ret) {
pblk_err(pblk, "line %d read emeta failed (%d)\n",
line->id, ret);
- pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ kvfree(emeta_buf);
return NULL;
}
@@ -161,16 +159,16 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
if (ret) {
pblk_err(pblk, "inconsistent emeta (line %d)\n",
line->id);
- pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ kvfree(emeta_buf);
return NULL;
}
- lba_list = pblk_malloc(lba_list_size,
- l_mg->emeta_alloc_type, GFP_KERNEL);
+ lba_list = kvmalloc(lba_list_size, GFP_KERNEL);
+
if (lba_list)
memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
- pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+ kvfree(emeta_buf);
return lba_list;
}
@@ -181,7 +179,6 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
ws);
struct pblk *pblk = line_ws->pblk;
struct pblk_line *line = line_ws->line;
- struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
@@ -272,7 +269,7 @@ next_rq:
goto next_rq;
out:
- pblk_mfree(lba_list, l_mg->emeta_alloc_type);
+ kvfree(lba_list);
kfree(line_ws);
kfree(invalid_bitmap);
@@ -286,7 +283,7 @@ fail_free_gc_data:
fail_free_gc_rq:
kfree(gc_rq);
fail_free_lba_list:
- pblk_mfree(lba_list, l_mg->emeta_alloc_type);
+ kvfree(lba_list);
fail_free_invalid_bitmap:
kfree(invalid_bitmap);
fail_free_ws:
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index b351c7f002de..9a967a2e83dd 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -543,7 +543,7 @@ static void pblk_line_mg_free(struct pblk *pblk)
for (i = 0; i < PBLK_DATA_LINES; i++) {
kfree(l_mg->sline_meta[i]);
- pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
+ kvfree(l_mg->eline_meta[i]->buf);
kfree(l_mg->eline_meta[i]);
}
@@ -560,7 +560,7 @@ static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg,
kfree(line->erase_bitmap);
kfree(line->chks);
- pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type);
+ kvfree(w_err_gc->lba_list);
kfree(w_err_gc);
}
@@ -890,29 +890,14 @@ static int pblk_line_mg_init(struct pblk *pblk)
if (!emeta)
goto fail_free_emeta;
- if (lm->emeta_len[0] > KMALLOC_MAX_CACHE_SIZE) {
- l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
-
- emeta->buf = vmalloc(lm->emeta_len[0]);
- if (!emeta->buf) {
- kfree(emeta);
- goto fail_free_emeta;
- }
-
- emeta->nr_entries = lm->emeta_sec[0];
- l_mg->eline_meta[i] = emeta;
- } else {
- l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
-
- emeta->buf = kmalloc(lm->emeta_len[0], GFP_KERNEL);
- if (!emeta->buf) {
- kfree(emeta);
- goto fail_free_emeta;
- }
-
- emeta->nr_entries = lm->emeta_sec[0];
- l_mg->eline_meta[i] = emeta;
+ emeta->buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL);
+ if (!emeta->buf) {
+ kfree(emeta);
+ goto fail_free_emeta;
}
+
+ emeta->nr_entries = lm->emeta_sec[0];
+ l_mg->eline_meta[i] = emeta;
}
for (i = 0; i < l_mg->nr_lines; i++)
@@ -926,10 +911,7 @@ static int pblk_line_mg_init(struct pblk *pblk)
fail_free_emeta:
while (--i >= 0) {
- if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META)
- vfree(l_mg->eline_meta[i]->buf);
- else
- kfree(l_mg->eline_meta[i]->buf);
+ kvfree(l_mg->eline_meta[i]->buf);
kfree(l_mg->eline_meta[i]);
}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d98ea392fe33..8efd14e683dc 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -342,7 +342,7 @@ split_retry:
bio_put(int_bio);
int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
goto split_retry;
- } else if (pblk_submit_io(pblk, rqd)) {
+ } else if (pblk_submit_io(pblk, rqd, NULL)) {
/* Submitting IO to drive failed, let's report an error */
rqd->error = -ENODEV;
pblk_end_io_read(rqd);
@@ -417,11 +417,7 @@ out:
int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
{
- struct nvm_tgt_dev *dev = pblk->dev;
- struct nvm_geo *geo = &dev->geo;
- struct bio *bio;
struct nvm_rq rqd;
- int data_len;
int ret = NVM_IO_OK;
memset(&rqd, 0, sizeof(struct nvm_rq));
@@ -446,26 +442,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
if (!(gc_rq->secs_to_gc))
goto out;
- data_len = (gc_rq->secs_to_gc) * geo->csecs;
- bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
- PBLK_VMALLOC_META, GFP_KERNEL);
- if (IS_ERR(bio)) {
- pblk_err(pblk, "could not allocate GC bio (%lu)\n",
- PTR_ERR(bio));
- ret = PTR_ERR(bio);
- goto err_free_dma;
- }
-
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
-
rqd.opcode = NVM_OP_PREAD;
rqd.nr_ppas = gc_rq->secs_to_gc;
- rqd.bio = bio;
- if (pblk_submit_io_sync(pblk, &rqd)) {
+ if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) {
ret = -EIO;
- goto err_free_bio;
+ goto err_free_dma;
}
pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
@@ -489,8 +471,6 @@ out:
pblk_free_rqd_meta(pblk, &rqd);
return ret;
-err_free_bio:
- bio_put(bio);
err_free_dma:
pblk_free_rqd_meta(pblk, &rqd);
return ret;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index e6dda04de144..299ef47a17b2 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -178,12 +178,11 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
void *meta_list;
struct pblk_pad_rq *pad_rq;
struct nvm_rq *rqd;
- struct bio *bio;
struct ppa_addr *ppa_list;
void *data;
__le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
u64 w_ptr = line->cur_sec;
- int left_line_ppas, rq_ppas, rq_len;
+ int left_line_ppas, rq_ppas;
int i, j;
int ret = 0;
@@ -212,28 +211,15 @@ next_pad_rq:
goto fail_complete;
}
- rq_len = rq_ppas * geo->csecs;
-
- bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
- PBLK_VMALLOC_META, GFP_KERNEL);
- if (IS_ERR(bio)) {
- ret = PTR_ERR(bio);
- goto fail_complete;
- }
-
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
ret = pblk_alloc_rqd_meta(pblk, rqd);
if (ret) {
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
- bio_put(bio);
goto fail_complete;
}
- rqd->bio = bio;
+ rqd->bio = NULL;
rqd->opcode = NVM_OP_PWRITE;
rqd->is_seq = 1;
rqd->nr_ppas = rq_ppas;
@@ -275,13 +261,12 @@ next_pad_rq:
kref_get(&pad_rq->ref);
pblk_down_chunk(pblk, ppa_list[0]);
- ret = pblk_submit_io(pblk, rqd);
+ ret = pblk_submit_io(pblk, rqd, data);
if (ret) {
pblk_err(pblk, "I/O submission failed: %d\n", ret);
pblk_up_chunk(pblk, ppa_list[0]);
kref_put(&pad_rq->ref, pblk_recov_complete);
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
- bio_put(bio);
goto fail_complete;
}
@@ -375,13 +360,12 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
struct ppa_addr *ppa_list;
void *meta_list;
struct nvm_rq *rqd;
- struct bio *bio;
void *data;
dma_addr_t dma_ppa_list, dma_meta_list;
__le64 *lba_list;
u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
bool padded = false;
- int rq_ppas, rq_len;
+ int rq_ppas;
int i, j;
int ret;
u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
@@ -404,18 +388,9 @@ next_rq:
rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
if (!rq_ppas)
rq_ppas = pblk->min_write_pgs;
- rq_len = rq_ppas * geo->csecs;
retry_rq:
- bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bio_get(bio);
-
- rqd->bio = bio;
+ rqd->bio = NULL;
rqd->opcode = NVM_OP_PREAD;
rqd->meta_list = meta_list;
rqd->nr_ppas = rq_ppas;
@@ -445,10 +420,9 @@ retry_rq:
addr_to_gen_ppa(pblk, paddr + j, line->id);
}
- ret = pblk_submit_io_sync(pblk, rqd);
+ ret = pblk_submit_io_sync(pblk, rqd, data);
if (ret) {
pblk_err(pblk, "I/O submission failed: %d\n", ret);
- bio_put(bio);
return ret;
}
@@ -460,24 +434,20 @@ retry_rq:
if (padded) {
pblk_log_read_err(pblk, rqd);
- bio_put(bio);
return -EINTR;
}
pad_distance = pblk_pad_distance(pblk, line);
ret = pblk_recov_pad_line(pblk, line, pad_distance);
if (ret) {
- bio_put(bio);
return ret;
}
padded = true;
- bio_put(bio);
goto retry_rq;
}
pblk_get_packed_meta(pblk, rqd);
- bio_put(bio);
for (i = 0; i < rqd->nr_ppas; i++) {
struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 4e63f9b5954c..b9a2aeba95ab 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -373,7 +373,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
struct pblk_emeta *emeta = meta_line->emeta;
struct ppa_addr *ppa_list;
struct pblk_g_ctx *m_ctx;
- struct bio *bio;
struct nvm_rq *rqd;
void *data;
u64 paddr;
@@ -391,20 +390,9 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
rq_len = rq_ppas * geo->csecs;
data = ((void *)emeta->buf) + emeta->mem;
- bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
- l_mg->emeta_alloc_type, GFP_KERNEL);
- if (IS_ERR(bio)) {
- pblk_err(pblk, "failed to map emeta io");
- ret = PTR_ERR(bio);
- goto fail_free_rqd;
- }
- bio->bi_iter.bi_sector = 0; /* internal bio */
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- rqd->bio = bio;
-
ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta);
if (ret)
- goto fail_free_bio;
+ goto fail_free_rqd;
ppa_list = nvm_rq_to_ppa_list(rqd);
for (i = 0; i < rqd->nr_ppas; ) {
@@ -423,7 +411,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
pblk_down_chunk(pblk, ppa_list[0]);
- ret = pblk_submit_io(pblk, rqd);
+ ret = pblk_submit_io(pblk, rqd, data);
if (ret) {
pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
goto fail_rollback;
@@ -437,8 +425,6 @@ fail_rollback:
pblk_dealloc_page(pblk, meta_line, rq_ppas);
list_add(&meta_line->list, &meta_line->list);
spin_unlock(&l_mg->close_lock);
-fail_free_bio:
- bio_put(bio);
fail_free_rqd:
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
return ret;
@@ -523,7 +509,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
meta_line = pblk_should_submit_meta_io(pblk, rqd);
/* Submit data write for current data line */
- err = pblk_submit_io(pblk, rqd);
+ err = pblk_submit_io(pblk, rqd, NULL);
if (err) {
pblk_err(pblk, "data I/O submission failed: %d\n", err);
return NVM_IO_ERR;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index a67855387f53..86ffa875bfe1 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -482,11 +482,6 @@ struct pblk_line {
#define PBLK_DATA_LINES 4
enum {
- PBLK_KMALLOC_META = 1,
- PBLK_VMALLOC_META = 2,
-};
-
-enum {
PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */
PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */
PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */
@@ -521,9 +516,6 @@ struct pblk_line_mgmt {
__le32 *vsc_list; /* Valid sector counts for all lines */
- /* Metadata allocation type: VMALLOC | KMALLOC */
- int emeta_alloc_type;
-
/* Pre-allocated metadata for data lines */
struct pblk_smeta *sline_meta[PBLK_DATA_LINES];
struct pblk_emeta *eline_meta[PBLK_DATA_LINES];
@@ -783,14 +775,10 @@ struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
struct ppa_addr ppa);
void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
-int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
-int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
-int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf);
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf);
int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd);
-struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
- unsigned int nr_secs, unsigned int len,
- int alloc_type, gfp_t gfp_mask);
struct pblk_line *pblk_line_get(struct pblk *pblk);
struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
@@ -938,21 +926,6 @@ void pblk_rl_werr_line_out(struct pblk_rl *rl);
int pblk_sysfs_init(struct gendisk *tdisk);
void pblk_sysfs_exit(struct gendisk *tdisk);
-static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
-{
- if (type == PBLK_KMALLOC_META)
- return kmalloc(size, flags);
- return vmalloc(size);
-}
-
-static inline void pblk_mfree(void *ptr, int type)
-{
- if (type == PBLK_KMALLOC_META)
- kfree(ptr);
- else
- vfree(ptr);
-}
-
static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
{
return c_ctx - sizeof(struct nvm_rq);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 73f5319295bc..c12cd809ab19 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -105,8 +105,14 @@ struct closure_syncer {
static void closure_sync_fn(struct closure *cl)
{
- cl->s->done = 1;
- wake_up_process(cl->s->task);
+ struct closure_syncer *s = cl->s;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = READ_ONCE(s->task);
+ s->done = 1;
+ wake_up_process(p);
+ rcu_read_unlock();
}
void __sched __closure_sync(struct closure *cl)
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8b123be05254..336f43910383 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
while (size) {
struct keybuf_key *w;
unsigned int bytes = min(i->bytes, size);
- int err = copy_to_user(buf, i->buf, bytes);
- if (err)
- return err;
+ if (copy_to_user(buf, i->buf, bytes))
+ return -EFAULT;
ret += bytes;
buf += bytes;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e2059af90791..627dcea0f5b6 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -964,6 +964,7 @@ KTYPE(bch_cache_set_internal);
static int __bch_cache_cmp(const void *l, const void *r)
{
+ cond_resched();
return *((uint16_t *)r) - *((uint16_t *)l);
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index c9e44ac1f9a6..3f8577e2c13b 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -408,6 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
ret = dm_dispatch_clone_request(clone, rq);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
blk_rq_unprep_clone(clone);
+ blk_mq_cleanup_rq(clone);
tio->ti->type->release_clone_rq(clone, &tio->info);
tio->clone = NULL;
return DM_MAPIO_REQUEUE;
@@ -562,7 +563,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
if (err)
goto out_kfree_tag_set;
- q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+ q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true);
if (IS_ERR(q)) {
err = PTR_ERR(q);
goto out_tag_set;
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 7354466ddc90..c766c559d36d 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -258,6 +258,11 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio_sector < start_sector))
goto out_of_bounds;
+ if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
struct bio *split = bio_split(bio, end_sector - bio_sector,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 24638ccedce4..1be7abeb24fd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -376,6 +376,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
struct mddev *mddev = q->queuedata;
unsigned int sectors;
+ if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
+
blk_queue_split(q, &bio);
if (mddev == NULL || mddev->pers == NULL) {
@@ -1232,6 +1237,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0)
+ mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
@@ -1647,6 +1654,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
+ sb->level != 0)
+ return -EINVAL;
+
if (!refdev) {
ret = 1;
} else {
@@ -1757,6 +1768,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0 &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
+ mddev->layout = -1;
+
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
@@ -1826,8 +1841,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
- } else
- set_bit(In_sync, &rdev->flags);
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
rdev->raid_disk = role;
break;
}
@@ -3664,11 +3686,7 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
return -EINVAL;
if (decimals < 0)
decimals = 0;
- while (decimals < scale) {
- result *= 10;
- decimals ++;
- }
- *res = result;
+ *res = result * int_pow(10, scale - decimals);
return 0;
}
@@ -4155,12 +4173,17 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
* active-idle
* like active, but no writes have been seen for a while (100msec).
*
+ * broken
+ * RAID0/LINEAR-only: same as clean, but array is missing a member.
+ * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
+ * when a member is gone, so this state will at least alert the
+ * user that something is wrong.
*/
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
- write_pending, active_idle, bad_word};
+ write_pending, active_idle, broken, bad_word};
static char *array_states[] = {
"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
- "write-pending", "active-idle", NULL };
+ "write-pending", "active-idle", "broken", NULL };
static int match_word(const char *word, char **list)
{
@@ -4176,7 +4199,7 @@ array_state_show(struct mddev *mddev, char *page)
{
enum array_state st = inactive;
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
switch(mddev->ro) {
case 1:
st = readonly;
@@ -4196,7 +4219,10 @@ array_state_show(struct mddev *mddev, char *page)
st = active;
spin_unlock(&mddev->lock);
}
- else {
+
+ if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
+ st = broken;
+ } else {
if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 &&
mddev->dev_sectors == 0)
@@ -4310,6 +4336,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case write_pending:
case active_idle:
+ case broken:
/* these cannot be set */
break;
}
@@ -5182,6 +5209,34 @@ static struct md_sysfs_entry md_consistency_policy =
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
consistency_policy_store);
+static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->fail_last_dev);
+}
+
+/*
+ * Setting fail_last_dev to true to allow last device to be forcibly removed
+ * from RAID1/RAID10.
+ */
+static ssize_t
+fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int ret;
+ bool value;
+
+ ret = kstrtobool(buf, &value);
+ if (ret)
+ return ret;
+
+ if (value != mddev->fail_last_dev)
+ mddev->fail_last_dev = value;
+
+ return len;
+}
+static struct md_sysfs_entry md_fail_last_dev =
+__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
+ fail_last_dev_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -5198,6 +5253,7 @@ static struct attribute *md_default_attrs[] = {
&md_array_size.attr,
&max_corr_read_errors.attr,
&md_consistency_policy.attr,
+ &md_fail_last_dev.attr,
NULL,
};
@@ -5744,9 +5800,6 @@ int md_run(struct mddev *mddev)
md_update_sb(mddev, 0);
md_new_event(mddev);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
bitmap_abort:
@@ -5767,6 +5820,7 @@ static int do_md_run(struct mddev *mddev)
{
int err;
+ set_bit(MD_NOT_READY, &mddev->flags);
err = md_run(mddev);
if (err)
goto out;
@@ -5787,9 +5841,14 @@ static int do_md_run(struct mddev *mddev)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
out:
+ clear_bit(MD_NOT_READY, &mddev->flags);
return err;
}
@@ -6849,6 +6908,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->external = 0;
mddev->layout = info->layout;
+ if (mddev->level == 0)
+ /* Cannot trust RAID0 layout info here */
+ mddev->layout = -1;
mddev->chunk_sectors = info->chunk_size >> 9;
if (mddev->persistent) {
@@ -8900,6 +8962,7 @@ void md_check_recovery(struct mddev *mddev)
if (mddev_trylock(mddev)) {
int spares = 0;
+ bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
@@ -8945,7 +9008,7 @@ void md_check_recovery(struct mddev *mddev)
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (try_set_sync && !mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
set_in_sync(mddev);
spin_unlock(&mddev->lock);
@@ -9043,7 +9106,8 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ mddev->degraded != mddev->raid_disks) {
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 10f98200e2f8..c5e3ff398b59 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -248,6 +248,12 @@ enum mddev_flags {
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
+ MD_NOT_READY, /* do_md_run() is active, so 'array_state'
+ * must not report that array is ready yet
+ */
+ MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
+ * I/O in case an array member is gone/failed.
+ */
};
enum mddev_sb_flags {
@@ -487,6 +493,7 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */
bool has_superblocks:1;
+ bool fail_last_dev:1;
};
enum recovery_flags {
@@ -735,6 +742,19 @@ extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
+static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
+{
+ int flags = rdev->bdev->bd_disk->flags;
+
+ if (!(flags & GENHD_FL_UP)) {
+ if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
+ pr_warn("md: %s: %s array has a missing/failed member\n",
+ mdname(rdev->mddev), md_type);
+ return true;
+ }
+ return false;
+}
+
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{
int faulty = test_bit(Faulty, &rdev->flags);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index bf5cf184a260..f61693e59684 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -19,6 +19,9 @@
#include "raid0.h"
#include "raid5.h"
+static int default_layout = 0;
+module_param(default_layout, int, 0644);
+
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
@@ -139,6 +142,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+
+ if (conf->nr_strip_zones == 1) {
+ conf->layout = RAID0_ORIG_LAYOUT;
+ } else if (mddev->layout == RAID0_ORIG_LAYOUT ||
+ mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = mddev->layout;
+ } else if (default_layout == RAID0_ORIG_LAYOUT ||
+ default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = default_layout;
+ } else {
+ pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
+ mdname(mddev));
+ pr_err("md/raid0: please set raid.default_layout to 1 or 2\n");
+ err = -ENOTSUPP;
+ goto abort;
+ }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
@@ -547,10 +566,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
+ struct r0conf *conf = mddev->private;
struct strip_zone *zone;
struct md_rdev *tmp_dev;
sector_t bio_sector;
sector_t sector;
+ sector_t orig_sector;
unsigned chunk_sects;
unsigned sectors;
@@ -584,8 +605,26 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ orig_sector = sector;
zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
+ switch (conf->layout) {
+ case RAID0_ORIG_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
+ break;
+ case RAID0_ALT_MULTIZONE_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ break;
+ default:
+ WARN("md/raid0:%s: Invalid layout\n", mdname(mddev));
+ bio_io_error(bio);
+ return true;
+ }
+
+ if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 540e65d92642..3816e5477db1 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -8,11 +8,25 @@ struct strip_zone {
int nb_dev; /* # of devices attached to the zone */
};
+/* Linux 3.14 (20d0189b101) made an unintended change to
+ * the RAID0 layout for multi-zone arrays (where devices aren't all
+ * the same size.
+ * RAID0_ORIG_LAYOUT restores the original layout
+ * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout
+ * The layouts are identical when there is only one zone (all
+ * devices the same size).
+ */
+
+enum r0layout {
+ RAID0_ORIG_LAYOUT = 1,
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
+};
struct r0conf {
struct strip_zone *strip_zone;
struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones;
+ enum r0layout layout;
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 34e26834ad28..0466ee2453b4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -447,19 +447,21 @@ static void raid1_end_write_request(struct bio *bio)
/* We never try FailFast to WriteMostly devices */
!test_bit(WriteMostly, &rdev->flags)) {
md_error(r1_bio->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R1BIO_WriteError, &r1_bio->state);
- else {
- /* Finished with this branch */
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
+ else {
+ /* Finished with this branch */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ }
} else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
@@ -872,8 +874,11 @@ static void flush_pending_writes(struct r1conf *conf)
* backgroup IO calls must call raise_barrier. Once that returns
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
+ *
+ * If resync/recovery is interrupted, returns -EINTR;
+ * Otherwise, returns 0.
*/
-static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
+static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
int idx = sector_to_idx(sector_nr);
@@ -1612,12 +1617,12 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& (conf->raid_disks - mddev->degraded) == 1) {
/*
* Don't fail the drive, act as though we were just a
@@ -1901,6 +1906,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
} while (sectors_to_go > 0);
}
+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ struct mddev *mddev = r1_bio->mddev;
+ int s = r1_bio->sectors;
+
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+ }
+}
+
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_status;
@@ -1927,16 +1948,7 @@ static void end_sync_write(struct bio *bio)
)
set_bit(R1BIO_MadeGood, &r1_bio->state);
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, uptodate);
- }
- }
+ put_sync_write_buf(r1_bio, uptodate);
}
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -2219,17 +2231,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
generic_make_request(wbio);
}
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- /* if we're here, all write(s) have completed, so clean up */
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, 1);
- }
- }
+ put_sync_write_buf(r1_bio, 1);
}
/*
@@ -3127,6 +3129,13 @@ static int raid1_run(struct mddev *mddev)
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
mddev->degraded++;
+ /*
+ * RAID1 needs at least one disk in active
+ */
+ if (conf->raid_disks - mddev->degraded < 1) {
+ ret = -EINVAL;
+ goto abort;
+ }
if (conf->raid_disks - mddev->degraded == 1)
mddev->recovery_cp = MaxSector;
@@ -3160,8 +3169,12 @@ static int raid1_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
- raid1_free(mddev, conf);
+ goto abort;
}
+ return 0;
+
+abort:
+ raid1_free(mddev, conf);
return ret;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8a1354a08a1a..299c7b1c9718 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -465,19 +465,21 @@ static void raid10_end_write_request(struct bio *bio)
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) {
md_error(rdev->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R10BIO_WriteError, &r10_bio->state);
- else {
- r10_bio->devs[slot].bio = NULL;
- to_put = bio;
- dec_rdev = 1;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state);
+ else {
+ r10_bio->devs[slot].bio = NULL;
+ to_put = bio;
+ dec_rdev = 1;
+ }
}
} else {
/*
@@ -1638,12 +1640,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& !enough(conf, rdev->raid_disk)) {
/*
* Don't fail the drive, just return an IO error.
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3de4e13bde98..223e97ab27e6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2526,7 +2526,8 @@ static void raid5_end_read_request(struct bio * bi)
int set_bad = 0;
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&rdev->read_errors);
+ if (!(bi->bi_status == BLK_STS_PROTECTION))
+ atomic_inc(&rdev->read_errors);
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
pr_warn_ratelimited(
"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
@@ -2549,16 +2550,24 @@ static void raid5_end_read_request(struct bio * bi)
(unsigned long long)s,
bdn);
} else if (atomic_read(&rdev->read_errors)
- > conf->max_nr_stripes)
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
- mdname(conf->mddev), bdn);
- else
+ > conf->max_nr_stripes) {
+ if (!test_bit(Faulty, &rdev->flags)) {
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
+ mdname(conf->mddev),
+ atomic_read(&rdev->read_errors),
+ conf->max_nr_stripes);
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
+ }
+ } else
retry = 1;
if (set_bad && test_bit(In_sync, &rdev->flags)
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
retry = 1;
if (retry)
- if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ if (sh->qd_idx >= 0 && sh->pd_idx == i)
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
@@ -4612,7 +4621,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_FULL_WRITE) |
(1 << STRIPE_BIOFILL_RUN) |
(1 << STRIPE_COMPUTE_RUN) |
- (1 << STRIPE_OPS_REQ_PENDING) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
(1 << STRIPE_BATCH_ERR) |
@@ -5491,7 +5499,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
return;
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
+ last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
@@ -5718,7 +5726,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (!sh->batch_head)
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cf991f13403e..f90e0704bed9 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -357,7 +357,6 @@ enum {
STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
STRIPE_BIOFILL_RUN,
STRIPE_COMPUTE_RUN,
- STRIPE_OPS_REQ_PENDING,
STRIPE_ON_UNPLUG_LIST,
STRIPE_DISCARD,
STRIPE_ON_RELEASE_LIST,
@@ -493,9 +492,7 @@ struct disk_info {
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
- int sectors = bio_sectors(bio);
-
- if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
+ if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index ec43ac9199e2..2b36f052bfb9 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -64,6 +64,7 @@ config NVME_TCP
depends on INET
depends on BLK_DEV_NVME
select NVME_FABRICS
+ select CRYPTO_CRC32C
help
This provides support for the NVMe over Fabrics protocol using
the TCP transport. This allows you to use remote block devices
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d3d6b7bd6903..1ede1763a5ee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -22,12 +22,12 @@
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
#include "nvme.h"
#include "fabrics.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
#define NVME_MINORS (1U << MINORBITS)
unsigned int admin_timeout = 60;
@@ -81,7 +81,6 @@ EXPORT_SYMBOL_GPL(nvme_reset_wq);
struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
-static DEFINE_IDA(nvme_subsystems_ida);
static LIST_HEAD(nvme_subsystems);
static DEFINE_MUTEX(nvme_subsystems_lock);
@@ -197,9 +196,9 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
}
-static blk_status_t nvme_error_status(struct request *req)
+static blk_status_t nvme_error_status(u16 status)
{
- switch (nvme_req(req)->status & 0x7ff) {
+ switch (status & 0x7ff) {
case NVME_SC_SUCCESS:
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
@@ -226,6 +225,8 @@ static blk_status_t nvme_error_status(struct request *req)
return BLK_STS_PROTECTION;
case NVME_SC_RESERVATION_CONFLICT:
return BLK_STS_NEXUS;
+ case NVME_SC_HOST_PATH_ERROR:
+ return BLK_STS_TRANSPORT;
default:
return BLK_STS_IOERR;
}
@@ -260,7 +261,7 @@ static void nvme_retry_req(struct request *req)
void nvme_complete_rq(struct request *req)
{
- blk_status_t status = nvme_error_status(req);
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
trace_nvme_complete_rq(req);
@@ -279,6 +280,8 @@ void nvme_complete_rq(struct request *req)
return;
}
}
+
+ nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -288,8 +291,12 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
- nvme_req(req)->status = NVME_SC_ABORT_REQ;
- blk_mq_complete_request_sync(req);
+ /* don't abort one completed request */
+ if (blk_mq_request_completed(req))
+ return true;
+
+ nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
+ blk_mq_complete_request(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -1088,10 +1095,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
NVME_IDENTIFY_DATA_SIZE);
}
-static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
- unsigned nsid)
+static int nvme_identify_ns(struct nvme_ctrl *ctrl,
+ unsigned nsid, struct nvme_id_ns **id)
{
- struct nvme_id_ns *id;
struct nvme_command c = { };
int error;
@@ -1100,18 +1106,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
c.identify.nsid = cpu_to_le32(nsid);
c.identify.cns = NVME_ID_CNS_NS;
- id = kmalloc(sizeof(*id), GFP_KERNEL);
- if (!id)
- return NULL;
+ *id = kmalloc(sizeof(**id), GFP_KERNEL);
+ if (!*id)
+ return -ENOMEM;
- error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+ error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
- kfree(id);
- return NULL;
+ kfree(*id);
}
- return id;
+ return error;
}
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
@@ -1180,7 +1185,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
#define NVME_AEN_SUPPORTED \
- (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
+ NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
@@ -1195,6 +1201,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
if (status)
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
supported_aens);
+
+ queue_work(nvme_wq, &ctrl->async_event_work);
}
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
@@ -1594,9 +1602,11 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors);
}
-static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
+static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
+ int ret = 0;
+
memset(ids, 0, sizeof(*ids));
if (ctrl->vs >= NVME_VS(1, 1, 0))
@@ -1607,10 +1617,12 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
/* Don't treat error as fatal we potentially
* already have a NGUID or EUI-64
*/
- if (nvme_identify_ns_descs(ctrl, nsid, ids))
+ ret = nvme_identify_ns_descs(ctrl, nsid, ids);
+ if (ret)
dev_warn(ctrl->device,
- "%s: Identify Descriptors failed\n", __func__);
+ "Identify Descriptors failed (%d)\n", ret);
}
+ return ret;
}
static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
@@ -1738,25 +1750,37 @@ static int nvme_revalidate_disk(struct gendisk *disk)
return -ENODEV;
}
- id = nvme_identify_ns(ctrl, ns->head->ns_id);
- if (!id)
- return -ENODEV;
+ ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
+ if (ret)
+ goto out;
if (id->ncap == 0) {
ret = -ENODEV;
- goto out;
+ goto free_id;
}
__nvme_revalidate_disk(disk, id);
- nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ if (ret)
+ goto free_id;
+
if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
dev_err(ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
ret = -ENODEV;
}
-out:
+free_id:
kfree(id);
+out:
+ /*
+ * Only fail the function if we got a fatal error back from the
+ * device, otherwise ignore the error and just move on.
+ */
+ if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
+ ret = 0;
+ else if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -1952,7 +1976,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
* bits', but doing so may cause the device to complete commands to the
* admin queue ... and we don't know what memory that might be pointing at!
*/
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
{
int ret;
@@ -1966,20 +1990,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
- return nvme_wait_ready(ctrl, cap, false);
+ return nvme_wait_ready(ctrl, ctrl->cap, false);
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accomodate architectures with differing
* kernel and IO page sizes.
*/
- unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
+ unsigned dev_page_min, page_shift = 12;
int ret;
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+ if (ret) {
+ dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
+ return ret;
+ }
+ dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+
if (page_shift < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
@@ -1998,7 +2029,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
- return nvme_wait_ready(ctrl, cap, true);
+ return nvme_wait_ready(ctrl, ctrl->cap, true);
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
@@ -2332,7 +2363,8 @@ static void nvme_release_subsystem(struct device *dev)
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
- ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
+ if (subsys->instance >= 0)
+ ida_simple_remove(&nvme_instance_ida, subsys->instance);
kfree(subsys);
}
@@ -2361,6 +2393,17 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
lockdep_assert_held(&nvme_subsystems_lock);
+ /*
+ * Fail matches for discovery subsystems. This results
+ * in each discovery controller bound to a unique subsystem.
+ * This avoids issues with validating controller values
+ * that can only be true when there is a single unique subsystem.
+ * There may be multiple and completely independent entities
+ * that provide discovery controllers.
+ */
+ if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
+ return NULL;
+
list_for_each_entry(subsys, &nvme_subsystems, entry) {
if (strcmp(subsys->subnqn, subsysnqn))
continue;
@@ -2461,12 +2504,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
if (!subsys)
return -ENOMEM;
- ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
- if (ret < 0) {
- kfree(subsys);
- return ret;
- }
- subsys->instance = ret;
+
+ subsys->instance = -1;
mutex_init(&subsys->lock);
kref_init(&subsys->ref);
INIT_LIST_HEAD(&subsys->ctrls);
@@ -2485,7 +2524,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->dev.class = nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
- dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
+ dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
device_initialize(&subsys->dev);
mutex_lock(&nvme_subsystems_lock);
@@ -2517,6 +2556,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
goto out_put_subsystem;
}
+ if (!found)
+ subsys->instance = ctrl->instance;
ctrl->subsys = subsys;
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
mutex_unlock(&nvme_subsystems_lock);
@@ -2574,7 +2615,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
- u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
bool prev_apst_enabled;
@@ -2584,16 +2624,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
}
-
- ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
- if (ret) {
- dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
- return ret;
- }
- page_shift = NVME_CAP_MPSMIN(cap) + 12;
+ page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+ ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
- ctrl->subsystem = NVME_CAP_NSSRC(cap);
+ ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
ret = nvme_identify_ctrl(ctrl, &id);
if (ret) {
@@ -3184,7 +3219,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = nsid;
kref_init(&head->ref);
- nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+ ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+ if (ret)
+ goto out_cleanup_srcu;
ret = __nvme_check_ids(ctrl->subsys, head);
if (ret) {
@@ -3209,6 +3246,8 @@ out_ida_remove:
out_free_head:
kfree(head);
out:
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ERR_PTR(ret);
}
@@ -3232,7 +3271,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
} else {
struct nvme_ns_ids ids;
- nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ if (ret)
+ goto out_unlock;
+
if (!nvme_ns_ids_equal(&head->ids, &ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
@@ -3247,6 +3289,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
out_unlock:
mutex_unlock(&ctrl->subsys->lock);
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -3338,11 +3382,9 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
- id = nvme_identify_ns(ctrl, nsid);
- if (!id) {
- ret = -EIO;
+ ret = nvme_identify_ns(ctrl, nsid, &id);
+ if (ret)
goto out_free_queue;
- }
if (id->ncap == 0) {
ret = -EINVAL;
@@ -3404,6 +3446,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_cleanup_queue(ns->queue);
out_free_ns:
kfree(ns);
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -3617,6 +3661,33 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
+static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(dev, struct nvme_ctrl, ctrl_device);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ret;
+
+ ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
+ if (ret)
+ return ret;
+
+ if (opts) {
+ ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
+ if (ret)
+ return ret;
+
+ ret = add_uevent_var(env, "NVME_TRSVCID=%s",
+ opts->trsvcid ?: "none");
+ if (ret)
+ return ret;
+
+ ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
+ opts->host_traddr ?: "none");
+ }
+ return ret;
+}
+
static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
{
char *envp[2] = { NULL, NULL };
@@ -3723,6 +3794,9 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
queue_work(nvme_wq, &ctrl->ana_work);
break;
#endif
+ case NVME_AER_NOTICE_DISC_CHANGED:
+ ctrl->aen_result = result;
+ break;
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
@@ -3769,10 +3843,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
if (ctrl->kato)
nvme_start_keep_alive(ctrl);
+ nvme_enable_aen(ctrl);
+
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
- nvme_enable_aen(ctrl);
- queue_work(nvme_wq, &ctrl->async_event_work);
nvme_start_queues(ctrl);
}
}
@@ -3792,7 +3866,9 @@ static void nvme_free_ctrl(struct device *dev)
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
- ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+ if (subsys && ctrl->instance != subsys->instance)
+ ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+
kfree(ctrl->effects);
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
@@ -3992,6 +4068,9 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_sync_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
+
+ if (ctrl->admin_q)
+ blk_sync_queue(ctrl->admin_q);
}
EXPORT_SYMBOL_GPL(nvme_sync_queues);
@@ -4050,6 +4129,7 @@ static int __init nvme_core_init(void)
result = PTR_ERR(nvme_class);
goto unregister_chrdev;
}
+ nvme_class->dev_uevent = nvme_class_uevent;
nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
if (IS_ERR(nvme_subsys_class)) {
@@ -4074,7 +4154,6 @@ out:
static void __exit nvme_core_exit(void)
{
- ida_destroy(&nvme_subsystems_ida);
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 1994d5b42f94..74b8818ac9a1 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
@@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.attrib = 1;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
@@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.offset = cpu_to_le32(off);
cmd.prop_set.value = cpu_to_le64(val);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
NVME_QID_ANY, 0, 0, false);
if (unlikely(ret))
dev_err(ctrl->device,
@@ -381,8 +381,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
* Set keep-alive timeout in seconds granularity (ms * 1000)
* and add a grace period for controller kato enforcement
*/
- cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
- cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
+ cmd.connect.kato = ctrl->kato ?
+ cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0;
if (ctrl->opts->disable_sqflow)
cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
@@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
data, sizeof(*data), 0, NVME_QID_ANY, 1,
BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
if (ret) {
@@ -611,6 +611,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_DATA_DIGEST, "data_digest" },
{ NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
{ NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
+ { NVMF_OPT_TOS, "tos=%d" },
{ NVMF_OPT_ERR, NULL }
};
@@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
opts->duplicate_connect = false;
opts->hdr_digest = false;
opts->data_digest = false;
+ opts->tos = -1; /* < 0 == use transport default */
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -738,13 +740,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n");
}
opts->kato = token;
-
- if (opts->discovery_nqn && opts->kato) {
- pr_err("Discovery controllers cannot accept KATO != 0\n");
- ret = -EINVAL;
- goto out;
- }
-
break;
case NVMF_OPT_CTRL_LOSS_TMO:
if (match_int(args, &token)) {
@@ -856,6 +851,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->nr_poll_queues = token;
break;
+ case NVMF_OPT_TOS:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token < 0) {
+ pr_err("Invalid type of service %d\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token > 255) {
+ pr_warn("Clamping type of service to 255\n");
+ token = 255;
+ }
+ opts->tos = token;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -865,7 +876,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
if (opts->discovery_nqn) {
- opts->kato = 0;
opts->nr_io_queues = 0;
opts->nr_write_queues = 0;
opts->nr_poll_queues = 0;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 3044d8b99a24..93f08d77c896 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -55,6 +55,7 @@ enum {
NVMF_OPT_DATA_DIGEST = 1 << 16,
NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
+ NVMF_OPT_TOS = 1 << 19,
};
/**
@@ -87,6 +88,7 @@ enum {
* @data_digest: generate/verify data digest (TCP)
* @nr_write_queues: number of queues for write I/O
* @nr_poll_queues: number of queues for polling I/O
+ * @tos: type of service
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -108,6 +110,7 @@ struct nvmf_ctrl_options {
bool data_digest;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+ int tos;
};
/*
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 232d8094091b..265f89e11d8b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1608,9 +1608,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
if (opstate == FCPOP_STATE_ABORTED)
- status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
- else if (freq->status)
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ else if (freq->status) {
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to lldd error %d\n",
+ ctrl->cnum, freq->status);
+ }
/*
* For the linux implementation, if we have an unsuccesful
@@ -1637,8 +1641,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
* no payload in the CQE by the transport.
*/
if (freq->transferred_length !=
- be32_to_cpu(op->cmd_iu.data_len)) {
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ be32_to_cpu(op->cmd_iu.data_len)) {
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to bad transfer "
+ "length: %d vs expected %d\n",
+ ctrl->cnum, freq->transferred_length,
+ be32_to_cpu(op->cmd_iu.data_len));
goto done;
}
result.u64 = 0;
@@ -1655,7 +1664,17 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
freq->transferred_length ||
op->rsp_iu.status_code ||
sqe->common.command_id != cqe->command_id)) {
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to bad NVMe_ERSP: "
+ "iu len %d, xfr len %d vs %d, status code "
+ "%d, cmdid %d vs %d\n",
+ ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len),
+ be32_to_cpu(op->rsp_iu.xfrd_len),
+ freq->transferred_length,
+ op->rsp_iu.status_code,
+ sqe->common.command_id,
+ cqe->command_id);
goto done;
}
result = cqe->result;
@@ -1663,7 +1682,11 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
break;
default:
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu "
+ "len %d\n",
+ ctrl->cnum, freq->rcv_rsplen);
goto done;
}
@@ -2006,6 +2029,7 @@ nvme_fc_ctrl_free(struct kref *ref)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.admin_q);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
kfree(ctrl->queues);
@@ -2107,7 +2131,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
struct nvme_fc_fcp_op *op)
{
struct nvmefc_fcp_req *freq = &op->fcp_req;
- enum dma_data_direction dir;
int ret;
freq->sg_cnt = 0;
@@ -2124,9 +2147,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
WARN_ON(op->nents > blk_rq_nr_phys_segments(rq));
- dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
- op->nents, dir);
+ op->nents, rq_dma_dir(rq));
if (unlikely(freq->sg_cnt <= 0)) {
sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
freq->sg_cnt = 0;
@@ -2149,8 +2171,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
return;
fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
- ((rq_data_dir(rq) == WRITE) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE));
+ rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
@@ -2633,8 +2654,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_delete_hw_queue;
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
-
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
@@ -2648,23 +2667,15 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
* prior connection values
*/
- ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
- if (ret) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_disconnect_admin_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ ret = nvme_enable_ctrl(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
ret = nvme_init_identify(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
@@ -2774,6 +2785,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
}
/*
@@ -2796,6 +2808,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
/* kill the aens as they are a separate path */
nvme_fc_abort_aen_ops(ctrl);
@@ -3109,10 +3122,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_free_queues;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ ret = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_admin_tag_set;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
ret = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_admin_tag_set;
+ goto out_cleanup_fabrics_q;
}
/*
@@ -3184,6 +3203,8 @@ fail_ctrl:
out_cleanup_admin_q:
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_admin_tag_set:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_queues:
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba009d4c9dfa..ec46693f6b64 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -667,11 +667,14 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
return rq;
}
-static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
+ void *buf)
{
+ struct nvm_geo *geo = &dev->geo;
struct request_queue *q = dev->q;
struct nvme_nvm_command *cmd;
struct request *rq;
+ int ret;
cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
if (!cmd)
@@ -679,8 +682,15 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
rq = nvme_nvm_alloc_request(q, rqd, cmd);
if (IS_ERR(rq)) {
- kfree(cmd);
- return PTR_ERR(rq);
+ ret = PTR_ERR(rq);
+ goto err_free_cmd;
+ }
+
+ if (buf) {
+ ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas,
+ GFP_KERNEL);
+ if (ret)
+ goto err_free_cmd;
}
rq->end_io_data = rqd;
@@ -688,33 +698,9 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
return 0;
-}
-
-static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- struct request_queue *q = dev->q;
- struct request *rq;
- struct nvme_nvm_command cmd;
- int ret = 0;
-
- memset(&cmd, 0, sizeof(struct nvme_nvm_command));
-
- rq = nvme_nvm_alloc_request(q, rqd, &cmd);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
-
- /* I/Os can fail and the error is signaled through rqd. Callers must
- * handle the error accordingly.
- */
- blk_execute_rq(q, NULL, rq, 0);
- if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
- ret = -EINTR;
-
- rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
- rqd->error = nvme_req(rq)->status;
-
- blk_mq_free_request(rq);
+err_free_cmd:
+ kfree(cmd);
return ret;
}
@@ -754,7 +740,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
.get_chk_meta = nvme_nvm_get_chk_meta,
.submit_io = nvme_nvm_submit_io,
- .submit_io_sync = nvme_nvm_submit_io_sync,
.create_dma_pool = nvme_nvm_create_dma_pool,
.destroy_dma_pool = nvme_nvm_destroy_dma_pool,
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index af831d3d15d0..30de7efef003 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -509,14 +509,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
- if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
+ unsigned nsid = le32_to_cpu(desc->nsids[n]);
+
+ if (ns->head->ns_id < nsid)
continue;
- nvme_update_ns_ana_state(desc, ns);
+ if (ns->head->ns_id == nsid)
+ nvme_update_ns_ana_state(desc, ns);
if (++n == nr_nsids)
break;
}
up_write(&ctrl->namespaces_rwsem);
- WARN_ON_ONCE(n < nr_nsids);
return 0;
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2d678fb968c7..b5013c101b35 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,8 @@
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
+#include <trace/events/block.h>
+
extern unsigned int nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
@@ -97,6 +99,21 @@ enum nvme_quirks {
* Force simple suspend/resume path.
*/
NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10),
+
+ /*
+ * Use only one interrupt vector for all queues
+ */
+ NVME_QUIRK_SINGLE_VECTOR = (1 << 11),
+
+ /*
+ * Use non-standard 128 bytes SQEs.
+ */
+ NVME_QUIRK_128_BYTES_SQES = (1 << 12),
+
+ /*
+ * Prevent tag overlap between queues
+ */
+ NVME_QUIRK_SHARED_TAGS = (1 << 13),
};
/*
@@ -169,6 +186,7 @@ struct nvme_ctrl {
const struct nvme_ctrl_ops *ops;
struct request_queue *admin_q;
struct request_queue *connect_q;
+ struct request_queue *fabrics_q;
struct device *dev;
int instance;
int numa_node;
@@ -431,8 +449,8 @@ void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks);
@@ -520,6 +538,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
+static inline void nvme_trace_bio_complete(struct request *req,
+ blk_status_t status)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+
+ if (req->cmd_flags & REQ_NVME_MPATH)
+ trace_block_bio_complete(ns->head->disk->queue,
+ req->bio, status);
+}
+
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
@@ -567,6 +595,10 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
+static inline void nvme_trace_bio_complete(struct request *req,
+ blk_status_t status)
+{
+}
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
struct nvme_id_ctrl *id)
{
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 732d5b63ec05..6b4d7b064b38 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -28,8 +28,8 @@
#include "trace.h"
#include "nvme.h"
-#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
-#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
+#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
+#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
@@ -100,6 +100,7 @@ struct nvme_dev {
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
+ int io_sqes;
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
@@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
struct nvme_queue {
struct nvme_dev *dev;
spinlock_t sq_lock;
- struct nvme_command *sq_cmds;
+ void *sq_cmds;
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
@@ -178,6 +179,7 @@ struct nvme_queue {
u16 last_cq_head;
u16 qid;
u8 cq_phase;
+ u8 sqes;
unsigned long flags;
#define NVMEQ_ENABLED 0
#define NVMEQ_SQ_CMB 1
@@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
- memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
+ memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
+ cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
nvme_write_sq_db(nvmeq, write_sq);
@@ -534,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- enum dma_data_direction dma_dir = rq_data_dir(req) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;
if (iod->dma_len) {
- dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir);
+ dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
+ rq_dma_dir(req));
return;
}
@@ -1344,16 +1346,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
- dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth),
+ dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
if (!nvmeq->sq_cmds)
return;
if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
- nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
+ nvmeq->sq_cmds, SQ_SIZE(nvmeq));
} else {
- dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth),
+ dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
}
}
@@ -1403,7 +1405,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
- nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ nvme_disable_ctrl(&dev->ctrl);
nvme_poll_irqdisable(nvmeq, -1);
}
@@ -1433,12 +1435,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
}
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
- int qid, int depth)
+ int qid)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
- nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
+ nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
if (nvmeq->sq_cmds) {
nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
nvmeq->sq_cmds);
@@ -1447,11 +1449,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
return 0;
}
- pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth));
+ pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
}
}
- nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+ nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
&nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
@@ -1465,12 +1467,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
if (dev->ctrl.queue_count > qid)
return 0;
- nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth),
+ nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
+ nvmeq->q_depth = depth;
+ nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
&nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
- if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
+ if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
goto free_cqdma;
nvmeq->dev = dev;
@@ -1479,15 +1483,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
- nvmeq->q_depth = depth;
nvmeq->qid = qid;
dev->ctrl.queue_count++;
return 0;
free_cqdma:
- dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
- nvmeq->cq_dma_addr);
+ dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
+ nvmeq->cq_dma_addr);
free_nvmeq:
return -ENOMEM;
}
@@ -1515,7 +1518,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
- memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+ memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
wmb(); /* ensure the first interrupt sees the initialization */
@@ -1552,7 +1555,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
nvme_init_queue(nvmeq, qid);
if (!polled) {
- nvmeq->cq_vector = vector;
result = queue_request_irq(nvmeq);
if (result < 0)
goto release_sq;
@@ -1679,7 +1681,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
- result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ result = nvme_disable_ctrl(&dev->ctrl);
if (result < 0)
return result;
@@ -1695,7 +1697,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
- result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ result = nvme_enable_ctrl(&dev->ctrl);
if (result)
return result;
@@ -2077,6 +2079,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
dev->io_queues[HCTX_TYPE_READ] = 0;
+ /*
+ * Some Apple controllers require all queues to use the
+ * first vector.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
+ irq_queues = 1;
+
return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
@@ -2095,6 +2104,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
unsigned long size;
nr_io_queues = max_io_queues();
+
+ /*
+ * If tags are shared with admin queue (Apple bug), then
+ * make sure we only use one IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ nr_io_queues = 1;
+
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
@@ -2265,6 +2282,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
+ /*
+ * Some Apple controllers requires tags to be unique
+ * across admin and IO queue, so reserve the first 32
+ * tags of the IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+
ret = blk_mq_alloc_tag_set(&dev->tagset);
if (ret) {
dev_warn(dev->ctrl.device,
@@ -2314,10 +2339,21 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
+ dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
dev->dbs = dev->bar + 4096;
/*
+ * Some Apple controllers require a non-standard SQE size.
+ * Interestingly they also seem to ignore the CC:IOSQES register
+ * so we don't bother updating it here.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
+ dev->io_sqes = 7;
+ else
+ dev->io_sqes = NVME_NVM_IOSQES;
+
+ /*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
@@ -2334,6 +2370,18 @@ static int nvme_pci_enable(struct nvme_dev *dev)
"set queue depth=%u\n", dev->q_depth);
}
+ /*
+ * Controllers with the shared tags quirk need the IO queue to be
+ * big enough so that we get 32 tags for the admin queue
+ */
+ if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
+ (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
+ dev->q_depth = NVME_AQ_DEPTH + 2;
+ dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
+ dev->q_depth);
+ }
+
+
nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
@@ -2401,6 +2449,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
+ blk_mq_tagset_wait_completed_request(&dev->tagset);
+ blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
/*
* The driver will not be starting up queues again if shutting down so
@@ -3041,6 +3091,10 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
+ { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
+ .driver_data = NVME_QUIRK_SINGLE_VECTOR |
+ NVME_QUIRK_128_BYTES_SQES |
+ NVME_QUIRK_SHARED_TAGS },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 1a6449bc547b..dfa07bb9dfeb 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -757,6 +757,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
@@ -798,10 +799,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
goto out_free_async_qe;
}
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ error = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_tagset;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_tagset;
+ goto out_cleanup_fabrics_q;
}
}
@@ -809,24 +816,15 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (error)
goto out_cleanup_queue;
- error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
- &ctrl->ctrl.cap);
- if (error) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_stop_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
goto out_stop_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
error = nvme_init_identify(&ctrl->ctrl);
if (error)
goto out_stop_queue;
@@ -838,6 +836,9 @@ out_stop_queue:
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ if (new)
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
@@ -907,10 +908,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
- if (ctrl->ctrl.admin_tagset)
+ if (ctrl->ctrl.admin_tagset) {
blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
nvme_cancel_request, &ctrl->ctrl);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
+ }
+ if (remove)
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
@@ -920,9 +924,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
- if (ctrl->ctrl.tagset)
+ if (ctrl->ctrl.tagset) {
blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
+ }
if (remove)
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
@@ -1059,6 +1065,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1145,9 +1152,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
req->mr = NULL;
}
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
@@ -1273,7 +1278,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
- rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rq_dma_dir(rq));
if (unlikely(count <= 0)) {
ret = -EIO;
goto out_free_table;
@@ -1302,9 +1307,7 @@ out:
return 0;
out_unmap_sg:
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
out_free_table:
sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
return ret;
@@ -1547,16 +1550,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
{
+ struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
int ret;
ret = nvme_rdma_create_queue_ib(queue);
if (ret)
return ret;
+ if (ctrl->opts->tos >= 0)
+ rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
- dev_err(queue->ctrl->ctrl.device,
- "rdma_resolve_route failed (%d).\n",
+ dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);
goto out_destroy_queue;
}
@@ -1869,10 +1874,11 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&ctrl->reconnect_work);
nvme_rdma_teardown_io_queues(ctrl, shutdown);
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
- nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ nvme_disable_ctrl(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
@@ -2051,7 +2057,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
- NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
+ NVMF_OPT_TOS,
.create_ctrl = nvme_rdma_create_ctrl,
};
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 606b13d35d16..4ffd5957637a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -13,6 +13,7 @@
#include <net/tcp.h>
#include <linux/blk-mq.h>
#include <crypto/hash.h>
+#include <net/busy_poll.h>
#include "nvme.h"
#include "fabrics.h"
@@ -72,6 +73,7 @@ struct nvme_tcp_queue {
int pdu_offset;
size_t data_remaining;
size_t ddgst_remaining;
+ unsigned int nr_cqe;
/* send state */
struct nvme_tcp_request *request;
@@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
}
nvme_end_request(rq, cqe->status, cqe->result);
+ queue->nr_cqe++;
return 0;
}
@@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
switch (hdr->type) {
case nvme_tcp_c2h_data:
- ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
- break;
+ return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
case nvme_tcp_rsp:
nvme_tcp_init_recv_ctx(queue);
- ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
- break;
+ return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
case nvme_tcp_r2t:
nvme_tcp_init_recv_ctx(queue);
- ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
- break;
+ return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
default:
dev_err(queue->ctrl->ctrl.device,
"unsupported pdu type (%d)\n", hdr->type);
return -EINVAL;
}
-
- return ret;
}
static inline void nvme_tcp_end_request(struct request *rq, u16 status)
@@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
} else {
- if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS)
+ if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+ queue->nr_cqe++;
+ }
nvme_tcp_init_recv_ctx(queue);
}
}
@@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
pdu->command_id);
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+ queue->nr_cqe++;
}
nvme_tcp_init_recv_ctx(queue);
@@ -841,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
{
- nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR);
+ nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
}
static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
@@ -1023,14 +1024,16 @@ done:
static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
{
- struct sock *sk = queue->sock->sk;
+ struct socket *sock = queue->sock;
+ struct sock *sk = sock->sk;
read_descriptor_t rd_desc;
int consumed;
rd_desc.arg.data = queue;
rd_desc.count = 1;
lock_sock(sk);
- consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+ queue->nr_cqe = 0;
+ consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
}
@@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->queue_size = queue_size;
if (qid > 0)
- queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+ queue->cmnd_capsule_len = nctrl->ioccsz * 16;
else
queue->cmnd_capsule_len = sizeof(struct nvme_command) +
NVME_TCP_ADMIN_CCSZ;
@@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
IPPROTO_TCP, &queue->sock);
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to create socket: %d\n", ret);
return ret;
}
@@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
(char *)&opt, sizeof(opt));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to set TCP_SYNCNT sock opt %d\n", ret);
goto err_sock;
}
@@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
TCP_NODELAY, (char *)&opt, sizeof(opt));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to set TCP_NODELAY sock opt %d\n", ret);
goto err_sock;
}
@@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
(char *)&sol, sizeof(sol));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to set SO_LINGER sock opt %d\n", ret);
goto err_sock;
}
+ /* Set socket type of service */
+ if (nctrl->opts->tos >= 0) {
+ opt = nctrl->opts->tos;
+ ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
+ (char *)&opt, sizeof(opt));
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to set IP_TOS sock opt %d\n", ret);
+ goto err_sock;
+ }
+ }
+
queue->sock->sk->sk_allocation = GFP_ATOMIC;
if (!qid)
n = 0;
@@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->pdu_offset = 0;
sk_set_memalloc(queue->sock->sk);
- if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+ if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
sizeof(ctrl->src_addr));
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to bind queue %d socket %d\n",
qid, ret);
goto err_sock;
@@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
if (queue->hdr_digest || queue->data_digest) {
ret = nvme_tcp_alloc_crypto(queue);
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to allocate queue %d crypto\n", qid);
goto err_sock;
}
@@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
goto err_crypto;
}
- dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
+ dev_dbg(nctrl->device, "connecting queue %d\n",
nvme_tcp_queue_id(queue));
ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
sizeof(ctrl->addr), 0);
if (ret) {
- dev_err(ctrl->ctrl.device,
+ dev_err(nctrl->device,
"failed to connect socket: %d\n", ret);
goto err_rcv_pdu;
}
@@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
queue->sock->sk->sk_state_change = nvme_tcp_state_change;
queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+ queue->sock->sk->sk_ll_usec = 1;
write_unlock_bh(&queue->sock->sk->sk_callback_lock);
return 0;
@@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
- set->nr_maps = 2 /* default + read */;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
ret = blk_mq_alloc_tag_set(set);
@@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
+ nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
return nr_io_queues;
}
@@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
min(opts->nr_io_queues, nr_io_queues);
nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
}
+
+ if (opts->nr_poll_queues && nr_io_queues) {
+ /* map dedicated poll queues only if we have queues left */
+ ctrl->io_queues[HCTX_TYPE_POLL] =
+ min(opts->nr_poll_queues, nr_io_queues);
+ }
}
static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
@@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
nvme_tcp_stop_queue(ctrl, 0);
if (remove) {
blk_cleanup_queue(ctrl->admin_q);
+ blk_cleanup_queue(ctrl->fabrics_q);
blk_mq_free_tag_set(ctrl->admin_tagset);
}
nvme_tcp_free_admin_queue(ctrl);
@@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
goto out_free_queue;
}
+ ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
+ if (IS_ERR(ctrl->fabrics_q)) {
+ error = PTR_ERR(ctrl->fabrics_q);
+ goto out_free_tagset;
+ }
+
ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->admin_q)) {
error = PTR_ERR(ctrl->admin_q);
- goto out_free_tagset;
+ goto out_cleanup_fabrics_q;
}
}
@@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
if (error)
goto out_cleanup_queue;
- error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
- if (error) {
- dev_err(ctrl->device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_stop_queue;
- }
-
- ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
-
- error = nvme_enable_ctrl(ctrl, ctrl->cap);
+ error = nvme_enable_ctrl(ctrl);
if (error)
goto out_stop_queue;
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+
error = nvme_init_identify(ctrl);
if (error)
goto out_stop_queue;
@@ -1735,6 +1758,9 @@ out_stop_queue:
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->admin_q);
+out_cleanup_fabrics_q:
+ if (new)
+ blk_cleanup_queue(ctrl->fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->admin_tagset);
@@ -1748,10 +1774,13 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
{
blk_mq_quiesce_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
- if (ctrl->admin_tagset)
+ if (ctrl->admin_tagset) {
blk_mq_tagset_busy_iter(ctrl->admin_tagset,
nvme_cancel_request, ctrl);
- blk_mq_unquiesce_queue(ctrl->admin_q);
+ blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
+ }
+ if (remove)
+ blk_mq_unquiesce_queue(ctrl->admin_q);
nvme_tcp_destroy_admin_queue(ctrl, remove);
}
@@ -1762,9 +1791,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
return;
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
- if (ctrl->tagset)
+ if (ctrl->tagset) {
blk_mq_tagset_busy_iter(ctrl->tagset,
nvme_cancel_request, ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->tagset);
+ }
if (remove)
nvme_start_queues(ctrl);
nvme_tcp_destroy_io_queues(ctrl, remove);
@@ -1793,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
{
struct nvmf_ctrl_options *opts = ctrl->opts;
- int ret = -EINVAL;
+ int ret;
ret = nvme_tcp_configure_admin_queue(ctrl, new);
if (ret)
@@ -1876,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
/* unquiesce to fail fast pending requests */
nvme_start_queues(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
+ blk_mq_unquiesce_queue(ctrl->admin_q);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1892,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
nvme_tcp_teardown_io_queues(ctrl, shutdown);
+ blk_mq_quiesce_queue(ctrl->admin_q);
if (shutdown)
nvme_shutdown_ctrl(ctrl);
else
- nvme_disable_ctrl(ctrl, ctrl->cap);
+ nvme_disable_ctrl(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, shutdown);
}
@@ -2151,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
+ /* map dedicated poll queues only if we have queues left */
+ set->map[HCTX_TYPE_POLL].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_POLL];
+ set->map[HCTX_TYPE_POLL].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+ }
+
dev_info(ctrl->ctrl.device,
- "mapped %d/%d default/read queues.\n",
+ "mapped %d/%d/%d default/read/poll queues.\n",
ctrl->io_queues[HCTX_TYPE_DEFAULT],
- ctrl->io_queues[HCTX_TYPE_READ]);
+ ctrl->io_queues[HCTX_TYPE_READ],
+ ctrl->io_queues[HCTX_TYPE_POLL]);
return 0;
}
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_tcp_queue *queue = hctx->driver_data;
+ struct sock *sk = queue->sock->sk;
+
+ if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue))
+ sk_busy_loop(sk, true);
+ nvme_tcp_try_recv(queue);
+ return queue->nr_cqe;
+}
+
static struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.complete = nvme_complete_rq,
@@ -2167,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
.init_hctx = nvme_tcp_init_hctx,
.timeout = nvme_tcp_timeout,
.map_queues = nvme_tcp_map_queues,
+ .poll = nvme_tcp_poll,
};
static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
@@ -2220,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
INIT_LIST_HEAD(&ctrl->list);
ctrl->ctrl.opts = opts;
- ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+ opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
@@ -2314,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
- NVMF_OPT_NR_WRITE_QUEUES,
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
+ NVMF_OPT_TOS,
.create_ctrl = nvme_tcp_create_ctrl,
};
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 9778eb0406b3..5c3cb6928f3c 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p,
return ret;
}
+static const char *nvme_trace_get_lba_status(struct trace_seq *p,
+ u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u64 slba = get_unaligned_le64(cdw10);
+ u32 mndw = get_unaligned_le32(cdw10 + 8);
+ u16 rl = get_unaligned_le16(cdw10 + 12);
+ u8 atype = cdw10[15];
+
+ trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
+ slba, mndw, rl, atype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
return nvme_trace_admin_identify(p, cdw10);
case nvme_admin_get_features:
return nvme_trace_admin_get_features(p, cdw10);
+ case nvme_admin_get_lba_status:
+ return nvme_trace_get_lba_status(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 4dc12ea52f23..831a062d27cb 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -37,7 +37,6 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
- u16 status = NVME_SC_SUCCESS;
unsigned long flags;
off_t offset = 0;
u64 slot;
@@ -47,9 +46,8 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS;
for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) {
- status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
- sizeof(struct nvme_error_slot));
- if (status)
+ if (nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
+ sizeof(struct nvme_error_slot)))
break;
if (slot == 0)
@@ -59,7 +57,7 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
offset += sizeof(struct nvme_error_slot);
}
spin_unlock_irqrestore(&ctrl->error_lock, flags);
- nvmet_req_complete(req, status);
+ nvmet_req_complete(req, 0);
}
static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
@@ -81,9 +79,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
goto out;
host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
- data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
+ data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
+ sectors[READ]), 1000);
host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
- data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+ data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part,
+ sectors[WRITE]), 1000);
put_unaligned_le64(host_reads, &slog->host_reads[0]);
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
@@ -111,11 +111,11 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
if (!ns->bdev)
continue;
host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
- data_units_read +=
- part_stat_read(ns->bdev->bd_part, sectors[READ]);
+ data_units_read += DIV_ROUND_UP(
+ part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000);
host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
- data_units_written +=
- part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+ data_units_written += DIV_ROUND_UP(
+ part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
}
rcu_read_unlock();
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 8efca26b4776..3764a8900850 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -381,9 +381,7 @@ int __init nvmet_init_discovery(void)
{
nvmet_disc_subsys =
nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
- if (IS_ERR(nvmet_disc_subsys))
- return PTR_ERR(nvmet_disc_subsys);
- return 0;
+ return PTR_ERR_OR_ZERO(nvmet_disc_subsys);
}
void nvmet_exit_discovery(void)
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0940c5024a34..748a39fca771 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -253,6 +253,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
blk_cleanup_queue(ctrl->ctrl.admin_q);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
}
@@ -357,10 +358,16 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
goto out_free_sq;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ error = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_tagset;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_tagset;
+ goto out_cleanup_fabrics_q;
}
error = nvmf_connect_admin_queue(&ctrl->ctrl);
@@ -369,23 +376,15 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
- error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
- if (error) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_cleanup_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
goto out_cleanup_queue;
ctrl->ctrl.max_hw_sectors =
(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
error = nvme_init_identify(&ctrl->ctrl);
if (error)
goto out_cleanup_queue;
@@ -394,6 +393,8 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
out_cleanup_queue:
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_sq:
@@ -407,16 +408,17 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
nvme_loop_destroy_io_queues(ctrl);
}
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
nvme_shutdown_ctrl(&ctrl->ctrl);
- blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
nvme_loop_destroy_admin_queue(ctrl);
}
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 69b83fa0c76c..bf4f03474e89 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -348,7 +348,8 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
return 0;
err:
- sgl_free(cmd->req.sg);
+ if (cmd->req.sg_cnt)
+ sgl_free(cmd->req.sg);
return NVME_SC_INTERNAL;
}
@@ -553,7 +554,8 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
if (queue->nvme_sq.sqhd_disabled) {
kfree(cmd->iov);
- sgl_free(cmd->req.sg);
+ if (cmd->req.sg_cnt)
+ sgl_free(cmd->req.sg);
}
return 1;
@@ -584,7 +586,8 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
return -EAGAIN;
kfree(cmd->iov);
- sgl_free(cmd->req.sg);
+ if (cmd->req.sg_cnt)
+ sgl_free(cmd->req.sg);
cmd->queue->snd_cmd = NULL;
nvmet_tcp_put_cmd(cmd);
return 1;
@@ -1306,7 +1309,9 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
{
nvmet_req_uninit(&cmd->req);
nvmet_tcp_unmap_pdu_iovec(cmd);
- sgl_free(cmd->req.sg);
+ kfree(cmd->iov);
+ if (cmd->req.sg_cnt)
+ sgl_free(cmd->req.sg);
}
static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
@@ -1410,6 +1415,7 @@ done:
static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
{
struct socket *sock = queue->sock;
+ struct inet_sock *inet = inet_sk(sock->sk);
struct linger sol = { .l_onoff = 1, .l_linger = 0 };
int ret;
@@ -1433,6 +1439,16 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
if (ret)
return ret;
+ /* Set socket type of service */
+ if (inet->rcv_tos > 0) {
+ int tos = inet->rcv_tos;
+
+ ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
+ (char *)&tos, sizeof(tos));
+ if (ret)
+ return ret;
+ }
+
write_lock_bh(&sock->sk->sk_callback_lock);
sock->sk->sk_user_data = queue;
queue->data_ready = sock->sk->sk_data_ready;
diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c
index 6af11d493271..1373a3c67962 100644
--- a/drivers/nvme/target/trace.c
+++ b/drivers/nvme/target/trace.c
@@ -33,6 +33,22 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p,
return ret;
}
+static const char *nvmet_trace_get_lba_status(struct trace_seq *p,
+ u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u64 slba = get_unaligned_le64(cdw10);
+ u32 mndw = get_unaligned_le32(cdw10 + 8);
+ u16 rl = get_unaligned_le16(cdw10 + 12);
+ u8 atype = cdw10[15];
+
+ trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
+ slba, mndw, rl, atype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -80,6 +96,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p,
return nvmet_trace_admin_identify(p, cdw10);
case nvme_admin_get_features:
return nvmet_trace_admin_get_features(p, cdw10);
+ case nvme_admin_get_lba_status:
+ return nvmet_trace_get_lba_status(p, cdw10);
default:
return nvmet_trace_common(p, cdw10);
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 11e64b50497f..4e88d7e9cf9a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1089,6 +1089,18 @@ static void scsi_initialize_rq(struct request *rq)
cmd->retries = 0;
}
+/*
+ * Only called when the request isn't completed by SCSI, and not freed by
+ * SCSI
+ */
+static void scsi_cleanup_rq(struct request *rq)
+{
+ if (rq->rq_flags & RQF_DONTPREP) {
+ scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq));
+ rq->rq_flags &= ~RQF_DONTPREP;
+ }
+}
+
/* Add a command to the list used by the aacraid and dpt_i2o drivers */
void scsi_add_cmd_to_list(struct scsi_cmnd *cmd)
{
@@ -1821,6 +1833,7 @@ static const struct blk_mq_ops scsi_mq_ops = {
.init_request = scsi_mq_init_request,
.exit_request = scsi_mq_exit_request,
.initialize_rq_fn = scsi_initialize_rq,
+ .cleanup_rq = scsi_cleanup_rq,
.busy = scsi_mq_lld_busy,
.map_queues = scsi_map_queues,
};
diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c
index 74ded5f3c236..3717eea37ecb 100644
--- a/drivers/scsi/scsi_pm.c
+++ b/drivers/scsi/scsi_pm.c
@@ -94,8 +94,7 @@ static int scsi_dev_type_resume(struct device *dev,
if (!err && scsi_is_sdev_device(dev)) {
struct scsi_device *sdev = to_scsi_device(dev);
- if (sdev->request_queue->dev)
- blk_set_runtime_active(sdev->request_queue);
+ blk_set_runtime_active(sdev->request_queue);
}
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 149d406aacc9..4b925552458f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1293,7 +1293,9 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
case REQ_OP_WRITE:
return sd_setup_read_write_cmnd(cmd);
case REQ_OP_ZONE_RESET:
- return sd_zbc_setup_reset_cmnd(cmd);
+ return sd_zbc_setup_reset_cmnd(cmd, false);
+ case REQ_OP_ZONE_RESET_ALL:
+ return sd_zbc_setup_reset_cmnd(cmd, true);
default:
WARN_ON_ONCE(1);
return BLK_STS_NOTSUPP;
@@ -1959,6 +1961,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
if (!result) {
good_bytes = blk_rq_bytes(req);
scsi_set_resid(SCpnt, 0);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 38c50946fc42..1eab779f812b 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -209,7 +209,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
-extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
+extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd, bool all);
extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
struct scsi_sense_hdr *sshdr);
extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
@@ -225,7 +225,8 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {}
-static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd,
+ bool all)
{
return BLK_STS_TARGET;
}
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 5d6ff3931632..de4019dc0f0b 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -209,10 +209,11 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
/**
* sd_zbc_setup_reset_cmnd - Prepare a RESET WRITE POINTER scsi command.
* @cmd: the command to setup
+ * @all: Reset all zones control.
*
* Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
*/
-blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd, bool all)
{
struct request *rq = cmd->request;
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
@@ -234,7 +235,10 @@ blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
memset(cmd->cmnd, 0, cmd->cmd_len);
cmd->cmnd[0] = ZBC_OUT;
cmd->cmnd[1] = ZO_RESET_WRITE_POINTER;
- put_unaligned_be64(block, &cmd->cmnd[2]);
+ if (all)
+ cmd->cmnd[14] = 0x1;
+ else
+ put_unaligned_be64(block, &cmd->cmnd[2]);
rq->timeout = SD_TIMEOUT;
cmd->sc_data_direction = DMA_NONE;
@@ -261,6 +265,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
switch (req_op(rq)) {
case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
if (result &&
sshdr->sense_key == ILLEGAL_REQUEST &&
@@ -487,6 +492,9 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
/* The drive satisfies the kernel restrictions: set it up */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, zone_blocks));
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
+ blk_queue_required_elevator_features(sdkp->disk->queue,
+ ELEVATOR_F_ZBD_SEQ_WRITE);
nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
/* READ16/WRITE16 is mandatory for ZBC disks */