diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 19:19:14 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 19:19:14 +0300 |
commit | 513389809e138ae903b6ef43c1d5d2ffaf4dca17 (patch) | |
tree | c71e478fab1568da4706868b14eb67a75c148a8b /drivers/block | |
parent | 0a78a376ef3c2f3d397df48909f00cd75f92137a (diff) | |
parent | 30514bd2dd4e86a3ecfd6a93a3eadf7b9ea164a0 (diff) | |
download | linux-513389809e138ae903b6ef43c1d5d2ffaf4dca17.tar.xz |
Merge tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- NVMe pull requests via Christoph:
- handle number of queue changes in the TCP and RDMA drivers
(Daniel Wagner)
- allow changing the number of queues in nvmet (Daniel Wagner)
- also consider host_iface when checking ip options (Daniel
Wagner)
- don't map pages which can't come from HIGHMEM (Fabio M. De
Francesco)
- avoid unnecessary flush bios in nvmet (Guixin Liu)
- shrink and better pack the nvme_iod structure (Keith Busch)
- add comment for unaligned "fake" nqn (Linjun Bao)
- print actual source IP address through sysfs "address" attr
(Martin Belanger)
- various cleanups (Jackie Liu, Wolfram Sang, Genjian Zhang)
- handle effects after freeing the request (Keith Busch)
- copy firmware_rev on each init (Keith Busch)
- restrict management ioctls to admin (Keith Busch)
- ensure subsystem reset is single threaded (Keith Busch)
- report the actual number of tagset maps in nvme-pci (Keith
Busch)
- small fabrics authentication fixups (Christoph Hellwig)
- add common code for tagset allocation and freeing (Christoph
Hellwig)
- stop using the request_queue in nvmet (Christoph Hellwig)
- set min_align_mask before calculating max_hw_sectors (Rishabh
Bhatnagar)
- send a rediscover uevent when a persistent discovery controller
reconnects (Sagi Grimberg)
- misc nvmet-tcp fixes (Varun Prakash, zhenwei pi)
- MD pull request via Song:
- Various raid5 fix and clean up, by Logan Gunthorpe and David
Sloan.
- Raid10 performance optimization, by Yu Kuai.
- sbitmap wakeup hang fixes (Hugh, Keith, Jan, Yu)
- IO scheduler switching quisce fix (Keith)
- s390/dasd block driver updates (Stefan)
- support for recovery for the ublk driver (ZiyangZhang)
- rnbd drivers fixes and updates (Guoqing, Santosh, ye, Christoph)
- blk-mq and null_blk map fixes (Bart)
- various bcache fixes (Coly, Jilin, Jules)
- nbd signal hang fix (Shigeru)
- block writeback throttling fix (Yu)
- optimize the passthrough mapping handling (me)
- prepare block cgroups to being gendisk based (Christoph)
- get rid of an old PSI hack in the block layer, moving it to the
callers instead where it belongs (Christoph)
- blk-throttle fixes and cleanups (Yu)
- misc fixes and cleanups (Liu Shixin, Liu Song, Miaohe, Pankaj,
Ping-Xiang, Wolfram, Saurabh, Li Jinlin, Li Lei, Lin, Li zeming,
Miaohe, Bart, Coly, Gaosheng
* tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux: (162 commits)
sbitmap: fix lockup while swapping
block: add rationale for not using blk_mq_plug() when applicable
block: adapt blk_mq_plug() to not plug for writes that require a zone lock
s390/dasd: use blk_mq_alloc_disk
blk-cgroup: don't update the blkg lookup hint in blkg_conf_prep
nvmet: don't look at the request_queue in nvmet_bdev_set_limits
nvmet: don't look at the request_queue in nvmet_bdev_zone_mgmt_emulate_all
blk-mq: use quiesced elevator switch when reinitializing queues
block: replace blk_queue_nowait with bdev_nowait
nvme: remove nvme_ctrl_init_connect_q
nvme-loop: use the tagset alloc/free helpers
nvme-loop: store the generic nvme_ctrl in set->driver_data
nvme-loop: initialize sqsize later
nvme-fc: use the tagset alloc/free helpers
nvme-fc: store the generic nvme_ctrl in set->driver_data
nvme-fc: keep ctrl->sqsize in sync with opts->queue_size
nvme-rdma: use the tagset alloc/free helpers
nvme-rdma: store the generic nvme_ctrl in set->driver_data
nvme-tcp: use the tagset alloc/free helpers
nvme-tcp: store the generic nvme_ctrl in set->driver_data
...
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/aoe/aoeblk.c | 15 | ||||
-rw-r--r-- | drivers/block/brd.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 3 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 2 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 12 | ||||
-rw-r--r-- | drivers/block/nbd.c | 6 | ||||
-rw-r--r-- | drivers/block/null_blk/main.c | 8 | ||||
-rw-r--r-- | drivers/block/ps3vram.c | 2 | ||||
-rw-r--r-- | drivers/block/rnbd/Makefile | 6 | ||||
-rw-r--r-- | drivers/block/rnbd/rnbd-clt.c | 8 | ||||
-rw-r--r-- | drivers/block/rnbd/rnbd-srv-dev.c | 43 | ||||
-rw-r--r-- | drivers/block/rnbd/rnbd-srv-dev.h | 64 | ||||
-rw-r--r-- | drivers/block/rnbd/rnbd-srv-trace.c | 17 | ||||
-rw-r--r-- | drivers/block/rnbd/rnbd-srv-trace.h | 207 | ||||
-rw-r--r-- | drivers/block/rnbd/rnbd-srv.c | 123 | ||||
-rw-r--r-- | drivers/block/rnbd/rnbd-srv.h | 2 | ||||
-rw-r--r-- | drivers/block/ublk_drv.c | 302 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 4 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.c | 6 |
21 files changed, 598 insertions, 237 deletions
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 12b3ca8f6f4a..128722cf6c3c 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -108,7 +108,7 @@ static ssize_t aoedisk_show_payload(struct device *dev, return sysfs_emit(page, "%lu\n", d->maxbcnt); } -static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) +static int aoe_debugfs_show(struct seq_file *s, void *ignored) { struct aoedev *d; struct aoetgt **t, **te; @@ -151,11 +151,7 @@ static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) return 0; } - -static int aoe_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, aoedisk_debugfs_show, inode->i_private); -} +DEFINE_SHOW_ATTRIBUTE(aoe_debugfs); static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL); static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL); @@ -184,13 +180,6 @@ static const struct attribute_group *aoe_attr_groups[] = { NULL, }; -static const struct file_operations aoe_debugfs_fops = { - .open = aoe_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static void aoedisk_add_debugfs(struct aoedev *d) { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 859499cd1ff8..20acc4a1fd6d 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -397,7 +397,7 @@ static int brd_alloc(int i) disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; - strlcpy(disk->disk_name, buf, DISK_NAME_LEN); + strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); /* diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f15f2f041596..4d661282ff41 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1529,7 +1529,6 @@ extern int w_send_read_req(struct drbd_work *, int); extern int w_e_reissue(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int); -extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(struct timer_list *t); extern void start_resync_timer_fn(struct timer_list *t); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 013d355a2033..864c98e74875 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -4752,7 +4752,7 @@ void notify_helper(enum drbd_notification_type type, struct drbd_genlmsghdr *dh; int err; - strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); + strscpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); helper_info.helper_status = status; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index af4c7d65490b..c897c4572036 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2113,9 +2113,6 @@ static int receive_DataReply(struct drbd_connection *connection, struct packet_i if (unlikely(!req)) return -EIO; - /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid - * special casing it there for the various failure cases. - * still no race with drbd_fail_pending_reads */ err = recv_dless_read(peer_device, req, sector, pi->size); if (!err) req_mod(req, DATA_RECEIVED); diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 511f39a08de4..6237fa1dcb0e 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -266,8 +266,6 @@ struct bio_and_error { extern void start_new_tl_epoch(struct drbd_connection *connection); extern void drbd_req_destroy(struct kref *kref); -extern void _req_may_be_done(struct drbd_request *req, - struct bio_and_error *m); extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m); extern void complete_master_bio(struct drbd_device *device, diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 562725d222a7..815d77ba6381 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1397,15 +1397,15 @@ static void mtip_dump_identify(struct mtip_port *port) if (!port->identify_valid) return; - strlcpy(cbuf, (char *)(port->identify+10), 21); + strscpy(cbuf, (char *)(port->identify + 10), 21); dev_info(&port->dd->pdev->dev, "Serial No.: %s\n", cbuf); - strlcpy(cbuf, (char *)(port->identify+23), 9); + strscpy(cbuf, (char *)(port->identify + 23), 9); dev_info(&port->dd->pdev->dev, "Firmware Ver.: %s\n", cbuf); - strlcpy(cbuf, (char *)(port->identify+27), 41); + strscpy(cbuf, (char *)(port->identify + 27), 41); dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", @@ -1421,13 +1421,13 @@ static void mtip_dump_identify(struct mtip_port *port) pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid); switch (revid & 0xFF) { case 0x1: - strlcpy(cbuf, "A0", 3); + strscpy(cbuf, "A0", 3); break; case 0x3: - strlcpy(cbuf, "A2", 3); + strscpy(cbuf, "A2", 3); break; default: - strlcpy(cbuf, "?", 2); + strscpy(cbuf, "?", 2); break; } dev_info(&port->dd->pdev->dev, diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6cec9ce23fd3..4762a03e1ffe 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1413,10 +1413,12 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd) mutex_unlock(&nbd->config_lock); ret = wait_event_interruptible(config->recv_wq, atomic_read(&config->recv_threads) == 0); - if (ret) + if (ret) { sock_shutdown(nbd); - flush_workqueue(nbd->recv_workq); + nbd_clear_que(nbd); + } + flush_workqueue(nbd->recv_workq); mutex_lock(&nbd->config_lock); nbd_bdev_reset(nbd); /* user requested, ignore socket errors */ diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index c451c477978f..1f154f92f4c2 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1528,7 +1528,7 @@ static bool should_requeue_request(struct request *rq) return false; } -static int null_map_queues(struct blk_mq_tag_set *set) +static void null_map_queues(struct blk_mq_tag_set *set) { struct nullb *nullb = set->driver_data; int i, qoff; @@ -1555,7 +1555,9 @@ static int null_map_queues(struct blk_mq_tag_set *set) } else { pr_warn("tag set has unexpected nr_hw_queues: %d\n", set->nr_hw_queues); - return -EINVAL; + WARN_ON_ONCE(true); + submit_queues = 1; + poll_queues = 0; } } @@ -1577,8 +1579,6 @@ static int null_map_queues(struct blk_mq_tag_set *set) qoff += map->nr_queues; blk_mq_map_queues(map); } - - return 0; } static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index e1d080f680ed..c76e0148eada 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -745,7 +745,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) gendisk->flags |= GENHD_FL_NO_PART; gendisk->fops = &ps3vram_fops; gendisk->private_data = dev; - strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); + strscpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS); blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE); diff --git a/drivers/block/rnbd/Makefile b/drivers/block/rnbd/Makefile index 5bb1a7ad1ada..40b31630822c 100644 --- a/drivers/block/rnbd/Makefile +++ b/drivers/block/rnbd/Makefile @@ -6,10 +6,12 @@ rnbd-client-y := rnbd-clt.o \ rnbd-clt-sysfs.o \ rnbd-common.o +CFLAGS_rnbd-srv-trace.o = -I$(src) + rnbd-server-y := rnbd-common.o \ rnbd-srv.o \ - rnbd-srv-dev.o \ - rnbd-srv-sysfs.o + rnbd-srv-sysfs.o \ + rnbd-srv-trace.o obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 04da33a22ef4..78334da74d8b 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1159,13 +1159,11 @@ static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct rnbd_queue *q = hctx->driver_data; struct rnbd_clt_dev *dev = q->dev; - int cnt; - cnt = rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); - return cnt; + return rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); } -static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) +static void rnbd_rdma_map_queues(struct blk_mq_tag_set *set) { struct rnbd_clt_session *sess = set->driver_data; @@ -1194,8 +1192,6 @@ static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) set->map[HCTX_TYPE_DEFAULT].nr_queues, set->map[HCTX_TYPE_READ].nr_queues); } - - return 0; } static struct blk_mq_ops rnbd_mq_ops = { diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c deleted file mode 100644 index c63017f6e421..000000000000 --- a/drivers/block/rnbd/rnbd-srv-dev.c +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * RDMA Network Block Driver - * - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. - */ -#undef pr_fmt -#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt - -#include "rnbd-srv-dev.h" -#include "rnbd-log.h" - -struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags) -{ - struct rnbd_dev *dev; - int ret; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return ERR_PTR(-ENOMEM); - - dev->blk_open_flags = flags; - dev->bdev = blkdev_get_by_path(path, flags, THIS_MODULE); - ret = PTR_ERR_OR_ZERO(dev->bdev); - if (ret) - goto err; - - dev->blk_open_flags = flags; - - return dev; - -err: - kfree(dev); - return ERR_PTR(ret); -} - -void rnbd_dev_close(struct rnbd_dev *dev) -{ - blkdev_put(dev->bdev, dev->blk_open_flags); - kfree(dev); -} diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h deleted file mode 100644 index 8407d12f70af..000000000000 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * RDMA Network Block Driver - * - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. - */ -#ifndef RNBD_SRV_DEV_H -#define RNBD_SRV_DEV_H - -#include <linux/fs.h> -#include "rnbd-proto.h" - -struct rnbd_dev { - struct block_device *bdev; - fmode_t blk_open_flags; -}; - -/** - * rnbd_dev_open() - Open a device - * @path: path to open - * @flags: open flags - */ -struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags); - -/** - * rnbd_dev_close() - Close a device - */ -void rnbd_dev_close(struct rnbd_dev *dev); - -void rnbd_endio(void *priv, int error); - -static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev) -{ - return queue_max_segments(bdev_get_queue(dev->bdev)); -} - -static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev) -{ - return queue_max_hw_sectors(bdev_get_queue(dev->bdev)); -} - -static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev) -{ - return bdev_max_secure_erase_sectors(dev->bdev); -} - -static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) -{ - return bdev_max_discard_sectors(dev->bdev); -} - -static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) -{ - return bdev_get_queue(dev->bdev)->limits.discard_granularity; -} - -static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) -{ - return bdev_discard_alignment(dev->bdev); -} - -#endif /* RNBD_SRV_DEV_H */ diff --git a/drivers/block/rnbd/rnbd-srv-trace.c b/drivers/block/rnbd/rnbd-srv-trace.c new file mode 100644 index 000000000000..30f0895c18f5 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-trace.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. + */ +#include "rtrs.h" +#include "rtrs-srv.h" +#include "rnbd-srv.h" +#include "rnbd-proto.h" + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "rnbd-srv-trace.h" diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h new file mode 100644 index 000000000000..8dedf73bdd28 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2022 1&1 IONOS SE. All rights reserved. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rnbd_srv + +#if !defined(_TRACE_RNBD_SRV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RNBD_SRV_H + +#include <linux/tracepoint.h> + +struct rnbd_srv_session; +struct rtrs_srv_op; + +DECLARE_EVENT_CLASS(rnbd_srv_link_class, + TP_PROTO(struct rnbd_srv_session *srv), + + TP_ARGS(srv), + + TP_STRUCT__entry( + __field(int, qdepth) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->qdepth = srv->queue_depth; + __assign_str(sessname, srv->sessname); + ), + + TP_printk("sessname: %s qdepth: %d", + __get_str(sessname), + __entry->qdepth + ) +); + +#define DEFINE_LINK_EVENT(name) \ +DEFINE_EVENT(rnbd_srv_link_class, name, \ + TP_PROTO(struct rnbd_srv_session *srv), \ + TP_ARGS(srv)) + +DEFINE_LINK_EVENT(create_sess); +DEFINE_LINK_EVENT(destroy_sess); + +TRACE_DEFINE_ENUM(RNBD_OP_READ); +TRACE_DEFINE_ENUM(RNBD_OP_WRITE); +TRACE_DEFINE_ENUM(RNBD_OP_FLUSH); +TRACE_DEFINE_ENUM(RNBD_OP_DISCARD); +TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE); +TRACE_DEFINE_ENUM(RNBD_F_SYNC); +TRACE_DEFINE_ENUM(RNBD_F_FUA); + +#define show_rnbd_rw_flags(x) \ + __print_flags(x, "|", \ + { RNBD_OP_READ, "READ" }, \ + { RNBD_OP_WRITE, "WRITE" }, \ + { RNBD_OP_FLUSH, "FLUSH" }, \ + { RNBD_OP_DISCARD, "DISCARD" }, \ + { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \ + { RNBD_F_SYNC, "SYNC" }, \ + { RNBD_F_FUA, "FUA" }) + +TRACE_EVENT(process_rdma, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_io *msg, + struct rtrs_srv_op *id, + u32 datalen, + size_t usrlen), + + TP_ARGS(srv, msg, id, datalen, usrlen), + + TP_STRUCT__entry( + __string(sessname, srv->sessname) + __field(u8, dir) + __field(u8, ver) + __field(u32, device_id) + __field(u64, sector) + __field(u32, flags) + __field(u32, bi_size) + __field(u16, ioprio) + __field(u32, datalen) + __field(size_t, usrlen) + ), + + TP_fast_assign( + __assign_str(sessname, srv->sessname); + __entry->dir = id->dir; + __entry->ver = srv->ver; + __entry->device_id = le32_to_cpu(msg->device_id); + __entry->sector = le64_to_cpu(msg->sector); + __entry->bi_size = le32_to_cpu(msg->bi_size); + __entry->flags = le32_to_cpu(msg->rw); + __entry->ioprio = le16_to_cpu(msg->prio); + __entry->datalen = datalen; + __entry->usrlen = usrlen; + ), + + TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu", + __get_str(sessname), + __print_symbolic(__entry->dir, + { READ, "READ" }, + { WRITE, "WRITE" }), + __entry->ver, + __entry->device_id, + __entry->sector, + __entry->bi_size, + show_rnbd_rw_flags(__entry->flags), + __entry->ioprio, + __entry->datalen, + __entry->usrlen + ) +); + +TRACE_EVENT(process_msg_sess_info, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_sess_info *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u8, proto_ver) + __field(u8, clt_ver) + __field(u8, srv_ver) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->proto_ver = srv->ver; + __entry->clt_ver = msg->ver; + __entry->srv_ver = RNBD_PROTO_VER_MAJOR; + __assign_str(sessname, srv->sessname); + ), + + TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)", + __get_str(sessname), + __entry->proto_ver, + __entry->clt_ver, + __entry->srv_ver + ) +); + +TRACE_DEFINE_ENUM(RNBD_ACCESS_RO); +TRACE_DEFINE_ENUM(RNBD_ACCESS_RW); +TRACE_DEFINE_ENUM(RNBD_ACCESS_MIGRATION); + +#define show_rnbd_access_mode(x) \ + __print_symbolic(x, \ + { RNBD_ACCESS_RO, "RO" }, \ + { RNBD_ACCESS_RW, "RW" }, \ + { RNBD_ACCESS_MIGRATION, "MIGRATION" }) + +TRACE_EVENT(process_msg_open, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_open *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u8, access_mode) + __string(sessname, srv->sessname) + __string(dev_name, msg->dev_name) + ), + + TP_fast_assign( + __entry->access_mode = msg->access_mode; + __assign_str(sessname, srv->sessname); + __assign_str(dev_name, msg->dev_name); + ), + + TP_printk("Open message received: session='%s' path='%s' access_mode=%s", + __get_str(sessname), + __get_str(dev_name), + show_rnbd_access_mode(__entry->access_mode) + ) +); + +TRACE_EVENT(process_msg_close, + TP_PROTO(struct rnbd_srv_session *srv, + const struct rnbd_msg_close *msg), + + TP_ARGS(srv, msg), + + TP_STRUCT__entry( + __field(u32, device_id) + __string(sessname, srv->sessname) + ), + + TP_fast_assign( + __entry->device_id = le32_to_cpu(msg->device_id); + __assign_str(sessname, srv->sessname); + ), + + TP_printk("Close message received: session='%s' device id='%d'", + __get_str(sessname), + __entry->device_id + ) +); + +#endif /* _TRACE_RNBD_SRV_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rnbd-srv-trace +#include <trace/define_trace.h> + diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 5e08da277ddf..08b041159cd3 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -13,7 +13,7 @@ #include <linux/blkdev.h> #include "rnbd-srv.h" -#include "rnbd-srv-dev.h" +#include "rnbd-srv-trace.h" MODULE_DESCRIPTION("RDMA Network Block Device Server"); MODULE_LICENSE("GPL"); @@ -84,18 +84,6 @@ static inline void rnbd_put_sess_dev(struct rnbd_srv_sess_dev *sess_dev) kref_put(&sess_dev->kref, rnbd_sess_dev_release); } -void rnbd_endio(void *priv, int error) -{ - struct rnbd_io_private *rnbd_priv = priv; - struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; - - rnbd_put_sess_dev(sess_dev); - - rtrs_srv_resp_rdma(rnbd_priv->id, error); - - kfree(priv); -} - static struct rnbd_srv_sess_dev * rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) { @@ -116,7 +104,13 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) static void rnbd_dev_bi_end_io(struct bio *bio) { - rnbd_endio(bio->bi_private, blk_status_to_errno(bio->bi_status)); + struct rnbd_io_private *rnbd_priv = bio->bi_private; + struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; + + rnbd_put_sess_dev(sess_dev); + rtrs_srv_resp_rdma(rnbd_priv->id, blk_status_to_errno(bio->bi_status)); + + kfree(rnbd_priv); bio_put(bio); } @@ -132,6 +126,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, struct bio *bio; short prio; + trace_process_rdma(srv_sess, msg, id, datalen, usrlen); + priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -149,7 +145,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - bio = bio_alloc(sess_dev->rnbd_dev->bdev, 1, + bio = bio_alloc(sess_dev->bdev, 1, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); if (bio_add_page(bio, virt_to_page(data), datalen, offset_in_page(data)) != datalen) { @@ -223,7 +219,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ - rnbd_dev_close(sess_dev->rnbd_dev); + blkdev_put(sess_dev->bdev, sess_dev->open_flags); mutex_lock(&sess_dev->dev->lock); list_del(&sess_dev->dev_list); if (sess_dev->open_flags & FMODE_WRITE) @@ -244,6 +240,8 @@ static void destroy_sess(struct rnbd_srv_session *srv_sess) if (xa_empty(&srv_sess->index_idr)) goto out; + trace_destroy_sess(srv_sess); + mutex_lock(&srv_sess->lock); xa_for_each(&srv_sess->index_idr, index, sess_dev) rnbd_srv_destroy_dev_session_sysfs(sess_dev); @@ -290,6 +288,8 @@ static int create_sess(struct rtrs_srv_sess *rtrs) rtrs_srv_set_sess_priv(rtrs, srv_sess); + trace_create_sess(srv_sess); + return 0; } @@ -332,23 +332,24 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, mutex_unlock(&sess->lock); } -static int process_msg_close(struct rnbd_srv_session *srv_sess, +static void process_msg_close(struct rnbd_srv_session *srv_sess, void *data, size_t datalen, const void *usr, size_t usrlen) { const struct rnbd_msg_close *close_msg = usr; struct rnbd_srv_sess_dev *sess_dev; + trace_process_msg_close(srv_sess, close_msg); + sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id), srv_sess); if (IS_ERR(sess_dev)) - return 0; + return; rnbd_put_sess_dev(sess_dev); mutex_lock(&srv_sess->lock); rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&srv_sess->lock); - return 0; } static int process_msg_open(struct rnbd_srv_session *srv_sess, @@ -378,7 +379,7 @@ static int rnbd_srv_rdma_ev(void *priv, case RNBD_MSG_IO: return process_rdma(srv_sess, id, data, datalen, usr, usrlen); case RNBD_MSG_CLOSE: - ret = process_msg_close(srv_sess, data, datalen, usr, usrlen); + process_msg_close(srv_sess, data, datalen, usr, usrlen); break; case RNBD_MSG_OPEN: ret = process_msg_open(srv_sess, usr, usrlen, data, datalen); @@ -393,6 +394,11 @@ static int rnbd_srv_rdma_ev(void *priv, return -EINVAL; } + /* + * Since ret is passed to rtrs to handle the failure case, we + * just return 0 at the end otherwise callers in rtrs would call + * send_io_resp_imm again to print redundant err message. + */ rtrs_srv_resp_rdma(id, ret); return 0; } @@ -504,14 +510,14 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, } static struct rnbd_srv_dev * -rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, +rnbd_srv_get_or_create_srv_dev(struct block_device *bdev, struct rnbd_srv_session *srv_sess, enum rnbd_access_mode access_mode) { int ret; struct rnbd_srv_dev *new_dev, *dev; - new_dev = rnbd_srv_init_srv_dev(rnbd_dev->bdev); + new_dev = rnbd_srv_init_srv_dev(bdev); if (IS_ERR(new_dev)) return new_dev; @@ -531,41 +537,32 @@ rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, struct rnbd_srv_sess_dev *sess_dev) { - struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; + struct block_device *bdev = sess_dev->bdev; rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); - rsp->device_id = - cpu_to_le32(sess_dev->device_id); - rsp->nsectors = - cpu_to_le64(get_capacity(rnbd_dev->bdev->bd_disk)); - rsp->logical_block_size = - cpu_to_le16(bdev_logical_block_size(rnbd_dev->bdev)); - rsp->physical_block_size = - cpu_to_le16(bdev_physical_block_size(rnbd_dev->bdev)); - rsp->max_segments = - cpu_to_le16(rnbd_dev_get_max_segs(rnbd_dev)); + rsp->device_id = cpu_to_le32(sess_dev->device_id); + rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev)); + rsp->logical_block_size = cpu_to_le16(bdev_logical_block_size(bdev)); + rsp->physical_block_size = cpu_to_le16(bdev_physical_block_size(bdev)); + rsp->max_segments = cpu_to_le16(bdev_max_segments(bdev)); rsp->max_hw_sectors = - cpu_to_le32(rnbd_dev_get_max_hw_sects(rnbd_dev)); + cpu_to_le32(queue_max_hw_sectors(bdev_get_queue(bdev))); rsp->max_write_same_sectors = 0; - rsp->max_discard_sectors = - cpu_to_le32(rnbd_dev_get_max_discard_sects(rnbd_dev)); - rsp->discard_granularity = - cpu_to_le32(rnbd_dev_get_discard_granularity(rnbd_dev)); - rsp->discard_alignment = - cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); - rsp->secure_discard = - cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); + rsp->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev)); + rsp->discard_granularity = cpu_to_le32(bdev_discard_granularity(bdev)); + rsp->discard_alignment = cpu_to_le32(bdev_discard_alignment(bdev)); + rsp->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev)); rsp->cache_policy = 0; - if (bdev_write_cache(rnbd_dev->bdev)) + if (bdev_write_cache(bdev)) rsp->cache_policy |= RNBD_WRITEBACK; - if (bdev_fua(rnbd_dev->bdev)) + if (bdev_fua(bdev)) rsp->cache_policy |= RNBD_FUA; } static struct rnbd_srv_sess_dev * rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, const struct rnbd_msg_open *open_msg, - struct rnbd_dev *rnbd_dev, fmode_t open_flags, + struct block_device *bdev, fmode_t open_flags, struct rnbd_srv_dev *srv_dev) { struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); @@ -577,7 +574,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); - sdev->rnbd_dev = rnbd_dev; + sdev->bdev = bdev; sdev->sess = srv_sess; sdev->dev = srv_dev; sdev->open_flags = open_flags; @@ -643,9 +640,8 @@ static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, struct rnbd_msg_sess_info_rsp *rsp = data; srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); - pr_debug("Session %s using protocol version %d (client version: %d, server version: %d)\n", - srv_sess->sessname, srv_sess->ver, - sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); + + trace_process_msg_sess_info(srv_sess, sess_info_msg); rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->ver = srv_sess->ver; @@ -685,14 +681,13 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, struct rnbd_srv_dev *srv_dev; struct rnbd_srv_sess_dev *srv_sess_dev; const struct rnbd_msg_open *open_msg = msg; + struct block_device *bdev; fmode_t open_flags; char *full_path; - struct rnbd_dev *rnbd_dev; struct rnbd_msg_open_rsp *rsp = data; - pr_debug("Open message received: session='%s' path='%s' access_mode=%d\n", - srv_sess->sessname, open_msg->dev_name, - open_msg->access_mode); + trace_process_msg_open(srv_sess, open_msg); + open_flags = FMODE_READ; if (open_msg->access_mode != RNBD_ACCESS_RO) open_flags |= FMODE_WRITE; @@ -725,25 +720,25 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, goto reject; } - rnbd_dev = rnbd_dev_open(full_path, open_flags); - if (IS_ERR(rnbd_dev)) { - pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n", - full_path, srv_sess->sessname, PTR_ERR(rnbd_dev)); - ret = PTR_ERR(rnbd_dev); + bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n", + full_path, srv_sess->sessname, ret); goto free_path; } - srv_dev = rnbd_srv_get_or_create_srv_dev(rnbd_dev, srv_sess, + srv_dev = rnbd_srv_get_or_create_srv_dev(bdev, srv_sess, open_msg->access_mode); if (IS_ERR(srv_dev)) { pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n", full_path, srv_sess->sessname, PTR_ERR(srv_dev)); ret = PTR_ERR(srv_dev); - goto rnbd_dev_close; + goto blkdev_put; } srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, - rnbd_dev, open_flags, + bdev, open_flags, srv_dev); if (IS_ERR(srv_sess_dev)) { pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", @@ -758,7 +753,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, */ mutex_lock(&srv_dev->lock); if (!srv_dev->dev_kobj.state_in_sysfs) { - ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev); + ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev); if (ret) { mutex_unlock(&srv_dev->lock); rnbd_srv_err(srv_sess_dev, @@ -800,8 +795,8 @@ srv_dev_put: mutex_unlock(&srv_dev->lock); } rnbd_put_srv_dev(srv_dev); -rnbd_dev_close: - rnbd_dev_close(rnbd_dev); +blkdev_put: + blkdev_put(bdev, open_flags); free_path: kfree(full_path); reject: diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 081bceaf4ae9..f5962fd31d62 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -46,7 +46,7 @@ struct rnbd_srv_dev { struct rnbd_srv_sess_dev { /* Entry inside rnbd_srv_dev struct */ struct list_head dev_list; - struct rnbd_dev *rnbd_dev; + struct block_device *bdev; struct rnbd_srv_session *sess; struct rnbd_srv_dev *dev; struct kobject kobj; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6a4a94b4cdf4..2651bf41dde3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -49,7 +49,9 @@ /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ - | UBLK_F_NEED_GET_DATA) + | UBLK_F_NEED_GET_DATA \ + | UBLK_F_USER_RECOVERY \ + | UBLK_F_USER_RECOVERY_REISSUE) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) @@ -119,7 +121,7 @@ struct ublk_queue { unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; - bool abort_work_pending; + bool force_abort; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; struct ublk_io ios[0]; @@ -161,6 +163,7 @@ struct ublk_device { * monitor each queue's daemon periodically */ struct delayed_work monitor_work; + struct work_struct quiesce_work; struct work_struct stop_work; }; @@ -323,6 +326,30 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) PAGE_SIZE); } +static inline bool ublk_queue_can_use_recovery_reissue( + struct ublk_queue *ubq) +{ + if ((ubq->flags & UBLK_F_USER_RECOVERY) && + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) + return true; + return false; +} + +static inline bool ublk_queue_can_use_recovery( + struct ublk_queue *ubq) +{ + if (ubq->flags & UBLK_F_USER_RECOVERY) + return true; + return false; +} + +static inline bool ublk_can_use_recovery(struct ublk_device *ub) +{ + if (ub->dev_info.flags & UBLK_F_USER_RECOVERY) + return true; + return false; +} + static void ublk_free_disk(struct gendisk *disk) { struct ublk_device *ub = disk->private_data; @@ -612,13 +639,17 @@ static void ublk_complete_rq(struct request *req) * Also aborting may not be started yet, keep in mind that one failed * request may be issued by block layer again. */ -static void __ublk_fail_req(struct ublk_io *io, struct request *req) +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { io->flags |= UBLK_IO_FLAG_ABORTED; - blk_mq_end_request(req, BLK_STS_IOERR); + if (ublk_queue_can_use_recovery_reissue(ubq)) + blk_mq_requeue_request(req, false); + else + blk_mq_end_request(req, BLK_STS_IOERR); } } @@ -639,22 +670,40 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res) #define UBLK_REQUEUE_DELAY_MS 3 +static inline void __ublk_abort_rq(struct ublk_queue *ubq, + struct request *rq) +{ + /* We cannot process this rq so just requeue it. */ + if (ublk_queue_can_use_recovery(ubq)) + blk_mq_requeue_request(rq, false); + else + blk_mq_end_request(rq, BLK_STS_IOERR); + + mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); +} + static inline void __ublk_rq_task_work(struct request *req) { struct ublk_queue *ubq = req->mq_hctx->driver_data; - struct ublk_device *ub = ubq->dev; int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; - bool task_exiting = current != ubq->ubq_daemon || ubq_daemon_is_dying(ubq); unsigned int mapped_bytes; pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, ublk_get_iod(ubq, req->tag)->addr); - if (unlikely(task_exiting)) { - blk_mq_end_request(req, BLK_STS_IOERR); - mod_delayed_work(system_wq, &ub->monitor_work, 0); + /* + * Task is exiting if either: + * + * (1) current != ubq_daemon. + * io_uring_cmd_complete_in_task() tries to run task_work + * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING. + * + * (2) current->flags & PF_EXITING. + */ + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) { + __ublk_abort_rq(ubq, req); return; } @@ -739,13 +788,24 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, res = ublk_setup_iod(ubq, rq); if (unlikely(res != BLK_STS_OK)) return BLK_STS_IOERR; + /* With recovery feature enabled, force_abort is set in + * ublk_stop_dev() before calling del_gendisk(). We have to + * abort all requeued and new rqs here to let del_gendisk() + * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() + * to avoid UAF on io_uring ctx. + * + * Note: force_abort is guaranteed to be seen because it is set + * before request queue is unqiuesced. + */ + if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort)) + return BLK_STS_IOERR; blk_mq_start_request(bd->rq); if (unlikely(ubq_daemon_is_dying(ubq))) { fail: - mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); - return BLK_STS_IOERR; + __ublk_abort_rq(ubq, rq); + return BLK_STS_OK; } if (ublk_can_use_task_work(ubq)) { @@ -916,7 +976,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); if (rq) - __ublk_fail_req(io, rq); + __ublk_fail_req(ubq, io, rq); } } ublk_put_device(ub); @@ -932,7 +992,10 @@ static void ublk_daemon_monitor_work(struct work_struct *work) struct ublk_queue *ubq = ublk_get_queue(ub, i); if (ubq_daemon_is_dying(ubq)) { - schedule_work(&ub->stop_work); + if (ublk_queue_can_use_recovery(ubq)) + schedule_work(&ub->quiesce_work); + else + schedule_work(&ub->stop_work); /* abort queue is for making forward progress */ ublk_abort_queue(ub, ubq); @@ -940,12 +1003,13 @@ static void ublk_daemon_monitor_work(struct work_struct *work) } /* - * We can't schedule monitor work after ublk_remove() is started. + * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE. + * after ublk_remove() or __ublk_quiesce_dev() is started. * * No need ub->mutex, monitor work are canceled after state is marked - * as DEAD, so DEAD state is observed reliably. + * as not LIVE, so new state is observed reliably. */ - if (ub->dev_info.state != UBLK_S_DEV_DEAD) + if (ub->dev_info.state == UBLK_S_DEV_LIVE) schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); } @@ -982,12 +1046,97 @@ static void ublk_cancel_dev(struct ublk_device *ub) ublk_cancel_queue(ublk_get_queue(ub, i)); } -static void ublk_stop_dev(struct ublk_device *ub) +static bool ublk_check_inflight_rq(struct request *rq, void *data) +{ + bool *idle = data; + + if (blk_mq_request_started(rq)) { + *idle = false; + return false; + } + return true; +} + +static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) { + bool idle; + + WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); + while (true) { + idle = true; + blk_mq_tagset_busy_iter(&ub->tag_set, + ublk_check_inflight_rq, &idle); + if (idle) + break; + msleep(UBLK_REQUEUE_DELAY_MS); + } +} + +static void __ublk_quiesce_dev(struct ublk_device *ub) +{ + pr_devel("%s: quiesce ub: dev_id %d state %s\n", + __func__, ub->dev_info.dev_id, + ub->dev_info.state == UBLK_S_DEV_LIVE ? + "LIVE" : "QUIESCED"); + blk_mq_quiesce_queue(ub->ub_disk->queue); + ublk_wait_tagset_rqs_idle(ub); + ub->dev_info.state = UBLK_S_DEV_QUIESCED; + ublk_cancel_dev(ub); + /* we are going to release task_struct of ubq_daemon and resets + * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF. + * Besides, monitor_work is not necessary in QUIESCED state since we have + * already scheduled quiesce_work and quiesced all ubqs. + * + * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel + * it here and re-schedule it in END_USER_RECOVERY to avoid UAF. + */ + cancel_delayed_work_sync(&ub->monitor_work); +} + +static void ublk_quiesce_work_fn(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, quiesce_work); + mutex_lock(&ub->mutex); if (ub->dev_info.state != UBLK_S_DEV_LIVE) goto unlock; + __ublk_quiesce_dev(ub); + unlock: + mutex_unlock(&ub->mutex); +} + +static void ublk_unquiesce_dev(struct ublk_device *ub) +{ + int i; + + pr_devel("%s: unquiesce ub: dev_id %d state %s\n", + __func__, ub->dev_info.dev_id, + ub->dev_info.state == UBLK_S_DEV_LIVE ? + "LIVE" : "QUIESCED"); + /* quiesce_work has run. We let requeued rqs be aborted + * before running fallback_wq. "force_abort" must be seen + * after request queue is unqiuesced. Then del_gendisk() + * can move on. + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->force_abort = true; + + blk_mq_unquiesce_queue(ub->ub_disk->queue); + /* We may have requeued some rqs in ublk_quiesce_queue() */ + blk_mq_kick_requeue_list(ub->ub_disk->queue); +} +static void ublk_stop_dev(struct ublk_device *ub) +{ + mutex_lock(&ub->mutex); + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + goto unlock; + if (ublk_can_use_recovery(ub)) { + if (ub->dev_info.state == UBLK_S_DEV_LIVE) + __ublk_quiesce_dev(ub); + ublk_unquiesce_dev(ub); + } del_gendisk(ub->ub_disk); ub->dev_info.state = UBLK_S_DEV_DEAD; ub->dev_info.ublksrv_pid = -1; @@ -1311,6 +1460,7 @@ static void ublk_remove(struct ublk_device *ub) { ublk_stop_dev(ub); cancel_work_sync(&ub->stop_work); + cancel_work_sync(&ub->quiesce_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); put_device(&ub->cdev_dev); } @@ -1487,6 +1637,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->mm_lock); + INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn); INIT_WORK(&ub->stop_work, ublk_stop_work_fn); INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); @@ -1607,6 +1758,7 @@ static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) ublk_stop_dev(ub); cancel_work_sync(&ub->stop_work); + cancel_work_sync(&ub->quiesce_work); ublk_put_device(ub); return 0; @@ -1709,6 +1861,116 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) return ret; } +static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); + /* All old ioucmds have to be completed */ + WARN_ON_ONCE(ubq->nr_io_ready); + /* old daemon is PF_EXITING, put it now */ + put_task_struct(ubq->ubq_daemon); + /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ + ubq->ubq_daemon = NULL; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + /* forget everything now and be ready for new FETCH_REQ */ + io->flags = 0; + io->cmd = NULL; + io->addr = 0; + } +} + +static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub; + int ret = -EINVAL; + int i; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return ret; + + mutex_lock(&ub->mutex); + if (!ublk_can_use_recovery(ub)) + goto out_unlock; + /* + * START_RECOVERY is only allowd after: + * + * (1) UB_STATE_OPEN is not set, which means the dying process is exited + * and related io_uring ctx is freed so file struct of /dev/ublkcX is + * released. + * + * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: + * (a)has quiesced request queue + * (b)has requeued every inflight rqs whose io_flags is ACTIVE + * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE + * (d)has completed/camceled all ioucmds owned by ther dying process + */ + if (test_bit(UB_STATE_OPEN, &ub->state) || + ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + ret = -EBUSY; + goto out_unlock; + } + pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_queue_reinit(ub, ublk_get_queue(ub, i)); + /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ + ub->mm = NULL; + ub->nr_queues_ready = 0; + init_completion(&ub->completion); + ret = 0; + out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + int ublksrv_pid = (int)header->data[0]; + struct ublk_device *ub; + int ret = -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return ret; + + pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", + __func__, ub->dev_info.nr_hw_queues, header->dev_id); + /* wait until new ubq_daemon sending all FETCH_REQ */ + wait_for_completion_interruptible(&ub->completion); + pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n", + __func__, ub->dev_info.nr_hw_queues, header->dev_id); + + mutex_lock(&ub->mutex); + if (!ublk_can_use_recovery(ub)) + goto out_unlock; + + if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) { + ret = -EBUSY; + goto out_unlock; + } + ub->dev_info.ublksrv_pid = ublksrv_pid; + pr_devel("%s: new ublksrv_pid %d, dev id %d\n", + __func__, ublksrv_pid, header->dev_id); + blk_mq_unquiesce_queue(ub->ub_disk->queue); + pr_devel("%s: queue unquiesced, dev id %d.\n", + __func__, header->dev_id); + blk_mq_kick_requeue_list(ub->ub_disk->queue); + ub->dev_info.state = UBLK_S_DEV_LIVE; + schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); + ret = 0; + out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -1750,6 +2012,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_SET_PARAMS: ret = ublk_ctrl_set_params(cmd); break; + case UBLK_CMD_START_USER_RECOVERY: + ret = ublk_ctrl_start_recovery(cmd); + break; + case UBLK_CMD_END_USER_RECOVERY: + ret = ublk_ctrl_end_recovery(cmd); + break; default: break; } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index dd9a05174726..3f4739d52268 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -801,7 +801,7 @@ static const struct attribute_group *virtblk_attr_groups[] = { NULL, }; -static int virtblk_map_queues(struct blk_mq_tag_set *set) +static void virtblk_map_queues(struct blk_mq_tag_set *set) { struct virtio_blk *vblk = set->driver_data; int i, qoff; @@ -826,8 +826,6 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set) else blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); } - - return 0; } static void virtblk_complete_batch(struct io_comp_batch *iob) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 226ea76cc819..e551433cd107 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -499,7 +499,7 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - strlcpy(file_name, buf, PATH_MAX); + strscpy(file_name, buf, PATH_MAX); /* ignore trailing newline */ sz = strlen(file_name); if (sz > 0 && file_name[sz - 1] == '\n') @@ -1031,7 +1031,7 @@ static ssize_t comp_algorithm_store(struct device *dev, char compressor[ARRAY_SIZE(zram->compressor)]; size_t sz; - strlcpy(compressor, buf, sizeof(compressor)); + strscpy(compressor, buf, sizeof(compressor)); /* ignore trailing newline */ sz = strlen(compressor); if (sz > 0 && compressor[sz - 1] == '\n') @@ -1974,7 +1974,7 @@ static int zram_add(void) if (ret) goto out_cleanup_disk; - strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + strscpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); |