From 979c6342f9c0a48696a6420f14f9dd409591657f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 15 Nov 2024 13:41:21 -0800 Subject: nvme-pci: add support for sgl metadata Supporting this mode allows creating and merging multi-segment metadata requests that wouldn't be possible otherwise. It also allows directly using user space requests that straddle physically discontiguous pages. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 0a6e22038ce3..5873ce859cc8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -389,6 +389,7 @@ enum { NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, + NVME_CTRL_SGLS_MSDS = 1 << 19, }; struct nvme_lbaf { -- cgit v1.2.3 From 6399a0db8cd61eedbfb4b7809a4f4699157a9bf8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Nov 2024 13:57:04 -0800 Subject: nvme: define the remaining used sgls constants This provides a little more context when reading the code than hardcoded magic numbers. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 3 ++- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/target/admin-cmd.c | 7 ++++--- include/linux/nvme.h | 4 ++++ 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5ef284a376cc..611b02c8a8b3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1123,7 +1123,8 @@ static inline void nvme_start_request(struct request *rq) static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) { - return ctrl->sgls & ((1 << 0) | (1 << 1)); + return ctrl->sgls & (NVME_CTRL_SGLS_BYTE_ALIGNED | + NVME_CTRL_SGLS_DWORD_ALIGNED); } static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 24a2759798d0..baf7d2490152 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1019,7 +1019,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) goto destroy_admin; } - if (!(ctrl->ctrl.sgls & (1 << 2))) { + if (!(ctrl->ctrl.sgls & NVME_CTRL_SGLS_KSDBDS)) { ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not supported!\n"); @@ -1051,7 +1051,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; } - if (ctrl->ctrl.sgls & (1 << 20)) + if (ctrl->ctrl.sgls & NVME_CTRL_SGLS_SAOS) ctrl->use_inline_data = true; if (ctrl->ctrl.queue_count > 1) { diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 934b401fbc2f..4fa8496a4d96 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -601,11 +601,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->awun = 0; id->awupf = 0; - id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + /* we always support SGLs */ + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_BYTE_ALIGNED); if (ctrl->ops->flags & NVMF_KEYED_SGLS) - id->sgls |= cpu_to_le32(1 << 2); + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_KSDBDS); if (req->port->inline_data_size) - id->sgls |= cpu_to_le32(1 << 20); + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_SAOS); strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5873ce859cc8..2baf9a80b470 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -389,7 +389,11 @@ enum { NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, + NVME_CTRL_SGLS_BYTE_ALIGNED = 1, + NVME_CTRL_SGLS_DWORD_ALIGNED = 2, + NVME_CTRL_SGLS_KSDBDS = 1 << 2, NVME_CTRL_SGLS_MSDS = 1 << 19, + NVME_CTRL_SGLS_SAOS = 1 << 20, }; struct nvme_lbaf { -- cgit v1.2.3 From 46fd48ab3ea3eb3bb215684bd66ea3d260b091a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 08:26:02 +0100 Subject: block: return unsigned int from bdev_io_min The underlying limit is defined as an unsigned int, so return that from bdev_io_min as well. Fixes: ac481c20ef8f ("block: Topology ioctls") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: John Garry Link: https://lore.kernel.org/r/20241119072602.1059488-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a1fd0ddce5cf..195db38fda16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1261,7 +1261,7 @@ static inline unsigned int queue_io_min(const struct request_queue *q) return q->limits.io_min; } -static inline int bdev_io_min(struct block_device *bdev) +static inline unsigned int bdev_io_min(struct block_device *bdev) { return queue_io_min(bdev_get_queue(bdev)); } -- cgit v1.2.3 From d7f36dc446e894e0f57b5f05c5628f03c5f9e2d2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 18 Nov 2024 10:50:15 +0000 Subject: block: Support atomic writes limits for stacked devices Allow stacked devices to support atomic writes by aggregating the minimum capability of all bottom devices. Flag BLK_FEAT_ATOMIC_WRITES_STACKED is set for stacked devices which have been enabled to support atomic writes. Some things to note on the implementation: - For simplicity, all bottom devices must have same atomic write boundary value (if any) - The atomic write boundary must be a power-of-2 already, but this restriction could be relaxed. Furthermore, it is now required that the chunk sectors for a top device must be aligned with this boundary. - If a bottom device atomic write unit min/max are not aligned with the top device chunk sectors, the top device atomic write unit min/max are reduced to a value which works for the chunk sectors. Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241118105018.1870052-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 4 ++ 2 files changed, 119 insertions(+) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index 03c67620f98a..8f09e33f41f6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -501,6 +501,119 @@ static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lb return sectors; } +/* Check if second and later bottom devices are compliant */ +static bool blk_stack_atomic_writes_tail(struct queue_limits *t, + struct queue_limits *b) +{ + /* We're not going to support different boundary sizes.. yet */ + if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary) + return false; + + /* Can't support this */ + if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max) + return false; + + /* Or this */ + if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min) + return false; + + t->atomic_write_hw_max = min(t->atomic_write_hw_max, + b->atomic_write_hw_max); + t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min, + b->atomic_write_hw_unit_min); + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, + b->atomic_write_hw_unit_max); + return true; +} + +/* Check for valid boundary of first bottom device */ +static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, + struct queue_limits *b) +{ + /* + * Ensure atomic write boundary is aligned with chunk sectors. Stacked + * devices store chunk sectors in t->io_min. + */ + if (b->atomic_write_hw_boundary > t->io_min && + b->atomic_write_hw_boundary % t->io_min) + return false; + if (t->io_min > b->atomic_write_hw_boundary && + t->io_min % b->atomic_write_hw_boundary) + return false; + + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; + return true; +} + + +/* Check stacking of first bottom device */ +static bool blk_stack_atomic_writes_head(struct queue_limits *t, + struct queue_limits *b) +{ + if (b->atomic_write_hw_boundary && + !blk_stack_atomic_writes_boundary_head(t, b)) + return false; + + if (t->io_min <= SECTOR_SIZE) { + /* No chunk sectors, so use bottom device values directly */ + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; + t->atomic_write_hw_max = b->atomic_write_hw_max; + return true; + } + + /* + * Find values for limits which work for chunk size. + * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk + * size (t->io_min), as chunk size is not restricted to a power-of-2. + * So we need to find highest power-of-2 which works for the chunk + * size. + * As an example scenario, we could have b->unit_max = 16K and + * t->io_min = 24K. For this case, reduce t->unit_max to a value + * aligned with both limits, i.e. 8K in this example. + */ + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + while (t->io_min % t->atomic_write_hw_unit_max) + t->atomic_write_hw_unit_max /= 2; + + t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, + t->atomic_write_hw_unit_max); + t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); + + return true; +} + +static void blk_stack_atomic_writes_limits(struct queue_limits *t, + struct queue_limits *b) +{ + if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED)) + goto unsupported; + + if (!b->atomic_write_unit_min) + goto unsupported; + + /* + * If atomic_write_hw_max is set, we have already stacked 1x bottom + * device, so check for compliance. + */ + if (t->atomic_write_hw_max) { + if (!blk_stack_atomic_writes_tail(t, b)) + goto unsupported; + return; + } + + if (!blk_stack_atomic_writes_head(t, b)) + goto unsupported; + return; + +unsupported: + t->atomic_write_hw_max = 0; + t->atomic_write_hw_unit_max = 0; + t->atomic_write_hw_unit_min = 0; + t->atomic_write_hw_boundary = 0; + t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED; +} + /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) @@ -661,6 +774,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } + blk_stack_atomic_writes_limits(t, b); + return ret; } EXPORT_SYMBOL(blk_stack_limits); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 195db38fda16..31867de88213 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -333,6 +333,10 @@ typedef unsigned int __bitwise blk_features_t; #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) +/* stacked device can/does support atomic writes */ +#define BLK_FEAT_ATOMIC_WRITES_STACKED \ + ((__force blk_features_t)(1u << 16)) + /* * Flags automatically inherited when stacking limits. */ -- cgit v1.2.3 From 5a9d1b83e5334915c651604648c20a9fc64d47a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:18 +0100 Subject: block: return unsigned int from bdev_io_opt The underlying limit is defined as an unsigned int, so return that from bdev_io_opt as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 31867de88213..97c89c0c1398 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1275,7 +1275,7 @@ static inline unsigned int queue_io_opt(const struct request_queue *q) return q->limits.io_opt; } -static inline int bdev_io_opt(struct block_device *bdev) +static inline unsigned int bdev_io_opt(struct block_device *bdev) { return queue_io_opt(bdev_get_queue(bdev)); } -- cgit v1.2.3 From ed5db174cf39374215934f21b04639a7a1513023 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:19 +0100 Subject: block: return unsigned int from queue_dma_alignment The underlying limit is defined as an unsigned int, so return that from queue_dma_alignment as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 97c89c0c1398..d8de261c2b99 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1421,7 +1421,7 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) return is_seq; } -static inline int queue_dma_alignment(const struct request_queue *q) +static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; } -- cgit v1.2.3 From e769489a54401d0c89555f7ad8672038b5c2b767 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:20 +0100 Subject: block: return unsigned int from blk_lim_dma_alignment_and_pad The underlying limits are defined as unsigned int, so return that from blk_lim_dma_alignment_and_pad as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d8de261c2b99..d0d8190429c8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,7 +1462,8 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev, bdev_logical_block_size(bdev) - 1); } -static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) +static inline unsigned int +blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { return lim->dma_alignment | lim->dma_pad_mask; } -- cgit v1.2.3 From da77d9b23700708d0d22a4407d32a8755a3596e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:21 +0100 Subject: block: return bool from blk_rq_aligned blk_rq_aligned returns a boolean condition, don't mascquerade it as int. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0d8190429c8..5270117d54ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1468,7 +1468,7 @@ blk_lim_dma_alignment_and_pad(struct queue_limits *lim) return lim->dma_alignment | lim->dma_pad_mask; } -static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, +static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); -- cgit v1.2.3 From e888810bc4f471f85989a0991aff28d2ac9f783b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:22 +0100 Subject: block: remove a duplicate definition for bdev_read_only bdev_read_only is already defined as an inline function in blkdev.h. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5270117d54ac..8b4e4692e7fb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1586,7 +1586,6 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } -int bdev_read_only(struct block_device *bdev); int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); -- cgit v1.2.3 From 766a71ef65bb217ed8bf1c068ac14c7d3c15d487 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:23 +0100 Subject: block: return bool from get_disk_ro and bdev_read_only get_disk_ro and bdev_read_only return boolean conditions, don't masquerade them as int. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8b4e4692e7fb..08a727b40816 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -779,13 +779,13 @@ static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) atomic_andnot(flag, &bdev->__bd_flags); } -static inline int get_disk_ro(struct gendisk *disk) +static inline bool get_disk_ro(struct gendisk *disk) { return bdev_test_flag(disk->part0, BD_READ_ONLY) || test_bit(GD_READ_ONLY, &disk->state); } -static inline int bdev_read_only(struct block_device *bdev) +static inline bool bdev_read_only(struct block_device *bdev) { return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); } -- cgit v1.2.3 From 7d2f9f870f26798699585f96fa7a2a2cab38dc02 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Nov 2024 15:10:07 +0800 Subject: nvme: introduce change ptpl and iekey definition This is for the next tuning pr code more readble patch, make linux/nvme.h's changes separately. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2baf9a80b470..13377dde4527 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2171,4 +2171,13 @@ enum nvme_pr_release_action { NVME_PR_RELEASE_ACT_CLEAR = 1, }; +enum nvme_pr_change_ptpl { + NVME_PR_CPTPL_NO_CHANGE = 0, + NVME_PR_CPTPL_RESV = 1 << 30, + NVME_PR_CPTPL_CLEARED = 2 << 30, + NVME_PR_CPTPL_PERSIST = 3 << 30, +}; + +#define NVME_PR_IGNORE_KEY (1 << 3) + #endif /* _LINUX_NVME_H */ -- cgit v1.2.3