From ebc4176551cdd021d02f4d2ed734e7b65e44442a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Feb 2025 22:00:35 -0800 Subject: blk-crypto: add basic hardware-wrapped key support To prevent keys from being compromised if an attacker acquires read access to kernel memory, some inline encryption hardware can accept keys which are wrapped by a per-boot hardware-internal key. This avoids needing to keep the raw keys in kernel memory, without limiting the number of keys that can be used. Such hardware also supports deriving a "software secret" for cryptographic tasks that can't be handled by inline encryption; this is needed for fscrypt to work properly. To support this hardware, allow struct blk_crypto_key to represent a hardware-wrapped key as an alternative to a raw key, and make drivers set flags in struct blk_crypto_profile to indicate which types of keys they support. Also add the ->derive_sw_secret() low-level operation, which drivers supporting wrapped keys must implement. For more information, see the detailed documentation which this patch adds to Documentation/block/inline-encryption.rst. Signed-off-by: Eric Biggers Tested-by: Bartosz Golaszewski # sm8650 Link: https://lore.kernel.org/r/20250204060041.409950-2-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-crypto-profile.h | 20 +++++++++++ include/linux/blk-crypto.h | 72 ++++++++++++++++++++++++++++++++++---- 2 files changed, 85 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index 90ab33cb5d0e..7764b4f7b45b 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -57,6 +57,20 @@ struct blk_crypto_ll_ops { int (*keyslot_evict)(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot); + + /** + * @derive_sw_secret: Derive the software secret from a hardware-wrapped + * key in ephemerally-wrapped form. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * Must return 0 on success, -EBADMSG if the key is invalid, or another + * -errno code on other errors. + */ + int (*derive_sw_secret)(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); }; /** @@ -84,6 +98,12 @@ struct blk_crypto_profile { */ unsigned int max_dun_bytes_supported; + /** + * @key_types_supported: A bitmask of the supported key types: + * BLK_CRYPTO_KEY_TYPE_RAW and/or BLK_CRYPTO_KEY_TYPE_HW_WRAPPED. + */ + unsigned int key_types_supported; + /** * @modes_supported: Array of bitmasks that specifies whether each * combination of crypto mode and data unit size is supported. diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 5e5822c18ee4..81f932b3ea37 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -6,6 +6,7 @@ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H +#include #include enum blk_crypto_mode_num { @@ -17,7 +18,55 @@ enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_MAX, }; -#define BLK_CRYPTO_MAX_KEY_SIZE 64 +/* + * Supported types of keys. Must be bitflags due to their use in + * blk_crypto_profile::key_types_supported. + */ +enum blk_crypto_key_type { + /* + * Raw keys (i.e. "software keys"). These keys are simply kept in raw, + * plaintext form in kernel memory. + */ + BLK_CRYPTO_KEY_TYPE_RAW = 0x1, + + /* + * Hardware-wrapped keys. These keys are only present in kernel memory + * in ephemerally-wrapped form, and they can only be unwrapped by + * dedicated hardware. For details, see the "Hardware-wrapped keys" + * section of Documentation/block/inline-encryption.rst. + */ + BLK_CRYPTO_KEY_TYPE_HW_WRAPPED = 0x2, +}; + +/* + * Currently the maximum raw key size is 64 bytes, as that is the key size of + * BLK_ENCRYPTION_MODE_AES_256_XTS which takes the longest key. + * + * The maximum hardware-wrapped key size depends on the hardware's key wrapping + * algorithm, which is a hardware implementation detail, so it isn't precisely + * specified. But currently 128 bytes is plenty in practice. Implementations + * are recommended to wrap a 32-byte key for the hardware KDF with AES-256-GCM, + * which should result in a size closer to 64 bytes than 128. + * + * Both of these values can trivially be increased if ever needed. + */ +#define BLK_CRYPTO_MAX_RAW_KEY_SIZE 64 +#define BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE 128 + +#define BLK_CRYPTO_MAX_ANY_KEY_SIZE \ + MAX(BLK_CRYPTO_MAX_RAW_KEY_SIZE, BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + +/* + * Size of the "software secret" which can be derived from a hardware-wrapped + * key. This is currently always 32 bytes. Note, the choice of 32 bytes + * assumes that the software secret is only used directly for algorithms that + * don't require more than a 256-bit key to get the desired security strength. + * If it were to be used e.g. directly as an AES-256-XTS key, then this would + * need to be increased (which is possible if hardware supports it, but care + * would need to be taken to avoid breaking users who need exactly 32 bytes). + */ +#define BLK_CRYPTO_SW_SECRET_SIZE 32 + /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for @@ -26,20 +75,23 @@ enum blk_crypto_mode_num { * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key + * @key_type: the type of this key -- either raw or hardware-wrapped */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; + enum blk_crypto_key_type key_type; }; /** * struct blk_crypto_key - an inline encryption key - * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this - * key + * @crypto_cfg: the crypto mode, data unit size, key type, and other + * characteristics of this key and how it will be used * @data_unit_size_bits: log2 of data_unit_size - * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) - * @raw: the raw bytes of this key. Only the first @size bytes are used. + * @size: size of this key in bytes. The size of a raw key is fixed for a given + * crypto mode, but the size of a hardware-wrapped key can vary. + * @bytes: the bytes of this key. Only the first @size bytes are significant. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed @@ -49,7 +101,7 @@ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; - u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; + u8 bytes[BLK_CRYPTO_MAX_ANY_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 @@ -87,7 +139,9 @@ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *key_bytes, size_t key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); @@ -103,6 +157,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev, bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) -- cgit v1.2.3 From 1ebd4a3c095cd538d3c1c7c12738ef47d8e71f96 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Feb 2025 22:00:37 -0800 Subject: blk-crypto: add ioctls to create and prepare hardware-wrapped keys Until this point, the kernel can use hardware-wrapped keys to do encryption if userspace provides one -- specifically a key in ephemerally-wrapped form. However, no generic way has been provided for userspace to get such a key in the first place. Getting such a key is a two-step process. First, the key needs to be imported from a raw key or generated by the hardware, producing a key in long-term wrapped form. This happens once in the whole lifetime of the key. Second, the long-term wrapped key needs to be converted into ephemerally-wrapped form. This happens each time the key is "unlocked". In Android, these operations are supported in a generic way through KeyMint, a userspace abstraction layer. However, that method is Android-specific and can't be used on other Linux systems, may rely on proprietary libraries, and also misleads people into supporting KeyMint features like rollback resistance that make sense for other KeyMint keys but don't make sense for hardware-wrapped inline encryption keys. Therefore, this patch provides a generic kernel interface for these operations by introducing new block device ioctls: - BLKCRYPTOIMPORTKEY: convert a raw key to long-term wrapped form. - BLKCRYPTOGENERATEKEY: have the hardware generate a new key, then return it in long-term wrapped form. - BLKCRYPTOPREPAREKEY: convert a key from long-term wrapped form to ephemerally-wrapped form. These ioctls are implemented using new operations in blk_crypto_ll_ops. Signed-off-by: Eric Biggers Tested-by: Bartosz Golaszewski # sm8650 Link: https://lore.kernel.org/r/20250204060041.409950-4-ebiggers@kernel.org Signed-off-by: Jens Axboe --- Documentation/block/inline-encryption.rst | 36 ++++++ Documentation/userspace-api/ioctl/ioctl-number.rst | 2 + block/blk-crypto-internal.h | 9 ++ block/blk-crypto-profile.c | 55 ++++++++ block/blk-crypto.c | 143 +++++++++++++++++++++ block/ioctl.c | 5 + include/linux/blk-crypto-profile.h | 53 ++++++++ include/linux/blk-crypto.h | 1 + include/uapi/linux/blk-crypto.h | 44 +++++++ include/uapi/linux/fs.h | 6 +- 10 files changed, 350 insertions(+), 4 deletions(-) create mode 100644 include/uapi/linux/blk-crypto.h (limited to 'include/linux') diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index f03bd5b090d8..6380e6ab492b 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -492,6 +492,42 @@ when hardware support is available. This works in the following way: blk-crypto-fallback doesn't support hardware-wrapped keys. Therefore, hardware-wrapped keys can only be used with actual inline encryption hardware. +All the above deals with hardware-wrapped keys in ephemerally-wrapped form only. +To get such keys in the first place, new block device ioctls have been added to +provide a generic interface to creating and preparing such keys: + +- ``BLKCRYPTOIMPORTKEY`` converts a raw key to long-term wrapped form. It takes + in a pointer to a ``struct blk_crypto_import_key_arg``. The caller must set + ``raw_key_ptr`` and ``raw_key_size`` to the pointer and size (in bytes) of the + raw key to import. On success, ``BLKCRYPTOIMPORTKEY`` returns 0 and writes + the resulting long-term wrapped key blob to the buffer pointed to by + ``lt_key_ptr``, which is of maximum size ``lt_key_size``. It also updates + ``lt_key_size`` to be the actual size of the key. On failure, it returns -1 + and sets errno. An errno of ``EOPNOTSUPP`` indicates that the block device + does not support hardware-wrapped keys. An errno of ``EOVERFLOW`` indicates + that the output buffer did not have enough space for the key blob. + +- ``BLKCRYPTOGENERATEKEY`` is like ``BLKCRYPTOIMPORTKEY``, but it has the + hardware generate the key instead of importing one. It takes in a pointer to + a ``struct blk_crypto_generate_key_arg``. + +- ``BLKCRYPTOPREPAREKEY`` converts a key from long-term wrapped form to + ephemerally-wrapped form. It takes in a pointer to a ``struct + blk_crypto_prepare_key_arg``. The caller must set ``lt_key_ptr`` and + ``lt_key_size`` to the pointer and size (in bytes) of the long-term wrapped + key blob to convert. On success, ``BLKCRYPTOPREPAREKEY`` returns 0 and writes + the resulting ephemerally-wrapped key blob to the buffer pointed to by + ``eph_key_ptr``, which is of maximum size ``eph_key_size``. It also updates + ``eph_key_size`` to be the actual size of the key. On failure, it returns -1 + and sets errno. Errno values of ``EOPNOTSUPP`` and ``EOVERFLOW`` mean the + same as they do for ``BLKCRYPTOIMPORTKEY``. An errno of ``EBADMSG`` indicates + that the long-term wrapped key is invalid. + +Userspace needs to use either ``BLKCRYPTOIMPORTKEY`` or ``BLKCRYPTOGENERATEKEY`` +once to create a key, and then ``BLKCRYPTOPREPAREKEY`` each time the key is +unlocked and added to the kernel. Note that these ioctls have no relevance for +raw keys; they are only for hardware-wrapped keys. + Testability ----------- diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 6d1465315df3..d448c010d307 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -85,6 +85,8 @@ Code Seq# Include File Comments 0x10 20-2F arch/s390/include/uapi/asm/hypfs.h 0x12 all linux/fs.h BLK* ioctls linux/blkpg.h + linux/blkzoned.h + linux/blk-crypto.h 0x15 all linux/fs.h FS_IOC_* ioctls 0x1b all InfiniBand Subsystem diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 1893df9a8f06..ccf6dff6ff6b 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -83,6 +83,9 @@ int __blk_crypto_evict_key(struct blk_crypto_profile *profile, bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -130,6 +133,12 @@ static inline bool blk_crypto_rq_has_keyslot(struct request *rq) return false; } +static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index a990d9026c83..94a155912bf1 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -502,6 +502,61 @@ int blk_crypto_derive_sw_secret(struct block_device *bdev, return err; } +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.import_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.import_key(profile, raw_key, raw_key_size, + lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.generate_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.generate_key(profile, lt_key); + blk_crypto_hw_exit(profile); + return ret; +} + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + int ret; + + if (!profile) + return -EOPNOTSUPP; + if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) + return -EOPNOTSUPP; + if (!profile->ll_ops.prepare_key) + return -EOPNOTSUPP; + blk_crypto_hw_enter(profile); + ret = profile->ll_ops.prepare_key(profile, lt_key, lt_key_size, + eph_key); + blk_crypto_hw_exit(profile); + return ret; +} + /** * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities * by child device diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 72975a980fbc..4b1ad84d1b5a 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -469,3 +469,146 @@ void blk_crypto_evict_key(struct block_device *bdev, pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); + +static int blk_crypto_ioctl_import_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_import_key_arg arg; + u8 raw_key[BLK_CRYPTO_MAX_RAW_KEY_SIZE]; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.raw_key_size < 16 || arg.raw_key_size > sizeof(raw_key)) + return -EINVAL; + + if (copy_from_user(raw_key, u64_to_user_ptr(arg.raw_key_ptr), + arg.raw_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_import_key(profile, raw_key, arg.raw_key_size, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(raw_key, sizeof(raw_key)); + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_generate_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_generate_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + ret = blk_crypto_generate_key(profile, lt_key); + if (ret < 0) + goto out; + if (ret > arg.lt_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.lt_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.lt_key_ptr), lt_key, + arg.lt_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + return ret; +} + +static int blk_crypto_ioctl_prepare_key(struct blk_crypto_profile *profile, + void __user *argp) +{ + struct blk_crypto_prepare_key_arg arg; + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]; + int ret; + + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + + if (memchr_inv(arg.reserved, 0, sizeof(arg.reserved))) + return -EINVAL; + + if (arg.lt_key_size > sizeof(lt_key)) + return -EINVAL; + + if (copy_from_user(lt_key, u64_to_user_ptr(arg.lt_key_ptr), + arg.lt_key_size)) { + ret = -EFAULT; + goto out; + } + ret = blk_crypto_prepare_key(profile, lt_key, arg.lt_key_size, eph_key); + if (ret < 0) + goto out; + if (ret > arg.eph_key_size) { + ret = -EOVERFLOW; + goto out; + } + arg.eph_key_size = ret; + if (copy_to_user(u64_to_user_ptr(arg.eph_key_ptr), eph_key, + arg.eph_key_size) || + copy_to_user(argp, &arg, sizeof(arg))) { + ret = -EFAULT; + goto out; + } + ret = 0; + +out: + memzero_explicit(lt_key, sizeof(lt_key)); + memzero_explicit(eph_key, sizeof(eph_key)); + return ret; +} + +int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, + void __user *argp) +{ + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + + if (!profile) + return -EOPNOTSUPP; + + switch (cmd) { + case BLKCRYPTOIMPORTKEY: + return blk_crypto_ioctl_import_key(profile, argp); + case BLKCRYPTOGENERATEKEY: + return blk_crypto_ioctl_generate_key(profile, argp); + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl_prepare_key(profile, argp); + default: + return -ENOTTY; + } +} diff --git a/block/ioctl.c b/block/ioctl.c index 6554b728bae6..faa40f383e27 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -15,6 +15,7 @@ #include #include #include "blk.h" +#include "blk-crypto-internal.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) @@ -620,6 +621,10 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKCRYPTOIMPORTKEY: + case BLKCRYPTOGENERATEKEY: + case BLKCRYPTOPREPAREKEY: + return blk_crypto_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index 7764b4f7b45b..4f39e9cd7576 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -71,6 +71,48 @@ struct blk_crypto_ll_ops { int (*derive_sw_secret)(struct blk_crypto_profile *profile, const u8 *eph_key, size_t eph_key_size, u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + + /** + * @import_key: Create a hardware-wrapped key by importing a raw key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*import_key)(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @generate_key: Generate a hardware-wrapped key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*generate_key)(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @prepare_key: Prepare a hardware-wrapped key to be used. + * + * Prepare a hardware-wrapped key to be used by converting it from + * long-term wrapped form to ephemerally-wrapped form. This only needs + * to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED is supported. + * + * On success, must write the key in ephemerally-wrapped form to + * @eph_key and return its size in bytes. On failure, must return + * -EBADMSG if the key is invalid, or another -errno on other error. + */ + int (*prepare_key)(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); }; /** @@ -163,6 +205,17 @@ void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, const struct blk_crypto_profile *child); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 81f932b3ea37..58b0c5254a67 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -8,6 +8,7 @@ #include #include +#include enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, diff --git a/include/uapi/linux/blk-crypto.h b/include/uapi/linux/blk-crypto.h new file mode 100644 index 000000000000..97302c6eb6af --- /dev/null +++ b/include/uapi/linux/blk-crypto.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_BLK_CRYPTO_H +#define _UAPI_LINUX_BLK_CRYPTO_H + +#include +#include + +struct blk_crypto_import_key_arg { + /* Raw key (input) */ + __u64 raw_key_ptr; + __u64 raw_key_size; + /* Long-term wrapped key blob (output) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + __u64 reserved[4]; +}; + +struct blk_crypto_generate_key_arg { + /* Long-term wrapped key blob (output) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + __u64 reserved[4]; +}; + +struct blk_crypto_prepare_key_arg { + /* Long-term wrapped key blob (input) */ + __u64 lt_key_ptr; + __u64 lt_key_size; + /* Ephemerally-wrapped key blob (output) */ + __u64 eph_key_ptr; + __u64 eph_key_size; + __u64 reserved[4]; +}; + +/* + * These ioctls share the block device ioctl space; see uapi/linux/fs.h. + * 140-141 are reserved for future blk-crypto ioctls; any more than that would + * require an additional allocation from the block device ioctl space. + */ +#define BLKCRYPTOIMPORTKEY _IOWR(0x12, 137, struct blk_crypto_import_key_arg) +#define BLKCRYPTOGENERATEKEY _IOWR(0x12, 138, struct blk_crypto_generate_key_arg) +#define BLKCRYPTOPREPAREKEY _IOWR(0x12, 139, struct blk_crypto_prepare_key_arg) + +#endif /* _UAPI_LINUX_BLK_CRYPTO_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 2bbe00cf1248..e762e1af650c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -212,10 +212,8 @@ struct fsxattr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) #define BLKGETDISKSEQ _IOR(0x12,128,__u64) -/* - * A jump here: 130-136 are reserved for zoned block devices - * (see uapi/linux/blkzoned.h) - */ +/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ +/* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From 36d03cb3277e29beedb87b8efb1e4da02b26e0c0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sat, 8 Feb 2025 17:04:15 +0800 Subject: block: introduce init_wait_func() There is already a macro DEFINE_WAIT_FUNC() to declare a wait_queue_entry with a specified waking function. But there is not a counterpart for initializing one wait_queue_entry with a specified waking function. So introducing init_wait_func() for this, which also could be used in iocost and rq-qos. Using default_wake_function() in rq_qos_wait() to wake up waiters, which could remove ->task field from rq_qos_wait_data. Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Muchun Song Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250208090416.38642-1-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 +-- block/blk-rq-qos.c | 14 +++++++------- include/linux/wait.h | 6 ++++-- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 65a1d4427ccf..6be46e28459b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2718,8 +2718,7 @@ retry_lock: * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); - wait.wait.private = current; + init_wait_func(&wait.wait, iocg_wake_fn); wait.bio = bio; wait.abs_cost = abs_cost; wait.committed = false; /* will be set true by waker */ diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d4d4f4dc0e23..0ed3c81723bb 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -196,7 +196,6 @@ bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) struct rq_qos_wait_data { struct wait_queue_entry wq; - struct task_struct *task; struct rq_wait *rqw; acquire_inflight_cb_t *cb; void *private_data; @@ -218,7 +217,12 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; - wake_up_process(data->task); + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). + */ + default_wake_function(curr, mode, wake_flags, key); list_del_init_careful(&curr->entry); return 1; } @@ -244,11 +248,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, cleanup_cb_t *cleanup_cb) { struct rq_qos_wait_data data = { - .wq = { - .func = rq_qos_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, .rqw = rqw, .cb = acquire_inflight_cb, .private_data = private_data, @@ -259,6 +258,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return; + init_wait_func(&data.wq, rq_qos_wake_function); has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); do { diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..2bdc8f47963b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1207,14 +1207,16 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) -#define init_wait(wait) \ +#define init_wait_func(wait, function) \ do { \ (wait)->private = current; \ - (wait)->func = autoremove_wake_function; \ + (wait)->func = function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) +#define init_wait(wait) init_wait_func(wait, autoremove_wake_function) + typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); -- cgit v1.2.3 From 105ca2a2c2ff2c8df0e334d6913d62eec1973dd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:33 -0800 Subject: block: split struct bio_integrity_payload Many of the fields in struct bio_integrity_payload are only needed for the default integrity buffer in the block layer, and the variable sized array at the end of the structure makes it very hard to embed into caller allocated structures. Reduce struct bio_integrity_payload to the minimal structure needed in common code and create two separate containing structures for the automatically generated payload and the caller allocated payload. The latter is a simple wrapper for struct bio_integrity_payload and the bvecs, while the former contains the additional fields moved out of struct bio_integrity_payload. Always use a dedicated mempool for automatic integrity metadata instead of depending on bio_set that is submitter controlled and thus often doesn't have the mempool initialized and stop using mempools for the submitter buffers as they aren't in the NOIO I/O submission path where we need to guarantee forward progress. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Tested-by: Anuj Gupta Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250225154449.422989-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 75 +++++++++++++++++-------- block/bio-integrity.c | 107 ++++++++---------------------------- block/bio.c | 6 -- block/blk.h | 2 +- block/t10-pi.c | 6 +- drivers/md/dm-integrity.c | 12 ---- drivers/md/dm-table.c | 6 -- drivers/md/md.c | 13 ----- drivers/target/target_core_iblock.c | 12 ---- include/linux/bio-integrity.h | 25 +-------- include/linux/bio.h | 4 -- 11 files changed, 80 insertions(+), 188 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 357241fa0f20..e524c609be50 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -12,18 +12,34 @@ #include #include "blk.h" +struct bio_integrity_data { + struct bio *bio; + struct bvec_iter saved_bio_iter; + struct work_struct work; + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *bid_slab; +static mempool_t bid_pool; static struct workqueue_struct *kintegrityd_wq; -static void bio_integrity_verify_fn(struct work_struct *work) +static void bio_integrity_finish(struct bio_integrity_data *bid) { - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; + bid->bio->bi_integrity = NULL; + bid->bio->bi_opf &= ~REQ_INTEGRITY; + kfree(bvec_virt(bid->bip.bip_vec)); + mempool_free(bid, &bid_pool); +} - blk_integrity_verify(bio); +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_data *bid = + container_of(work, struct bio_integrity_data, work); + struct bio *bio = bid->bio; - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); + blk_integrity_verify_iter(bio, &bid->saved_bio_iter); + bio_integrity_finish(bid); bio_endio(bio); } @@ -40,15 +56,16 @@ bool __bio_integrity_endio(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_data *bid = + container_of(bip, struct bio_integrity_data, bip); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); + INIT_WORK(&bid->work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bid->work); return false; } - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); + bio_integrity_finish(bid); return true; } @@ -65,8 +82,8 @@ bool __bio_integrity_endio(struct bio *bio) */ bool bio_integrity_prep(struct bio *bio) { - struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_data *bid; gfp_t gfp = GFP_NOIO; unsigned int len; void *buf; @@ -102,27 +119,30 @@ bool bio_integrity_prep(struct bio *bio) return true; } + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return true; + /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (!buf) goto err_end_io; + bid = mempool_alloc(&bid_pool, GFP_NOIO); + if (!bid) + goto err_free_buf; + bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); - bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (IS_ERR(bip)) { - kfree(buf); - goto err_end_io; - } + bid->bio = bio; - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); + bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bip->bip_flags |= BIP_IP_CHECKSUM; + bid->bip.bip_flags |= BIP_IP_CHECKSUM; if (bi->csum_type) - bip->bip_flags |= BIP_CHECK_GUARD; + bid->bip.bip_flags |= BIP_CHECK_GUARD; if (bi->flags & BLK_INTEGRITY_REF_TAG) - bip->bip_flags |= BIP_CHECK_REFTAG; + bid->bip.bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) @@ -132,9 +152,11 @@ bool bio_integrity_prep(struct bio *bio) if (bio_data_dir(bio) == WRITE) blk_integrity_generate(bio); else - bip->bio_iter = bio->bi_iter; + bid->saved_bio_iter = bio->bi_iter; return true; +err_free_buf: + kfree(buf); err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); @@ -149,6 +171,13 @@ void blk_flush_integrity(void) static int __init blk_integrity_auto_init(void) { + bid_slab = kmem_cache_create("bio_integrity_data", + sizeof(struct bio_integrity_data), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) + panic("bio: can't create integrity pool\n"); + /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. diff --git a/block/bio-integrity.c b/block/bio-integrity.c index aa9f96612319..608594a154a5 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,13 +7,12 @@ */ #include -#include -#include -#include -#include #include "blk.h" -static struct kmem_cache *bip_slab; +struct bio_integrity_alloc { + struct bio_integrity_payload bip; + struct bio_vec bvecs[]; +}; /** * bio_integrity_free - Free bio integrity payload @@ -23,21 +22,23 @@ static struct kmem_cache *bip_slab; */ void bio_integrity_free(struct bio *bio) { - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - - if (bs && mempool_initialized(&bs->bio_integrity_pool)) { - if (bip->bip_vec) - bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_max_vcnt); - mempool_free(bip, &bs->bio_integrity_pool); - } else { - kfree(bip); - } + kfree(bio_integrity(bio)); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs) +{ + memset(bip, 0, sizeof(*bip)); + bip->bip_max_vcnt = nr_vecs; + if (nr_vecs) + bip->bip_vec = bvecs; + + bio->bi_integrity = bip; + bio->bi_opf |= REQ_INTEGRITY; +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -52,48 +53,16 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned inline_vecs; + struct bio_integrity_alloc *bia; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); - if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { - bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!bip)) + bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask); + if (unlikely(!bia)) return ERR_PTR(-ENOMEM); - - memset(bip, 0, sizeof(*bip)); - - /* always report as many vecs as asked explicitly, not inline vecs */ - bip->bip_max_vcnt = nr_vecs; - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, - &bip->bip_max_vcnt, gfp_mask); - if (!bip->bip_vec) - goto err; - } else if (nr_vecs) { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_bio = bio; - bio->bi_integrity = bip; - bio->bi_opf |= REQ_INTEGRITY; - - return bip; -err: - if (bs && mempool_initialized(&bs->bio_integrity_pool)) - mempool_free(bip, &bs->bio_integrity_pool); - else - kfree(bip); - return ERR_PTR(-ENOMEM); + bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs); + return &bia->bip; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -467,35 +436,3 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (mempool_initialized(&bs->bio_integrity_pool)) - return 0; - - if (mempool_init_slab_pool(&bs->bio_integrity_pool, - pool_size, bip_slab)) - return -1; - - if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { - mempool_exit(&bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - mempool_exit(&bs->bio_integrity_pool); - mempool_exit(&bs->bvec_integrity_pool); -} - -void __init bio_integrity_init(void) -{ - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIO_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} diff --git a/block/bio.c b/block/bio.c index f0c416e5931d..dabc1a6c41b1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1657,7 +1657,6 @@ void bioset_exit(struct bio_set *bs) mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); - bioset_integrity_free(bs); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; @@ -1737,8 +1736,6 @@ static int __init init_bio(void) BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); - bio_integrity_init(); - for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; @@ -1754,9 +1751,6 @@ static int __init init_bio(void) BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); - if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/block/blk.h b/block/blk.h index 90fa5f28ccab..8f5554a6989e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -710,7 +710,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); void blk_integrity_generate(struct bio *bio); -void blk_integrity_verify(struct bio *bio); +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); diff --git a/block/t10-pi.c b/block/t10-pi.c index 2d05421f0fa5..de172d56b1f3 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -404,7 +404,7 @@ void blk_integrity_generate(struct bio *bio) } } -void blk_integrity_verify(struct bio *bio) +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); @@ -418,9 +418,9 @@ void blk_integrity_verify(struct bio *bio) */ iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; - iter.seed = bip->bio_iter.bi_sector; + iter.seed = saved_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) { + __bio_for_each_segment(bv, bio, bviter, *saved_iter) { void *kaddr = bvec_kmap_local(&bv); blk_status_t ret = BLK_STS_OK; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e..e743657379f7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4808,23 +4808,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS); if (r) { ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recalc_bios, 1); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bf9a61191e9a..453803f1edf5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1081,15 +1081,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->io_bs, pool_size)) - goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->bs, pool_size)) - goto out_free_pools; t->mempools = pools; return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index 30b3dbbce2d2..79cabe4be77d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2359,19 +2359,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index c8dc92a7d63e..73564efd11d2 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -167,18 +167,6 @@ static int iblock_configure_device(struct se_device *dev) break; } - if (dev->dev_attrib.pi_prot_type) { - struct bio_set *bs = &ib_dev->ibd_bio_set; - - if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { - pr_err("Unable to allocate bioset for PI\n"); - ret = -ENOMEM; - goto out_blkdev_put; - } - pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - &bs->bio_integrity_pool); - } - dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; return 0; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 802f52e38efd..0a25716820fe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -16,8 +16,6 @@ enum bip_flags { }; struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ @@ -25,12 +23,7 @@ struct bio_integrity_payload { unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ @@ -74,6 +67,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, bip->bip_iter.bi_sector = seed; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, @@ -85,9 +80,6 @@ bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); -int bioset_integrity_create(struct bio_set *bs, int pool_size); -void bioset_integrity_free(struct bio_set *bs); -void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ @@ -96,15 +88,6 @@ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) return NULL; } -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free(struct bio_set *bs) -{ -} - static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; @@ -139,10 +122,6 @@ static inline void bio_integrity_trim(struct bio *bio) { } -static inline void bio_integrity_init(void) -{ -} - static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4b79bf50f4f0..cafc7c215de8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -625,10 +625,6 @@ struct bio_set { mempool_t bio_pool; mempool_t bvec_pool; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t bio_integrity_pool; - mempool_t bvec_integrity_pool; -#endif unsigned int back_pad; /* -- cgit v1.2.3 From c8775aefba959cdfbaa25408a84d3dd15bbeb991 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 27 Feb 2025 15:55:05 +0800 Subject: badblocks: return boolean from badblocks_set() and badblocks_clear() Change the return type of badblocks_set() and badblocks_clear() from int to bool, indicating success or failure. Specifically: - _badblocks_set() and _badblocks_clear() functions now return true for success and false for failure. - All calls to these functions are updated to handle the new boolean return type. - This change improves code clarity and ensures a more consistent handling of success and failure states. Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Acked-by: Coly Li Acked-by: Ira Weiny Link: https://lore.kernel.org/r/20250227075507.151331-11-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 41 ++++++++++++++++++++--------------------- drivers/block/null_blk/main.c | 14 +++++++------- drivers/md/md.c | 35 ++++++++++++++++++----------------- drivers/nvdimm/badrange.c | 2 +- include/linux/badblocks.h | 6 +++--- 5 files changed, 49 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/block/badblocks.c b/block/badblocks.c index b66d5f12a766..e326a16fd056 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -836,8 +836,8 @@ static bool try_adjacent_combine(struct badblocks *bb, int prev) } /* Do exact work to set bad block range into the bad block table */ -static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +static bool _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) { int len = 0, added = 0; struct badblocks_context bad; @@ -847,11 +847,11 @@ static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { /* round the start down, and the end up */ @@ -977,7 +977,7 @@ out: write_sequnlock_irqrestore(&bb->lock, flags); - return sectors; + return sectors == 0; } /* @@ -1048,21 +1048,20 @@ static int front_splitting_clear(struct badblocks *bb, int prev, } /* Do the exact work to clear bad block range from the bad block table */ -static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static bool _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) { struct badblocks_context bad; int prev = -1, hint = -1; int len = 0, cleared = 0; - int rv = 0; u64 *p; if (bb->shift < 0) /* badblocks are disabled */ - return 1; + return false; if (sectors == 0) /* Invalid sectors number */ - return 1; + return false; if (bb->shift) { sector_t target; @@ -1182,9 +1181,9 @@ update_sectors: write_sequnlock_irq(&bb->lock); if (!cleared) - rv = 1; + return false; - return rv; + return true; } /* Do the exact work to check bad blocks range from the bad block table */ @@ -1338,12 +1337,12 @@ EXPORT_SYMBOL_GPL(badblocks_check); * decide how best to handle it. * * Return: - * 0: success - * other: failed to set badblocks (out of space). Parital setting will be + * true: success + * false: failed to set badblocks (out of space). Parital setting will be * treated as failure. */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) { return _badblocks_set(bb, s, sectors, acknowledged); } @@ -1360,10 +1359,10 @@ EXPORT_SYMBOL_GPL(badblocks_set); * drop the remove request. * * Return: - * 0: success - * 1: failed to clear badblocks + * true: success + * false: failed to clear badblocks */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors) { return _badblocks_clear(bb, s, sectors); } @@ -1485,10 +1484,10 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, return -EINVAL; } - if (badblocks_set(bb, sector, length, !unack)) + if (!badblocks_set(bb, sector, length, !unack)) return -ENOSPC; - else - return len; + + return len; } EXPORT_SYMBOL_GPL(badblocks_store); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 31d44cef6841..8f6025efc543 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -561,14 +561,14 @@ static ssize_t nullb_device_badblocks_store(struct config_item *item, goto out; /* enable badblocks */ cmpxchg(&t_dev->badblocks.shift, -1, 0); - if (buf[0] == '+') - ret = badblocks_set(&t_dev->badblocks, start, - end - start + 1, 1); - else - ret = badblocks_clear(&t_dev->badblocks, start, - end - start + 1); - if (ret == 0) + if (buf[0] == '+') { + if (badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1)) + ret = count; + } else if (badblocks_clear(&t_dev->badblocks, start, + end - start + 1)) { ret = count; + } out: kfree(orig); return ret; diff --git a/drivers/md/md.c b/drivers/md/md.c index 79cabe4be77d..95ceb6052360 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1748,7 +1748,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (badblocks_set(&rdev->badblocks, sector, count, 1)) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -9833,7 +9833,6 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { struct mddev *mddev = rdev->mddev; - int rv; /* * Recording new badblocks for faulty rdev will force unnecessary @@ -9849,33 +9848,35 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { - /* Make sure they get written out promptly */ - if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->sb_flags, 0, - BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); - md_wakeup_thread(rdev->mddev->thread); - return 1; - } else + + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) return 0; + + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return 1; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_clear(&rdev->badblocks, s, sectors); - if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + + if (!badblocks_clear(&rdev->badblocks, s, sectors)) + return 0; + + if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return rv; + return 1; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); diff --git a/drivers/nvdimm/badrange.c b/drivers/nvdimm/badrange.c index a002ea6fdd84..ee478ccde7c6 100644 --- a/drivers/nvdimm/badrange.c +++ b/drivers/nvdimm/badrange.c @@ -167,7 +167,7 @@ static void set_badblock(struct badblocks *bb, sector_t s, int num) dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n", (u64) s * 512, (u64) num * 512); /* this isn't an error as the hardware will still throw an exception */ - if (badblocks_set(bb, s, num, 1)) + if (!badblocks_set(bb, s, num, 1)) dev_info_once(bb->dev, "%s: failed for sector %llx\n", __func__, (u64) s); } diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 670f2dae692f..8764bed9ff16 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -50,9 +50,9 @@ struct badblocks_context { int badblocks_check(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors); -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors); void ack_all_badblocks(struct badblocks *bb); ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, -- cgit v1.2.3 From d301f164c3fbff611bd71f57dfa553b9219f0f5e Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 27 Feb 2025 15:55:07 +0800 Subject: badblocks: use sector_t instead of int to avoid truncation of badblocks length There is a truncation of badblocks length issue when set badblocks as follow: echo "2055 4294967299" > bad_blocks cat bad_blocks 2055 3 Change 'sectors' argument type from 'int' to 'sector_t'. This change avoids truncation of badblocks length for large sectors by replacing 'int' with 'sector_t' (u64), enabling proper handling of larger disk sizes and ensuring compatibility with 64-bit sector addressing. Fixes: 9e0e252a048b ("badblocks: Add core badblock management code") Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-13-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/badblocks.c | 20 ++++++++------------ drivers/block/null_blk/main.c | 3 +-- drivers/md/md.h | 6 +++--- drivers/md/raid1-10.c | 2 +- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 8 ++++---- drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/pfn_devs.c | 7 ++++--- drivers/nvdimm/pmem.c | 2 +- include/linux/badblocks.h | 8 ++++---- 10 files changed, 29 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/block/badblocks.c b/block/badblocks.c index e326a16fd056..673ef068423a 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -836,7 +836,7 @@ static bool try_adjacent_combine(struct badblocks *bb, int prev) } /* Do exact work to set bad block range into the bad block table */ -static bool _badblocks_set(struct badblocks *bb, sector_t s, int sectors, +static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, int acknowledged) { int len = 0, added = 0; @@ -956,8 +956,6 @@ update_sectors: if (sectors > 0) goto re_insert; - WARN_ON(sectors < 0); - /* * Check whether the following already set range can be * merged. (prev < 0) condition is not handled here, @@ -1048,7 +1046,7 @@ static int front_splitting_clear(struct badblocks *bb, int prev, } /* Do the exact work to clear bad block range from the bad block table */ -static bool _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { struct badblocks_context bad; int prev = -1, hint = -1; @@ -1171,8 +1169,6 @@ update_sectors: if (sectors > 0) goto re_clear; - WARN_ON(sectors < 0); - if (cleared) { badblocks_update_acked(bb); set_changed(bb); @@ -1187,8 +1183,8 @@ update_sectors: } /* Do the exact work to check bad blocks range from the bad block table */ -static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { int prev = -1, hint = -1, set = 0; struct badblocks_context bad; @@ -1298,8 +1294,8 @@ update_sectors: * -1: there are bad blocks which have not yet been acknowledged in metadata. * plus the start/length of the first bad section we overlap. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { unsigned int seq; int rv; @@ -1341,7 +1337,7 @@ EXPORT_SYMBOL_GPL(badblocks_check); * false: failed to set badblocks (out of space). Parital setting will be * treated as failure. */ -bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, int acknowledged) { return _badblocks_set(bb, s, sectors, acknowledged); @@ -1362,7 +1358,7 @@ EXPORT_SYMBOL_GPL(badblocks_set); * true: success * false: failed to clear badblocks */ -bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors) { return _badblocks_clear(bb, s, sectors); } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 8f6025efc543..4a5cd288dc07 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1339,8 +1339,7 @@ blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, struct badblocks *bb = &cmd->nq->dev->badblocks; struct nullb_device *dev = cmd->nq->dev; unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT; - sector_t first_bad; - int bad_sectors; + sector_t first_bad, bad_sectors; unsigned int partial_io_sectors = 0; if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors)) diff --git a/drivers/md/md.h b/drivers/md/md.h index 923a0ef51efe..6edc0f71b7d4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -266,8 +266,8 @@ enum flag_bits { Nonrot, /* non-rotational device (SSD) */ }; -static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, @@ -284,7 +284,7 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 4378d3250bd7..62b980b12f93 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, sector_t this_sector, int *len) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; /* no bad block overlap */ if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8beb8cccc6af..0b2839105857 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1537,7 +1537,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, @@ -2886,7 +2886,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { /* may need to read from here */ sector_t first_bad = MaxSector; - int bad_sectors; + sector_t bad_sectors; if (is_badblock(rdev, sector_nr, good_sectors, &first_bad, &bad_sectors)) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7ed933181712..a8664e29aada 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -747,7 +747,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, for (slot = 0; slot < conf->copies ; slot++) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; sector_t dev_sector; unsigned int pending; bool nonrot; @@ -1438,7 +1438,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, dev_sector, max_sectors, @@ -3413,7 +3413,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t from_addr, to_addr; struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t sector, first_bad; - int bad_sectors; + sector_t bad_sectors; if (!rdev || !test_bit(In_sync, &rdev->flags)) continue; @@ -3609,7 +3609,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; - int bad_sectors; + sector_t bad_sectors; struct md_rdev *rdev; if (r10_bio->devs[i].repl_bio) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 5ca06e9a2d29..cc5c8f3f81e8 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -673,7 +673,7 @@ static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, { if (bb->count) { sector_t first_bad; - int num_bad; + sector_t num_bad; return !!badblocks_check(bb, sector, len / 512, &first_bad, &num_bad); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index cfdfe0eaa512..8f3e816e805d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -367,9 +367,10 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) struct nd_namespace_common *ndns = nd_pfn->ndns; void *zero_page = page_address(ZERO_PAGE(0)); struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; - int num_bad, meta_num, rc, bb_present; + int meta_num, rc, bb_present; sector_t first_bad, meta_start; struct nd_namespace_io *nsio; + sector_t num_bad; if (nd_pfn->mode != PFN_MODE_PMEM) return 0; @@ -394,7 +395,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) bb_present = badblocks_check(&nd_region->bb, meta_start, meta_num, &first_bad, &num_bad); if (bb_present) { - dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n", + dev_dbg(&nd_pfn->dev, "meta: %llx badblocks at %llx\n", num_bad, first_bad); nsoff = ALIGN_DOWN((nd_region->ndr_start + (first_bad << 9)) - nsio->res.start, @@ -413,7 +414,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) } if (rc) { dev_err(&nd_pfn->dev, - "error clearing %x badblocks at %llx\n", + "error clearing %llx badblocks at %llx\n", num_bad, first_bad); return rc; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d81faa9d89c9..43156e1576c9 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -249,7 +249,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, unsigned int num = PFN_PHYS(nr_pages) >> SECTOR_SHIFT; struct badblocks *bb = &pmem->bb; sector_t first_bad; - int num_bad; + sector_t num_bad; if (kaddr) *kaddr = pmem->virt_addr + offset; diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 8764bed9ff16..996493917f36 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -48,11 +48,11 @@ struct badblocks_context { int ack; }; -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); -bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors); +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, int acknowledged); -bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors); void ack_all_badblocks(struct badblocks *bb); ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, -- cgit v1.2.3 From 1bf70d08cc3b55abd1763e6dff5855cb8dd8318b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:33 +0530 Subject: block: introduce a dedicated lock for protecting queue elevator updates A queue's elevator can be updated either when modifying nr_hw_queues or through the sysfs scheduler attribute. Currently, elevator switching/ updating is protected using q->sysfs_lock, but this has led to lockdep splats[1] due to inconsistent lock ordering between q->sysfs_lock and the freeze-lock in multiple block layer call sites. As the scope of q->sysfs_lock is not well-defined, its (mis)use has resulted in numerous lockdep warnings. To address this, introduce a new q->elevator_lock, dedicated specifically for protecting elevator switches/updates. And we'd now use this new q->elevator_lock instead of q->sysfs_lock for protecting elevator switches/updates. While at it, make elv_iosched_load_module() a static function, as it is only called from elv_iosched_store(). Also, remove redundant parameters from elv_iosched_load_module() function signature. [1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-5-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-mq.c | 15 +++++++-------- block/blk-sysfs.c | 32 ++++++++++++++++++++++---------- block/elevator.c | 35 ++++++++++++++++------------------- block/elevator.h | 2 -- block/genhd.c | 9 ++++++--- include/linux/blkdev.h | 8 ++++++++ 7 files changed, 60 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index d6c4fa3943b5..362d0a55b07a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -429,6 +429,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); + mutex_init(&q->elevator_lock); mutex_init(&q->sysfs_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); diff --git a/block/blk-mq.c b/block/blk-mq.c index 40490ac88045..5a2d63927525 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4467,7 +4467,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, unsigned long i, j; /* protect against switching io scheduler */ - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); @@ -4500,7 +4500,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); @@ -4933,10 +4933,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, if (!qe) return false; - /* q->elevator needs protection from ->sysfs_lock */ - mutex_lock(&q->sysfs_lock); + /* Accessing q->elevator needs protection from ->elevator_lock. */ + mutex_lock(&q->elevator_lock); - /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); goto unlock; @@ -4950,7 +4949,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, list_add(&qe->node, head); elevator_disable(q); unlock: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return true; } @@ -4980,11 +4979,11 @@ static void blk_mq_elv_switch_back(struct list_head *head, list_del(&qe->node); kfree(qe); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bc641ac71cde..1562e22877e1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -693,10 +693,15 @@ static struct attribute *blk_mq_queue_attrs[] = { * Attributes which are protected with q->sysfs_lock. */ &queue_requests_entry.attr, - &elv_iosched_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif + /* + * Attributes which require some form of locking other than + * q->sysfs_lock. + */ + &elv_iosched_entry.attr, + /* * Attributes which don't require locking. */ @@ -865,15 +870,19 @@ int blk_register_queue(struct gendisk *disk) if (ret) goto out_debugfs_remove; + ret = blk_crypto_sysfs_register(disk); + if (ret) + goto out_unregister_ia_ranges; + + mutex_lock(&q->elevator_lock); if (q->elevator) { ret = elv_register_queue(q, false); - if (ret) - goto out_unregister_ia_ranges; + if (ret) { + mutex_unlock(&q->elevator_lock); + goto out_crypto_sysfs_unregister; + } } - - ret = blk_crypto_sysfs_register(disk); - if (ret) - goto out_elv_unregister; + mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk); @@ -898,8 +907,8 @@ int blk_register_queue(struct gendisk *disk) return ret; -out_elv_unregister: - elv_unregister_queue(q); +out_crypto_sysfs_unregister: + blk_crypto_sysfs_unregister(disk); out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); out_debugfs_remove: @@ -945,8 +954,11 @@ void blk_unregister_queue(struct gendisk *disk) blk_mq_sysfs_unregister(disk); blk_crypto_sysfs_unregister(disk); - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elv_unregister_queue(q); + mutex_unlock(&q->elevator_lock); + + mutex_lock(&q->sysfs_lock); disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); diff --git a/block/elevator.c b/block/elevator.c index 041f1d983bc7..b4d08026b02c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) struct elevator_queue *e = q->elevator; int error; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { @@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q) { struct elevator_queue *e = q->elevator; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); @@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) unsigned int memflags; int ret; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q) { unsigned int memflags; - lockdep_assert_held(&q->sysfs_lock); + lockdep_assert_held(&q->elevator_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -700,28 +700,23 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return ret; } -void elv_iosched_load_module(struct gendisk *disk, const char *buf, - size_t count) +static void elv_iosched_load_module(char *elevator_name) { - char elevator_name[ELV_NAME_MAX]; struct elevator_type *found; - const char *name; - - strscpy(elevator_name, buf, sizeof(elevator_name)); - name = strstrip(elevator_name); spin_lock(&elv_list_lock); - found = __elevator_find(name); + found = __elevator_find(elevator_name); spin_unlock(&elv_list_lock); if (!found) - request_module("%s-iosched", name); + request_module("%s-iosched", elevator_name); } ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; + char *name; int ret; unsigned int memflags; struct request_queue *q = disk->queue; @@ -731,16 +726,18 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, * queue to ensure that the module file can be read when the request * queue is the one for the device storing the module file. */ - elv_iosched_load_module(disk, buf, count); strscpy(elevator_name, buf, sizeof(elevator_name)); + name = strstrip(elevator_name); + + elv_iosched_load_module(name); - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); - ret = elevator_change(q, strstrip(elevator_name)); + mutex_lock(&q->elevator_lock); + ret = elevator_change(q, name); if (!ret) ret = count; + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -751,7 +748,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { @@ -769,7 +766,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return len; } diff --git a/block/elevator.h b/block/elevator.h index e526662c5dbb..e4e44dfac503 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -148,8 +148,6 @@ extern void elv_unregister(struct elevator_type *); * io scheduler sysfs switching */ ssize_t elv_iosched_show(struct gendisk *disk, char *page); -void elv_iosched_load_module(struct gendisk *disk, const char *page, - size_t count); ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); diff --git a/block/genhd.c b/block/genhd.c index e9375e20d866..c2bd86cd09de 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -565,8 +565,11 @@ out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out_exit_elevator: - if (disk->queue->elevator) + if (disk->queue->elevator) { + mutex_lock(&disk->queue->elevator_lock); elevator_exit(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); + } return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); @@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk) blk_mq_quiesce_queue(q); if (q->elevator) { - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); elevator_exit(q); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); } rq_qos_exit(q); blk_mq_unquiesce_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..31b1b635c710 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -560,6 +560,14 @@ struct request_queue { struct blk_flush_queue *fq; struct list_head flush_list; + /* + * Protects against I/O scheduler switching, specifically when + * updating q->elevator. To ensure proper locking order during + * an elevator update, first freeze the queue, then acquire + * ->elevator_lock. + */ + struct mutex elevator_lock; + struct mutex sysfs_lock; struct mutex limits_lock; -- cgit v1.2.3 From 3efe7571c3ae2b6481253a2616c2bb3fbadd503b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:34 +0530 Subject: block: protect nr_requests update using q->elevator_lock The sysfs attribute nr_requests could be simultaneously updated from elevator switch/update or nr_hw_queue update code path. The update to nr_requests for each of those code paths runs holding q->elevator_lock. So we should protect access to sysfs attribute nr_requests using q-> elevator_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-6-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 10 +++++----- include/linux/blkdev.h | 10 ++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1562e22877e1..f1fa57de29ed 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -55,9 +55,9 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) { ssize_t ret; - mutex_lock(&disk->queue->sysfs_lock); + mutex_lock(&disk->queue->elevator_lock); ret = queue_var_show(disk->queue->nr_requests, page); - mutex_unlock(&disk->queue->sysfs_lock); + mutex_unlock(&disk->queue->elevator_lock); return ret; } @@ -76,16 +76,16 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) if (ret < 0) return ret; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; err = blk_mq_update_nr_requests(disk->queue, nr); if (err) ret = err; + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -692,7 +692,6 @@ static struct attribute *blk_mq_queue_attrs[] = { /* * Attributes which are protected with q->sysfs_lock. */ - &queue_requests_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif @@ -701,6 +700,7 @@ static struct attribute *blk_mq_queue_attrs[] = { * q->sysfs_lock. */ &elv_iosched_entry.attr, + &queue_requests_entry.attr, /* * Attributes which don't require locking. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 31b1b635c710..3e66ad016a23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,10 +561,12 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, specifically when - * updating q->elevator. To ensure proper locking order during - * an elevator update, first freeze the queue, then acquire - * ->elevator_lock. + * Protects against I/O scheduler switching, particularly when + * updating q->elevator. Since the elevator update code path may + * also modify q->nr_requests, this lock also protects the sysfs + * attribute nr_requests. + * To ensure proper locking order during an elevator update, first + * freeze the queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From 245618f8e45ff4f79327627b474b563da71c2c75 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:35 +0530 Subject: block: protect wbt_lat_usec using q->elevator_lock The wbt latency and state could be updated while initializing the elevator or exiting the elevator. It could be also updated while configuring IO latency QoS parameters using cgroup. The elevator code path is now protected with q->elevator_lock. So we should protect the access to sysfs attribute wbt_lat_usec using q->elevator _lock instead of q->sysfs_lock. White we're at it, also protect ioc_qos_write(), which configures wbt parameters via cgroup, using q->elevator_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-7-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 ++ block/blk-sysfs.c | 20 ++++++++------------ include/linux/blkdev.h | 4 ++-- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6be46e28459b..38e7bf3c3b4f 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3248,6 +3248,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, } memflags = blk_mq_freeze_queue(disk->queue); + mutex_lock(&disk->queue->elevator_lock); blk_mq_quiesce_queue(disk->queue); spin_lock_irq(&ioc->lock); @@ -3355,6 +3356,7 @@ einval: spin_unlock_irq(&ioc->lock); blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&disk->queue->elevator_lock); blk_mq_unfreeze_queue(disk->queue, memflags); ret = -EINVAL; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f1fa57de29ed..223da196a548 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -557,7 +557,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ssize_t ret; struct request_queue *q = disk->queue; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); if (!wbt_rq_qos(q)) { ret = -EINVAL; goto out; @@ -570,7 +570,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page) ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); out: - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return ret; } @@ -589,8 +589,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; - mutex_lock(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); rqos = wbt_rq_qos(q); if (!rqos) { @@ -619,8 +619,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, blk_mq_unquiesce_queue(q); out: + mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -689,19 +689,15 @@ static struct attribute *queue_attrs[] = { /* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { - /* - * Attributes which are protected with q->sysfs_lock. - */ -#ifdef CONFIG_BLK_WBT - &queue_wb_lat_entry.attr, -#endif /* * Attributes which require some form of locking other than * q->sysfs_lock. */ &elv_iosched_entry.attr, &queue_requests_entry.attr, - +#ifdef CONFIG_BLK_WBT + &queue_wb_lat_entry.attr, +#endif /* * Attributes which don't require locking. */ @@ -882,10 +878,10 @@ int blk_register_queue(struct gendisk *disk) goto out_crypto_sysfs_unregister; } } + wbt_enable_default(disk); mutex_unlock(&q->elevator_lock); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3e66ad016a23..0ee3b5c9388e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -563,8 +563,8 @@ struct request_queue { /* * Protects against I/O scheduler switching, particularly when * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests, this lock also protects the sysfs - * attribute nr_requests. + * also modify q->nr_requests and wbt latency, this lock also + * protects the sysfs attributes nr_requests and wbt_lat_usec. * To ensure proper locking order during an elevator update, first * freeze the queue, then acquire ->elevator_lock. */ -- cgit v1.2.3 From 5e40f4452dc9a3fb44d13bb6bc7032f3911a2675 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:36 +0530 Subject: block: protect read_ahead_kb using q->limits_lock The bdi->ra_pages could be updated under q->limits_lock because it's usually calculated from the queue limits by queue_limits_commit_update. So protect reading/writing the sysfs attribute read_ahead_kb using q->limits_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-8-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 16 ++++++++++------ include/linux/blkdev.h | 3 +++ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 223da196a548..d584461a1d84 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -93,9 +93,9 @@ static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; - mutex_lock(&disk->queue->sysfs_lock); + mutex_lock(&disk->queue->limits_lock); ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page); - mutex_unlock(&disk->queue->sysfs_lock); + mutex_unlock(&disk->queue->limits_lock); return ret; } @@ -111,12 +111,15 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - mutex_lock(&q->sysfs_lock); + /* + * ->ra_pages is protected by ->limits_lock because it is usually + * calculated from the queue limits by queue_limits_commit_update. + */ + mutex_lock(&q->limits_lock); memflags = blk_mq_freeze_queue(q); disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + mutex_unlock(&q->limits_lock); blk_mq_unfreeze_queue(q, memflags); - mutex_unlock(&q->sysfs_lock); return ret; } @@ -670,7 +673,8 @@ static struct attribute *queue_attrs[] = { &queue_dma_alignment_entry.attr, /* - * Attributes which are protected with q->sysfs_lock. + * Attributes which require some form of locking other than + * q->sysfs_lock. */ &queue_ra_entry.attr, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ee3b5c9388e..3bee1b4858b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,9 @@ struct request_queue { struct mutex elevator_lock; struct mutex sysfs_lock; + /* + * Protects queue limits and also sysfs attribute read_ahead_kb. + */ struct mutex limits_lock; /* -- cgit v1.2.3 From 5abba4cebec0a591ca7e7f55701e42cd5dc059af Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 6 Mar 2025 15:09:53 +0530 Subject: block: protect hctx attributes/params using q->elevator_lock Currently, hctx attributes (nr_tags, nr_reserved_tags, and cpu_list) are protected using `q->sysfs_lock`. However, these attributes can be updated in multiple scenarios: - During the driver's probe method. - When updating nr_hw_queues. - When writing to the sysfs attribute nr_requests, which can modify nr_tags. The nr_requests attribute is already protected using q->elevator_lock, but none of the update paths actually use q->sysfs_lock to protect hctx attributes. So to ensure proper synchronization, replace q->sysfs_lock with q->elevator_lock when reading hctx attributes through sysfs. Additionally, blk_mq_update_nr_hw_queues allocates and updates hctx. The allocation of hctx is protected using q->elevator_lock, however, updating hctx params happens without any protection, so safeguard hctx param update path by also using q->elevator_lock. Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250306093956.2818808-1-nilay@linux.ibm.com [axboe: wrap comment at 80 chars] Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 4 ++-- block/blk-mq.c | 4 ++++ include/linux/blkdev.h | 14 ++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 3feeeccf8a99..24656980f443 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -61,9 +61,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, if (!entry->show) return -EIO; - mutex_lock(&q->sysfs_lock); + mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); return res; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 5a2d63927525..b9550a127c8e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4094,6 +4094,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&q->elevator_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -4198,6 +4200,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + + mutex_unlock(&q->elevator_lock); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3bee1b4858b6..dcf8fce15e23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,12 +561,14 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, particularly when - * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests and wbt latency, this lock also - * protects the sysfs attributes nr_requests and wbt_lat_usec. - * To ensure proper locking order during an elevator update, first - * freeze the queue, then acquire ->elevator_lock. + * Protects against I/O scheduler switching, particularly when updating + * q->elevator. Since the elevator update code path may also modify q-> + * nr_requests and wbt latency, this lock also protects the sysfs attrs + * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update + * may modify hctx tags, reserved-tags and cpumask, so this lock also + * helps protect the hctx attrs. To ensure proper locking order during + * an elevator or nr_hw_queue update, first freeze the queue, then + * acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From 75618ac6e98faee6ed1f17ae64875cc2d7784204 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 13 Mar 2025 09:23:18 +0530 Subject: block: remove unused parameter 'q' parameter in __blk_rq_map_sg() request_queue param is no longer used by blk_rq_map_sg and __blk_rq_map_sg. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313035322.243239-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 4 ++-- block/bsg-lib.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/block/sunvdc.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/memstick/core/ms_block.c | 2 +- drivers/memstick/core/mspro_block.c | 4 +--- drivers/mmc/core/queue.c | 2 +- drivers/mtd/ubi/block.c | 2 +- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 3 +-- drivers/nvme/target/loop.c | 2 +- drivers/scsi/scsi_lib.c | 2 +- include/linux/blk-mq.h | 9 ++++----- 18 files changed, 22 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index 15cd231d560c..8bfe54f23e5e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -551,8 +551,8 @@ static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, * Map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries. */ -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg) { struct req_iterator iter = { .bio = rq->bio, diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 93523d8f8195..9ceb5d0832f5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -219,7 +219,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); - buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->sg_cnt = blk_rq_map_sg(req, buf->sg_list); buf->payload_len = blk_rq_bytes(req); return 0; } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 95361099a2dc..0d619df03fa9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2056,7 +2056,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, unsigned int nents; /* Map the scatter list for DMA access */ - nents = blk_rq_map_sg(hctx->queue, rq, command->sg); + nents = blk_rq_map_sg(rq, command->sg); nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); prefetch(&port->flags); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 82467ecde7ec..15627417f12e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1010,7 +1010,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, * See queue limits. */ if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES)) - sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl); + sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl); if (sg_cnt == 0) sg_mark_end(&iu->sgt.sgl[0]); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 282f81616a78..2b33fb5b949b 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -485,7 +485,7 @@ static int __send_request(struct request *req) } sg_init_table(sg, port->ring_cookies); - nsg = blk_rq_map_sg(req->q, req, sg); + nsg = blk_rq_map_sg(req, sg); len = 0; for (i = 0; i < nsg; i++) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6a61ec35f426..a3df4d49bd46 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -226,7 +226,7 @@ static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, if (unlikely(err)) return -ENOMEM; - return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); + return blk_rq_map_sg(req, vbr->sg_table.sgl); } static void virtblk_cleanup_cmd(struct request *req) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index edcd08a9dcef..5babe575c288 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -751,7 +751,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri id = blkif_ring_get_request(rinfo, req, &final_ring_req); ring_req = &rinfo->shadow[id].req; - num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); + num_sg = blk_rq_map_sg(req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 5b617c1f6789..f4398383ae06 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1904,7 +1904,7 @@ static void msb_io_work(struct work_struct *work) /* process the request */ dbg_verbose("IO: processing new request"); - blk_rq_map_sg(msb->queue, req, sg); + blk_rq_map_sg(req, sg); lba = blk_rq_pos(req); diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 634d343b6bdb..c9853d887d28 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -627,9 +627,7 @@ static int mspro_block_issue_req(struct memstick_dev *card) while (true) { msb->current_page = 0; msb->current_seg = 0; - msb->seg_count = blk_rq_map_sg(msb->block_req->q, - msb->block_req, - msb->req_sg); + msb->seg_count = blk_rq_map_sg(msb->block_req, msb->req_sg); if (!msb->seg_count) { unsigned int bytes = blk_rq_cur_bytes(msb->block_req); diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index ab662f502fe7..3ba62f825b84 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -523,5 +523,5 @@ unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq) { struct request *req = mmc_queue_req_to_req(mqrq); - return blk_rq_map_sg(mq->queue, req, mqrq->sg); + return blk_rq_map_sg(req, mqrq->sg); } diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 2836905f0152..39cc0a6a4d37 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -199,7 +199,7 @@ static blk_status_t ubiblock_read(struct request *req) * and ubi_read_sg() will check that limit. */ ubi_sgl_init(&pdu->usgl); - blk_rq_map_sg(req->q, req, pdu->usgl.sg); + blk_rq_map_sg(req, pdu->usgl.sg); while (bytes_left) { /* diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 1de11b722f04..fe2f9b143c9f 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -525,7 +525,7 @@ static blk_status_t apple_nvme_map_data(struct apple_nvme *anv, if (!iod->sg) return BLK_STS_RESOURCE; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + iod->nents = blk_rq_map_sg(req, iod->sg); if (!iod->nents) goto out_free_sg; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4f1866fbd5b..7de29dae8e74 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2620,7 +2620,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, if (ret) return -ENOMEM; - op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); + op->nents = blk_rq_map_sg(rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, rq_dma_dir(rq)); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9197a5b173fd..a65978b6cdd8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -812,7 +812,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (!iod->sgt.sgl) return BLK_STS_RESOURCE; sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); + iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl); if (!iod->sgt.orig_nents) goto out_free_sg; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 86a2891d9bcc..b5a0295b5bf4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1476,8 +1476,7 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, if (ret) return -ENOMEM; - req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, - req->data_sgl.sg_table.sgl); + req->data_sgl.nents = blk_rq_map_sg(rq, req->data_sgl.sg_table.sgl); *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a9d112d34d4f..a5c41144667c 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -162,7 +162,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, } iod->req.sg = iod->sg_table.sgl; - iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + iod->req.sg_cnt = blk_rq_map_sg(req, iod->sg_table.sgl); iod->req.transfer_len = blk_rq_payload_bytes(req); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index be0890e4e706..02576c98a833 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1149,7 +1149,7 @@ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + count = __blk_rq_map_sg(rq, cmd->sdb.table.sgl, &last_sg); if (blk_rq_bytes(rq) & rq->q->limits.dma_pad_mask) { unsigned int pad_len = diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cd..d99024423355 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1155,14 +1155,13 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) return max_t(unsigned short, rq->nr_phys_segments, 1); } -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg); -static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; - return __blk_rq_map_sg(q, rq, sglist, &last_sg); + return __blk_rq_map_sg(rq, sglist, &last_sg); } void blk_dump_rq_flags(struct request *, char *); -- cgit v1.2.3 From a3996d11f3ab743e6cc4e3529ce9459c2cd27139 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Mar 2025 17:21:50 +0530 Subject: block: protect debugfs attrs using elevator_lock instead of sysfs_lock Currently, the block debugfs attributes (tags, tags_bitmap, sched_tags, and sched_tags_bitmap) are protected using q->sysfs_lock. However, these attributes are updated in multiple scenarios: - During driver probe method - During an elevator switch/update - During an nr_hw_queues update - When writing to the sysfs attribute nr_requests All these update paths (except driver probe method, which doesn't require any protection) are already protected using q->elevator_lock. To ensure consistency and proper synchronization, replace q->sysfs_lock with q->elevator_lock for protecting these debugfs attributes. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313115235.3707600-2-nilay@linux.ibm.com [axboe: some commit message rewording/fixes] Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 16 ++++++++-------- include/linux/blkdev.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index adf5f0697b6b..62775b132d4c 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -400,12 +400,12 @@ static int hctx_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -417,12 +417,12 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -434,12 +434,12 @@ static int hctx_sched_tags_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; @@ -451,12 +451,12 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) struct request_queue *q = hctx->queue; int res; - res = mutex_lock_interruptible(&q->sysfs_lock); + res = mutex_lock_interruptible(&q->elevator_lock); if (res) goto out; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); - mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->elevator_lock); out: return res; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dcf8fce15e23..8d072042790e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -566,9 +566,9 @@ struct request_queue { * nr_requests and wbt latency, this lock also protects the sysfs attrs * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update * may modify hctx tags, reserved-tags and cpumask, so this lock also - * helps protect the hctx attrs. To ensure proper locking order during - * an elevator or nr_hw_queue update, first freeze the queue, then - * acquire ->elevator_lock. + * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking + * order during an elevator or nr_hw_queue update, first freeze the + * queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From 5c12a9cdb5ad54621f1b7c02df7993a4a1b86a46 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:10 +0100 Subject: nvme: add nvme_auth_generate_psk() Add a function to generate a NVMe PSK from the shared credentials negotiated by DH-HMAC-CHAP. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme-auth.h | 3 ++ 2 files changed, 90 insertions(+) (limited to 'include/linux') diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 9b7126e1a19d..76e4a12f005e 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -471,5 +472,91 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) } EXPORT_SYMBOL_GPL(nvme_auth_generate_key); +/** + * nvme_auth_generate_psk - Generate a PSK for TLS + * @hmac_id: Hash function identifier + * @skey: Session key + * @skey_len: Length of @skey + * @c1: Value of challenge C1 + * @c2: Value of challenge C2 + * @hash_len: Hash length of the hash algorithm + * @ret_psk: Pointer too the resulting generated PSK + * @ret_len: length of @ret_psk + * + * Generate a PSK for TLS as specified in NVMe base specification, section + * 8.13.5.9: Generated PSK for TLS + * + * The generated PSK for TLS shall be computed applying the HMAC function + * using the hash function H( ) selected by the HashID parameter in the + * DH-HMAC-CHAP_Challenge message with the session key KS as key to the + * concatenation of the two challenges C1 and C2 (i.e., generated + * PSK = HMAC(KS, C1 || C2)). + * + * Returns 0 on success with a valid generated PSK pointer in @ret_psk and + * the length of @ret_psk in @ret_len, or a negative error number otherwise. + */ +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *psk; + const char *hmac_name; + int ret, psk_len; + + if (!c1 || !c2) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + psk_len = crypto_shash_digestsize(tfm); + psk = kzalloc(psk_len, GFP_KERNEL); + if (!psk) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, skey, skey_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c1, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c2, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_final(shash, psk); + if (!ret) { + *ret_psk = psk; + *ret_len = psk_len; + } + +out_free_psk: + if (ret) + kfree_sensitive(psk); +out_free_tfm: + crypto_free_shash(tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index c1d0bc5d9624..b13884b04dfd 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -40,5 +40,8 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len); +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, + u8 **ret_psk, size_t *ret_len); #endif /* _NVME_AUTH_H */ -- cgit v1.2.3 From 71972b9ffe1efe183a87d76d094236f9ec30656e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:11 +0100 Subject: nvme: add nvme_auth_generate_digest() Add a function to calculate the PSK digest as specified in TP8018. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 134 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme-auth.h | 2 + 2 files changed, 136 insertions(+) (limited to 'include/linux') diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 76e4a12f005e..54b75d177101 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -558,5 +558,139 @@ out_free_tfm: } EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); +/** + * nvme_auth_generate_digest - Generate TLS PSK digest + * @hmac_id: Hash function identifier + * @psk: Generated input PSK + * @psk_len: Length of @psk + * @subsysnqn: NQN of the subsystem + * @hostnqn: NQN of the host + * @ret_digest: Pointer to the returned digest + * + * Generate a TLS PSK digest as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The PSK digest shall be computed by encoding in Base64 (refer to RFC + * 4648) the result of the application of the HMAC function using the hash + * function specified in item 4 above (ie the hash function of the cipher + * suite associated with the PSK identity) with the PSK as HMAC key to the + * concatenation of: + * - the NQN of the host (i.e., NQNh) not including the null terminator; + * - a space character; + * - the NQN of the NVM subsystem (i.e., NQNc) not including the null + * terminator; + * - a space character; and + * - the seventeen ASCII characters "NVMe-over-Fabrics" + * (i.e., = Base64(HMAC(PSK, NQNh || " " || NQNc || " " || + * "NVMe-over-Fabrics"))). + * The length of the PSK digest depends on the hash function used to compute + * it as follows: + * - If the SHA-256 hash function is used, the resulting PSK digest is 44 + * characters long; or + * - If the SHA-384 hash function is used, the resulting PSK digest is 64 + * characters long. + * + * Returns 0 on success with a valid digest pointer in @ret_digest, or a + * negative error number on failure. + */ +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *digest, *enc; + const char *hmac_name; + size_t digest_len, hmac_len; + int ret; + + if (WARN_ON(!subsysnqn || !hostnqn)) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + switch (nvme_auth_hmac_hash_len(hmac_id)) { + case 32: + hmac_len = 44; + break; + case 48: + hmac_len = 64; + break; + default: + pr_warn("%s: invalid hash algorithm '%s'\n", + __func__, hmac_name); + return -EINVAL; + } + + enc = kzalloc(hmac_len + 1, GFP_KERNEL); + if (!enc) + return -ENOMEM; + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out_free_enc; + } + + digest_len = crypto_shash_digestsize(tfm); + digest = kzalloc(digest_len, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, psk, psk_len); + if (ret) + goto out_free_digest; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " ", 1); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18); + if (ret) + goto out_free_digest; + + ret = crypto_shash_final(shash, digest); + if (ret) + goto out_free_digest; + + ret = base64_encode(digest, digest_len, enc); + if (ret < hmac_len) { + ret = -ENOKEY; + goto out_free_digest; + } + *ret_digest = enc; + ret = 0; + +out_free_digest: + kfree_sensitive(digest); +out_free_tfm: + crypto_free_shash(tfm); +out_free_enc: + if (ret) + kfree_sensitive(enc); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index b13884b04dfd..998f06bf10fd 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -43,5 +43,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest); #endif /* _NVME_AUTH_H */ -- cgit v1.2.3 From 9d5c0fffee266f61ccc745faa6298dafe2b8c5bf Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:12 +0100 Subject: nvme: add nvme_auth_derive_tls_psk() Add a function to derive the TLS PSK as specified TP8018. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/Kconfig | 1 + drivers/nvme/common/auth.c | 116 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme-auth.h | 2 + 3 files changed, 119 insertions(+) (limited to 'include/linux') diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index 244432e0b73d..da963e4f3f1f 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -12,3 +12,4 @@ config NVME_AUTH select CRYPTO_SHA512 select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS + select CRYPTO_HKDF diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 54b75d177101..2c092ec8c0a9 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -15,6 +15,8 @@ #include #include +#define HKDF_MAX_HASHLEN 64 + static u32 nvme_dhchap_seqnum; static DEFINE_MUTEX(nvme_dhchap_mutex); @@ -692,5 +694,119 @@ out_free_enc: } EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); +/** + * nvme_auth_derive_tls_psk - Derive TLS PSK + * @hmac_id: Hash function identifier + * @psk: generated input PSK + * @psk_len: size of @psk + * @psk_digest: TLS PSK digest + * @ret_psk: Pointer to the resulting TLS PSK + * + * Derive a TLS PSK as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The TLS PSK shall be derived as follows from an input PSK + * (i.e., either a retained PSK or a generated PSK) and a PSK + * identity using the HKDF-Extract and HKDF-Expand-Label operations + * (refer to RFC 5869 and RFC 8446) where the hash function is the + * one specified by the hash specifier of the PSK identity: + * 1. PRK = HKDF-Extract(0, Input PSK); and + * 2. TLS PSK = HKDF-Expand-Label(PRK, "nvme-tls-psk", PskIdentityContext, L), + * where PskIdentityContext is the hash identifier indicated in + * the PSK identity concatenated to a space character and to the + * Base64 PSK digest (i.e., " ") and L is the + * output size in bytes of the hash function (i.e., 32 for SHA-256 + * and 48 for SHA-384). + * + * Returns 0 on success with a valid psk pointer in @ret_psk or a negative + * error number otherwise. + */ +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk) +{ + struct crypto_shash *hmac_tfm; + const char *hmac_name; + const char *psk_prefix = "tls13 nvme-tls-psk"; + static const char default_salt[HKDF_MAX_HASHLEN]; + size_t info_len, prk_len; + char *info; + unsigned char *prk, *tls_key; + int ret; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + if (hmac_id == NVME_AUTH_HASH_SHA512) { + pr_warn("%s: unsupported hash algorithm %s\n", + __func__, hmac_name); + return -EINVAL; + } + + hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(hmac_tfm)) + return PTR_ERR(hmac_tfm); + + prk_len = crypto_shash_digestsize(hmac_tfm); + prk = kzalloc(prk_len, GFP_KERNEL); + if (!prk) { + ret = -ENOMEM; + goto out_free_shash; + } + + if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) { + ret = -EINVAL; + goto out_free_prk; + } + ret = hkdf_extract(hmac_tfm, psk, psk_len, + default_salt, prk_len, prk); + if (ret) + goto out_free_prk; + + ret = crypto_shash_setkey(hmac_tfm, prk, prk_len); + if (ret) + goto out_free_prk; + + /* + * 2 addtional bytes for the length field from HDKF-Expand-Label, + * 2 addtional bytes for the HMAC ID, and one byte for the space + * separator. + */ + info_len = strlen(psk_digest) + strlen(psk_prefix) + 5; + info = kzalloc(info_len + 1, GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto out_free_prk; + } + + put_unaligned_be16(psk_len, info); + memcpy(info + 2, psk_prefix, strlen(psk_prefix)); + sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest); + + tls_key = kzalloc(psk_len, GFP_KERNEL); + if (!tls_key) { + ret = -ENOMEM; + goto out_free_info; + } + ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len); + if (ret) { + kfree(tls_key); + goto out_free_info; + } + *ret_psk = tls_key; + +out_free_info: + kfree(info); +out_free_prk: + kfree(prk); +out_free_shash: + crypto_free_shash(hmac_tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 998f06bf10fd..60e069a6757f 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -45,5 +45,7 @@ int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, u8 **ret_psk, size_t *ret_len); int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, char *subsysnqn, char *hostnqn, u8 **ret_digest); +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk); #endif /* _NVME_AUTH_H */ -- cgit v1.2.3 From 62eb89323cb08f1d6a3b41b84972ff4f373a1960 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:13 +0100 Subject: nvme-keyring: add nvme_tls_psk_refresh() Add a function to refresh a generated PSK in the specified keyring. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/keyring.c | 65 ++++++++++++++++++++++++++++++++++++++++++- drivers/nvme/host/tcp.c | 1 - drivers/nvme/target/tcp.c | 1 - include/linux/nvme-keyring.h | 12 +++++++- 4 files changed, 75 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index ed5167f942d8..32d16c53133b 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -5,7 +5,6 @@ #include #include -#include #include #include #include @@ -124,6 +123,70 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, return key_ref_to_ptr(keyref); } +/** + * nvme_tls_psk_refresh - Refresh TLS PSK + * @keyring: Keyring holding the TLS PSK + * @hostnqn: Host NQN to use + * @subnqn: Subsystem NQN to use + * @hmac_id: Hash function identifier + * @data: TLS PSK key material + * @data_len: Length of @data + * @digest: TLS PSK digest + * + * Refresh a generated version 1 TLS PSK with the identity generated + * from @hmac_id, @hostnqn, @subnqn, and @digest in the keyring given + * by @keyring. + * + * Returns the updated key success or an error pointer otherwise. + */ +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + key_perm_t keyperm = + KEY_POS_SEARCH | KEY_POS_VIEW | KEY_POS_READ | + KEY_POS_WRITE | KEY_POS_LINK | KEY_POS_SETATTR | + KEY_USR_SEARCH | KEY_USR_VIEW | KEY_USR_READ; + char *identity; + key_ref_t keyref; + key_serial_t keyring_id; + struct key *key; + + if (!hostnqn || !subnqn || !data || !data_len) + return ERR_PTR(-EINVAL); + + identity = kasprintf(GFP_KERNEL, "NVMe1G%02d %s %s %s", + hmac_id, hostnqn, subnqn, digest); + if (!identity) + return ERR_PTR(-ENOMEM); + + if (!keyring) + keyring = nvme_keyring; + keyring_id = key_serial(keyring); + pr_debug("keyring %x refresh tls psk '%s'\n", + keyring_id, identity); + keyref = key_create_or_update(make_key_ref(keyring, true), + "psk", identity, data, data_len, + keyperm, KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(keyref)) { + pr_debug("refresh tls psk '%s' failed, error %ld\n", + identity, PTR_ERR(keyref)); + kfree(identity); + return ERR_PTR(-ENOKEY); + } + kfree(identity); + /* + * Set the default timeout to 1 hour + * as suggested in TP8018. + */ + key = key_ref_to_ptr(keyref); + key_set_timeout(key, 3600); + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_psk_refresh); + /* * NVMe PSK priority list * diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 841238f38fdd..b50972257e49 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 7c51c2a8c109..fa59a7996efa 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index 19d2b256180f..ab8971afa973 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -6,15 +6,25 @@ #ifndef _NVME_KEYRING_H #define _NVME_KEYRING_H +#include + #if IS_ENABLED(CONFIG_NVME_KEYRING) +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest); key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); struct key *nvme_tls_key_lookup(key_serial_t key_id); #else - +static inline struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + return ERR_PTR(-ENOTSUPP); +} static inline key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn) { -- cgit v1.2.3 From e88a7595b57f2a04f1be796419444b4a14a55d18 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:14 +0100 Subject: nvme-tcp: request secure channel concatenation Add a fabrics option 'concat' to request secure channel concatenation as specified the NVME Base Specification v2.1, section 8.3.4.3: Secure Channel Concatenation. When secure channel concatenation is enabled a 'generated PSK' is inserted into the keyring such that it's available after reset. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/Kconfig | 2 +- drivers/nvme/host/auth.c | 115 ++++++++++++++++++++++++++++++++++++++++++-- drivers/nvme/host/fabrics.c | 34 +++++++++++-- drivers/nvme/host/fabrics.h | 3 ++ drivers/nvme/host/nvme.h | 2 + drivers/nvme/host/sysfs.c | 4 +- drivers/nvme/host/tcp.c | 53 +++++++++++++++++--- include/linux/nvme.h | 7 +++ 8 files changed, 205 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 486afe598184..10e453b2436e 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -109,7 +109,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH - select NVME_KEYRING if NVME_TCP_TLS + select NVME_KEYRING help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 5ea0e21709da..6115fef74c1e 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -12,6 +12,7 @@ #include "nvme.h" #include "fabrics.h" #include +#include #define CHAP_BUF_SIZE 4096 static struct kmem_cache *nvme_chap_buf_cache; @@ -131,7 +132,13 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, data->auth_type = NVME_AUTH_COMMON_MESSAGES; data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; data->t_id = cpu_to_le16(chap->transaction); - data->sc_c = 0; /* No secure channel concatenation */ + if (ctrl->opts->concat && chap->qid == 0) { + if (ctrl->opts->tls_key) + data->sc_c = NVME_AUTH_SECP_REPLACETLSPSK; + else + data->sc_c = NVME_AUTH_SECP_NEWTLSPSK; + } else + data->sc_c = NVME_AUTH_SECP_NOSC; data->napd = 1; data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; data->auth_protocol[0].dhchap.halen = 3; @@ -311,8 +318,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, data->hl = chap->hash_len; data->dhvlen = cpu_to_le16(chap->host_key_len); memcpy(data->rval, chap->response, chap->hash_len); - if (ctrl->ctrl_key) { + if (ctrl->ctrl_key) chap->bi_directional = true; + if (ctrl->ctrl_key || ctrl->opts->concat) { get_random_bytes(chap->c2, chap->hash_len); data->cvalid = 1; memcpy(data->rval + chap->hash_len, chap->c2, @@ -322,7 +330,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, } else { memset(chap->c2, 0, chap->hash_len); } - chap->s2 = nvme_auth_get_seqnum(); + if (ctrl->opts->concat) + chap->s2 = 0; + else + chap->s2 = nvme_auth_get_seqnum(); data->seqnum = cpu_to_le32(chap->s2); if (chap->host_key_len) { dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n", @@ -677,6 +688,92 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_kpp(chap->dh_tfm); } +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) +{ + dev_dbg(ctrl->device, "Wipe generated TLS PSK %08x\n", + key_serial(ctrl->opts->tls_key)); + key_revoke(ctrl->opts->tls_key); + key_put(ctrl->opts->tls_key); + ctrl->opts->tls_key = NULL; +} +EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key); + +static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + u8 *psk, *digest, *tls_psk; + struct key *tls_key; + size_t psk_len; + int ret = 0; + + if (!chap->sess_key) { + dev_warn(ctrl->device, + "%s: qid %d no session key negotiated\n", + __func__, chap->qid); + return -ENOKEY; + } + + if (chap->qid) { + dev_warn(ctrl->device, + "qid %d: secure concatenation not supported on I/O queues\n", + chap->qid); + return -EINVAL; + } + ret = nvme_auth_generate_psk(chap->hash_id, chap->sess_key, + chap->sess_key_len, + chap->c1, chap->c2, + chap->hash_len, &psk, &psk_len); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate PSK, error %d\n", + __func__, chap->qid, ret); + return ret; + } + dev_dbg(ctrl->device, + "%s: generated psk %*ph\n", __func__, (int)psk_len, psk); + + ret = nvme_auth_generate_digest(chap->hash_id, psk, psk_len, + ctrl->opts->subsysnqn, + ctrl->opts->host->nqn, &digest); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate digest, error %d\n", + __func__, chap->qid, ret); + goto out_free_psk; + }; + dev_dbg(ctrl->device, "%s: generated digest %s\n", + __func__, digest); + ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len, + digest, &tls_psk); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to derive TLS psk, error %d\n", + __func__, chap->qid, ret); + goto out_free_digest; + }; + + tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring, + ctrl->opts->host->nqn, + ctrl->opts->subsysnqn, chap->hash_id, + tls_psk, psk_len, digest); + if (IS_ERR(tls_key)) { + ret = PTR_ERR(tls_key); + dev_warn(ctrl->device, + "%s: qid %d failed to insert generated key, error %d\n", + __func__, chap->qid, ret); + tls_key = NULL; + } + kfree_sensitive(tls_psk); + if (ctrl->opts->tls_key) + nvme_auth_revoke_tls_key(ctrl); + ctrl->opts->tls_key = tls_key; +out_free_digest: + kfree_sensitive(digest); +out_free_psk: + kfree_sensitive(psk); + return ret; +} + static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = @@ -833,6 +930,13 @@ static void nvme_queue_auth_work(struct work_struct *work) } if (!ret) { chap->error = 0; + if (ctrl->opts->concat && + (ret = nvme_auth_secure_concat(ctrl, chap))) { + dev_warn(ctrl->device, + "%s: qid %d failed to enable secure concatenation\n", + __func__, chap->qid); + chap->error = ret; + } return; } @@ -912,6 +1016,11 @@ static void nvme_ctrl_auth_work(struct work_struct *work) "qid 0: authentication failed\n"); return; } + /* + * Only run authentication on the admin queue for secure concatenation. + */ + if (ctrl->opts->concat) + return; for (q = 1; q < ctrl->queue_count; q++) { ret = nvme_auth_negotiate(ctrl, q); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 432efcbf9e2f..93e9041b9657 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -472,8 +472,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) result = le32_to_cpu(res.u32); ctrl->cntlid = result & 0xFFFF; if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { - /* Secure concatenation is not implemented */ - if (result & NVME_CONNECT_AUTHREQ_ASCR) { + /* Check for secure concatenation */ + if ((result & NVME_CONNECT_AUTHREQ_ASCR) && + !ctrl->opts->concat) { dev_warn(ctrl->device, "qid 0: secure concatenation is not supported\n"); ret = -EOPNOTSUPP; @@ -550,7 +551,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) /* Secure concatenation is not implemented */ if (result & NVME_CONNECT_AUTHREQ_ASCR) { dev_warn(ctrl->device, - "qid 0: secure concatenation is not supported\n"); + "qid %d: secure concatenation is not supported\n", qid); ret = -EOPNOTSUPP; goto out_free_data; } @@ -706,6 +707,7 @@ static const match_table_t opt_tokens = { #endif #ifdef CONFIG_NVME_TCP_TLS { NVMF_OPT_TLS, "tls" }, + { NVMF_OPT_CONCAT, "concat" }, #endif { NVMF_OPT_ERR, NULL } }; @@ -735,6 +737,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->tls = false; opts->tls_key = NULL; opts->keyring = NULL; + opts->concat = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -1053,6 +1056,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->tls = true; break; + case NVMF_OPT_CONCAT: + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) { + pr_err("TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + opts->concat = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -1079,6 +1090,23 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", opts->fast_io_fail_tmo, ctrl_loss_tmo); } + if (opts->concat) { + if (opts->tls) { + pr_err("Secure concatenation over TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + if (opts->tls_key) { + pr_err("Cannot specify a TLS key for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + if (!opts->dhchap_secret) { + pr_err("Need to enable DH-CHAP for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + } opts->host = nvmf_host_add(hostnqn, &hostid); if (IS_ERR(opts->host)) { diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 21d75dc4a3a0..9cf5b020adba 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -66,6 +66,7 @@ enum { NVMF_OPT_TLS = 1 << 25, NVMF_OPT_KEYRING = 1 << 26, NVMF_OPT_TLS_KEY = 1 << 27, + NVMF_OPT_CONCAT = 1 << 28, }; /** @@ -101,6 +102,7 @@ enum { * @keyring: Keyring to use for key lookups * @tls_key: TLS key for encrypted connections (TCP) * @tls: Start TLS encrypted connections (TCP) + * @concat: Enabled Secure channel concatenation (TCP) * @disable_sqflow: disable controller sq flow control * @hdr_digest: generate/verify header digest (TCP) * @data_digest: generate/verify data digest (TCP) @@ -130,6 +132,7 @@ struct nvmf_ctrl_options { struct key *keyring; struct key *tls_key; bool tls; + bool concat; bool disable_sqflow; bool hdr_digest; bool data_digest; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7be92d07430e..daa4c91e4848 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1147,6 +1147,7 @@ void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); void nvme_auth_free(struct nvme_ctrl *ctrl); +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl); #else static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { @@ -1169,6 +1170,7 @@ static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) return -EPROTONOSUPPORT; } static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; +static inline void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) {}; #endif u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3a41b9ab0f13..52683943be15 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -780,10 +780,10 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_tls_key.attr && - !ctrl->opts->tls) + !ctrl->opts->tls && !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_configured_key.attr && - !ctrl->opts->tls_key) + (!ctrl->opts->tls_key || ctrl->opts->concat)) return 0; if (a == &dev_attr_tls_keyring.attr && !ctrl->opts->keyring) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b50972257e49..196d36318853 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -235,7 +235,7 @@ static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0; - return ctrl->opts->tls; + return ctrl->opts->tls || ctrl->opts->concat; } static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) @@ -1987,7 +1987,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) if (nvme_tcp_tls_configured(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); - else { + else if (ctrl->opts->tls) { pskid = nvme_tls_psk_default(ctrl->opts->keyring, ctrl->opts->host->nqn, ctrl->opts->subsysnqn); @@ -2017,9 +2017,25 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { - dev_err(ctrl->device, "no PSK negotiated\n"); - return -ENOKEY; + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->concat) { + /* + * The generated PSK is stored in the + * fabric options + */ + if (!ctrl->opts->tls_key) { + dev_err(ctrl->device, "no PSK generated\n"); + return -ENOKEY; + } + if (ctrl->tls_pskid && + ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) { + dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } + } else if (!ctrl->tls_pskid) { + dev_err(ctrl->device, "no PSK negotiated\n"); + return -ENOKEY; + } } for (i = 1; i < ctrl->queue_count; i++) { @@ -2237,6 +2253,27 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, } } +/* + * The TLS key is set by secure concatenation after negotiation has been + * completed on the admin queue. We need to revoke the key when: + * - concatenation is enabled (otherwise it's a static key set by the user) + * and + * - the generated key is present in ctrl->tls_key (otherwise there's nothing + * to revoke) + * and + * - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS + * negotiation has not run). + * + * We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called + * twice during secure concatenation, once on a 'normal' connection to run the + * DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set), + * and once after the negotiation (which uses the key, so it _must_ be set). + */ +static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid; +} + static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; @@ -2342,6 +2379,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); @@ -2382,6 +2421,8 @@ static void nvme_reset_ctrl_work(struct work_struct *work) container_of(work, struct nvme_ctrl, reset_work); int ret; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_ctrl(ctrl); nvme_tcp_teardown_ctrl(ctrl, false); @@ -2877,7 +2918,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | - NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY, + NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdc..bfb5688363b0 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1746,6 +1746,13 @@ enum { NVME_AUTH_DHGROUP_INVALID = 0xff, }; +enum { + NVME_AUTH_SECP_NOSC = 0x00, + NVME_AUTH_SECP_SC = 0x01, + NVME_AUTH_SECP_NEWTLSPSK = 0x02, + NVME_AUTH_SECP_REPLACETLSPSK = 0x03, +}; + union nvmf_auth_protocol { struct nvmf_auth_dhchap_protocol_descriptor dhchap; }; -- cgit v1.2.3