diff options
Diffstat (limited to 'drivers/nvme')
37 files changed, 2680 insertions, 956 deletions
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index 244432e0b73d..da963e4f3f1f 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -12,3 +12,4 @@ config NVME_AUTH select CRYPTO_SHA512 select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS + select CRYPTO_HKDF diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 9b7126e1a19d..91e273b89fea 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -11,9 +11,12 @@ #include <linux/unaligned.h> #include <crypto/hash.h> #include <crypto/dh.h> +#include <crypto/hkdf.h> #include <linux/nvme.h> #include <linux/nvme-auth.h> +#define HKDF_MAX_HASHLEN 64 + static u32 nvme_dhchap_seqnum; static DEFINE_MUTEX(nvme_dhchap_mutex); @@ -239,7 +242,7 @@ struct nvme_dhchap_key *nvme_auth_transform_key( { const char *hmac_name; struct crypto_shash *key_tfm; - struct shash_desc *shash; + SHASH_DESC_ON_STACK(shash, key_tfm); struct nvme_dhchap_key *transformed_key; int ret, key_len; @@ -264,19 +267,11 @@ struct nvme_dhchap_key *nvme_auth_transform_key( if (IS_ERR(key_tfm)) return ERR_CAST(key_tfm); - shash = kmalloc(sizeof(struct shash_desc) + - crypto_shash_descsize(key_tfm), - GFP_KERNEL); - if (!shash) { - ret = -ENOMEM; - goto out_free_key; - } - key_len = crypto_shash_digestsize(key_tfm); transformed_key = nvme_auth_alloc_key(key_len, key->hash); if (!transformed_key) { ret = -ENOMEM; - goto out_free_shash; + goto out_free_key; } shash->tfm = key_tfm; @@ -296,15 +291,12 @@ struct nvme_dhchap_key *nvme_auth_transform_key( if (ret < 0) goto out_free_transformed_key; - kfree(shash); crypto_free_shash(key_tfm); return transformed_key; out_free_transformed_key: nvme_auth_free_key(transformed_key); -out_free_shash: - kfree(shash); out_free_key: crypto_free_shash(key_tfm); @@ -471,5 +463,339 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) } EXPORT_SYMBOL_GPL(nvme_auth_generate_key); +/** + * nvme_auth_generate_psk - Generate a PSK for TLS + * @hmac_id: Hash function identifier + * @skey: Session key + * @skey_len: Length of @skey + * @c1: Value of challenge C1 + * @c2: Value of challenge C2 + * @hash_len: Hash length of the hash algorithm + * @ret_psk: Pointer to the resulting generated PSK + * @ret_len: length of @ret_psk + * + * Generate a PSK for TLS as specified in NVMe base specification, section + * 8.13.5.9: Generated PSK for TLS + * + * The generated PSK for TLS shall be computed applying the HMAC function + * using the hash function H( ) selected by the HashID parameter in the + * DH-HMAC-CHAP_Challenge message with the session key KS as key to the + * concatenation of the two challenges C1 and C2 (i.e., generated + * PSK = HMAC(KS, C1 || C2)). + * + * Returns 0 on success with a valid generated PSK pointer in @ret_psk and + * the length of @ret_psk in @ret_len, or a negative error number otherwise. + */ +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *psk; + const char *hmac_name; + int ret, psk_len; + + if (!c1 || !c2) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + psk_len = crypto_shash_digestsize(tfm); + psk = kzalloc(psk_len, GFP_KERNEL); + if (!psk) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, skey, skey_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c1, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c2, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_final(shash, psk); + if (!ret) { + *ret_psk = psk; + *ret_len = psk_len; + } + +out_free_psk: + if (ret) + kfree_sensitive(psk); +out_free_tfm: + crypto_free_shash(tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); + +/** + * nvme_auth_generate_digest - Generate TLS PSK digest + * @hmac_id: Hash function identifier + * @psk: Generated input PSK + * @psk_len: Length of @psk + * @subsysnqn: NQN of the subsystem + * @hostnqn: NQN of the host + * @ret_digest: Pointer to the returned digest + * + * Generate a TLS PSK digest as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The PSK digest shall be computed by encoding in Base64 (refer to RFC + * 4648) the result of the application of the HMAC function using the hash + * function specified in item 4 above (ie the hash function of the cipher + * suite associated with the PSK identity) with the PSK as HMAC key to the + * concatenation of: + * - the NQN of the host (i.e., NQNh) not including the null terminator; + * - a space character; + * - the NQN of the NVM subsystem (i.e., NQNc) not including the null + * terminator; + * - a space character; and + * - the seventeen ASCII characters "NVMe-over-Fabrics" + * (i.e., <PSK digest> = Base64(HMAC(PSK, NQNh || " " || NQNc || " " || + * "NVMe-over-Fabrics"))). + * The length of the PSK digest depends on the hash function used to compute + * it as follows: + * - If the SHA-256 hash function is used, the resulting PSK digest is 44 + * characters long; or + * - If the SHA-384 hash function is used, the resulting PSK digest is 64 + * characters long. + * + * Returns 0 on success with a valid digest pointer in @ret_digest, or a + * negative error number on failure. + */ +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *digest, *enc; + const char *hmac_name; + size_t digest_len, hmac_len; + int ret; + + if (WARN_ON(!subsysnqn || !hostnqn)) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + switch (nvme_auth_hmac_hash_len(hmac_id)) { + case 32: + hmac_len = 44; + break; + case 48: + hmac_len = 64; + break; + default: + pr_warn("%s: invalid hash algorithm '%s'\n", + __func__, hmac_name); + return -EINVAL; + } + + enc = kzalloc(hmac_len + 1, GFP_KERNEL); + if (!enc) + return -ENOMEM; + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out_free_enc; + } + + digest_len = crypto_shash_digestsize(tfm); + digest = kzalloc(digest_len, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, psk, psk_len); + if (ret) + goto out_free_digest; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " ", 1); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18); + if (ret) + goto out_free_digest; + + ret = crypto_shash_final(shash, digest); + if (ret) + goto out_free_digest; + + ret = base64_encode(digest, digest_len, enc); + if (ret < hmac_len) { + ret = -ENOKEY; + goto out_free_digest; + } + *ret_digest = enc; + ret = 0; + +out_free_digest: + kfree_sensitive(digest); +out_free_tfm: + crypto_free_shash(tfm); +out_free_enc: + if (ret) + kfree_sensitive(enc); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); + +/** + * nvme_auth_derive_tls_psk - Derive TLS PSK + * @hmac_id: Hash function identifier + * @psk: generated input PSK + * @psk_len: size of @psk + * @psk_digest: TLS PSK digest + * @ret_psk: Pointer to the resulting TLS PSK + * + * Derive a TLS PSK as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The TLS PSK shall be derived as follows from an input PSK + * (i.e., either a retained PSK or a generated PSK) and a PSK + * identity using the HKDF-Extract and HKDF-Expand-Label operations + * (refer to RFC 5869 and RFC 8446) where the hash function is the + * one specified by the hash specifier of the PSK identity: + * 1. PRK = HKDF-Extract(0, Input PSK); and + * 2. TLS PSK = HKDF-Expand-Label(PRK, "nvme-tls-psk", PskIdentityContext, L), + * where PskIdentityContext is the hash identifier indicated in + * the PSK identity concatenated to a space character and to the + * Base64 PSK digest (i.e., "<hash> <PSK digest>") and L is the + * output size in bytes of the hash function (i.e., 32 for SHA-256 + * and 48 for SHA-384). + * + * Returns 0 on success with a valid psk pointer in @ret_psk or a negative + * error number otherwise. + */ +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk) +{ + struct crypto_shash *hmac_tfm; + const char *hmac_name; + const char *psk_prefix = "tls13 nvme-tls-psk"; + static const char default_salt[HKDF_MAX_HASHLEN]; + size_t info_len, prk_len; + char *info; + unsigned char *prk, *tls_key; + int ret; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + if (hmac_id == NVME_AUTH_HASH_SHA512) { + pr_warn("%s: unsupported hash algorithm %s\n", + __func__, hmac_name); + return -EINVAL; + } + + hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(hmac_tfm)) + return PTR_ERR(hmac_tfm); + + prk_len = crypto_shash_digestsize(hmac_tfm); + prk = kzalloc(prk_len, GFP_KERNEL); + if (!prk) { + ret = -ENOMEM; + goto out_free_shash; + } + + if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) { + ret = -EINVAL; + goto out_free_prk; + } + ret = hkdf_extract(hmac_tfm, psk, psk_len, + default_salt, prk_len, prk); + if (ret) + goto out_free_prk; + + ret = crypto_shash_setkey(hmac_tfm, prk, prk_len); + if (ret) + goto out_free_prk; + + /* + * 2 additional bytes for the length field from HDKF-Expand-Label, + * 2 additional bytes for the HMAC ID, and one byte for the space + * separator. + */ + info_len = strlen(psk_digest) + strlen(psk_prefix) + 5; + info = kzalloc(info_len + 1, GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto out_free_prk; + } + + put_unaligned_be16(psk_len, info); + memcpy(info + 2, psk_prefix, strlen(psk_prefix)); + sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest); + + tls_key = kzalloc(psk_len, GFP_KERNEL); + if (!tls_key) { + ret = -ENOMEM; + goto out_free_info; + } + ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len); + if (ret) { + kfree(tls_key); + goto out_free_info; + } + *ret_psk = tls_key; + +out_free_info: + kfree(info); +out_free_prk: + kfree(prk); +out_free_shash: + crypto_free_shash(hmac_tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index ed5167f942d8..32d16c53133b 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -5,7 +5,6 @@ #include <linux/module.h> #include <linux/seq_file.h> -#include <linux/key.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/nvme.h> @@ -124,6 +123,70 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, return key_ref_to_ptr(keyref); } +/** + * nvme_tls_psk_refresh - Refresh TLS PSK + * @keyring: Keyring holding the TLS PSK + * @hostnqn: Host NQN to use + * @subnqn: Subsystem NQN to use + * @hmac_id: Hash function identifier + * @data: TLS PSK key material + * @data_len: Length of @data + * @digest: TLS PSK digest + * + * Refresh a generated version 1 TLS PSK with the identity generated + * from @hmac_id, @hostnqn, @subnqn, and @digest in the keyring given + * by @keyring. + * + * Returns the updated key success or an error pointer otherwise. + */ +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + key_perm_t keyperm = + KEY_POS_SEARCH | KEY_POS_VIEW | KEY_POS_READ | + KEY_POS_WRITE | KEY_POS_LINK | KEY_POS_SETATTR | + KEY_USR_SEARCH | KEY_USR_VIEW | KEY_USR_READ; + char *identity; + key_ref_t keyref; + key_serial_t keyring_id; + struct key *key; + + if (!hostnqn || !subnqn || !data || !data_len) + return ERR_PTR(-EINVAL); + + identity = kasprintf(GFP_KERNEL, "NVMe1G%02d %s %s %s", + hmac_id, hostnqn, subnqn, digest); + if (!identity) + return ERR_PTR(-ENOMEM); + + if (!keyring) + keyring = nvme_keyring; + keyring_id = key_serial(keyring); + pr_debug("keyring %x refresh tls psk '%s'\n", + keyring_id, identity); + keyref = key_create_or_update(make_key_ref(keyring, true), + "psk", identity, data, data_len, + keyperm, KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(keyref)) { + pr_debug("refresh tls psk '%s' failed, error %ld\n", + identity, PTR_ERR(keyref)); + kfree(identity); + return ERR_PTR(-ENOKEY); + } + kfree(identity); + /* + * Set the default timeout to 1 hour + * as suggested in TP8018. + */ + key = key_ref_to_ptr(keyref); + key_set_timeout(key, 3600); + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_psk_refresh); + /* * NVMe PSK priority list * diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 486afe598184..31974c7dd20c 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -18,10 +18,15 @@ config NVME_MULTIPATH bool "NVMe multipath support" depends on NVME_CORE help - This option enables support for multipath access to NVMe - subsystems. If this option is enabled only a single - /dev/nvmeXnY device will show up for each NVMe namespace, - even if it is accessible through multiple controllers. + This option controls support for multipath access to NVMe + subsystems. If this option is enabled support for NVMe multipath + access is included in the kernel. If this option is disabled support + for NVMe multipath access is excluded from the kernel. When this + option is disabled each controller/namespace receives its + own /dev/nvmeXnY device entry and NVMe multipath access is + not supported. + + If unsure, say Y. config NVME_VERBOSE_ERRORS bool "NVMe verbose error reporting" @@ -79,9 +84,9 @@ config NVME_TCP tristate "NVM Express over Fabrics TCP host driver" depends on INET depends on BLOCK + select CRC32 + select NET_CRC32C select NVME_FABRICS - select CRYPTO - select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using the TCP transport. This allows you to use remote block devices @@ -97,10 +102,11 @@ config NVME_TCP_TLS depends on NVME_TCP select NET_HANDSHAKE select KEYS + select TLS help Enables TLS encryption for NVMe TCP using the netlink handshake API. - The TLS handshake daemon is availble at + The TLS handshake daemon is available at https://github.com/oracle/ktls-utils. If unsure, say N. @@ -109,7 +115,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH - select NVME_KEYRING if NVME_TCP_TLS + select NVME_KEYRING help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 8971aca41e63..b1fddfa33ab9 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -221,7 +221,7 @@ static unsigned int apple_nvme_queue_depth(struct apple_nvme_queue *q) return APPLE_ANS_MAX_QUEUE_DEPTH; } -static void apple_nvme_rtkit_crashed(void *cookie) +static void apple_nvme_rtkit_crashed(void *cookie, const void *crashlog, size_t crashlog_size) { struct apple_nvme *anv = cookie; @@ -525,7 +525,7 @@ static blk_status_t apple_nvme_map_data(struct apple_nvme *anv, if (!iod->sg) return BLK_STS_RESOURCE; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + iod->nents = blk_rq_map_sg(req, iod->sg); if (!iod->nents) goto out_free_sg; diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 5ea0e21709da..f6ddbe553289 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -12,6 +12,7 @@ #include "nvme.h" #include "fabrics.h" #include <linux/nvme-auth.h> +#include <linux/nvme-keyring.h> #define CHAP_BUF_SIZE 4096 static struct kmem_cache *nvme_chap_buf_cache; @@ -30,6 +31,7 @@ struct nvme_dhchap_queue_context { u32 s1; u32 s2; bool bi_directional; + bool authenticated; u16 transaction; u8 status; u8 dhgroup_id; @@ -131,7 +133,13 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, data->auth_type = NVME_AUTH_COMMON_MESSAGES; data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; data->t_id = cpu_to_le16(chap->transaction); - data->sc_c = 0; /* No secure channel concatenation */ + if (ctrl->opts->concat && chap->qid == 0) { + if (ctrl->opts->tls_key) + data->sc_c = NVME_AUTH_SECP_REPLACETLSPSK; + else + data->sc_c = NVME_AUTH_SECP_NEWTLSPSK; + } else + data->sc_c = NVME_AUTH_SECP_NOSC; data->napd = 1; data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; data->auth_protocol[0].dhchap.halen = 3; @@ -311,8 +319,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, data->hl = chap->hash_len; data->dhvlen = cpu_to_le16(chap->host_key_len); memcpy(data->rval, chap->response, chap->hash_len); - if (ctrl->ctrl_key) { + if (ctrl->ctrl_key) chap->bi_directional = true; + if (ctrl->ctrl_key || ctrl->opts->concat) { get_random_bytes(chap->c2, chap->hash_len); data->cvalid = 1; memcpy(data->rval + chap->hash_len, chap->c2, @@ -322,7 +331,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, } else { memset(chap->c2, 0, chap->hash_len); } - chap->s2 = nvme_auth_get_seqnum(); + if (ctrl->opts->concat) + chap->s2 = 0; + else + chap->s2 = nvme_auth_get_seqnum(); data->seqnum = cpu_to_le32(chap->s2); if (chap->host_key_len) { dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n", @@ -671,12 +683,99 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { nvme_auth_reset_dhchap(chap); + chap->authenticated = false; if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); } +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) +{ + dev_dbg(ctrl->device, "Wipe generated TLS PSK %08x\n", + key_serial(ctrl->opts->tls_key)); + key_revoke(ctrl->opts->tls_key); + key_put(ctrl->opts->tls_key); + ctrl->opts->tls_key = NULL; +} +EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key); + +static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + u8 *psk, *digest, *tls_psk; + struct key *tls_key; + size_t psk_len; + int ret = 0; + + if (!chap->sess_key) { + dev_warn(ctrl->device, + "%s: qid %d no session key negotiated\n", + __func__, chap->qid); + return -ENOKEY; + } + + if (chap->qid) { + dev_warn(ctrl->device, + "qid %d: secure concatenation not supported on I/O queues\n", + chap->qid); + return -EINVAL; + } + ret = nvme_auth_generate_psk(chap->hash_id, chap->sess_key, + chap->sess_key_len, + chap->c1, chap->c2, + chap->hash_len, &psk, &psk_len); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate PSK, error %d\n", + __func__, chap->qid, ret); + return ret; + } + dev_dbg(ctrl->device, + "%s: generated psk %*ph\n", __func__, (int)psk_len, psk); + + ret = nvme_auth_generate_digest(chap->hash_id, psk, psk_len, + ctrl->opts->subsysnqn, + ctrl->opts->host->nqn, &digest); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate digest, error %d\n", + __func__, chap->qid, ret); + goto out_free_psk; + }; + dev_dbg(ctrl->device, "%s: generated digest %s\n", + __func__, digest); + ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len, + digest, &tls_psk); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to derive TLS psk, error %d\n", + __func__, chap->qid, ret); + goto out_free_digest; + }; + + tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring, + ctrl->opts->host->nqn, + ctrl->opts->subsysnqn, chap->hash_id, + tls_psk, psk_len, digest); + if (IS_ERR(tls_key)) { + ret = PTR_ERR(tls_key); + dev_warn(ctrl->device, + "%s: qid %d failed to insert generated key, error %d\n", + __func__, chap->qid, ret); + tls_key = NULL; + } + kfree_sensitive(tls_psk); + if (ctrl->opts->tls_key) + nvme_auth_revoke_tls_key(ctrl); + ctrl->opts->tls_key = tls_key; +out_free_digest: + kfree_sensitive(digest); +out_free_psk: + kfree_sensitive(psk); + return ret; +} + static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = @@ -833,6 +932,15 @@ static void nvme_queue_auth_work(struct work_struct *work) } if (!ret) { chap->error = 0; + chap->authenticated = true; + if (ctrl->opts->concat && + (ret = nvme_auth_secure_concat(ctrl, chap))) { + dev_warn(ctrl->device, + "%s: qid %d failed to enable secure concatenation\n", + __func__, chap->qid); + chap->error = ret; + chap->authenticated = false; + } return; } @@ -912,15 +1020,23 @@ static void nvme_ctrl_auth_work(struct work_struct *work) "qid 0: authentication failed\n"); return; } + /* + * Only run authentication on the admin queue for secure concatenation. + */ + if (ctrl->opts->concat) + return; for (q = 1; q < ctrl->queue_count; q++) { - ret = nvme_auth_negotiate(ctrl, q); - if (ret) { - dev_warn(ctrl->device, - "qid %d: error %d setting up authentication\n", - q, ret); - break; - } + struct nvme_dhchap_queue_context *chap = + &ctrl->dhchap_ctxs[q]; + /* + * Skip re-authentication if the queue had + * not been authenticated initially. + */ + if (!chap->authenticated) + continue; + cancel_work_sync(&chap->auth_work); + queue_work(nvme_auth_wq, &chap->auth_work); } /* @@ -928,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work) * the controller terminates the connection. */ for (q = 1; q < ctrl->queue_count; q++) { - ret = nvme_auth_wait(ctrl, q); + struct nvme_dhchap_queue_context *chap = + &ctrl->dhchap_ctxs[q]; + if (!chap->authenticated) + continue; + flush_work(&chap->auth_work); + ret = chap->error; + nvme_auth_reset_dhchap(chap); if (ret) dev_warn(ctrl->device, "qid %d: authentication failed\n", q); @@ -967,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) chap = &ctrl->dhchap_ctxs[i]; chap->qid = i; chap->ctrl = ctrl; + chap->authenticated = false; INIT_WORK(&chap->auth_work, nvme_queue_auth_work); } diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 2b9e6cfaf2a8..1a0058be5821 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = { [NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes", [NVME_SC_INVALID_PI] = "Invalid Protection Information", [NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range", - [NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported", + [NVME_SC_CMD_SIZE_LIM_EXCEEDED ] = "Command Size Limits Exceeded", [NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error", [NVME_SC_ZONE_FULL] = "Zone Is Full", [NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only", diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8359d0aa0e44..895fb163d48e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -38,6 +38,8 @@ struct nvme_ns_info { u32 nsid; __le32 anagrpid; u8 pi_offset; + u16 endgid; + u64 runs; bool is_shared; bool is_readonly; bool is_ready; @@ -150,6 +152,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, struct nvme_command *cmd); +static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, + u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi); void nvme_queue_scan(struct nvme_ctrl *ctrl) { @@ -286,7 +290,6 @@ static blk_status_t nvme_error_status(u16 status) case NVME_SC_NS_NOT_READY: return BLK_STS_TARGET; case NVME_SC_BAD_ATTRIBUTES: - case NVME_SC_ONCS_NOT_SUPPORTED: case NVME_SC_INVALID_OPCODE: case NVME_SC_INVALID_FIELD: case NVME_SC_INVALID_NS: @@ -378,12 +381,12 @@ static void nvme_log_err_passthru(struct request *req) nr->status & NVME_SC_MASK, /* Status Code */ nr->status & NVME_STATUS_MORE ? "MORE " : "", nr->status & NVME_STATUS_DNR ? "DNR " : "", - nr->cmd->common.cdw10, - nr->cmd->common.cdw11, - nr->cmd->common.cdw12, - nr->cmd->common.cdw13, - nr->cmd->common.cdw14, - nr->cmd->common.cdw14); + le32_to_cpu(nr->cmd->common.cdw10), + le32_to_cpu(nr->cmd->common.cdw11), + le32_to_cpu(nr->cmd->common.cdw12), + le32_to_cpu(nr->cmd->common.cdw13), + le32_to_cpu(nr->cmd->common.cdw14), + le32_to_cpu(nr->cmd->common.cdw15)); } enum nvme_disposition { @@ -664,10 +667,11 @@ static void nvme_free_ns_head(struct kref *ref) struct nvme_ns_head *head = container_of(ref, struct nvme_ns_head, ref); - nvme_mpath_remove_disk(head); + nvme_mpath_put_disk(head); ida_free(&head->subsys->ns_ida, head->instance); cleanup_srcu_struct(&head->srcu); nvme_put_subsystem(head->subsys); + kfree(head->plids); kfree(head); } @@ -760,6 +764,10 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; + + if (!(rq->rq_flags & RQF_DONTPREP)) + nvme_clear_nvme_request(rq); + return nvme_host_path_error(rq); } EXPORT_SYMBOL_GPL(nvme_fail_nonready_command); @@ -991,6 +999,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + if (op == nvme_cmd_write && ns->head->nr_plids) { + u16 write_stream = req->bio->bi_write_stream; + + if (WARN_ON_ONCE(write_stream > ns->head->nr_plids)) + return BLK_STS_INVAL; + + if (write_stream) { + dsmgmt |= ns->head->plids[write_stream - 1] << 16; + control |= NVME_RW_DTYPE_DPLCMT; + } + } + if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req)) return BLK_STS_INVAL; @@ -1010,7 +1030,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (ns->head->ms) { /* - * If formated with metadata, the block layer always provides a + * If formatted with metadata, the block layer always provides a * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else * we enable the PRACT bit for protection information or set the * namespace capacity to zero to prevent any I/O. @@ -1157,7 +1177,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req->cmd_flags &= ~REQ_FAILFAST_DRIVER; if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL); if (ret) goto out; } @@ -1609,6 +1629,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl, info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; info->is_ready = true; + info->endgid = le16_to_cpu(id->endgid); if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { dev_info(ctrl->device, "Ignoring bogus Namespace Identifiers\n"); @@ -1649,6 +1670,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl, info->is_ready = id->nstat & NVME_NSTAT_NRDY; info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL; info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT; + info->endgid = le16_to_cpu(id->endgid); } kfree(id); return ret; @@ -1674,7 +1696,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result) + void *result) { return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, buflen, result); @@ -1683,7 +1705,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features); int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result) + void *result) { return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, buflen, result); @@ -1997,21 +2019,41 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl, } -static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, - struct nvme_id_ns *id, struct queue_limits *lim, - u32 bs, u32 atomic_bs) +static u32 nvme_configure_atomic_write(struct nvme_ns *ns, + struct nvme_id_ns *id, struct queue_limits *lim, u32 bs) { - unsigned int boundary = 0; + u32 atomic_bs, boundary = 0; - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { - if (le16_to_cpu(id->nabspf)) + /* + * We do not support an offset for the atomic boundaries. + */ + if (id->nabo) + return bs; + + if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) { + /* + * Use the per-namespace atomic write unit when available. + */ + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; + if (id->nabspf) boundary = (le16_to_cpu(id->nabspf) + 1) * bs; + } else { + /* + * Use the controller wide atomic write unit. This sucks + * because the limit is defined in terms of logical blocks while + * namespaces can have different formats, and because there is + * no clear language in the specification prohibiting different + * values for different controllers in the subsystem. + */ + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; } + lim->atomic_write_hw_max = atomic_bs; lim->atomic_write_hw_boundary = boundary; lim->atomic_write_hw_unit_min = bs; lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); lim->features |= BLK_FEAT_ATOMIC_WRITES; + return atomic_bs; } static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) @@ -2049,20 +2091,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, valid = false; } - atomic_bs = phys_bs = bs; - if (id->nabo == 0) { - /* - * Bit 1 indicates whether NAWUPF is defined for this namespace - * and whether it should be used instead of AWUPF. If NAWUPF == - * 0 then AWUPF must be used instead. - */ - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) - atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; - else - atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; - - nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); - } + phys_bs = bs; + atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs); if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { /* NPWG = Namespace Preferred Write Granularity */ @@ -2153,6 +2183,148 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, return ret; } +static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl, + struct nvme_ns_info *info, u8 fdp_idx) +{ + struct nvme_fdp_config_log hdr, *h; + struct nvme_fdp_config_desc *desc; + size_t size = sizeof(hdr); + void *log, *end; + int i, n, ret; + + ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0, + NVME_CSI_NVM, &hdr, size, 0, info->endgid); + if (ret) { + dev_warn(ctrl->device, + "FDP configs log header status:0x%x endgid:%d\n", ret, + info->endgid); + return ret; + } + + size = le32_to_cpu(hdr.sze); + if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) { + dev_warn(ctrl->device, "FDP config size too large:%zu\n", + size); + return 0; + } + + h = kvmalloc(size, GFP_KERNEL); + if (!h) + return -ENOMEM; + + ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0, + NVME_CSI_NVM, h, size, 0, info->endgid); + if (ret) { + dev_warn(ctrl->device, + "FDP configs log status:0x%x endgid:%d\n", ret, + info->endgid); + goto out; + } + + n = le16_to_cpu(h->numfdpc) + 1; + if (fdp_idx > n) { + dev_warn(ctrl->device, "FDP index:%d out of range:%d\n", + fdp_idx, n); + /* Proceed without registering FDP streams */ + ret = 0; + goto out; + } + + log = h + 1; + desc = log; + end = log + size - sizeof(*h); + for (i = 0; i < fdp_idx; i++) { + log += le16_to_cpu(desc->dsze); + desc = log; + if (log >= end) { + dev_warn(ctrl->device, + "FDP invalid config descriptor list\n"); + ret = 0; + goto out; + } + } + + if (le32_to_cpu(desc->nrg) > 1) { + dev_warn(ctrl->device, "FDP NRG > 1 not supported\n"); + ret = 0; + goto out; + } + + info->runs = le64_to_cpu(desc->runs); +out: + kvfree(h); + return ret; +} + +static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info) +{ + struct nvme_ns_head *head = ns->head; + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_fdp_ruh_status *ruhs; + struct nvme_fdp_config fdp; + struct nvme_command c = {}; + size_t size; + int i, ret; + + /* + * The FDP configuration is static for the lifetime of the namespace, + * so return immediately if we've already registered this namespace's + * streams. + */ + if (head->nr_plids) + return 0; + + ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0, + &fdp); + if (ret) { + dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret); + return ret; + } + + if (!(fdp.flags & FDPCFG_FDPE)) + return 0; + + ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx); + if (!info->runs) + return ret; + + size = struct_size(ruhs, ruhsd, S8_MAX - 1); + ruhs = kzalloc(size, GFP_KERNEL); + if (!ruhs) + return -ENOMEM; + + c.imr.opcode = nvme_cmd_io_mgmt_recv; + c.imr.nsid = cpu_to_le32(head->ns_id); + c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS; + c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size)); + ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size); + if (ret) { + dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret); + goto free; + } + + head->nr_plids = le16_to_cpu(ruhs->nruhsd); + if (!head->nr_plids) + goto free; + + head->plids = kcalloc(head->nr_plids, sizeof(*head->plids), + GFP_KERNEL); + if (!head->plids) { + dev_warn(ctrl->device, + "failed to allocate %u FDP placement IDs\n", + head->nr_plids); + head->nr_plids = 0; + ret = -ENOMEM; + goto free; + } + + for (i = 0; i < head->nr_plids; i++) + head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid); +free: + kfree(ruhs); + return ret; +} + static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { @@ -2190,6 +2362,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) { + ret = nvme_query_fdp_info(ns, info); + if (ret < 0) + goto out; + } + lim = queue_limits_start_update(ns->disk->queue); memflags = blk_mq_freeze_queue(ns->disk->queue); @@ -2201,6 +2379,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, nvme_set_chunk_sectors(ns, id, &lim); if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; + nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) @@ -2223,6 +2402,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_init_integrity(ns->head, &lim, info)) capacity = 0; + lim.max_write_streams = ns->head->nr_plids; + if (lim.max_write_streams) + lim.write_stream_granularity = min(info->runs, U32_MAX); + else + lim.write_stream_granularity = 0; + ret = queue_limits_commit_update(ns->disk->queue, &lim); if (ret) { blk_mq_unfreeze_queue(ns->disk->queue, memflags); @@ -2326,6 +2511,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) ns->head->disk->flags |= GENHD_FL_HIDDEN; else nvme_init_integrity(ns->head, &lim, info); + lim.max_write_streams = ns_lim->max_write_streams; + lim.write_stream_granularity = ns_lim->write_stream_granularity; ret = queue_limits_commit_update(ns->head->disk->queue, &lim); set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); @@ -3016,6 +3203,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->model, id->mn, sizeof(subsys->model)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; + subsys->awupf = le16_to_cpu(id->awupf); /* Versions prior to 1.4 don't necessarily report a valid type */ if (id->cntrltype == NVME_CTRL_DISC || @@ -3031,7 +3219,6 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) kfree(subsys); return -EINVAL; } - subsys->awupf = le16_to_cpu(id->awupf); nvme_mpath_default_iopolicy(subsys); subsys->dev.class = &nvme_subsys_class; @@ -3084,8 +3271,8 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, - void *log, size_t size, u64 offset) +static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, + u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi) { struct nvme_command c = { }; u32 dwlen = nvme_bytes_to_numd(size); @@ -3099,10 +3286,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); c.get_log_page.csi = csi; + c.get_log_page.lsi = cpu_to_le16(lsi); return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, + void *log, size_t size, u64 offset) +{ + return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size, + offset, 0); +} + static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, struct nvme_effects_log **log) { @@ -3441,7 +3636,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); - out_free: kfree(id); return ret; @@ -3560,7 +3754,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl, */ if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h)) continue; - if (!list_empty(&h->list) && nvme_tryget_ns_head(h)) + if (nvme_tryget_ns_head(h)) return h; } @@ -3804,7 +3998,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) } } else { ret = -EINVAL; - if (!info->is_shared || !head->shared) { + if ((!info->is_shared || !head->shared) && + !list_empty(&head->list)) { dev_err(ctrl->device, "Duplicate unshared namespace %d\n", info->nsid); @@ -3822,13 +4017,17 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) "Found shared namespace %d, but multipathing not supported.\n", info->nsid); dev_warn_once(ctrl->device, - "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n"); + "Shared namespace support requires core_nvme.multipath=Y.\n"); } } list_add_tail_rcu(&ns->siblings, &head->list); ns->head = head; mutex_unlock(&ctrl->subsys->lock); + +#ifdef CONFIG_NVME_MULTIPATH + cancel_delayed_work(&head->remove_work); +#endif return 0; out_put_ns_head: @@ -3873,7 +4072,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) return; } } - list_add(&ns->list, &ns->ctrl->namespaces); + list_add_rcu(&ns->list, &ns->ctrl->namespaces); } static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) @@ -3882,6 +4081,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) struct nvme_ns *ns; struct gendisk *disk; int node = ctrl->numa_node; + bool last_path = false; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -3974,9 +4174,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); - if (list_empty(&ns->head->list)) + if (list_empty(&ns->head->list)) { list_del_init(&ns->head->entry); + /* + * If multipath is not configured, we still create a namespace + * head (nshead), but head->disk is not initialized in that + * case. As a result, only a single reference to nshead is held + * (via kref_init()) when it is created. Therefore, ensure that + * we do not release the reference to nshead twice if head->disk + * is not present. + */ + if (ns->head->disk) + last_path = true; + } mutex_unlock(&ctrl->subsys->lock); + if (last_path) + nvme_put_ns_head(ns->head); nvme_put_ns_head(ns->head); out_cleanup_disk: put_disk(disk); @@ -4008,7 +4221,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_lock(&ns->ctrl->subsys->lock); list_del_rcu(&ns->siblings); if (list_empty(&ns->head->list)) { - list_del_init(&ns->head->entry); + if (!nvme_mpath_queue_if_no_path(ns->head)) + list_del_init(&ns->head->entry); last_path = true; } mutex_unlock(&ns->ctrl->subsys->lock); @@ -4018,6 +4232,9 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (!nvme_ns_head_multipath(ns->head)) nvme_cdev_del(&ns->cdev, &ns->cdev_device); + + nvme_mpath_remove_sysfs_link(ns); + del_gendisk(ns->disk); mutex_lock(&ns->ctrl->namespaces_lock); @@ -4026,7 +4243,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) synchronize_srcu(&ns->ctrl->srcu); if (last_path) - nvme_mpath_shutdown_disk(ns->head); + nvme_mpath_remove_disk(ns->head); nvme_put_ns(ns); } @@ -4292,6 +4509,15 @@ static void nvme_scan_work(struct work_struct *work) nvme_scan_ns_sequential(ctrl); } mutex_unlock(&ctrl->scan_lock); + + /* Requeue if we have missed AENs */ + if (test_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) + nvme_queue_scan(ctrl); +#ifdef CONFIG_NVME_MULTIPATH + else if (ctrl->ana_log_buf) + /* Re-read the ANA log page to not miss updates */ + queue_work(nvme_wq, &ctrl->ana_work); +#endif } /* @@ -4466,11 +4692,9 @@ static void nvme_fw_act_work(struct work_struct *work) nvme_auth_stop(ctrl); if (ctrl->mtfa) - fw_act_timeout = jiffies + - msecs_to_jiffies(ctrl->mtfa * 100); + fw_act_timeout = jiffies + msecs_to_jiffies(ctrl->mtfa * 100); else - fw_act_timeout = jiffies + - msecs_to_jiffies(admin_timeout * 1000); + fw_act_timeout = jiffies + secs_to_jiffies(admin_timeout); nvme_quiesce_io_queues(ctrl); while (nvme_ctrl_pp_status(ctrl)) { @@ -4483,7 +4707,8 @@ static void nvme_fw_act_work(struct work_struct *work) msleep(100); } - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING) || + !nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; nvme_unquiesce_io_queues(ctrl); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 432efcbf9e2f..2e58a7ce1090 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -472,8 +472,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) result = le32_to_cpu(res.u32); ctrl->cntlid = result & 0xFFFF; if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { - /* Secure concatenation is not implemented */ - if (result & NVME_CONNECT_AUTHREQ_ASCR) { + /* Check for secure concatenation */ + if ((result & NVME_CONNECT_AUTHREQ_ASCR) && + !ctrl->opts->concat) { dev_warn(ctrl->device, "qid 0: secure concatenation is not supported\n"); ret = -EOPNOTSUPP; @@ -550,7 +551,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) /* Secure concatenation is not implemented */ if (result & NVME_CONNECT_AUTHREQ_ASCR) { dev_warn(ctrl->device, - "qid 0: secure concatenation is not supported\n"); + "qid %d: secure concatenation is not supported\n", qid); ret = -EOPNOTSUPP; goto out_free_data; } @@ -581,7 +582,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); * Do not retry when: * * - the DNR bit is set and the specification states no further connect - * attempts with the same set of paramenters should be attempted. + * attempts with the same set of parameters should be attempted. * * - when the authentication attempt fails, because the key was invalid. * This error code is set on the host side. @@ -706,6 +707,7 @@ static const match_table_t opt_tokens = { #endif #ifdef CONFIG_NVME_TCP_TLS { NVMF_OPT_TLS, "tls" }, + { NVMF_OPT_CONCAT, "concat" }, #endif { NVMF_OPT_ERR, NULL } }; @@ -735,6 +737,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->tls = false; opts->tls_key = NULL; opts->keyring = NULL; + opts->concat = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -1053,6 +1056,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->tls = true; break; + case NVMF_OPT_CONCAT: + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) { + pr_err("TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + opts->concat = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -1079,6 +1090,23 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", opts->fast_io_fail_tmo, ctrl_loss_tmo); } + if (opts->concat) { + if (opts->tls) { + pr_err("Secure concatenation over TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + if (opts->tls_key) { + pr_err("Cannot specify a TLS key for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + if (!opts->dhchap_secret) { + pr_err("Need to enable DH-CHAP for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + } opts->host = nvmf_host_add(hostnqn, &hostid); if (IS_ERR(opts->host)) { diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 21d75dc4a3a0..1b58ee7d0dce 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -66,6 +66,7 @@ enum { NVMF_OPT_TLS = 1 << 25, NVMF_OPT_KEYRING = 1 << 26, NVMF_OPT_TLS_KEY = 1 << 27, + NVMF_OPT_CONCAT = 1 << 28, }; /** @@ -79,7 +80,7 @@ enum { * @transport: Holds the fabric transport "technology name" (for a lack of * better description) that will be used by an NVMe controller * being added. - * @subsysnqn: Hold the fully qualified NQN subystem name (format defined + * @subsysnqn: Hold the fully qualified NQN subsystem name (format defined * in the NVMe specification, "NVMe Qualified Names"). * @traddr: The transport-specific TRADDR field for a port on the * subsystem which is adding a controller. @@ -101,6 +102,7 @@ enum { * @keyring: Keyring to use for key lookups * @tls_key: TLS key for encrypted connections (TCP) * @tls: Start TLS encrypted connections (TCP) + * @concat: Enabled Secure channel concatenation (TCP) * @disable_sqflow: disable controller sq flow control * @hdr_digest: generate/verify header digest (TCP) * @data_digest: generate/verify data digest (TCP) @@ -130,6 +132,7 @@ struct nvmf_ctrl_options { struct key *keyring; struct key *tls_key; bool tls; + bool concat; bool disable_sqflow; bool hdr_digest; bool data_digest; @@ -153,7 +156,7 @@ struct nvmf_ctrl_options { * @create_ctrl(): function pointer that points to a non-NVMe * implementation-specific fabric technology * that would go into starting up that fabric - * for the purpose of conneciton to an NVMe controller + * for the purpose of connection to an NVMe controller * using that fabric technology. * * Notes: @@ -162,7 +165,7 @@ struct nvmf_ctrl_options { * 2. create_ctrl() must be defined (even if it does nothing) * 3. struct nvmf_transport_ops must be statically allocated in the * modules .bss section so that a pure module_get on @module - * prevents the memory from beeing freed. + * prevents the memory from being freed. */ struct nvmf_transport_ops { struct list_head entry; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b9929a5a7f4e..014b387f1e8b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) } static void -nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop) { - struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; struct nvme_fc_rport *rport = lsop->rport; struct nvme_fc_lport *lport = rport->lport; unsigned long flags; @@ -1434,6 +1433,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) } static void +nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +{ + struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; + + nvme_fc_xmt_ls_rsp_free(lsop); +} + +static void nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) { struct nvme_fc_rport *rport = lsop->rport; @@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) dev_warn(lport->dev, "LLDD rejected LS RSP xmt: LS %d status %d\n", w0->ls_cmd, ret); - nvme_fc_xmt_ls_rsp_done(lsop->lsrsp); + nvme_fc_xmt_ls_rsp_free(lsop); return; } } @@ -1948,7 +1955,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) } /* - * For the linux implementation, if we have an unsuccesful + * For the linux implementation, if we have an unsucceesful * status, they blk-mq layer can typically be called with the * non-zero status and the content of the cqe isn't important. */ @@ -2472,7 +2479,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * writing the registers for shutdown and polling (call * nvme_disable_ctrl()). Given a bunch of i/o was potentially * just aborted and we will wait on those contexts, and given - * there was no indication of how live the controlelr is on the + * there was no indication of how live the controller is on the * link, don't send more io to create more contexts for the * shutdown. Let the controller fail via keepalive failure if * its still present. @@ -2571,7 +2578,7 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, if (ret) return -ENOMEM; - op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); + op->nents = blk_rq_map_sg(rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, rq_dma_dir(rq)); @@ -2858,7 +2865,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) unsigned int nr_io_queues; int ret; - nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(), ctrl->lport->ops->max_hw_queues); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { @@ -2912,7 +2919,7 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) unsigned int nr_io_queues; int ret; - nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(), ctrl->lport->ops->max_hw_queues); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index fed6b29098ad..6b3ac8ae3f34 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -114,7 +114,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - struct io_uring_cmd *ioucmd, unsigned int flags) + struct iov_iter *iter, unsigned int flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -128,36 +128,23 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, if (!nvme_ctrl_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked data buffer\n"); if (has_metadata) { - if (!supports_metadata) { - ret = -EINVAL; - goto out; - } + if (!supports_metadata) + return -EINVAL; + if (!nvme_ctrl_meta_sgl_supported(ctrl)) dev_warn_once(ctrl->device, "using unchecked metadata buffer\n"); } - if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { - struct iov_iter iter; - - /* fixedbufs is only for non-vectored io */ - if (flags & NVME_IOCTL_VEC) { - ret = -EINVAL; - goto out; - } - ret = io_uring_cmd_import_fixed(ubuffer, bufflen, - rq_data_dir(req), &iter, ioucmd); - if (ret < 0) - goto out; - ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); - } else { + if (iter) + ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL); + else ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 0, rq_data_dir(req)); - } if (ret) - goto out; + return ret; bio = req->bio; if (bdev) @@ -174,8 +161,6 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, out_unmap: if (bio) blk_rq_unmap_user(bio); -out: - blk_mq_free_request(req); return ret; } @@ -200,7 +185,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, meta_len, NULL, flags); if (ret) - return ret; + goto out_free_req; } bio = req->bio; @@ -216,7 +201,10 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (effects) nvme_passthru_end(ctrl, ns, effects, cmd, ret); + return ret; +out_free_req: + blk_mq_free_request(req); return ret; } @@ -441,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * For iopoll, complete it directly. Note that using the uring_cmd - * helper for this is safe only because we check blk_rq_is_poll(). - * As that returns false if we're NOT on a polled queue, then it's - * safe to use the polled completion helper. - * - * Otherwise, move the completion to task work. + * IOPOLL could potentially complete this request directly, but + * if multiple rings are polling on the same queue, then it's possible + * for one ring to find completions for another ring. Punting the + * completion via task_work will always direct it to the right + * location, rather than potentially complete requests for ringA + * under iopoll invocations from ringB. */ - if (blk_rq_is_poll(req)) { - if (pdu->bio) - blk_rq_unmap_user(pdu->bio); - io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); - } else { - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); - } - + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); return RQ_END_IO_FREE; } @@ -467,6 +448,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct request_queue *q = ns ? ns->queue : ctrl->admin_q; struct nvme_uring_data d; struct nvme_command c; + struct iov_iter iter; + struct iov_iter *map_iter = NULL; struct request *req; blk_opf_t rq_flags = REQ_ALLOC_CACHE; blk_mq_req_flags_t blk_flags = 0; @@ -502,6 +485,22 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, d.metadata_len = READ_ONCE(cmd->metadata_len); d.timeout_ms = READ_ONCE(cmd->timeout_ms); + if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) { + int ddir = nvme_is_write(&c) ? WRITE : READ; + + if (vec) + ret = io_uring_cmd_import_fixed_vec(ioucmd, + u64_to_user_ptr(d.addr), d.data_len, + ddir, &iter, issue_flags); + else + ret = io_uring_cmd_import_fixed(d.addr, d.data_len, + ddir, &iter, ioucmd, issue_flags); + if (ret < 0) + return ret; + + map_iter = &iter; + } + if (issue_flags & IO_URING_F_NONBLOCK) { rq_flags |= REQ_NOWAIT; blk_flags = BLK_MQ_REQ_NOWAIT; @@ -514,12 +513,12 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return PTR_ERR(req); req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; - if (d.addr && d.data_len) { - ret = nvme_map_user_request(req, d.addr, - d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, ioucmd, vec); + if (d.data_len) { + ret = nvme_map_user_request(req, d.addr, d.data_len, + nvme_to_user_ptr(d.metadata), d.metadata_len, + map_iter, vec ? NVME_IOCTL_VEC : 0); if (ret) - return ret; + goto out_free_req; } /* to free bio on completion, as req->bio will be null at that time */ @@ -529,6 +528,10 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, req->end_io = nvme_uring_cmd_end_io; blk_execute_rq_nowait(req, false); return -EIOCBQUEUED; + +out_free_req: + blk_mq_free_request(req); + return ret; } static bool is_ctrl_ioctl(unsigned int cmd) @@ -719,7 +722,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, /* * Handle ioctls that apply to the controller instead of the namespace - * seperately and drop the ns SRCU reference early. This avoids a + * separately and drop the ns SRCU reference early. This avoids a * deadlock when deleting namespaces using the passthrough interface. */ if (is_ctrl_ioctl(cmd)) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 2a7635565083..3da980dc60d9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -10,10 +10,61 @@ #include "nvme.h" bool multipath = true; -module_param(multipath, bool, 0444); +static bool multipath_always_on; + +static int multipath_param_set(const char *val, const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret) + return ret; + + if (multipath_always_on && !*arg) { + pr_err("Can't disable multipath when multipath_always_on is configured.\n"); + *arg = true; + return -EINVAL; + } + + return 0; +} + +static const struct kernel_param_ops multipath_param_ops = { + .set = multipath_param_set, + .get = param_get_bool, +}; + +module_param_cb(multipath, &multipath_param_ops, &multipath, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +static int multipath_always_on_set(const char *val, + const struct kernel_param *kp) +{ + int ret; + bool *arg = kp->arg; + + ret = param_set_bool(val, kp); + if (ret < 0) + return ret; + + if (*arg) + multipath = true; + + return 0; +} + +static const struct kernel_param_ops multipath_always_on_ops = { + .set = multipath_always_on_set, + .get = param_get_bool, +}; + +module_param_cb(multipath_always_on, &multipath_always_on_ops, + &multipath_always_on, 0444); +MODULE_PARM_DESC(multipath_always_on, + "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support"); + static const char *nvme_iopolicy_names[] = { [NVME_IOPOLICY_NUMA] = "numa", [NVME_IOPOLICY_RR] = "round-robin", @@ -427,7 +478,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) struct nvme_ns *ns; if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) - return NULL; + return false; list_for_each_entry_srcu(ns, &head->list, siblings, srcu_read_lock_held(&head->srcu)) { @@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head) break; } } - return false; + + /* + * If "head->delayed_removal_secs" is configured (i.e., non-zero), do + * not immediately fail I/O. Instead, requeue the I/O for the configured + * duration, anticipating that if there's a transient link failure then + * it may recover within this time window. This parameter is exported to + * userspace via sysfs, and its default value is zero. It is internally + * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is + * non-zero, this flag is set to true. When zero, the flag is cleared. + */ + return nvme_mpath_queue_if_no_path(head); } static void nvme_ns_head_submit_bio(struct bio *bio) @@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work) } } +static void nvme_remove_head(struct nvme_ns_head *head) +{ + if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + /* + * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared + * to allow multipath to fail all I/O. + */ + kblockd_schedule_work(&head->requeue_work); + + nvme_cdev_del(&head->cdev, &head->cdev_device); + synchronize_srcu(&head->srcu); + del_gendisk(head->disk); + } + nvme_put_ns_head(head); +} + +static void nvme_remove_head_work(struct work_struct *work) +{ + struct nvme_ns_head *head = container_of(to_delayed_work(work), + struct nvme_ns_head, remove_work); + bool remove = false; + + mutex_lock(&head->subsys->lock); + if (list_empty(&head->list)) { + list_del_init(&head->entry); + remove = true; + } + mutex_unlock(&head->subsys->lock); + if (remove) + nvme_remove_head(head); + + module_put(THIS_MODULE); +} + int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { struct queue_limits lim; @@ -626,19 +721,31 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) spin_lock_init(&head->requeue_lock); INIT_WORK(&head->requeue_work, nvme_requeue_work); INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work); + INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work); + head->delayed_removal_secs = 0; /* - * Add a multipath node if the subsystems supports multiple controllers. - * We also do this for private namespaces as the namespace sharing flag - * could change after a rescan. + * If "multipath_always_on" is enabled, a multipath node is added + * regardless of whether the disk is single/multi ported, and whether + * the namespace is shared or private. If "multipath_always_on" is not + * enabled, a multipath node is added only if the subsystem supports + * multiple controllers and the "multipath" option is configured. In + * either case, for private namespaces, we ensure that the NSID is + * unique. */ - if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || - !nvme_is_unique_nsid(ctrl, head) || !multipath) + if (!multipath_always_on) { + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || + !multipath) + return 0; + } + + if (!nvme_is_unique_nsid(ctrl, head)) return 0; blk_set_stacking_limits(&lim); lim.dma_alignment = 3; - lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL; + lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | + BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES; if (head->ids.csi == NVME_CSI_ZNS) lim.features |= BLK_FEAT_ZONED; @@ -653,12 +760,13 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) * controller's scan_work context. If a path error occurs here, the IO * will wait until a path becomes available or all paths are torn down, * but that action also occurs within scan_work, so it would deadlock. - * Defer the partion scan to a different context that does not block + * Defer the partition scan to a different context that does not block * scan_work. */ set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state); sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); + nvme_tryget_ns_head(head); return 0; } @@ -686,6 +794,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) kblockd_schedule_work(&head->partition_scan_work); } + nvme_mpath_add_sysfs_link(ns->head); + mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { int node, srcu_idx; @@ -768,6 +878,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, if (nvme_state_is_live(ns->ana_state) && nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE) nvme_mpath_set_live(ns); + else { + /* + * Add sysfs link from multipath head gendisk node to path + * device gendisk node. + * If path's ana state is live (i.e. state is either optimized + * or non-optimized) while we alloc the ns then sysfs link would + * be created from nvme_mpath_set_live(). In that case we would + * not fallthrough this code path. However for the path's ana + * state other than live, we call nvme_mpath_set_live() only + * after ana state transitioned to the live state. But we still + * want to create the sysfs link from head node to a path device + * irrespctive of the path's ana state. + * If we reach through here then it means that path's ana state + * is not live but still create the sysfs link to this path from + * head node if head node of the path has already come alive. + */ + if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags)) + nvme_mpath_add_sysfs_link(ns->head); + } } static int nvme_update_ana_state(struct nvme_ctrl *ctrl, @@ -839,7 +968,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl) if (nr_change_groups) mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); else - del_timer_sync(&ctrl->anatt_timer); + timer_delete_sync(&ctrl->anatt_timer); out_unlock: mutex_unlock(&ctrl->ana_lock); return error; @@ -869,7 +998,7 @@ void nvme_mpath_update(struct nvme_ctrl *ctrl) static void nvme_anatt_timeout(struct timer_list *t) { - struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); + struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer); dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); nvme_reset_ctrl(ctrl); @@ -879,7 +1008,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl) { if (!nvme_ctrl_use_ana(ctrl)) return; - del_timer_sync(&ctrl->anatt_timer); + timer_delete_sync(&ctrl->anatt_timer); cancel_work_sync(&ctrl->ana_work); } @@ -955,6 +1084,88 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, } DEVICE_ATTR_RO(ana_state); +static ssize_t queue_depth_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD) + return 0; + + return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active)); +} +DEVICE_ATTR_RO(queue_depth); + +static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + int node, srcu_idx; + nodemask_t numa_nodes; + struct nvme_ns *current_ns; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_head *head = ns->head; + + if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA) + return 0; + + nodes_clear(numa_nodes); + + srcu_idx = srcu_read_lock(&head->srcu); + for_each_node(node) { + current_ns = srcu_dereference(head->current_path[node], + &head->srcu); + if (ns == current_ns) + node_set(node, numa_nodes); + } + srcu_read_unlock(&head->srcu, srcu_idx); + + return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes)); +} +DEVICE_ATTR_RO(numa_nodes); + +static ssize_t delayed_removal_secs_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + int ret; + + mutex_lock(&head->subsys->lock); + ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs); + mutex_unlock(&head->subsys->lock); + return ret; +} + +static ssize_t delayed_removal_secs_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + unsigned int sec; + int ret; + + ret = kstrtouint(buf, 0, &sec); + if (ret < 0) + return ret; + + mutex_lock(&head->subsys->lock); + head->delayed_removal_secs = sec; + if (sec) + set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + else + clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags); + mutex_unlock(&head->subsys->lock); + /* + * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen + * by its reader. + */ + synchronize_srcu(&head->srcu); + + return count; +} + +DEVICE_ATTR_RW(delayed_removal_secs); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { @@ -967,6 +1178,85 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, return -ENXIO; /* just break out of the loop */ } +void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head) +{ + struct device *target; + int rc, srcu_idx; + struct nvme_ns *ns; + struct kobject *kobj; + + /* + * Ensure head disk node is already added otherwise we may get invalid + * kobj for head disk node + */ + if (!test_bit(GD_ADDED, &head->disk->state)) + return; + + kobj = &disk_to_dev(head->disk)->kobj; + + /* + * loop through each ns chained through the head->list and create the + * sysfs link from head node to the ns path node + */ + srcu_idx = srcu_read_lock(&head->srcu); + + list_for_each_entry_srcu(ns, &head->list, siblings, + srcu_read_lock_held(&head->srcu)) { + /* + * Ensure that ns path disk node is already added otherwise we + * may get invalid kobj name for target + */ + if (!test_bit(GD_ADDED, &ns->disk->state)) + continue; + + /* + * Avoid creating link if it already exists for the given path. + * When path ana state transitions from optimized to non- + * optimized or vice-versa, the nvme_mpath_set_live() is + * invoked which in truns call this function. Now if the sysfs + * link already exists for the given path and we attempt to re- + * create the link then sysfs code would warn about it loudly. + * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure + * that we're not creating duplicate link. + * The test_and_set_bit() is used because it is protecting + * against multiple nvme paths being simultaneously added. + */ + if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags)) + continue; + + target = disk_to_dev(ns->disk); + /* + * Create sysfs link from head gendisk kobject @kobj to the + * ns path gendisk kobject @target->kobj. + */ + rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name, + &target->kobj, dev_name(target)); + if (unlikely(rc)) { + dev_err(disk_to_dev(ns->head->disk), + "failed to create link to %s\n", + dev_name(target)); + clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags); + } + } + + srcu_read_unlock(&head->srcu, srcu_idx); +} + +void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns) +{ + struct device *target; + struct kobject *kobj; + + if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags)) + return; + + target = disk_to_dev(ns->disk); + kobj = &disk_to_dev(ns->head->disk)->kobj; + sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name, + dev_name(target)); + clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags); +} + void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { if (nvme_ctrl_use_ana(ns->ctrl)) { @@ -998,23 +1288,46 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) #endif } -void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +void nvme_mpath_remove_disk(struct nvme_ns_head *head) { + bool remove = false; + if (!head->disk) return; - if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { - nvme_cdev_del(&head->cdev, &head->cdev_device); + + mutex_lock(&head->subsys->lock); + /* + * We are called when all paths have been removed, and at that point + * head->list is expected to be empty. However, nvme_remove_ns() and + * nvme_init_ns_head() can run concurrently and so if head->delayed_ + * removal_secs is configured, it is possible that by the time we reach + * this point, head->list may no longer be empty. Therefore, we recheck + * head->list here. If it is no longer empty then we skip enqueuing the + * delayed head removal work. + */ + if (!list_empty(&head->list)) + goto out; + + if (head->delayed_removal_secs) { /* - * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared - * to allow multipath to fail all I/O. + * Ensure that no one could remove this module while the head + * remove work is pending. */ - synchronize_srcu(&head->srcu); - kblockd_schedule_work(&head->requeue_work); - del_gendisk(head->disk); + if (!try_module_get(THIS_MODULE)) + goto out; + mod_delayed_work(nvme_wq, &head->remove_work, + head->delayed_removal_secs * HZ); + } else { + list_del_init(&head->entry); + remove = true; } +out: + mutex_unlock(&head->subsys->lock); + if (remove) + nvme_remove_head(head); } -void nvme_mpath_remove_disk(struct nvme_ns_head *head) +void nvme_mpath_put_disk(struct nvme_ns_head *head) { if (!head->disk) return; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7be92d07430e..7df2ea21851f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -442,7 +442,7 @@ struct nvme_subsystem { u8 cmic; enum nvme_subsys_type subtype; u16 vendor_id; - u16 awupf; /* 0's based awupf value. */ + u16 awupf; /* 0's based value. */ struct ida ns_ida; #ifdef CONFIG_NVME_MULTIPATH enum nvme_iopolicy iopolicy; @@ -496,6 +496,9 @@ struct nvme_ns_head { struct device cdev_device; struct gendisk *disk; + + u16 nr_plids; + u16 *plids; #ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; spinlock_t requeue_lock; @@ -503,7 +506,10 @@ struct nvme_ns_head { struct work_struct partition_scan_work; struct mutex lock; unsigned long flags; -#define NVME_NSHEAD_DISK_LIVE 0 + struct delayed_work remove_work; + unsigned int delayed_removal_secs; +#define NVME_NSHEAD_DISK_LIVE 0 +#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1 struct nvme_ns __rcu *current_path[]; #endif }; @@ -516,7 +522,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ - NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeores supported */ + NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeroes supported */ }; struct nvme_ns { @@ -534,10 +540,11 @@ struct nvme_ns { struct nvme_ns_head *head; unsigned long flags; -#define NVME_NS_REMOVING 0 -#define NVME_NS_ANA_PENDING 2 -#define NVME_NS_FORCE_RO 3 -#define NVME_NS_READY 4 +#define NVME_NS_REMOVING 0 +#define NVME_NS_ANA_PENDING 2 +#define NVME_NS_FORCE_RO 3 +#define NVME_NS_READY 4 +#define NVME_NS_SYSFS_ATTR_LINK 5 struct cdev cdev; struct device cdev_device; @@ -895,10 +902,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int qid, nvme_submit_flags_t flags); int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result); + void *result); int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, - u32 *result); + void *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); @@ -933,6 +940,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_attr_groups[]; +extern const struct attribute_group nvme_ns_mpath_attr_group; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; extern const struct attribute_group nvme_dev_attrs_group; @@ -955,8 +963,10 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); +void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns); +void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns); void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid); -void nvme_mpath_remove_disk(struct nvme_ns_head *head); +void nvme_mpath_put_disk(struct nvme_ns_head *head); int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); void nvme_mpath_update(struct nvme_ctrl *ctrl); @@ -965,7 +975,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); -void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); +void nvme_mpath_remove_disk(struct nvme_ns_head *head); void nvme_mpath_start_request(struct request *rq); void nvme_mpath_end_request(struct request *rq); @@ -980,12 +990,21 @@ static inline void nvme_trace_bio_complete(struct request *req) extern bool multipath; extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; +extern struct device_attribute dev_attr_queue_depth; +extern struct device_attribute dev_attr_numa_nodes; +extern struct device_attribute dev_attr_delayed_removal_secs; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) { return disk->fops == &nvme_ns_head_ops; } +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head) +{ + if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags)) + return true; + return false; +} #else #define multipath false static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) @@ -1006,7 +1025,13 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { } -static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_put_disk(struct nvme_ns_head *head) +{ +} +static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns) +{ +} +static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns) { } static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) @@ -1019,7 +1044,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns) static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { } -static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } static inline void nvme_trace_bio_complete(struct request *req) @@ -1067,6 +1092,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) { return false; } +static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head) +{ + return false; +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16], @@ -1147,6 +1176,7 @@ void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); void nvme_auth_free(struct nvme_ctrl *ctrl); +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl); #else static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { @@ -1169,6 +1199,7 @@ static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) return -EPROTONOSUPPORT; } static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; +static inline void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) {}; #endif u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1dc12784efaf..3ef30c36bf10 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/nodemask.h> #include <linux/once.h> #include <linux/pci.h> #include <linux/suspend.h> @@ -34,16 +35,31 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +/* Optimisation for I/Os between 4k and 128k */ +#define NVME_SMALL_POOL_SIZE 256 /* * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data. */ #define NVME_MAX_KB_SZ 8192 -#define NVME_MAX_SEGS 128 -#define NVME_MAX_META_SEGS 15 -#define NVME_MAX_NR_ALLOCATIONS 5 +#define NVME_MAX_NR_DESCRIPTORS 5 + +/* + * For data SGLs we support a single descriptors worth of SGL entries, but for + * now we also limit it to avoid an allocation larger than PAGE_SIZE for the + * scatterlist. + */ +#define NVME_MAX_SEGS \ + min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \ + (PAGE_SIZE / sizeof(struct scatterlist))) + +/* + * For metadata SGLs, only the small descriptor is supported, and the first + * entry is the segment descriptor, which for the data pointer sits in the SQE. + */ +#define NVME_MAX_META_SEGS \ + ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1) static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -112,6 +128,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); static void nvme_update_attrs(struct nvme_dev *dev); +struct nvme_descriptor_pools { + struct dma_pool *large; + struct dma_pool *small; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -121,8 +142,6 @@ struct nvme_dev { struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; unsigned online_queues; unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; @@ -162,6 +181,7 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + struct nvme_descriptor_pools descriptor_pools[]; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -191,6 +211,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) */ struct nvme_queue { struct nvme_dev *dev; + struct nvme_descriptor_pools descriptor_pools; spinlock_t sq_lock; void *sq_cmds; /* only used for poll queues: */ @@ -219,30 +240,30 @@ struct nvme_queue { struct completion delete_done; }; -union nvme_descriptor { - struct nvme_sgl_desc *sg_list; - __le64 *prp_list; +/* bits for iod->flags */ +enum nvme_iod_flags { + /* this command has been aborted by the timeout handler */ + IOD_ABORTED = 1U << 0, + + /* uses the small descriptor pool */ + IOD_SMALL_DESCRIPTOR = 1U << 1, }; /* * The nvme_iod describes the data in an I/O. - * - * The sg pointer contains the list of PRP/SGL chunk allocations in addition - * to the actual struct scatterlist. */ struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - bool aborted; - s8 nr_allocations; /* PRP list pool allocations. 0 means small - pool in use */ + u8 flags; + u8 nr_descriptors; unsigned int dma_len; /* length of single DMA segment mapping */ dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; struct sg_table meta_sgt; - union nvme_descriptor meta_list; - union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; + struct nvme_sgl_desc *meta_descriptor; + void *descriptors[NVME_MAX_NR_DESCRIPTORS]; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -390,37 +411,85 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_pci_npages_prp(void) +static __always_inline int nvme_pci_npages_prp(void) { unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } -static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static struct nvme_descriptor_pools * +nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) { - struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[0]; + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node]; + size_t small_align = NVME_SMALL_POOL_SIZE; - WARN_ON(hctx_idx != 0); - WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + if (pools->small) + return pools; /* already initialized */ - hctx->driver_data = nvmeq; - return 0; + pools->large = dma_pool_create_node("nvme descriptor page", dev->dev, + NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node); + if (!pools->large) + return ERR_PTR(-ENOMEM); + + if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) + small_align = 512; + + pools->small = dma_pool_create_node("nvme descriptor small", dev->dev, + NVME_SMALL_POOL_SIZE, small_align, 0, numa_node); + if (!pools->small) { + dma_pool_destroy(pools->large); + pools->large = NULL; + return ERR_PTR(-ENOMEM); + } + + return pools; } -static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) +static void nvme_release_descriptor_pools(struct nvme_dev *dev) +{ + unsigned i; + + for (i = 0; i < nr_node_ids; i++) { + struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i]; + + dma_pool_destroy(pools->large); + dma_pool_destroy(pools->small); + } +} + +static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data, + unsigned qid) { struct nvme_dev *dev = to_nvme_dev(data); - struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; + struct nvme_queue *nvmeq = &dev->queues[qid]; + struct nvme_descriptor_pools *pools; + struct blk_mq_tags *tags; + + tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0]; + WARN_ON(tags != hctx->tags); + pools = nvme_setup_descriptor_pools(dev, hctx->numa_node); + if (IS_ERR(pools)) + return PTR_ERR(pools); - WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); + nvmeq->descriptor_pools = *pools; hctx->driver_data = nvmeq; return 0; } +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + WARN_ON(hctx_idx != 0); + return nvme_init_hctx_common(hctx, data, 0); +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + return nvme_init_hctx_common(hctx, data, hctx_idx + 1); +} + static int nvme_pci_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) @@ -537,23 +606,39 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, return true; } -static void nvme_free_prps(struct nvme_dev *dev, struct request *req) +static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, + struct nvme_iod *iod) +{ + if (iod->flags & IOD_SMALL_DESCRIPTOR) + return nvmeq->descriptor_pools.small; + return nvmeq->descriptor_pools.large; +} + +static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req) { const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); dma_addr_t dma_addr = iod->first_dma; int i; - for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = iod->list[i].prp_list; + if (iod->nr_descriptors == 1) { + dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0], + dma_addr); + return; + } + + for (i = 0; i < iod->nr_descriptors; i++) { + __le64 *prp_list = iod->descriptors[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); + dma_pool_free(nvmeq->descriptor_pools.large, prp_list, + dma_addr); dma_addr = next_dma_addr; } } -static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, + struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -566,15 +651,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) WARN_ON_ONCE(!iod->sgt.nents); dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - - if (iod->nr_allocations == 0) - dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, - iod->first_dma); - else if (iod->nr_allocations == 1) - dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, - iod->first_dma); - else - nvme_free_prps(dev, req); + nvme_free_descriptors(nvmeq, req); mempool_free(iod->sgt.sgl, dev->iod_mempool); } @@ -592,11 +669,10 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents) } } -static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, +static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; int length = blk_rq_payload_bytes(req); struct scatterlist *sg = iod->sgt.sgl; int dma_len = sg_dma_len(sg); @@ -604,7 +680,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; dma_addr_t prp_dma; - int nprps, i; + int i; length -= (NVME_CTRL_PAGE_SIZE - offset); if (length <= 0) { @@ -626,30 +702,26 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, goto done; } - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } + if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <= + NVME_SMALL_POOL_SIZE / sizeof(__le64)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) { - iod->nr_allocations = -1; + prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &prp_dma); + if (!prp_list) return BLK_STS_RESOURCE; - } - iod->list[0].prp_list = prp_list; + iod->descriptors[iod->nr_descriptors++] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + + prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, + GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; - iod->list[iod->nr_allocations++].prp_list = prp_list; + iod->descriptors[iod->nr_descriptors++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); i = 1; @@ -673,7 +745,7 @@ done: cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); return BLK_STS_OK; free_prps: - nvme_free_prps(dev, req); + nvme_free_descriptors(nvmeq, req); return BLK_STS_RESOURCE; bad_sgl: WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), @@ -698,11 +770,10 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } -static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, +static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, struct request *req, struct nvme_rw_command *cmd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; struct nvme_sgl_desc *sg_list; struct scatterlist *sg = iod->sgt.sgl; unsigned int entries = iod->sgt.nents; @@ -717,21 +788,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, return BLK_STS_OK; } - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } + if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list)) + iod->flags |= IOD_SMALL_DESCRIPTOR; - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) { - iod->nr_allocations = -1; + sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &sgl_dma); + if (!sg_list) return BLK_STS_RESOURCE; - } - - iod->list[0].sg_list = sg_list; + iod->descriptors[iod->nr_descriptors++] = sg_list; iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); @@ -785,12 +849,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret = BLK_STS_RESOURCE; int rc; if (blk_rq_nr_phys_segments(req) == 1) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { @@ -812,7 +876,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (!iod->sgt.sgl) return BLK_STS_RESOURCE; sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); + iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl); if (!iod->sgt.orig_nents) goto out_free_sg; @@ -825,9 +889,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, } if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); + ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw); else - ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw); if (ret != BLK_STS_OK) goto out_unmap_sg; return BLK_STS_OK; @@ -842,6 +906,7 @@ out_free_sg: static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_rw_command *cmnd = &iod->cmd.rw; struct nvme_sgl_desc *sg_list; @@ -865,12 +930,13 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, if (rc) goto out_free_sg; - sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma); + sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, + &sgl_dma); if (!sg_list) goto out_unmap_sg; entries = iod->meta_sgt.nents; - iod->meta_list.sg_list = sg_list; + iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; cmnd->flags = NVME_CMD_SGL_METASEG; @@ -912,7 +978,10 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) { - if (nvme_pci_metadata_use_sgls(dev, req)) + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) && + nvme_pci_metadata_use_sgls(dev, req)) return nvme_pci_setup_meta_sgls(dev, req); return nvme_pci_setup_meta_mptr(dev, req); } @@ -922,8 +991,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; - iod->aborted = false; - iod->nr_allocations = -1; + iod->flags = 0; + iod->nr_descriptors = 0; iod->sgt.nents = 0; iod->meta_sgt.nents = 0; @@ -947,15 +1016,12 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; out_unmap_data: if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); + nvme_unmap_data(dev, req->mq_hctx->driver_data, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; } -/* - * NOTE: ns is NULL when called on the admin queue. - */ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1040,6 +1106,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist) } static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, + struct nvme_queue *nvmeq, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1051,8 +1118,8 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, return; } - dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list, - iod->meta_dma); + dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, + iod->meta_dma); dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); } @@ -1063,10 +1130,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req) struct nvme_dev *dev = nvmeq->dev; if (blk_integrity_rq(req)) - nvme_unmap_metadata(dev, req); + nvme_unmap_metadata(dev, nvmeq, req); if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req); + nvme_unmap_data(dev, nvmeq, req); } static void nvme_pci_complete_rq(struct request *req) @@ -1205,7 +1272,9 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + spin_lock(&nvmeq->cq_poll_lock); nvme_poll_cq(nvmeq, NULL); + spin_unlock(&nvmeq->cq_poll_lock); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } @@ -1491,7 +1560,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * returned to the driver, or if this is the admin queue. */ opcode = nvme_req(req)->cmd->common.opcode; - if (!nvmeq->qid || iod->aborted) { + if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) { dev_warn(dev->ctrl.device, "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n", req->tag, nvme_cid(req), opcode, @@ -1504,7 +1573,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } - iod->aborted = true; + iod->flags |= IOD_ABORTED; cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = nvme_cid(req); @@ -1889,8 +1958,28 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) * might be pointing at! */ result = nvme_disable_ctrl(&dev->ctrl, false); - if (result < 0) - return result; + if (result < 0) { + struct pci_dev *pdev = to_pci_dev(dev->dev); + + /* + * The NVMe Controller Reset method did not get an expected + * CSTS.RDY transition, so something with the device appears to + * be stuck. Use the lower level and bigger hammer PCIe + * Function Level Reset to attempt restoring the device to its + * initial state, and try again. + */ + result = pcie_reset_flr(pdev, false); + if (result < 0) + return result; + + pci_restore_state(pdev); + result = nvme_disable_ctrl(&dev->ctrl, false); + if (result < 0) + return result; + + dev_info(dev->ctrl.device, + "controller reset completed after pcie flr\n"); + } result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); if (result) @@ -2032,8 +2121,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - nvme_update_attrs(dev); } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2843,35 +2930,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) return 0; } -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - size_t small_align = 256; - - dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512) - small_align = 512; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, - 256, small_align, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); @@ -2970,12 +3028,14 @@ static void nvme_reset_work(struct work_struct *work) if (result < 0) goto out; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out; /* - * Freeze and update the number of I/O queues as thos might have + * Freeze and update the number of I/O queues as those might have * changed. If there are no I/O queues left after this reset, keep the * controller around but remove all namespaces. */ @@ -3146,7 +3206,7 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) /* * Exclude some Kingston NV1 and A2000 devices from * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a - * lot fo energy with s2idle sleep on some TUXEDO platforms. + * lot of energy with s2idle sleep on some TUXEDO platforms. */ if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") || dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") || @@ -3186,7 +3246,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); + dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids), + GFP_KERNEL, node); if (!dev) return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); @@ -3261,13 +3322,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto out_uninit_ctrl; - result = nvme_setup_prp_pools(dev); - if (result) - goto out_dev_unmap; - result = nvme_pci_alloc_iod_mempool(dev); if (result) - goto out_release_prp_pools; + goto out_dev_unmap; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); @@ -3306,6 +3363,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result < 0) goto out_disable; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out_disable; @@ -3343,8 +3402,6 @@ out_disable: out_release_iod_mempool: mempool_destroy(dev->iod_mempool); mempool_destroy(dev->iod_meta_mempool); -out_release_prp_pools: - nvme_release_prp_pools(dev); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: @@ -3409,7 +3466,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_queues(dev, 0); mempool_destroy(dev->iod_mempool); mempool_destroy(dev->iod_meta_mempool); - nvme_release_prp_pools(dev); + nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); } @@ -3578,7 +3635,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); - if (!nvme_try_sched_reset(&dev->ctrl)) + if (nvme_try_sched_reset(&dev->ctrl)) nvme_unquiesce_io_queues(&dev->ctrl); return PCI_ERS_RESULT_RECOVERED; } @@ -3626,6 +3683,9 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */ .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, }, + { PCI_DEVICE(0x126f, 0x1001), /* Silicon Motion generic */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_BOGUS_NID, }, @@ -3649,6 +3709,9 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x15b7, 0x5008), /* Sandisk SN530 */ .driver_data = NVME_QUIRK_BROKEN_MSI }, + { PCI_DEVICE(0x15b7, 0x5009), /* Sandisk SN550 */ + .driver_data = NVME_QUIRK_BROKEN_MSI | + NVME_QUIRK_NO_DEEPEST_PS }, { PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ @@ -3734,6 +3797,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ @@ -3802,9 +3867,7 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); - BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE); - BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); + BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index cf2d2c5039dd..ca6a74607b13 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -82,8 +82,6 @@ static int nvme_status_to_pr_err(int status) return PR_STS_SUCCESS; case NVME_SC_RESERVATION_CONFLICT: return PR_STS_RESERVATION_CONFLICT; - case NVME_SC_ONCS_NOT_SUPPORTED: - return -EOPNOTSUPP; case NVME_SC_BAD_ATTRIBUTES: case NVME_SC_INVALID_OPCODE: case NVME_SC_INVALID_FIELD: diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 86a2891d9bcc..9bd3646568d0 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -221,7 +221,7 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, /* * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue - * lifetime. It's safe, since any chage in the underlying RDMA device + * lifetime. It's safe, since any change in the underlying RDMA device * will issue error recovery and queue re-creation. */ for (i = 0; i < ib_queue_size; i++) { @@ -800,7 +800,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, /* * Bind the async event SQE DMA mapping to the admin queue lifetime. - * It's safe, since any chage in the underlying RDMA device will issue + * It's safe, since any change in the underlying RDMA device will issue * error recovery and queue re-creation. */ error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, @@ -1476,8 +1476,7 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, if (ret) return -ENOMEM; - req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, - req->data_sgl.sg_table.sgl); + req->data_sgl.nents = blk_rq_map_sg(rq, req->data_sgl.sg_table.sgl); *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3a41b9ab0f13..29430949ce2f 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -258,6 +258,9 @@ static struct attribute *nvme_ns_attrs[] = { #ifdef CONFIG_NVME_MULTIPATH &dev_attr_ana_grpid.attr, &dev_attr_ana_state.attr, + &dev_attr_queue_depth.attr, + &dev_attr_numa_nodes.attr, + &dev_attr_delayed_removal_secs.attr, #endif &dev_attr_io_passthru_err_log_enabled.attr, NULL, @@ -290,6 +293,16 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; } + if (a == &dev_attr_queue_depth.attr || a == &dev_attr_numa_nodes.attr) { + if (nvme_disk_is_ns_head(dev_to_disk(dev))) + return 0; + } + if (a == &dev_attr_delayed_removal_secs.attr) { + struct gendisk *disk = dev_to_disk(dev); + + if (!nvme_disk_is_ns_head(disk)) + return 0; + } #endif return a->mode; } @@ -299,8 +312,50 @@ static const struct attribute_group nvme_ns_attr_group = { .is_visible = nvme_ns_attrs_are_visible, }; +#ifdef CONFIG_NVME_MULTIPATH +/* + * NOTE: The dummy attribute does not appear in sysfs. It exists solely to allow + * control over the visibility of the multipath sysfs node. Without at least one + * attribute defined in nvme_ns_mpath_attrs[], the sysfs implementation does not + * invoke the multipath_sysfs_group_visible() method. As a result, we would not + * be able to control the visibility of the multipath sysfs node. + */ +static struct attribute dummy_attr = { + .name = "dummy", +}; + +static struct attribute *nvme_ns_mpath_attrs[] = { + &dummy_attr, + NULL, +}; + +static bool multipath_sysfs_group_visible(struct kobject *kobj) +{ + struct device *dev = container_of(kobj, struct device, kobj); + + return nvme_disk_is_ns_head(dev_to_disk(dev)); +} + +static bool multipath_sysfs_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + return false; +} + +DEFINE_SYSFS_GROUP_VISIBLE(multipath_sysfs) + +const struct attribute_group nvme_ns_mpath_attr_group = { + .name = "multipath", + .attrs = nvme_ns_mpath_attrs, + .is_visible = SYSFS_GROUP_VISIBLE(multipath_sysfs), +}; +#endif + const struct attribute_group *nvme_ns_attr_groups[] = { &nvme_ns_attr_group, +#ifdef CONFIG_NVME_MULTIPATH + &nvme_ns_mpath_attr_group, +#endif NULL, }; @@ -780,10 +835,10 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_tls_key.attr && - !ctrl->opts->tls) + !ctrl->opts->tls && !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_configured_key.attr && - !ctrl->opts->tls_key) + (!ctrl->opts->tls_key || ctrl->opts->concat)) return 0; if (a == &dev_attr_tls_keyring.attr && !ctrl->opts->keyring) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 327f3f2f5399..9233f088fac8 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,7 +8,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/key.h> +#include <linux/crc32.h> #include <linux/nvme-tcp.h> #include <linux/nvme-keyring.h> #include <net/sock.h> @@ -17,7 +17,6 @@ #include <net/tls_prot.h> #include <net/handshake.h> #include <linux/blk-mq.h> -#include <crypto/hash.h> #include <net/busy_poll.h> #include <trace/events/sock.h> @@ -169,8 +168,8 @@ struct nvme_tcp_queue { bool hdr_digest; bool data_digest; bool tls_enabled; - struct ahash_request *rcv_hash; - struct ahash_request *snd_hash; + u32 rcv_crc; + u32 snd_crc; __le32 exp_ddgst; __le32 recv_ddgst; struct completion tls_complete; @@ -249,7 +248,7 @@ static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0; - return ctrl->opts->tls; + return ctrl->opts->tls || ctrl->opts->concat; } static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) @@ -404,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync, bool last) + bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; @@ -418,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, * are on the same cpu, so we don't introduce contention. */ if (queue->io_cpu == raw_smp_processor_id() && - sync && empty && mutex_trylock(&queue->send_mutex)) { + empty && mutex_trylock(&queue->send_mutex)) { nvme_tcp_send_all(queue); mutex_unlock(&queue->send_mutex); } @@ -453,36 +452,43 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) return NULL; } - list_del(&req->entry); + list_del_init(&req->entry); + init_llist_node(&req->lentry); return req; } -static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, - __le32 *dgst) +#define NVME_TCP_CRC_SEED (~0) + +static inline void nvme_tcp_ddgst_update(u32 *crcp, + struct page *page, size_t off, size_t len) { - ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); - crypto_ahash_final(hash); + page += off / PAGE_SIZE; + off %= PAGE_SIZE; + while (len) { + const void *vaddr = kmap_local_page(page); + size_t n = min(len, (size_t)PAGE_SIZE - off); + + *crcp = crc32c(*crcp, vaddr + off, n); + kunmap_local(vaddr); + page++; + off = 0; + len -= n; + } } -static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, - struct page *page, off_t off, size_t len) +static inline __le32 nvme_tcp_ddgst_final(u32 crc) { - struct scatterlist sg; - - sg_init_table(&sg, 1); - sg_set_page(&sg, page, len, off); - ahash_request_set_crypt(hash, &sg, NULL, len); - crypto_ahash_update(hash); + return cpu_to_le32(~crc); } -static inline void nvme_tcp_hdgst(struct ahash_request *hash, - void *pdu, size_t len) +static inline __le32 nvme_tcp_hdgst(const void *pdu, size_t len) { - struct scatterlist sg; + return cpu_to_le32(~crc32c(NVME_TCP_CRC_SEED, pdu, len)); +} - sg_init_one(&sg, pdu, len); - ahash_request_set_crypt(hash, &sg, pdu + len, len); - crypto_ahash_digest(hash); +static inline void nvme_tcp_set_hdgst(void *pdu, size_t len) +{ + *(__le32 *)(pdu + len) = nvme_tcp_hdgst(pdu, len); } static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, @@ -500,8 +506,7 @@ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, } recv_digest = *(__le32 *)(pdu + hdr->hlen); - nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); - exp_digest = *(__le32 *)(pdu + hdr->hlen); + exp_digest = nvme_tcp_hdgst(pdu, pdu_len); if (recv_digest != exp_digest) { dev_err(queue->ctrl->ctrl.device, "header digest error: recv %#x expected %#x\n", @@ -527,7 +532,7 @@ static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) nvme_tcp_queue_id(queue)); return -EPROTO; } - crypto_ahash_init(queue->rcv_hash); + queue->rcv_crc = NVME_TCP_CRC_SEED; return 0; } @@ -561,6 +566,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, req->queue = queue; nvme_req(rq)->ctrl = &ctrl->ctrl; nvme_req(rq)->cmd = &pdu->cmd; + init_llist_node(&req->lentry); + INIT_LIST_HEAD(&req->entry); return 0; } @@ -765,13 +772,23 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, return -EPROTO; } + if (llist_on_list(&req->lentry) || + !list_empty(&req->entry)) { + dev_err(queue->ctrl->ctrl.device, + "req %d unexpected r2t while processing request\n", + rq->tag); + return -EPROTO; + } + req->pdu_len = 0; req->h2cdata_left = r2t_length; req->h2cdata_offset = r2t_offset; req->ttag = pdu->ttag; nvme_tcp_setup_h2c_data_pdu(req); - nvme_tcp_queue_request(req, false, true); + + llist_add(&req->lentry, &queue->req_list); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); return 0; } @@ -927,8 +944,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, iov_iter_count(&req->iter)); if (queue->data_digest) - ret = skb_copy_and_hash_datagram_iter(skb, *offset, - &req->iter, recv_len, queue->rcv_hash); + ret = skb_copy_and_crc32c_datagram_iter(skb, *offset, + &req->iter, recv_len, &queue->rcv_crc); else ret = skb_copy_datagram_iter(skb, *offset, &req->iter, recv_len); @@ -946,7 +963,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, if (!queue->data_remaining) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); + queue->exp_ddgst = nvme_tcp_ddgst_final(queue->rcv_crc); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { @@ -1148,7 +1165,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) return ret; if (queue->data_digest) - nvme_tcp_ddgst_update(queue->snd_hash, page, + nvme_tcp_ddgst_update(&queue->snd_crc, page, offset, ret); /* @@ -1162,8 +1179,8 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) /* fully successful last send in current PDU */ if (last && ret == len) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->snd_hash, - &req->ddgst); + req->ddgst = + nvme_tcp_ddgst_final(queue->snd_crc); req->state = NVME_TCP_SEND_DDGST; req->offset = 0; } else { @@ -1195,7 +1212,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) msg.msg_flags |= MSG_EOR; if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); bvec_set_virt(&bvec, (void *)pdu + req->offset, len); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); @@ -1208,7 +1225,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) if (inline_data) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); + queue->snd_crc = NVME_TCP_CRC_SEED; } else { nvme_tcp_done_send_req(queue); } @@ -1230,7 +1247,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) int ret; if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); if (!req->h2cdata_left) msg.msg_flags |= MSG_SPLICE_PAGES; @@ -1245,7 +1262,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) if (!len) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); + queue->snd_crc = NVME_TCP_CRC_SEED; return 1; } req->offset += ret; @@ -1349,7 +1366,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) queue->nr_cqe = 0; consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); - return consumed; + return consumed == -EAGAIN ? 0 : consumed; } static void nvme_tcp_io_work(struct work_struct *w) @@ -1377,6 +1394,11 @@ static void nvme_tcp_io_work(struct work_struct *w) else if (unlikely(result < 0)) return; + /* did we get some space after spending time in recv? */ + if (nvme_tcp_queue_has_pending(queue) && + sk_stream_is_writeable(queue->sock->sk)) + pending = true; + if (!pending || !queue->rd_enabled) return; @@ -1385,41 +1407,6 @@ static void nvme_tcp_io_work(struct work_struct *w) queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } -static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); - - ahash_request_free(queue->rcv_hash); - ahash_request_free(queue->snd_hash); - crypto_free_ahash(tfm); -} - -static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm; - - tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->snd_hash) - goto free_tfm; - ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); - - queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->rcv_hash) - goto free_snd_hash; - ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); - - return 0; -free_snd_hash: - ahash_request_free(queue->snd_hash); -free_tfm: - crypto_free_ahash(tfm); - return -ENOMEM; -} - static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) { struct nvme_tcp_request *async = &ctrl->async_req; @@ -1452,9 +1439,6 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); - page_frag_cache_drain(&queue->pf_cache); noreclaim_flag = memalloc_noreclaim_save(); @@ -1761,9 +1745,14 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, qid, ret); tls_handshake_cancel(queue->sock->sk); } else { - dev_dbg(nctrl->device, - "queue %d: TLS handshake complete, error %d\n", - qid, queue->tls_err); + if (queue->tls_err) { + dev_err(nctrl->device, + "queue %d: TLS handshake complete, error %d\n", + qid, queue->tls_err); + } else { + dev_dbg(nctrl->device, + "queue %d: TLS handshake complete\n", qid); + } ret = queue->tls_err; } return ret; @@ -1790,7 +1779,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; - ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, + ret = sock_create_kern(current->nsproxy->net_ns, + ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { dev_err(nctrl->device, @@ -1803,6 +1793,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, ret = PTR_ERR(sock_file); goto err_destroy_mutex; } + + sk_net_refcnt_upgrade(queue->sock->sk); nvme_tcp_reclassify_socket(queue->sock); /* Single syn retry */ @@ -1865,21 +1857,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, queue->hdr_digest = nctrl->opts->hdr_digest; queue->data_digest = nctrl->opts->data_digest; - if (queue->hdr_digest || queue->data_digest) { - ret = nvme_tcp_alloc_crypto(queue); - if (ret) { - dev_err(nctrl->device, - "failed to allocate queue %d crypto\n", qid); - goto err_sock; - } - } rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + nvme_tcp_hdgst_len(queue); queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); if (!queue->pdu) { ret = -ENOMEM; - goto err_crypto; + goto err_sock; } dev_dbg(nctrl->device, "connecting queue %d\n", @@ -1912,9 +1896,6 @@ err_init_connect: kernel_sock_shutdown(queue->sock, SHUT_RDWR); err_rcv_pdu: kfree(queue->pdu); -err_crypto: - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); err_sock: /* ->sock will be released by fput() */ fput(queue->sock->file); @@ -1944,7 +1925,7 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) cancel_work_sync(&queue->io_work); } -static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; @@ -1963,6 +1944,31 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) mutex_unlock(&queue->queue_lock); } +static void nvme_tcp_wait_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + int timeout = 100; + + while (timeout > 0) { + if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags) || + !sk_wmem_alloc_get(queue->sock->sk)) + return; + msleep(2); + timeout -= 2; + } + dev_warn(nctrl->device, + "qid %d: timeout draining sock wmem allocation expired\n", + qid); +} + +static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +{ + nvme_tcp_stop_queue_nowait(nctrl, qid); + nvme_tcp_wait_queue(nctrl, qid); +} + + static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) { write_lock_bh(&queue->sock->sk->sk_callback_lock); @@ -2030,7 +2036,9 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) int i; for (i = 1; i < ctrl->queue_count; i++) - nvme_tcp_stop_queue(ctrl, i); + nvme_tcp_stop_queue_nowait(ctrl, i); + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_wait_queue(ctrl, i); } static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, @@ -2060,7 +2068,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) if (nvme_tcp_tls_configured(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); - else { + else if (ctrl->opts->tls) { pskid = nvme_tls_psk_default(ctrl->opts->keyring, ctrl->opts->host->nqn, ctrl->opts->subsysnqn); @@ -2090,9 +2098,25 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { - dev_err(ctrl->device, "no PSK negotiated\n"); - return -ENOKEY; + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->concat) { + /* + * The generated PSK is stored in the + * fabric options + */ + if (!ctrl->opts->tls_key) { + dev_err(ctrl->device, "no PSK generated\n"); + return -ENOKEY; + } + if (ctrl->tls_pskid && + ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) { + dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } + } else if (!ctrl->tls_pskid) { + dev_err(ctrl->device, "no PSK negotiated\n"); + return -ENOKEY; + } } for (i = 1; i < ctrl->queue_count; i++) { @@ -2310,6 +2334,27 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, } } +/* + * The TLS key is set by secure concatenation after negotiation has been + * completed on the admin queue. We need to revoke the key when: + * - concatenation is enabled (otherwise it's a static key set by the user) + * and + * - the generated key is present in ctrl->tls_key (otherwise there's nothing + * to revoke) + * and + * - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS + * negotiation has not run). + * + * We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called + * twice during secure concatenation, once on a 'normal' connection to run the + * DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set), + * and once after the negotiation (which uses the key, so it _must_ be set). + */ +static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid; +} + static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; @@ -2319,6 +2364,16 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (ret) return ret; + if (ctrl->opts->concat && !ctrl->tls_pskid) { + /* See comments for nvme_tcp_key_revoke_needed() */ + dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n"); + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + ret = nvme_tcp_configure_admin_queue(ctrl, false); + if (ret) + goto destroy_admin; + } + if (ctrl->icdoff) { ret = -EOPNOTSUPP; dev_err(ctrl->device, "icdoff is not supported!\n"); @@ -2415,6 +2470,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); @@ -2455,6 +2512,8 @@ static void nvme_reset_ctrl_work(struct work_struct *work) container_of(work, struct nvme_ctrl, reset_work); int ret; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_ctrl(ctrl); nvme_tcp_teardown_ctrl(ctrl, false); @@ -2556,8 +2615,10 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.offset = 0; ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; + init_llist_node(&ctrl->async_req.lentry); + INIT_LIST_HEAD(&ctrl->async_req.entry); - nvme_tcp_queue_request(&ctrl->async_req, true, true); + nvme_tcp_queue_request(&ctrl->async_req, true); } static void nvme_tcp_complete_timed_out(struct request *rq) @@ -2709,7 +2770,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_start_request(rq); - nvme_tcp_queue_request(req, true, bd->last); + nvme_tcp_queue_request(req, bd->last); return BLK_STS_OK; } @@ -2951,7 +3012,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | - NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY, + NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 382949e18c6a..cce4c5b55aa9 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -146,17 +146,16 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, return NULL; } -static int nvme_zone_parse_entry(struct nvme_ctrl *ctrl, - struct nvme_ns_head *head, +static int nvme_zone_parse_entry(struct nvme_ns *ns, struct nvme_zone_descriptor *entry, unsigned int idx, report_zones_cb cb, void *data) { + struct nvme_ns_head *head = ns->head; struct blk_zone zone = { }; if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { - dev_err(ctrl->device, "invalid zone type %#x\n", - entry->zt); + dev_err(ns->ctrl->device, "invalid zone type %#x\n", entry->zt); return -EINVAL; } @@ -213,8 +212,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, break; for (i = 0; i < nz && zone_idx < nr_zones; i++) { - ret = nvme_zone_parse_entry(ns->ctrl, ns->head, - &report->entries[i], + ret = nvme_zone_parse_entry(ns, &report->entries[i], zone_idx, cb, data); if (ret) goto out_free; diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index fb7446d6d682..4904097dfd49 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -3,7 +3,7 @@ config NVME_TARGET tristate "NVMe Target support" depends on BLOCK - depends on CONFIGFS_FS + select CONFIGFS_FS select NVME_KEYRING if NVME_TARGET_TCP_TLS select KEYS if NVME_TARGET_TCP_TLS select SGL_ALLOC @@ -98,6 +98,7 @@ config NVME_TARGET_TCP_TLS bool "NVMe over Fabrics TCP target TLS encryption support" depends on NVME_TARGET_TCP select NET_HANDSHAKE + select TLS help Enables TLS encryption for the NVMe TCP target using the netlink handshake API. diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index acc138bbf8f2..3e378153a781 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -63,14 +63,9 @@ static void nvmet_execute_create_sq(struct nvmet_req *req) if (status != NVME_SC_SUCCESS) goto complete; - /* - * Note: The NVMe specification allows multiple SQs to use the same CQ. - * However, the target code does not really support that. So for now, - * prevent this and fail the command if sqid and cqid are different. - */ - if (!cqid || cqid != sqid) { - pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid); - status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR; + status = nvmet_check_io_cqid(ctrl, cqid, false); + if (status != NVME_SC_SUCCESS) { + pr_err("SQ %u: Invalid CQID %u\n", sqid, cqid); goto complete; } @@ -79,7 +74,7 @@ static void nvmet_execute_create_sq(struct nvmet_req *req) goto complete; } - status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1); + status = ctrl->ops->create_sq(ctrl, sqid, cqid, sq_flags, qsize, prp1); complete: nvmet_req_complete(req, status); @@ -96,14 +91,15 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req) goto complete; } - if (!cqid) { - status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; + status = nvmet_check_io_cqid(ctrl, cqid, false); + if (status != NVME_SC_SUCCESS) goto complete; - } - status = nvmet_check_cqid(ctrl, cqid); - if (status != NVME_SC_SUCCESS) + if (!ctrl->cqs[cqid] || nvmet_cq_in_use(ctrl->cqs[cqid])) { + /* Some SQs are still using this CQ */ + status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; goto complete; + } status = ctrl->ops->delete_cq(ctrl, cqid); @@ -127,12 +123,7 @@ static void nvmet_execute_create_cq(struct nvmet_req *req) goto complete; } - if (!cqid) { - status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; - goto complete; - } - - status = nvmet_check_cqid(ctrl, cqid); + status = nvmet_check_io_cqid(ctrl, cqid, true); if (status != NVME_SC_SUCCESS) goto complete; @@ -1174,7 +1165,7 @@ static void nvmet_execute_identify(struct nvmet_req *req) * A "minimum viable" abort implementation: the command is mandatory in the * spec, but we are not required to do any useful work. We couldn't really * do a useful abort, so don't bother even with waiting for the command - * to be exectuted and return immediately telling the command to abort + * to be executed and return immediately telling the command to abort * wasn't found. */ static void nvmet_execute_abort(struct nvmet_req *req) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index b47d675232d2..b340380f3892 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -15,6 +15,7 @@ #include <linux/ctype.h> #include <linux/random.h> #include <linux/nvme-auth.h> +#include <linux/nvme-keyring.h> #include <linux/unaligned.h> #include "nvmet.h" @@ -139,7 +140,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) return ret; } -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { int ret = 0; struct nvmet_host_link *p; @@ -165,6 +166,11 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) goto out_unlock; } + if (nvmet_queue_tls_keyid(sq)) { + pr_debug("host %s tls enabled\n", ctrl->hostnqn); + goto out_unlock; + } + ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id); if (ret < 0) { pr_warn("Failed to setup DH group"); @@ -233,6 +239,9 @@ out_unlock: void nvmet_auth_sq_free(struct nvmet_sq *sq) { cancel_delayed_work(&sq->auth_expired_work); +#ifdef CONFIG_NVME_TARGET_TCP_TLS + sq->tls_key = NULL; +#endif kfree(sq->dhchap_c1); sq->dhchap_c1 = NULL; kfree(sq->dhchap_c2); @@ -261,13 +270,22 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) nvme_auth_free_key(ctrl->ctrl_key); ctrl->ctrl_key = NULL; } +#ifdef CONFIG_NVME_TARGET_TCP_TLS + if (ctrl->tls_key) { + key_put(ctrl->tls_key); + ctrl->tls_key = NULL; + } +#endif } bool nvmet_check_auth_status(struct nvmet_req *req) { - if (req->sq->ctrl->host_key && - !req->sq->authenticated) - return false; + if (req->sq->ctrl->host_key) { + if (req->sq->qid > 0) + return true; + if (!req->sq->authenticated) + return false; + } return true; } @@ -275,7 +293,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int shash_len) { struct crypto_shash *shash_tfm; - struct shash_desc *shash; + SHASH_DESC_ON_STACK(shash, shash_tfm); struct nvmet_ctrl *ctrl = req->sq->ctrl; const char *hash_name; u8 *challenge = req->sq->dhchap_c1; @@ -327,19 +345,13 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, req->sq->dhchap_c1, challenge, shash_len); if (ret) - goto out_free_challenge; + goto out; } pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, req->sq->dhchap_tid); - shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), - GFP_KERNEL); - if (!shash) { - ret = -ENOMEM; - goto out_free_challenge; - } shash->tfm = shash_tfm; ret = crypto_shash_init(shash); if (ret) @@ -374,8 +386,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, goto out; ret = crypto_shash_final(shash, response); out: - kfree(shash); -out_free_challenge: if (challenge != req->sq->dhchap_c1) kfree(challenge); out_free_response: @@ -542,3 +552,57 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, return ret; } + +void nvmet_auth_insert_psk(struct nvmet_sq *sq) +{ + int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id); + u8 *psk, *digest, *tls_psk; + size_t psk_len; + int ret; +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key = NULL; +#endif + + ret = nvme_auth_generate_psk(sq->ctrl->shash_id, + sq->dhchap_skey, + sq->dhchap_skey_len, + sq->dhchap_c1, sq->dhchap_c2, + hash_len, &psk, &psk_len); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to generate PSK, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + return; + } + ret = nvme_auth_generate_digest(sq->ctrl->shash_id, psk, psk_len, + sq->ctrl->subsysnqn, + sq->ctrl->hostnqn, &digest); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to generate digest, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + goto out_free_psk; + } + ret = nvme_auth_derive_tls_psk(sq->ctrl->shash_id, psk, psk_len, + digest, &tls_psk); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to derive TLS PSK, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + goto out_free_digest; + } +#ifdef CONFIG_NVME_TARGET_TCP_TLS + tls_key = nvme_tls_psk_refresh(NULL, sq->ctrl->hostnqn, sq->ctrl->subsysnqn, + sq->ctrl->shash_id, tls_psk, psk_len, digest); + if (IS_ERR(tls_key)) { + pr_warn("%s: ctrl %d qid %d failed to refresh key, error %ld\n", + __func__, sq->ctrl->cntlid, sq->qid, PTR_ERR(tls_key)); + tls_key = NULL; + } + if (sq->ctrl->tls_key) + key_put(sq->ctrl->tls_key); + sq->ctrl->tls_key = tls_key; +#endif + kfree_sensitive(tls_psk); +out_free_digest: + kfree_sensitive(digest); +out_free_psk: + kfree_sensitive(psk); +} diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2e741696f371..491df044635f 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -62,14 +62,7 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) return NVME_SC_LBA_RANGE | NVME_STATUS_DNR; case -EOPNOTSUPP: req->error_loc = offsetof(struct nvme_common_command, opcode); - switch (req->cmd->common.opcode) { - case nvme_cmd_dsm: - case nvme_cmd_write_zeroes: - return NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR; - default: - return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; - } - break; + return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; case -ENODATA: req->error_loc = offsetof(struct nvme_rw_command, nsid); return NVME_SC_ACCESS_DENIED; @@ -324,6 +317,9 @@ int nvmet_enable_port(struct nvmet_port *port) lockdep_assert_held(&nvmet_config_sem); + if (port->disc_addr.trtype == NVMF_TRTYPE_MAX) + return -EINVAL; + ops = nvmet_transports[port->disc_addr.trtype]; if (!ops) { up_write(&nvmet_config_sem); @@ -648,7 +644,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) * Now that we removed the namespaces from the lookup list, we * can kill the per_cpu ref and wait for any remaining references * to be dropped, as well as a RCU grace period for anyone only - * using the namepace under rcu_read_lock(). Note that we can't + * using the namespace under rcu_read_lock(). Note that we can't * use call_rcu here as we need to ensure the namespaces have * been fully destroyed before unloading the module. */ @@ -810,11 +806,43 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status) } EXPORT_SYMBOL_GPL(nvmet_req_complete); +void nvmet_cq_init(struct nvmet_cq *cq) +{ + refcount_set(&cq->ref, 1); +} +EXPORT_SYMBOL_GPL(nvmet_cq_init); + +bool nvmet_cq_get(struct nvmet_cq *cq) +{ + return refcount_inc_not_zero(&cq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_cq_get); + +void nvmet_cq_put(struct nvmet_cq *cq) +{ + if (refcount_dec_and_test(&cq->ref)) + nvmet_cq_destroy(cq); +} +EXPORT_SYMBOL_GPL(nvmet_cq_put); + void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size) { cq->qid = qid; cq->size = size; + + ctrl->cqs[qid] = cq; +} + +void nvmet_cq_destroy(struct nvmet_cq *cq) +{ + struct nvmet_ctrl *ctrl = cq->ctrl; + + if (ctrl) { + ctrl->cqs[cq->qid] = NULL; + nvmet_ctrl_put(cq->ctrl); + cq->ctrl = NULL; + } } void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, @@ -834,37 +862,47 @@ static void nvmet_confirm_sq(struct percpu_ref *ref) complete(&sq->confirm_done); } -u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid) +u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create) { - if (!ctrl->sqs) + if (!ctrl->cqs) return NVME_SC_INTERNAL | NVME_STATUS_DNR; if (cqid > ctrl->subsys->max_qid) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; - /* - * Note: For PCI controllers, the NVMe specifications allows multiple - * SQs to share a single CQ. However, we do not support this yet, so - * check that there is no SQ defined for a CQ. If one exist, then the - * CQ ID is invalid for creation as well as when the CQ is being - * deleted (as that would mean that the SQ was not deleted before the - * CQ). - */ - if (ctrl->sqs[cqid]) + if ((create && ctrl->cqs[cqid]) || (!create && !ctrl->cqs[cqid])) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; return NVME_SC_SUCCESS; } +u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create) +{ + if (!cqid) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + return nvmet_check_cqid(ctrl, cqid, create); +} + +bool nvmet_cq_in_use(struct nvmet_cq *cq) +{ + return refcount_read(&cq->ref) > 1; +} +EXPORT_SYMBOL_GPL(nvmet_cq_in_use); + u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size) { u16 status; - status = nvmet_check_cqid(ctrl, qid); + status = nvmet_check_cqid(ctrl, qid, true); if (status != NVME_SC_SUCCESS) return status; + if (!kref_get_unless_zero(&ctrl->ref)) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + cq->ctrl = ctrl; + + nvmet_cq_init(cq); nvmet_cq_setup(ctrl, cq, qid, size); return NVME_SC_SUCCESS; @@ -888,7 +926,7 @@ u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, } u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, - u16 sqid, u16 size) + struct nvmet_cq *cq, u16 sqid, u16 size) { u16 status; int ret; @@ -900,7 +938,7 @@ u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, if (status != NVME_SC_SUCCESS) return status; - ret = nvmet_sq_init(sq); + ret = nvmet_sq_init(sq, cq); if (ret) { status = NVME_SC_INTERNAL | NVME_STATUS_DNR; goto ctrl_put; @@ -932,6 +970,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) wait_for_completion(&sq->free_done); percpu_ref_exit(&sq->ref); nvmet_auth_sq_free(sq); + nvmet_cq_put(sq->cq); /* * we must reference the ctrl again after waiting for inflight IO @@ -964,18 +1003,23 @@ static void nvmet_sq_free(struct percpu_ref *ref) complete(&sq->free_done); } -int nvmet_sq_init(struct nvmet_sq *sq) +int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq) { int ret; + if (!nvmet_cq_get(cq)) + return -EINVAL; + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); if (ret) { pr_err("percpu_ref init failed!\n"); + nvmet_cq_put(cq); return ret; } init_completion(&sq->free_done); init_completion(&sq->confirm_done); nvmet_auth_sq_init(sq); + sq->cq = cq; return 0; } @@ -1105,13 +1149,13 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) return ret; } -bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, - struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops) +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq, + const struct nvmet_fabrics_ops *ops) { u8 flags = req->cmd->common.flags; u16 status; - req->cq = cq; + req->cq = sq->cq; req->sq = sq; req->ops = ops; req->sg = NULL; @@ -1609,17 +1653,20 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) if (!ctrl->sqs) goto out_free_changed_ns_list; + ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *), + GFP_KERNEL); + if (!ctrl->cqs) + goto out_free_sqs; + ret = ida_alloc_range(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; - goto out_free_sqs; + goto out_free_cqs; } ctrl->cntlid = ret; - uuid_copy(&ctrl->hostid, args->hostid); - /* * Discovery controllers may use some arbitrary high value * in order to cleanup stale discovery sessions @@ -1647,7 +1694,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) if (args->hostid) uuid_copy(&ctrl->hostid, args->hostid); - dhchap_status = nvmet_setup_auth(ctrl); + dhchap_status = nvmet_setup_auth(ctrl, args->sq); if (dhchap_status) { pr_err("Failed to setup authentication, dhchap status %u\n", dhchap_status); @@ -1662,11 +1709,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) args->status = NVME_SC_SUCCESS; - pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s.\n", + pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s%s.\n", nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, ctrl->pi_support ? " T10-PI is enabled" : "", - nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); + nvmet_has_auth(ctrl, args->sq) ? " with DH-HMAC-CHAP" : "", + nvmet_queue_tls_keyid(args->sq) ? ", TLS" : ""); return ctrl; @@ -1674,6 +1722,8 @@ init_pr_fail: mutex_unlock(&subsys->lock); nvmet_stop_keep_alive_timer(ctrl); ida_free(&cntlid_ida, ctrl->cntlid); +out_free_cqs: + kfree(ctrl->cqs); out_free_sqs: kfree(ctrl->sqs); out_free_changed_ns_list: @@ -1710,6 +1760,7 @@ static void nvmet_ctrl_free(struct kref *ref) nvmet_async_events_free(ctrl); kfree(ctrl->sqs); + kfree(ctrl->cqs); kfree(ctrl->changed_ns_list); kfree(ctrl); @@ -1911,24 +1962,24 @@ static int __init nvmet_init(void) if (!nvmet_wq) goto out_free_buffered_work_queue; - error = nvmet_init_discovery(); + error = nvmet_init_debugfs(); if (error) goto out_free_nvmet_work_queue; - error = nvmet_init_debugfs(); + error = nvmet_init_discovery(); if (error) - goto out_exit_discovery; + goto out_exit_debugfs; error = nvmet_init_configfs(); if (error) - goto out_exit_debugfs; + goto out_exit_discovery; return 0; -out_exit_debugfs: - nvmet_exit_debugfs(); out_exit_discovery: nvmet_exit_discovery(); +out_exit_debugfs: + nvmet_exit_debugfs(); out_free_nvmet_work_queue: destroy_workqueue(nvmet_wq); out_free_buffered_work_queue: @@ -1943,8 +1994,8 @@ out_destroy_bvec_cache: static void __exit nvmet_exit(void) { nvmet_exit_configfs(); - nvmet_exit_debugfs(); nvmet_exit_discovery(); + nvmet_exit_debugfs(); ida_destroy(&cntlid_ida); destroy_workqueue(nvmet_wq); destroy_workqueue(buffered_io_wq); diff --git a/drivers/nvme/target/debugfs.c b/drivers/nvme/target/debugfs.c index c6571fbd35e3..5dcbd5aa86e1 100644 --- a/drivers/nvme/target/debugfs.c +++ b/drivers/nvme/target/debugfs.c @@ -132,6 +132,27 @@ static int nvmet_ctrl_host_traddr_show(struct seq_file *m, void *p) } NVMET_DEBUGFS_ATTR(nvmet_ctrl_host_traddr); +#ifdef CONFIG_NVME_TARGET_TCP_TLS +static int nvmet_ctrl_tls_key_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + key_serial_t keyid = nvmet_queue_tls_keyid(ctrl->sqs[0]); + + seq_printf(m, "%08x\n", keyid); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_key); + +static int nvmet_ctrl_tls_concat_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + + seq_printf(m, "%d\n", ctrl->concat); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_concat); +#endif + int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) { char name[32]; @@ -157,6 +178,12 @@ int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) &nvmet_ctrl_state_fops); debugfs_create_file("host_traddr", S_IRUSR, ctrl->debugfs_dir, ctrl, &nvmet_ctrl_host_traddr_fops); +#ifdef CONFIG_NVME_TARGET_TCP_TLS + debugfs_create_file("tls_concat", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_tls_concat_fops); + debugfs_create_file("tls_key", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_tls_key_fops); +#endif return 0; } diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index df7207640506..c06f3e04296c 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -119,7 +119,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE); memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); - strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); + strscpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); } /* diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 2022757f08dc..bf01ec414c55 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -43,8 +43,26 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) data->auth_protocol[0].dhchap.halen, data->auth_protocol[0].dhchap.dhlen); req->sq->dhchap_tid = le16_to_cpu(data->t_id); - if (data->sc_c) - return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + if (data->sc_c != NVME_AUTH_SECP_NOSC) { + if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + /* Secure concatenation can only be enabled on the admin queue */ + if (req->sq->qid) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + switch (data->sc_c) { + case NVME_AUTH_SECP_NEWTLSPSK: + if (nvmet_queue_tls_keyid(req->sq)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + break; + case NVME_AUTH_SECP_REPLACETLSPSK: + if (!nvmet_queue_tls_keyid(req->sq)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + break; + default: + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + } + ctrl->concat = true; + } if (data->napd != 1) return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; @@ -103,6 +121,12 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) nvme_auth_dhgroup_name(fallback_dhgid)); ctrl->dh_gid = fallback_dhgid; } + if (ctrl->dh_gid == NVME_AUTH_DHGROUP_NULL && ctrl->concat) { + pr_debug("%s: ctrl %d qid %d: NULL DH group invalid " + "for secure channel concatenation\n", __func__, + ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + } pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n", __func__, ctrl->cntlid, req->sq->qid, nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid); @@ -148,12 +172,22 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d) if (memcmp(data->rval, response, data->hl)) { pr_info("ctrl %d qid %d host response mismatch\n", ctrl->cntlid, req->sq->qid); + pr_debug("ctrl %d qid %d rval %*ph\n", + ctrl->cntlid, req->sq->qid, data->hl, data->rval); + pr_debug("ctrl %d qid %d response %*ph\n", + ctrl->cntlid, req->sq->qid, data->hl, response); kfree(response); return NVME_AUTH_DHCHAP_FAILURE_FAILED; } kfree(response); pr_debug("%s: ctrl %d qid %d host authenticated\n", __func__, ctrl->cntlid, req->sq->qid); + if (!data->cvalid && ctrl->concat) { + pr_debug("%s: ctrl %d qid %d invalid challenge\n", + __func__, ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + } + req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); if (data->cvalid) { req->sq->dhchap_c2 = kmemdup(data->rval + data->hl, data->hl, GFP_KERNEL); @@ -163,11 +197,23 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d) pr_debug("%s: ctrl %d qid %d challenge %*ph\n", __func__, ctrl->cntlid, req->sq->qid, data->hl, req->sq->dhchap_c2); - } else { + } + /* + * NVMe Base Spec 2.2 section 8.3.4.5.4: DH-HMAC-CHAP_Reply message + * Sequence Number (SEQNUM): [ .. ] + * The value 0h is used to indicate that bidirectional authentication + * is not performed, but a challenge value C2 is carried in order to + * generate a pre-shared key (PSK) for subsequent establishment of a + * secure channel. + */ + if (req->sq->dhchap_s2 == 0) { + if (ctrl->concat) + nvmet_auth_insert_psk(req->sq); req->sq->authenticated = true; + kfree(req->sq->dhchap_c2); req->sq->dhchap_c2 = NULL; - } - req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); + } else if (!data->cvalid) + req->sq->authenticated = true; return 0; } @@ -246,7 +292,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req) pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__, ctrl->cntlid, req->sq->qid); if (!req->sq->qid) { - dhchap_status = nvmet_setup_auth(ctrl); + dhchap_status = nvmet_setup_auth(ctrl, req->sq); if (dhchap_status) { pr_err("ctrl %d qid 0 failed to setup re-authentication\n", ctrl->cntlid); @@ -303,6 +349,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req) } goto done_kfree; case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2: + if (ctrl->concat) + nvmet_auth_insert_psk(req->sq); req->sq->authenticated = true; pr_debug("%s: ctrl %d qid %d ctrl authenticated\n", __func__, ctrl->cntlid, req->sq->qid); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index eb406c90c167..7b8d8b397802 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -208,6 +208,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; } + kref_get(&ctrl->ref); + old = cmpxchg(&req->cq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; + } + /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); @@ -234,10 +242,26 @@ err: return ret; } -static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl) +static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { + bool needs_auth = nvmet_has_auth(ctrl, sq); + key_serial_t keyid = nvmet_queue_tls_keyid(sq); + + /* Do not authenticate I/O queues */ + if (sq->qid) + needs_auth = false; + + if (keyid) + pr_debug("%s: ctrl %d qid %d should %sauthenticate, tls psk %08x\n", + __func__, ctrl->cntlid, sq->qid, + needs_auth ? "" : "not ", keyid); + else + pr_debug("%s: ctrl %d qid %d should %sauthenticate%s\n", + __func__, ctrl->cntlid, sq->qid, + needs_auth ? "" : "not ", + ctrl->concat ? ", secure concatenation" : ""); return (u32)ctrl->cntlid | - (nvmet_has_auth(ctrl) ? NVME_CONNECT_AUTHREQ_ATR : 0); + (needs_auth ? NVME_CONNECT_AUTHREQ_ATR : 0); } static void nvmet_execute_admin_connect(struct nvmet_req *req) @@ -247,6 +271,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) struct nvmet_ctrl *ctrl = NULL; struct nvmet_alloc_ctrl_args args = { .port = req->port, + .sq = req->sq, .ops = req->ops, .p2p_client = req->p2p_client, .kato = le32_to_cpu(c->kato), @@ -299,7 +324,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) goto out; } - args.result = cpu_to_le32(nvmet_connect_result(ctrl)); + args.result = cpu_to_le32(nvmet_connect_result(ctrl, req->sq)); out: kfree(d); complete: @@ -357,7 +382,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out_ctrl_put; pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); - req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); + req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl, req->sq)); out: kfree(d); complete: diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 7318b736d414..25598a46bf0d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -816,7 +816,8 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue); - ret = nvmet_sq_init(&queue->nvme_sq); + nvmet_cq_init(&queue->nvme_cq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) goto out_fail_iodlist; @@ -826,6 +827,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, return queue; out_fail_iodlist: + nvmet_cq_put(&queue->nvme_cq); nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue); destroy_workqueue(queue->work_q); out_free_queue: @@ -934,6 +936,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) flush_workqueue(queue->work_q); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); nvmet_fc_tgt_q_put(queue); } @@ -995,16 +998,6 @@ nvmet_fc_hostport_get(struct nvmet_fc_hostport *hostport) return kref_get_unless_zero(&hostport->ref); } -static void -nvmet_fc_free_hostport(struct nvmet_fc_hostport *hostport) -{ - /* if LLDD not implemented, leave as NULL */ - if (!hostport || !hostport->hosthandle) - return; - - nvmet_fc_hostport_put(hostport); -} - static struct nvmet_fc_hostport * nvmet_fc_match_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) { @@ -1028,33 +1021,24 @@ nvmet_fc_alloc_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) struct nvmet_fc_hostport *newhost, *match = NULL; unsigned long flags; + /* + * Caller holds a reference on tgtport. + */ + /* if LLDD not implemented, leave as NULL */ if (!hosthandle) return NULL; - /* - * take reference for what will be the newly allocated hostport if - * we end up using a new allocation - */ - if (!nvmet_fc_tgtport_get(tgtport)) - return ERR_PTR(-EINVAL); - spin_lock_irqsave(&tgtport->lock, flags); match = nvmet_fc_match_hostport(tgtport, hosthandle); spin_unlock_irqrestore(&tgtport->lock, flags); - if (match) { - /* no new allocation - release reference */ - nvmet_fc_tgtport_put(tgtport); + if (match) return match; - } newhost = kzalloc(sizeof(*newhost), GFP_KERNEL); - if (!newhost) { - /* no new allocation - release reference */ - nvmet_fc_tgtport_put(tgtport); + if (!newhost) return ERR_PTR(-ENOMEM); - } spin_lock_irqsave(&tgtport->lock, flags); match = nvmet_fc_match_hostport(tgtport, hosthandle); @@ -1063,6 +1047,7 @@ nvmet_fc_alloc_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) kfree(newhost); newhost = match; } else { + nvmet_fc_tgtport_get(tgtport); newhost->tgtport = tgtport; newhost->hosthandle = hosthandle; INIT_LIST_HEAD(&newhost->host_list); @@ -1076,20 +1061,14 @@ nvmet_fc_alloc_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) } static void -nvmet_fc_delete_assoc(struct nvmet_fc_tgt_assoc *assoc) -{ - nvmet_fc_delete_target_assoc(assoc); - nvmet_fc_tgt_a_put(assoc); -} - -static void nvmet_fc_delete_assoc_work(struct work_struct *work) { struct nvmet_fc_tgt_assoc *assoc = container_of(work, struct nvmet_fc_tgt_assoc, del_work); struct nvmet_fc_tgtport *tgtport = assoc->tgtport; - nvmet_fc_delete_assoc(assoc); + nvmet_fc_delete_target_assoc(assoc); + nvmet_fc_tgt_a_put(assoc); nvmet_fc_tgtport_put(tgtport); } @@ -1097,7 +1076,8 @@ static void nvmet_fc_schedule_delete_assoc(struct nvmet_fc_tgt_assoc *assoc) { nvmet_fc_tgtport_get(assoc->tgtport); - queue_work(nvmet_wq, &assoc->del_work); + if (!queue_work(nvmet_wq, &assoc->del_work)) + nvmet_fc_tgtport_put(assoc->tgtport); } static bool @@ -1143,6 +1123,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) goto out_ida; assoc->tgtport = tgtport; + nvmet_fc_tgtport_get(tgtport); assoc->a_id = idx; INIT_LIST_HEAD(&assoc->a_list); kref_init(&assoc->ref); @@ -1190,7 +1171,7 @@ nvmet_fc_target_assoc_free(struct kref *ref) /* Send Disconnect now that all i/o has completed */ nvmet_fc_xmt_disconnect_assoc(assoc); - nvmet_fc_free_hostport(assoc->hostport); + nvmet_fc_hostport_put(assoc->hostport); spin_lock_irqsave(&tgtport->lock, flags); oldls = assoc->rcv_disconn; spin_unlock_irqrestore(&tgtport->lock, flags); @@ -1244,6 +1225,8 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) dev_info(tgtport->dev, "{%d:%d} Association deleted\n", tgtport->fc_target_port.port_num, assoc->a_id); + + nvmet_fc_tgtport_put(tgtport); } static struct nvmet_fc_tgt_assoc * @@ -1274,6 +1257,7 @@ nvmet_fc_portentry_bind(struct nvmet_fc_tgtport *tgtport, { lockdep_assert_held(&nvmet_fc_tgtlock); + nvmet_fc_tgtport_get(tgtport); pe->tgtport = tgtport; tgtport->pe = pe; @@ -1293,8 +1277,10 @@ nvmet_fc_portentry_unbind(struct nvmet_fc_port_entry *pe) unsigned long flags; spin_lock_irqsave(&nvmet_fc_tgtlock, flags); - if (pe->tgtport) + if (pe->tgtport) { + nvmet_fc_tgtport_put(pe->tgtport); pe->tgtport->pe = NULL; + } list_del(&pe->pe_list); spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); } @@ -1312,8 +1298,10 @@ nvmet_fc_portentry_unbind_tgt(struct nvmet_fc_tgtport *tgtport) spin_lock_irqsave(&nvmet_fc_tgtlock, flags); pe = tgtport->pe; - if (pe) + if (pe) { + nvmet_fc_tgtport_put(pe->tgtport); pe->tgtport = NULL; + } tgtport->pe = NULL; spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); } @@ -1336,6 +1324,9 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport) list_for_each_entry(pe, &nvmet_fc_portentry_list, pe_list) { if (tgtport->fc_target_port.node_name == pe->node_name && tgtport->fc_target_port.port_name == pe->port_name) { + if (!nvmet_fc_tgtport_get(tgtport)) + continue; + WARN_ON(pe->tgtport); tgtport->pe = pe; pe->tgtport = tgtport; @@ -1348,7 +1339,7 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport) /** * nvmet_fc_register_targetport - transport entry point called by an * LLDD to register the existence of a local - * NVME subystem FC port. + * NVME subsystem FC port. * @pinfo: pointer to information about the port to be registered * @template: LLDD entrypoints and operational parameters for the port * @dev: physical hardware device node port corresponds to. Will be @@ -1455,11 +1446,6 @@ nvmet_fc_free_tgtport(struct kref *ref) struct nvmet_fc_tgtport *tgtport = container_of(ref, struct nvmet_fc_tgtport, ref); struct device *dev = tgtport->dev; - unsigned long flags; - - spin_lock_irqsave(&nvmet_fc_tgtlock, flags); - list_del(&tgtport->tgt_list); - spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); nvmet_fc_free_ls_iodlist(tgtport); @@ -1605,6 +1591,39 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); } +static void +nvmet_fc_free_pending_reqs(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_ls_req_op *lsop; + struct nvmefc_ls_req *lsreq; + struct nvmet_fc_ls_iod *iod; + int i; + + iod = tgtport->iod; + for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) + cancel_work(&iod->work); + + /* + * After this point the connection is lost and thus any pending + * request can't be processed by the normal completion path. This + * is likely a request from nvmet_fc_send_ls_req_async. + */ + while ((lsop = list_first_entry_or_null(&tgtport->ls_req_list, + struct nvmet_fc_ls_req_op, lsreq_list))) { + list_del(&lsop->lsreq_list); + + if (!lsop->req_queued) + continue; + + lsreq = &lsop->ls_req; + fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma, + (lsreq->rqstlen + lsreq->rsplen), + DMA_BIDIRECTIONAL); + nvmet_fc_tgtport_put(tgtport); + kfree(lsop); + } +} + /** * nvmet_fc_unregister_targetport - transport entry point called by an * LLDD to deregister/remove a previously @@ -1620,6 +1639,11 @@ int nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port) { struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port); + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + list_del(&tgtport->tgt_list); + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); nvmet_fc_portentry_unbind_tgt(tgtport); @@ -1628,13 +1652,7 @@ nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port) flush_workqueue(nvmet_wq); - /* - * should terminate LS's as well. However, LS's will be generated - * at the tail end of association termination, so they likely don't - * exist yet. And even if they did, it's worthwhile to just let - * them finish and targetport ref counting will clean things up. - */ - + nvmet_fc_free_pending_reqs(tgtport); nvmet_fc_tgtport_put(tgtport); return 0; @@ -2551,10 +2569,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->data_sg = NULL; fod->data_sg_cnt = 0; - ret = nvmet_req_init(&fod->req, - &fod->queue->nvme_cq, - &fod->queue->nvme_sq, - &nvmet_fc_tgt_fcp_ops); + ret = nvmet_req_init(&fod->req, &fod->queue->nvme_sq, + &nvmet_fc_tgt_fcp_ops); if (!ret) { /* bad SQE content or invalid ctrl state */ /* nvmet layer has already called op done to send rsp. */ @@ -2880,12 +2896,17 @@ nvmet_fc_add_port(struct nvmet_port *port) list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { if ((tgtport->fc_target_port.node_name == traddr.nn) && (tgtport->fc_target_port.port_name == traddr.pn)) { + if (!nvmet_fc_tgtport_get(tgtport)) + continue; + /* a FC port can only be 1 nvmet port id */ if (!tgtport->pe) { nvmet_fc_portentry_bind(tgtport, pe, port); ret = 0; } else ret = -EALREADY; + + nvmet_fc_tgtport_put(tgtport); break; } } @@ -2901,11 +2922,21 @@ static void nvmet_fc_remove_port(struct nvmet_port *port) { struct nvmet_fc_port_entry *pe = port->priv; + struct nvmet_fc_tgtport *tgtport = NULL; + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport)) + tgtport = pe->tgtport; + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); nvmet_fc_portentry_unbind(pe); - /* terminate any outstanding associations */ - __nvmet_fc_free_assocs(pe->tgtport); + if (tgtport) { + /* terminate any outstanding associations */ + __nvmet_fc_free_assocs(tgtport); + nvmet_fc_tgtport_put(tgtport); + } kfree(pe); } @@ -2914,10 +2945,21 @@ static void nvmet_fc_discovery_chg(struct nvmet_port *port) { struct nvmet_fc_port_entry *pe = port->priv; - struct nvmet_fc_tgtport *tgtport = pe->tgtport; + struct nvmet_fc_tgtport *tgtport = NULL; + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport)) + tgtport = pe->tgtport; + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + if (!tgtport) + return; if (tgtport && tgtport->ops->discovery_event) tgtport->ops->discovery_event(&tgtport->fc_target_port); + + nvmet_fc_tgtport_put(tgtport); } static ssize_t diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index da195d61a966..257b497d515a 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -207,13 +207,16 @@ static LIST_HEAD(fcloop_nports); struct fcloop_lport { struct nvme_fc_local_port *localport; struct list_head lport_list; - struct completion unreg_done; + refcount_t ref; }; struct fcloop_lport_priv { struct fcloop_lport *lport; }; +/* The port is already being removed, avoid double free */ +#define PORT_DELETED 0 + struct fcloop_rport { struct nvme_fc_remote_port *remoteport; struct nvmet_fc_target_port *targetport; @@ -222,6 +225,7 @@ struct fcloop_rport { spinlock_t lock; struct list_head ls_list; struct work_struct ls_work; + unsigned long flags; }; struct fcloop_tport { @@ -232,6 +236,7 @@ struct fcloop_tport { spinlock_t lock; struct list_head ls_list; struct work_struct ls_work; + unsigned long flags; }; struct fcloop_nport { @@ -239,7 +244,7 @@ struct fcloop_nport { struct fcloop_tport *tport; struct fcloop_lport *lport; struct list_head nport_list; - struct kref ref; + refcount_t ref; u64 node_name; u64 port_name; u32 port_role; @@ -274,7 +279,7 @@ struct fcloop_fcpreq { u32 inistate; bool active; bool aborted; - struct kref ref; + refcount_t ref; struct work_struct fcp_rcv_work; struct work_struct abort_rcv_work; struct work_struct tio_done_work; @@ -287,6 +292,9 @@ struct fcloop_ini_fcpreq { spinlock_t inilock; }; +/* SLAB cache for fcloop_lsreq structures */ +static struct kmem_cache *lsreq_cache; + static inline struct fcloop_lsreq * ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp) { @@ -337,6 +345,7 @@ fcloop_rport_lsrqst_work(struct work_struct *work) * callee may free memory containing tls_req. * do not reference lsreq after this. */ + kmem_cache_free(lsreq_cache, tls_req); spin_lock(&rport->lock); } @@ -348,10 +357,13 @@ fcloop_h2t_ls_req(struct nvme_fc_local_port *localport, struct nvme_fc_remote_port *remoteport, struct nvmefc_ls_req *lsreq) { - struct fcloop_lsreq *tls_req = lsreq->private; struct fcloop_rport *rport = remoteport->private; + struct fcloop_lsreq *tls_req; int ret = 0; + tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL); + if (!tls_req) + return -ENOMEM; tls_req->lsreq = lsreq; INIT_LIST_HEAD(&tls_req->ls_list); @@ -388,14 +400,17 @@ fcloop_h2t_xmt_ls_rsp(struct nvmet_fc_target_port *targetport, lsrsp->done(lsrsp); - if (remoteport) { - rport = remoteport->private; - spin_lock(&rport->lock); - list_add_tail(&tls_req->ls_list, &rport->ls_list); - spin_unlock(&rport->lock); - queue_work(nvmet_wq, &rport->ls_work); + if (!remoteport) { + kmem_cache_free(lsreq_cache, tls_req); + return 0; } + rport = remoteport->private; + spin_lock(&rport->lock); + list_add_tail(&tls_req->ls_list, &rport->ls_list); + spin_unlock(&rport->lock); + queue_work(nvmet_wq, &rport->ls_work); + return 0; } @@ -421,6 +436,7 @@ fcloop_tport_lsrqst_work(struct work_struct *work) * callee may free memory containing tls_req. * do not reference lsreq after this. */ + kmem_cache_free(lsreq_cache, tls_req); spin_lock(&tport->lock); } @@ -431,8 +447,8 @@ static int fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle, struct nvmefc_ls_req *lsreq) { - struct fcloop_lsreq *tls_req = lsreq->private; struct fcloop_tport *tport = targetport->private; + struct fcloop_lsreq *tls_req; int ret = 0; /* @@ -440,6 +456,10 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle, * hosthandle ignored as fcloop currently is * 1:1 tgtport vs remoteport */ + + tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL); + if (!tls_req) + return -ENOMEM; tls_req->lsreq = lsreq; INIT_LIST_HEAD(&tls_req->ls_list); @@ -456,6 +476,9 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle, ret = nvme_fc_rcv_ls_req(tport->remoteport, &tls_req->ls_rsp, lsreq->rqstaddr, lsreq->rqstlen); + if (ret) + kmem_cache_free(lsreq_cache, tls_req); + return ret; } @@ -470,18 +493,30 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport, struct nvmet_fc_target_port *targetport = rport->targetport; struct fcloop_tport *tport; + if (!targetport) { + /* + * The target port is gone. The target doesn't expect any + * response anymore and the ->done call is not valid + * because the resources have been freed by + * nvmet_fc_free_pending_reqs. + * + * We end up here from delete association exchange: + * nvmet_fc_xmt_disconnect_assoc sends an async request. + */ + kmem_cache_free(lsreq_cache, tls_req); + return 0; + } + memcpy(lsreq->rspaddr, lsrsp->rspbuf, ((lsreq->rsplen < lsrsp->rsplen) ? lsreq->rsplen : lsrsp->rsplen)); lsrsp->done(lsrsp); - if (targetport) { - tport = targetport->private; - spin_lock(&tport->lock); - list_add_tail(&tls_req->ls_list, &tport->ls_list); - spin_unlock(&tport->lock); - queue_work(nvmet_wq, &tport->ls_work); - } + tport = targetport->private; + spin_lock(&tport->lock); + list_add_tail(&tls_req->ls_list, &tport->ls_list); + spin_unlock(&tport->lock); + queue_work(nvmet_wq, &tport->ls_work); return 0; } @@ -534,24 +569,18 @@ fcloop_tgt_discovery_evt(struct nvmet_fc_target_port *tgtport) } static void -fcloop_tfcp_req_free(struct kref *ref) +fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req) { - struct fcloop_fcpreq *tfcp_req = - container_of(ref, struct fcloop_fcpreq, ref); + if (!refcount_dec_and_test(&tfcp_req->ref)) + return; kfree(tfcp_req); } -static void -fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req) -{ - kref_put(&tfcp_req->ref, fcloop_tfcp_req_free); -} - static int fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req) { - return kref_get_unless_zero(&tfcp_req->ref); + return refcount_inc_not_zero(&tfcp_req->ref); } static void @@ -571,7 +600,8 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, } /* release original io reference on tgt struct */ - fcloop_tfcp_req_put(tfcp_req); + if (tfcp_req) + fcloop_tfcp_req_put(tfcp_req); } static bool drop_fabric_opcode; @@ -623,12 +653,13 @@ fcloop_fcp_recv_work(struct work_struct *work) { struct fcloop_fcpreq *tfcp_req = container_of(work, struct fcloop_fcpreq, fcp_rcv_work); - struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + struct nvmefc_fcp_req *fcpreq; unsigned long flags; int ret = 0; bool aborted = false; spin_lock_irqsave(&tfcp_req->reqlock, flags); + fcpreq = tfcp_req->fcpreq; switch (tfcp_req->inistate) { case INI_IO_START: tfcp_req->inistate = INI_IO_ACTIVE; @@ -643,16 +674,19 @@ fcloop_fcp_recv_work(struct work_struct *work) } spin_unlock_irqrestore(&tfcp_req->reqlock, flags); - if (unlikely(aborted)) - ret = -ECANCELED; - else { - if (likely(!check_for_drop(tfcp_req))) - ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, - &tfcp_req->tgt_fcp_req, - fcpreq->cmdaddr, fcpreq->cmdlen); - else - pr_info("%s: dropped command ********\n", __func__); + if (unlikely(aborted)) { + /* the abort handler will call fcloop_call_host_done */ + return; + } + + if (unlikely(check_for_drop(tfcp_req))) { + pr_info("%s: dropped command ********\n", __func__); + return; } + + ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, + &tfcp_req->tgt_fcp_req, + fcpreq->cmdaddr, fcpreq->cmdlen); if (ret) fcloop_call_host_done(fcpreq, tfcp_req, ret); } @@ -667,15 +701,17 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) unsigned long flags; spin_lock_irqsave(&tfcp_req->reqlock, flags); - fcpreq = tfcp_req->fcpreq; switch (tfcp_req->inistate) { case INI_IO_ABORTED: + fcpreq = tfcp_req->fcpreq; + tfcp_req->fcpreq = NULL; break; case INI_IO_COMPLETED: completed = true; break; default: spin_unlock_irqrestore(&tfcp_req->reqlock, flags); + fcloop_tfcp_req_put(tfcp_req); WARN_ON(1); return; } @@ -691,10 +727,6 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, &tfcp_req->tgt_fcp_req); - spin_lock_irqsave(&tfcp_req->reqlock, flags); - tfcp_req->fcpreq = NULL; - spin_unlock_irqrestore(&tfcp_req->reqlock, flags); - fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); /* call_host_done releases reference for abort downcall */ } @@ -748,7 +780,7 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport, INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work); INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work); INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work); - kref_init(&tfcp_req->ref); + refcount_set(&tfcp_req->ref, 1); queue_work(nvmet_wq, &tfcp_req->fcp_rcv_work); @@ -963,13 +995,16 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, spin_lock(&inireq->inilock); tfcp_req = inireq->tfcp_req; - if (tfcp_req) - fcloop_tfcp_req_get(tfcp_req); + if (tfcp_req) { + if (!fcloop_tfcp_req_get(tfcp_req)) + tfcp_req = NULL; + } spin_unlock(&inireq->inilock); - if (!tfcp_req) + if (!tfcp_req) { /* abort has already been called */ - return; + goto out_host_done; + } /* break initiator/target relationship for io */ spin_lock_irqsave(&tfcp_req->reqlock, flags); @@ -984,7 +1019,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, default: spin_unlock_irqrestore(&tfcp_req->reqlock, flags); WARN_ON(1); - return; + goto out_host_done; } spin_unlock_irqrestore(&tfcp_req->reqlock, flags); @@ -998,27 +1033,56 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, */ fcloop_tfcp_req_put(tfcp_req); } + + return; + +out_host_done: + fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); } static void -fcloop_nport_free(struct kref *ref) +fcloop_lport_put(struct fcloop_lport *lport) { - struct fcloop_nport *nport = - container_of(ref, struct fcloop_nport, ref); + unsigned long flags; - kfree(nport); + if (!refcount_dec_and_test(&lport->ref)) + return; + + spin_lock_irqsave(&fcloop_lock, flags); + list_del(&lport->lport_list); + spin_unlock_irqrestore(&fcloop_lock, flags); + + kfree(lport); +} + +static int +fcloop_lport_get(struct fcloop_lport *lport) +{ + return refcount_inc_not_zero(&lport->ref); } static void fcloop_nport_put(struct fcloop_nport *nport) { - kref_put(&nport->ref, fcloop_nport_free); + unsigned long flags; + + if (!refcount_dec_and_test(&nport->ref)) + return; + + spin_lock_irqsave(&fcloop_lock, flags); + list_del(&nport->nport_list); + spin_unlock_irqrestore(&fcloop_lock, flags); + + if (nport->lport) + fcloop_lport_put(nport->lport); + + kfree(nport); } static int fcloop_nport_get(struct fcloop_nport *nport) { - return kref_get_unless_zero(&nport->ref); + return refcount_inc_not_zero(&nport->ref); } static void @@ -1027,26 +1091,45 @@ fcloop_localport_delete(struct nvme_fc_local_port *localport) struct fcloop_lport_priv *lport_priv = localport->private; struct fcloop_lport *lport = lport_priv->lport; - /* release any threads waiting for the unreg to complete */ - complete(&lport->unreg_done); + fcloop_lport_put(lport); } static void fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; + bool put_port = false; + unsigned long flags; flush_work(&rport->ls_work); - fcloop_nport_put(rport->nport); + + spin_lock_irqsave(&fcloop_lock, flags); + if (!test_and_set_bit(PORT_DELETED, &rport->flags)) + put_port = true; + rport->nport->rport = NULL; + spin_unlock_irqrestore(&fcloop_lock, flags); + + if (put_port) + fcloop_nport_put(rport->nport); } static void fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) { struct fcloop_tport *tport = targetport->private; + bool put_port = false; + unsigned long flags; flush_work(&tport->ls_work); - fcloop_nport_put(tport->nport); + + spin_lock_irqsave(&fcloop_lock, flags); + if (!test_and_set_bit(PORT_DELETED, &tport->flags)) + put_port = true; + tport->nport->tport = NULL; + spin_unlock_irqrestore(&fcloop_lock, flags); + + if (put_port) + fcloop_nport_put(tport->nport); } #define FCLOOP_HW_QUEUES 4 @@ -1070,7 +1153,6 @@ static struct nvme_fc_port_template fctemplate = { /* sizes of additional private data for data structures */ .local_priv_sz = sizeof(struct fcloop_lport_priv), .remote_priv_sz = sizeof(struct fcloop_rport), - .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), }; @@ -1093,7 +1175,6 @@ static struct nvmet_fc_target_template tgttemplate = { .target_features = 0, /* sizes of additional private data for data structures */ .target_priv_sz = sizeof(struct fcloop_tport), - .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), }; static ssize_t @@ -1140,6 +1221,7 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr, lport->localport = localport; INIT_LIST_HEAD(&lport->lport_list); + refcount_set(&lport->ref, 1); spin_lock_irqsave(&fcloop_lock, flags); list_add_tail(&lport->lport_list, &fcloop_lports); @@ -1156,60 +1238,94 @@ out_free_lport: return ret ? ret : count; } +static int +__localport_unreg(struct fcloop_lport *lport) +{ + return nvme_fc_unregister_localport(lport->localport); +} -static void -__unlink_local_port(struct fcloop_lport *lport) +static struct fcloop_nport * +__fcloop_nport_lookup(u64 node_name, u64 port_name) { - list_del(&lport->lport_list); + struct fcloop_nport *nport; + + list_for_each_entry(nport, &fcloop_nports, nport_list) { + if (nport->node_name != node_name || + nport->port_name != port_name) + continue; + + if (fcloop_nport_get(nport)) + return nport; + + break; + } + + return NULL; } -static int -__wait_localport_unreg(struct fcloop_lport *lport) +static struct fcloop_nport * +fcloop_nport_lookup(u64 node_name, u64 port_name) { - int ret; + struct fcloop_nport *nport; + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + nport = __fcloop_nport_lookup(node_name, port_name); + spin_unlock_irqrestore(&fcloop_lock, flags); + + return nport; +} - init_completion(&lport->unreg_done); +static struct fcloop_lport * +__fcloop_lport_lookup(u64 node_name, u64 port_name) +{ + struct fcloop_lport *lport; - ret = nvme_fc_unregister_localport(lport->localport); + list_for_each_entry(lport, &fcloop_lports, lport_list) { + if (lport->localport->node_name != node_name || + lport->localport->port_name != port_name) + continue; - if (!ret) - wait_for_completion(&lport->unreg_done); + if (fcloop_lport_get(lport)) + return lport; - kfree(lport); + break; + } - return ret; + return NULL; } +static struct fcloop_lport * +fcloop_lport_lookup(u64 node_name, u64 port_name) +{ + struct fcloop_lport *lport; + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + lport = __fcloop_lport_lookup(node_name, port_name); + spin_unlock_irqrestore(&fcloop_lock, flags); + + return lport; +} static ssize_t fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct fcloop_lport *tlport, *lport = NULL; + struct fcloop_lport *lport; u64 nodename, portname; - unsigned long flags; int ret; ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf); if (ret) return ret; - spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tlport, &fcloop_lports, lport_list) { - if (tlport->localport->node_name == nodename && - tlport->localport->port_name == portname) { - lport = tlport; - __unlink_local_port(lport); - break; - } - } - spin_unlock_irqrestore(&fcloop_lock, flags); - + lport = fcloop_lport_lookup(nodename, portname); if (!lport) return -ENOENT; - ret = __wait_localport_unreg(lport); + ret = __localport_unreg(lport); + fcloop_lport_put(lport); return ret ? ret : count; } @@ -1217,8 +1333,8 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, static struct fcloop_nport * fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) { - struct fcloop_nport *newnport, *nport = NULL; - struct fcloop_lport *tmplport, *lport = NULL; + struct fcloop_nport *newnport, *nport; + struct fcloop_lport *lport; struct fcloop_ctrl_options *opts; unsigned long flags; u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS; @@ -1233,10 +1349,8 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) goto out_free_opts; /* everything there ? */ - if ((opts->mask & opts_mask) != opts_mask) { - ret = -EINVAL; + if ((opts->mask & opts_mask) != opts_mask) goto out_free_opts; - } newnport = kzalloc(sizeof(*newnport), GFP_KERNEL); if (!newnport) @@ -1249,63 +1363,64 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) newnport->port_role = opts->roles; if (opts->mask & NVMF_OPT_FCADDR) newnport->port_id = opts->fcaddr; - kref_init(&newnport->ref); + refcount_set(&newnport->ref, 1); spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tmplport, &fcloop_lports, lport_list) { - if (tmplport->localport->node_name == opts->wwnn && - tmplport->localport->port_name == opts->wwpn) - goto out_invalid_opts; - - if (tmplport->localport->node_name == opts->lpwwnn && - tmplport->localport->port_name == opts->lpwwpn) - lport = tmplport; + lport = __fcloop_lport_lookup(opts->wwnn, opts->wwpn); + if (lport) { + /* invalid configuration */ + fcloop_lport_put(lport); + goto out_free_newnport; } if (remoteport) { - if (!lport) - goto out_invalid_opts; - newnport->lport = lport; - } - - list_for_each_entry(nport, &fcloop_nports, nport_list) { - if (nport->node_name == opts->wwnn && - nport->port_name == opts->wwpn) { - if ((remoteport && nport->rport) || - (!remoteport && nport->tport)) { - nport = NULL; - goto out_invalid_opts; - } - - fcloop_nport_get(nport); - - spin_unlock_irqrestore(&fcloop_lock, flags); - - if (remoteport) - nport->lport = lport; - if (opts->mask & NVMF_OPT_ROLES) - nport->port_role = opts->roles; - if (opts->mask & NVMF_OPT_FCADDR) - nport->port_id = opts->fcaddr; + lport = __fcloop_lport_lookup(opts->lpwwnn, opts->lpwwpn); + if (!lport) { + /* invalid configuration */ goto out_free_newnport; } } - list_add_tail(&newnport->nport_list, &fcloop_nports); + nport = __fcloop_nport_lookup(opts->wwnn, opts->wwpn); + if (nport) { + if ((remoteport && nport->rport) || + (!remoteport && nport->tport)) { + /* invalid configuration */ + goto out_put_nport; + } + + /* found existing nport, discard the new nport */ + kfree(newnport); + } else { + list_add_tail(&newnport->nport_list, &fcloop_nports); + nport = newnport; + } + if (opts->mask & NVMF_OPT_ROLES) + nport->port_role = opts->roles; + if (opts->mask & NVMF_OPT_FCADDR) + nport->port_id = opts->fcaddr; + if (lport) { + if (!nport->lport) + nport->lport = lport; + else + fcloop_lport_put(lport); + } spin_unlock_irqrestore(&fcloop_lock, flags); kfree(opts); - return newnport; + return nport; -out_invalid_opts: - spin_unlock_irqrestore(&fcloop_lock, flags); +out_put_nport: + if (lport) + fcloop_lport_put(lport); + fcloop_nport_put(nport); out_free_newnport: + spin_unlock_irqrestore(&fcloop_lock, flags); kfree(newnport); out_free_opts: kfree(opts); - return nport; + return NULL; } static ssize_t @@ -1346,6 +1461,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, rport->nport = nport; rport->lport = nport->lport; nport->rport = rport; + rport->flags = 0; spin_lock_init(&rport->lock); INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work); INIT_LIST_HEAD(&rport->ls_list); @@ -1359,21 +1475,18 @@ __unlink_remote_port(struct fcloop_nport *nport) { struct fcloop_rport *rport = nport->rport; + lockdep_assert_held(&fcloop_lock); + if (rport && nport->tport) nport->tport->remoteport = NULL; nport->rport = NULL; - list_del(&nport->nport_list); - return rport; } static int __remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) { - if (!rport) - return -EALREADY; - return nvme_fc_unregister_remoteport(rport->remoteport); } @@ -1381,8 +1494,8 @@ static ssize_t fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct fcloop_nport *nport = NULL, *tmpport; - static struct fcloop_rport *rport; + struct fcloop_nport *nport; + struct fcloop_rport *rport; u64 nodename, portname; unsigned long flags; int ret; @@ -1391,24 +1504,24 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, if (ret) return ret; - spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tmpport, &fcloop_nports, nport_list) { - if (tmpport->node_name == nodename && - tmpport->port_name == portname && tmpport->rport) { - nport = tmpport; - rport = __unlink_remote_port(nport); - break; - } - } + nport = fcloop_nport_lookup(nodename, portname); + if (!nport) + return -ENOENT; + spin_lock_irqsave(&fcloop_lock, flags); + rport = __unlink_remote_port(nport); spin_unlock_irqrestore(&fcloop_lock, flags); - if (!nport) - return -ENOENT; + if (!rport) { + ret = -ENOENT; + goto out_nport_put; + } ret = __remoteport_unreg(nport, rport); +out_nport_put: + fcloop_nport_put(nport); + return ret ? ret : count; } @@ -1446,6 +1559,7 @@ fcloop_create_target_port(struct device *dev, struct device_attribute *attr, tport->nport = nport; tport->lport = nport->lport; nport->tport = tport; + tport->flags = 0; spin_lock_init(&tport->lock); INIT_WORK(&tport->ls_work, fcloop_tport_lsrqst_work); INIT_LIST_HEAD(&tport->ls_list); @@ -1459,6 +1573,8 @@ __unlink_target_port(struct fcloop_nport *nport) { struct fcloop_tport *tport = nport->tport; + lockdep_assert_held(&fcloop_lock); + if (tport && nport->rport) nport->rport->targetport = NULL; nport->tport = NULL; @@ -1469,9 +1585,6 @@ __unlink_target_port(struct fcloop_nport *nport) static int __targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) { - if (!tport) - return -EALREADY; - return nvmet_fc_unregister_targetport(tport->targetport); } @@ -1479,8 +1592,8 @@ static ssize_t fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct fcloop_nport *nport = NULL, *tmpport; - struct fcloop_tport *tport = NULL; + struct fcloop_nport *nport; + struct fcloop_tport *tport; u64 nodename, portname; unsigned long flags; int ret; @@ -1489,24 +1602,24 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, if (ret) return ret; - spin_lock_irqsave(&fcloop_lock, flags); - - list_for_each_entry(tmpport, &fcloop_nports, nport_list) { - if (tmpport->node_name == nodename && - tmpport->port_name == portname && tmpport->tport) { - nport = tmpport; - tport = __unlink_target_port(nport); - break; - } - } + nport = fcloop_nport_lookup(nodename, portname); + if (!nport) + return -ENOENT; + spin_lock_irqsave(&fcloop_lock, flags); + tport = __unlink_target_port(nport); spin_unlock_irqrestore(&fcloop_lock, flags); - if (!nport) - return -ENOENT; + if (!tport) { + ret = -ENOENT; + goto out_nport_put; + } ret = __targetport_unreg(nport, tport); +out_nport_put: + fcloop_nport_put(nport); + return ret ? ret : count; } @@ -1572,15 +1685,20 @@ static const struct class fcloop_class = { }; static struct device *fcloop_device; - static int __init fcloop_init(void) { int ret; + lsreq_cache = kmem_cache_create("lsreq_cache", + sizeof(struct fcloop_lsreq), 0, + 0, NULL); + if (!lsreq_cache) + return -ENOMEM; + ret = class_register(&fcloop_class); if (ret) { pr_err("couldn't register class fcloop\n"); - return ret; + goto out_destroy_cache; } fcloop_device = device_create_with_groups( @@ -1598,13 +1716,15 @@ static int __init fcloop_init(void) out_destroy_class: class_unregister(&fcloop_class); +out_destroy_cache: + kmem_cache_destroy(lsreq_cache); return ret; } static void __exit fcloop_exit(void) { - struct fcloop_lport *lport = NULL; - struct fcloop_nport *nport = NULL; + struct fcloop_lport *lport; + struct fcloop_nport *nport; struct fcloop_tport *tport; struct fcloop_rport *rport; unsigned long flags; @@ -1615,7 +1735,7 @@ static void __exit fcloop_exit(void) for (;;) { nport = list_first_entry_or_null(&fcloop_nports, typeof(*nport), nport_list); - if (!nport) + if (!nport || !fcloop_nport_get(nport)) break; tport = __unlink_target_port(nport); @@ -1623,13 +1743,21 @@ static void __exit fcloop_exit(void) spin_unlock_irqrestore(&fcloop_lock, flags); - ret = __targetport_unreg(nport, tport); - if (ret) - pr_warn("%s: Failed deleting target port\n", __func__); + if (tport) { + ret = __targetport_unreg(nport, tport); + if (ret) + pr_warn("%s: Failed deleting target port\n", + __func__); + } - ret = __remoteport_unreg(nport, rport); - if (ret) - pr_warn("%s: Failed deleting remote port\n", __func__); + if (rport) { + ret = __remoteport_unreg(nport, rport); + if (ret) + pr_warn("%s: Failed deleting remote port\n", + __func__); + } + + fcloop_nport_put(nport); spin_lock_irqsave(&fcloop_lock, flags); } @@ -1637,17 +1765,17 @@ static void __exit fcloop_exit(void) for (;;) { lport = list_first_entry_or_null(&fcloop_lports, typeof(*lport), lport_list); - if (!lport) + if (!lport || !fcloop_lport_get(lport)) break; - __unlink_local_port(lport); - spin_unlock_irqrestore(&fcloop_lock, flags); - ret = __wait_localport_unreg(lport); + ret = __localport_unreg(lport); if (ret) pr_warn("%s: Failed deleting local port\n", __func__); + fcloop_lport_put(lport); + spin_lock_irqsave(&fcloop_lock, flags); } @@ -1657,6 +1785,7 @@ static void __exit fcloop_exit(void) device_destroy(&fcloop_class, MKDEV(0, 0)); class_unregister(&fcloop_class); + kmem_cache_destroy(lsreq_cache); } module_init(fcloop_init); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 83be0657e6df..eba42df2f821 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -133,7 +133,7 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) * Right now there exists M : 1 mapping between block layer error * to the NVMe status code (see nvme_error_status()). For consistency, * when we reverse map we use most appropriate NVMe Status code from - * the group of the NVMe staus codes used in the nvme_error_status(). + * the group of the NVMe status codes used in the nvme_error_status(). */ switch (blk_sts) { case BLK_STS_NOSPC: @@ -145,15 +145,8 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) req->error_loc = offsetof(struct nvme_rw_command, slba); break; case BLK_STS_NOTSUPP: + status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; req->error_loc = offsetof(struct nvme_common_command, opcode); - switch (req->cmd->common.opcode) { - case nvme_cmd_dsm: - case nvme_cmd_write_zeroes: - status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR; - break; - default: - status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR; - } break; case BLK_STS_MEDIUM: status = NVME_SC_ACCESS_DENIED; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a9d112d34d4f..f85a8441bcc6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -33,10 +33,12 @@ struct nvme_loop_ctrl { struct list_head list; struct blk_mq_tag_set tag_set; - struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; struct nvmet_port *port; + + /* Must be last --ends in a flexible-array member. */ + struct nvme_loop_iod async_event_iod; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) @@ -148,8 +150,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_start_request(req); iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; iod->req.port = queue->ctrl->port; - if (!nvmet_req_init(&iod->req, &queue->nvme_cq, - &queue->nvme_sq, &nvme_loop_ops)) + if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) return BLK_STS_OK; if (blk_rq_nr_phys_segments(req)) { @@ -162,7 +163,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, } iod->req.sg = iod->sg_table.sgl; - iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + iod->req.sg_cnt = blk_rq_map_sg(req, iod->sg_table.sgl); iod->req.transfer_len = blk_rq_payload_bytes(req); } @@ -181,8 +182,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg) iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH; iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; - if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, - &nvme_loop_ops)) { + if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) { dev_err(ctrl->ctrl.device, "failed async event work\n"); return; } @@ -273,6 +273,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) nvme_unquiesce_admin_queue(&ctrl->ctrl); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + nvmet_cq_put(&ctrl->queues[0].nvme_cq); nvme_remove_admin_tag_set(&ctrl->ctrl); } @@ -302,6 +303,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) for (i = 1; i < ctrl->ctrl.queue_count; i++) { clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + nvmet_cq_put(&ctrl->queues[i].nvme_cq); } ctrl->ctrl.queue_count = 1; /* @@ -327,9 +329,13 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) for (i = 1; i <= nr_io_queues; i++) { ctrl->queues[i].ctrl = ctrl; - ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); - if (ret) + nvmet_cq_init(&ctrl->queues[i].nvme_cq); + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq, + &ctrl->queues[i].nvme_cq); + if (ret) { + nvmet_cq_put(&ctrl->queues[i].nvme_cq); goto out_destroy_queues; + } ctrl->ctrl.queue_count++; } @@ -360,9 +366,13 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) int error; ctrl->queues[0].ctrl = ctrl; - error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); - if (error) + nvmet_cq_init(&ctrl->queues[0].nvme_cq); + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq, + &ctrl->queues[0].nvme_cq); + if (error) { + nvmet_cq_put(&ctrl->queues[0].nvme_cq); return error; + } ctrl->ctrl.queue_count = 1; error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, @@ -401,6 +411,7 @@ out_cleanup_tagset: nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_sq: nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + nvmet_cq_put(&ctrl->queues[0].nvme_cq); return error; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index fcf4f460dc9a..51df72f5e89b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -141,13 +141,16 @@ static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns) } struct nvmet_cq { + struct nvmet_ctrl *ctrl; u16 qid; u16 size; + refcount_t ref; }; struct nvmet_sq { struct nvmet_ctrl *ctrl; struct percpu_ref ref; + struct nvmet_cq *cq; u16 qid; u16 size; u32 sqhd; @@ -165,6 +168,9 @@ struct nvmet_sq { u8 *dhchap_skey; int dhchap_skey_len; #endif +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key; +#endif struct completion free_done; struct completion confirm_done; }; @@ -244,6 +250,7 @@ struct nvmet_pr_log_mgr { struct nvmet_ctrl { struct nvmet_subsys *subsys; struct nvmet_sq **sqs; + struct nvmet_cq **cqs; void *drvdata; @@ -289,6 +296,7 @@ struct nvmet_ctrl { u64 err_counter; struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; bool pi_support; + bool concat; #ifdef CONFIG_NVME_TARGET_AUTH struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; @@ -298,6 +306,9 @@ struct nvmet_ctrl { u8 *dh_key; size_t dh_keysize; #endif +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key; +#endif struct nvmet_pr_log_mgr pr_log_mgr; }; @@ -417,7 +428,7 @@ struct nvmet_fabrics_ops { u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl); /* Operations mandatory for PCI target controllers */ - u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 flags, + u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 prp1); u16 (*delete_sq)(struct nvmet_ctrl *ctrl, u16 sqid); u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags, @@ -550,8 +561,8 @@ u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req); u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req); u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req); -bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, - struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq, + const struct nvmet_fabrics_ops *ops); void nvmet_req_uninit(struct nvmet_req *req); size_t nvmet_req_transfer_len(struct nvmet_req *req); bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len); @@ -564,18 +575,24 @@ void nvmet_execute_set_features(struct nvmet_req *req); void nvmet_execute_get_features(struct nvmet_req *req); void nvmet_execute_keep_alive(struct nvmet_req *req); -u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid); +u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create); +u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create); +void nvmet_cq_init(struct nvmet_cq *cq); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); +void nvmet_cq_destroy(struct nvmet_cq *cq); +bool nvmet_cq_get(struct nvmet_cq *cq); +void nvmet_cq_put(struct nvmet_cq *cq); +bool nvmet_cq_in_use(struct nvmet_cq *cq); u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create); void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size); -u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, - u16 size); +u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + struct nvmet_cq *cq, u16 qid, u16 size); void nvmet_sq_destroy(struct nvmet_sq *sq); -int nvmet_sq_init(struct nvmet_sq *sq); +int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq); void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); @@ -583,6 +600,7 @@ void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); struct nvmet_alloc_ctrl_args { struct nvmet_port *port; + struct nvmet_sq *sq; char *subsysnqn; char *hostnqn; uuid_t *hostid; @@ -819,7 +837,7 @@ static inline u8 nvmet_cc_iocqes(u32 cc) /* Convert a 32-bit number to a 16-bit 0's based number */ static inline __le16 to0based(u32 a) { - return cpu_to_le16(max(1U, min(1U << 16, a)) - 1); + return cpu_to_le16(clamp(a, 1U, 1U << 16) - 1); } static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns) @@ -849,8 +867,26 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) { if (bio != &req->b.inline_bio) bio_put(bio); + else + bio_uninit(bio); } +#ifdef CONFIG_NVME_TARGET_TCP_TLS +static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq) +{ + return sq->tls_key ? key_serial(sq->tls_key) : 0; +} +static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq) +{ + if (sq->tls_key) { + key_put(sq->tls_key); + sq->tls_key = NULL; + } +} +#else +static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq) { return 0; } +static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq) {} +#endif #ifdef CONFIG_NVME_TARGET_AUTH u32 nvmet_auth_send_data_len(struct nvmet_req *req); void nvmet_execute_auth_send(struct nvmet_req *req); @@ -859,7 +895,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req); int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, bool set_ctrl); int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl); +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq); void nvmet_auth_sq_init(struct nvmet_sq *sq); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_auth_sq_free(struct nvmet_sq *sq); @@ -869,16 +905,18 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int hash_len); int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, unsigned int hash_len); -static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { - return ctrl->host_key != NULL; + return ctrl->host_key != NULL && !nvmet_queue_tls_keyid(sq); } int nvmet_auth_ctrl_exponential(struct nvmet_req *req, u8 *buf, int buf_size); int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, u8 *buf, int buf_size); +void nvmet_auth_insert_psk(struct nvmet_sq *sq); #else -static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) +static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, + struct nvmet_sq *sq) { return 0; } @@ -891,11 +929,13 @@ static inline bool nvmet_check_auth_status(struct nvmet_req *req) { return true; } -static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, + struct nvmet_sq *sq) { return false; } static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; } +static inline void nvmet_auth_insert_psk(struct nvmet_sq *sq) {}; #endif int nvmet_pr_init_ns(struct nvmet_ns *ns); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 26e2907ce8bb..b7515c53829b 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -99,7 +99,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) /* * The passthru NVMe driver may have a limit on the number of segments - * which depends on the host's memory fragementation. To solve this, + * which depends on the host's memory fragmentation. To solve this, * ensure mdts is limited to the pages equal to the number of segments. */ max_hw_sectors = min_not_zero(pctrl->max_segments << PAGE_SECTORS_SHIFT, diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index 5c4c4c1f535d..6f1651183e32 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -62,8 +62,7 @@ static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex); #define NVMET_PCI_EPF_CQ_RETRY_INTERVAL msecs_to_jiffies(1) enum nvmet_pci_epf_queue_flags { - NVMET_PCI_EPF_Q_IS_SQ = 0, /* The queue is a submission queue */ - NVMET_PCI_EPF_Q_LIVE, /* The queue is live */ + NVMET_PCI_EPF_Q_LIVE = 0, /* The queue is live */ NVMET_PCI_EPF_Q_IRQ_ENABLED, /* IRQ is enabled for this queue */ }; @@ -596,9 +595,6 @@ static bool nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, struct nvmet_pci_epf_irq_vector *iv = cq->iv; bool ret; - if (!test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) - return false; - /* IRQ coalescing for the admin queue is not allowed. */ if (!cq->qid) return true; @@ -625,7 +621,8 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, struct pci_epf *epf = nvme_epf->epf; int ret = 0; - if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) + if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) || + !test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) return; mutex_lock(&ctrl->irq_lock); @@ -636,14 +633,16 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, switch (nvme_epf->irq_type) { case PCI_IRQ_MSIX: case PCI_IRQ_MSI: + /* + * If we fail to raise an MSI or MSI-X interrupt, it is likely + * because the host is using legacy INTX IRQs (e.g. BIOS, + * grub), but we can fallback to the INTX type only if the + * endpoint controller supports this type. + */ ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no, nvme_epf->irq_type, cq->vector + 1); - if (!ret) + if (!ret || !nvme_epf->epc_features->intx_capable) break; - /* - * If we got an error, it is likely because the host is using - * legacy IRQs (e.g. BIOS, grub). - */ fallthrough; case PCI_IRQ_INTX: ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no, @@ -656,7 +655,9 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, } if (ret) - dev_err(ctrl->dev, "Failed to raise IRQ (err=%d)\n", ret); + dev_err_ratelimited(ctrl->dev, + "CQ[%u]: Failed to raise IRQ (err=%d)\n", + cq->qid, ret); unlock: mutex_unlock(&ctrl->irq_lock); @@ -1241,8 +1242,11 @@ static void nvmet_pci_epf_queue_response(struct nvmet_req *req) iod->status = le16_to_cpu(req->cqe->status) >> 1; - /* If we have no data to transfer, directly complete the command. */ - if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) { + /* + * If the command failed or we have no data to transfer, complete the + * command immediately. + */ + if (iod->status || !iod->data_len || iod->dma_dir != DMA_TO_DEVICE) { nvmet_pci_epf_complete_iod(iod); return; } @@ -1264,6 +1268,7 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; u16 status; + int ret; if (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; @@ -1298,13 +1303,41 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, if (status != NVME_SC_SUCCESS) goto err; + /* + * Map the CQ PCI address space and since PCI endpoint controllers may + * return a partial mapping, check that the mapping is large enough. + */ + ret = nvmet_pci_epf_mem_map(ctrl->nvme_epf, cq->pci_addr, cq->pci_size, + &cq->pci_map); + if (ret) { + dev_err(ctrl->dev, "Failed to map CQ %u (err=%d)\n", + cq->qid, ret); + goto err_internal; + } + + if (cq->pci_map.pci_size < cq->pci_size) { + dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n", + cq->qid); + goto err_unmap_queue; + } + set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags); - dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", - cqid, qsize, cq->qes, cq->vector); + if (test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) + dev_dbg(ctrl->dev, + "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", + cqid, qsize, cq->qes, cq->vector); + else + dev_dbg(ctrl->dev, + "CQ[%u]: %u entries of %zu B, IRQ disabled\n", + cqid, qsize, cq->qes); return NVME_SC_SUCCESS; +err_unmap_queue: + nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); +err_internal: + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; err: if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); @@ -1321,16 +1354,20 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid) cancel_delayed_work_sync(&cq->work); nvmet_pci_epf_drain_queue(cq); - nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); + if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) + nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); + nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map); + nvmet_cq_put(&cq->nvme_cq); return NVME_SC_SUCCESS; } static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, - u16 sqid, u16 flags, u16 qsize, u64 pci_addr) + u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 pci_addr) { struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid]; + struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; u16 status; if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) @@ -1353,7 +1390,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, sq->qes = ctrl->io_sqes; sq->pci_size = sq->qes * sq->depth; - status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth); + status = nvmet_sq_create(tctrl, &sq->nvme_sq, &cq->nvme_cq, sqid, + sq->depth); if (status != NVME_SC_SUCCESS) return status; @@ -1385,7 +1423,6 @@ static u16 nvmet_pci_epf_delete_sq(struct nvmet_ctrl *tctrl, u16 sqid) if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) return NVME_SC_QID_INVALID | NVME_STATUS_DNR; - flush_workqueue(sq->iod_wq); destroy_workqueue(sq->iod_wq); sq->iod_wq = NULL; @@ -1510,7 +1547,6 @@ static void nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl *ctrl, if (sq) { queue = &ctrl->sq[qid]; - set_bit(NVMET_PCI_EPF_Q_IS_SQ, &queue->flags); } else { queue = &ctrl->cq[qid]; INIT_DELAYED_WORK(&queue->work, nvmet_pci_epf_cq_work); @@ -1554,36 +1590,6 @@ static void nvmet_pci_epf_free_queues(struct nvmet_pci_epf_ctrl *ctrl) ctrl->cq = NULL; } -static int nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl *ctrl, - struct nvmet_pci_epf_queue *queue) -{ - struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf; - int ret; - - ret = nvmet_pci_epf_mem_map(nvme_epf, queue->pci_addr, - queue->pci_size, &queue->pci_map); - if (ret) { - dev_err(ctrl->dev, "Failed to map queue %u (err=%d)\n", - queue->qid, ret); - return ret; - } - - if (queue->pci_map.pci_size < queue->pci_size) { - dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n", - queue->qid); - nvmet_pci_epf_mem_unmap(nvme_epf, &queue->pci_map); - return -ENOMEM; - } - - return 0; -} - -static inline void nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl *ctrl, - struct nvmet_pci_epf_queue *queue) -{ - nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &queue->pci_map); -} - static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) { struct nvmet_pci_epf_iod *iod = @@ -1601,9 +1607,13 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) goto complete; } - if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq, - &nvmet_pci_epf_fabrics_ops)) - goto complete; + /* + * If nvmet_req_init() fails (e.g., unsupported opcode) it will call + * __nvmet_req_complete() internally which will call + * nvmet_pci_epf_queue_response() and will complete the command directly. + */ + if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops)) + return; iod->data_len = nvmet_req_transfer_len(req); if (iod->data_len) { @@ -1641,10 +1651,11 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) wait_for_completion(&iod->done); - if (iod->status == NVME_SC_SUCCESS) { - WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE); - nvmet_pci_epf_transfer_iod_data(iod); - } + if (iod->status != NVME_SC_SUCCESS) + return; + + WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE); + nvmet_pci_epf_transfer_iod_data(iod); complete: nvmet_pci_epf_complete_iod(iod); @@ -1749,11 +1760,7 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work) struct nvme_completion *cqe; struct nvmet_pci_epf_iod *iod; unsigned long flags; - int ret, n = 0; - - ret = nvmet_pci_epf_map_queue(ctrl, cq); - if (ret) - goto again; + int ret = 0, n = 0; while (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) && ctrl->link_up) { @@ -1809,8 +1816,6 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work) n++; } - nvmet_pci_epf_unmap_queue(ctrl, cq); - /* * We do not support precise IRQ coalescing time (100ns units as per * NVMe specifications). So if we have posted completion entries without @@ -1819,7 +1824,6 @@ static void nvmet_pci_epf_cq_work(struct work_struct *work) if (n) nvmet_pci_epf_raise_irq(ctrl, cq, true); -again: if (ret < 0) queue_delayed_work(system_highpri_wq, &cq->work, NVMET_PCI_EPF_CQ_RETRY_INTERVAL); @@ -1886,8 +1890,8 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) qsize = aqa & 0x00000fff; pci_addr = asq & GENMASK_ULL(63, 12); - status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG, - qsize, pci_addr); + status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, 0, + NVME_QUEUE_PHYS_CONTIG, qsize, pci_addr); if (status != NVME_SC_SUCCESS) { dev_err(ctrl->dev, "Failed to create admin submission queue\n"); nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); @@ -2109,11 +2113,18 @@ out_mempool_exit: static void nvmet_pci_epf_start_ctrl(struct nvmet_pci_epf_ctrl *ctrl) { + + dev_info(ctrl->dev, "PCI link up\n"); + ctrl->link_up = true; + schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL); } static void nvmet_pci_epf_stop_ctrl(struct nvmet_pci_epf_ctrl *ctrl) { + dev_info(ctrl->dev, "PCI link down\n"); + ctrl->link_up = false; + cancel_delayed_work_sync(&ctrl->poll_cc); nvmet_pci_epf_disable_ctrl(ctrl, false); @@ -2340,10 +2351,8 @@ static int nvmet_pci_epf_epc_init(struct pci_epf *epf) if (ret) goto out_clear_bar; - if (!epc_features->linkup_notifier) { - ctrl->link_up = true; + if (!epc_features->linkup_notifier) nvmet_pci_epf_start_ctrl(&nvme_epf->ctrl); - } return 0; @@ -2359,7 +2368,6 @@ static void nvmet_pci_epf_epc_deinit(struct pci_epf *epf) struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; - ctrl->link_up = false; nvmet_pci_epf_destroy_ctrl(ctrl); nvmet_pci_epf_deinit_dma(nvme_epf); @@ -2371,7 +2379,6 @@ static int nvmet_pci_epf_link_up(struct pci_epf *epf) struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; - ctrl->link_up = true; nvmet_pci_epf_start_ctrl(ctrl); return 0; @@ -2382,7 +2389,6 @@ static int nvmet_pci_epf_link_down(struct pci_epf *epf) struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; - ctrl->link_up = false; nvmet_pci_epf_stop_ctrl(ctrl); return 0; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 2a4536ef6184..67f61c67c167 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -976,8 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, cmd->send_sge.addr, cmd->send_sge.length, DMA_TO_DEVICE); - if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, - &queue->nvme_sq, &nvmet_rdma_ops)) + if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops)) return; status = nvmet_rdma_map_sgl(cmd); @@ -1353,6 +1352,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) pr_debug("freeing queue %d\n", queue->idx); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); nvmet_rdma_destroy_queue_ib(queue); if (!queue->nsrq) { @@ -1436,7 +1436,8 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, goto out_reject; } - ret = nvmet_sq_init(&queue->nvme_sq); + nvmet_cq_init(&queue->nvme_cq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) { ret = NVME_RDMA_CM_NO_RSC; goto out_free_queue; @@ -1517,6 +1518,7 @@ out_ida_remove: out_destroy_sq: nvmet_sq_destroy(&queue->nvme_sq); out_free_queue: + nvmet_cq_put(&queue->nvme_cq); kfree(queue); out_reject: nvmet_rdma_cm_reject(cm_id, ret); @@ -1999,7 +2001,7 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, struct nvmet_rdma_port *port = nport->priv; struct rdma_cm_id *cm_id = port->cm_id; - if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { + if (inet_addr_is_any(&cm_id->route.addr.src_addr)) { struct nvmet_rdma_rsp *rsp = container_of(req, struct nvmet_rdma_rsp, req); struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4f9cac8a5abe..470bf37e5a63 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -7,8 +7,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/crc32c.h> #include <linux/err.h> -#include <linux/key.h> #include <linux/nvme-tcp.h> #include <linux/nvme-keyring.h> #include <net/sock.h> @@ -18,7 +18,6 @@ #include <net/handshake.h> #include <linux/inet.h> #include <linux/llist.h> -#include <crypto/hash.h> #include <trace/events/sock.h> #include "nvmet.h" @@ -173,8 +172,6 @@ struct nvmet_tcp_queue { /* digest state */ bool hdr_digest; bool data_digest; - struct ahash_request *snd_hash; - struct ahash_request *rcv_hash; /* TLS state */ key_serial_t tls_pskid; @@ -295,14 +292,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } -static inline void nvmet_tcp_hdgst(struct ahash_request *hash, - void *pdu, size_t len) +static inline void nvmet_tcp_hdgst(void *pdu, size_t len) { - struct scatterlist sg; - - sg_init_one(&sg, pdu, len); - ahash_request_set_crypt(hash, &sg, pdu + len, len); - crypto_ahash_digest(hash); + put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len); } static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, @@ -319,7 +311,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, } recv_digest = *(__le32 *)(pdu + hdr->hlen); - nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); + nvmet_tcp_hdgst(pdu, len); exp_digest = *(__le32 *)(pdu + hdr->hlen); if (recv_digest != exp_digest) { pr_err("queue %d: header digest error: recv %#x expected %#x\n", @@ -442,12 +434,24 @@ err: return NVME_SC_INTERNAL; } -static void nvmet_tcp_calc_ddgst(struct ahash_request *hash, - struct nvmet_tcp_cmd *cmd) +static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd) { - ahash_request_set_crypt(hash, cmd->req.sg, - (void *)&cmd->exp_ddgst, cmd->req.transfer_len); - crypto_ahash_digest(hash); + size_t total_len = cmd->req.transfer_len; + struct scatterlist *sg = cmd->req.sg; + u32 crc = ~0; + + while (total_len) { + size_t len = min_t(size_t, total_len, sg->length); + + /* + * Note that the scatterlist does not contain any highmem pages, + * as it was allocated by sgl_alloc() with GFP_KERNEL. + */ + crc = crc32c(crc, sg_virt(sg), len); + total_len -= len; + sg = sg_next(sg); + } + cmd->exp_ddgst = cpu_to_le32(~crc); } static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) @@ -474,19 +478,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) if (queue->data_digest) { pdu->hdr.flags |= NVME_TCP_F_DDGST; - nvmet_tcp_calc_ddgst(queue->snd_hash, cmd); + nvmet_tcp_calc_ddgst(cmd); } if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; - nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; - struct nvmet_tcp_queue *queue = cmd->queue; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); cmd->offset = 0; @@ -504,14 +507,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; - nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) { struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; - struct nvmet_tcp_queue *queue = cmd->queue; u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); cmd->offset = 0; @@ -524,7 +526,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); if (cmd->queue->hdr_digest) { pdu->hdr.flags |= NVME_TCP_F_HDGST; - nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvmet_tcp_hdgst(pdu, sizeof(*pdu)); } } @@ -858,42 +860,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU); } -static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); - - ahash_request_free(queue->rcv_hash); - ahash_request_free(queue->snd_hash); - crypto_free_ahash(tfm); -} - -static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) -{ - struct crypto_ahash *tfm; - - tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->snd_hash) - goto free_tfm; - ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); - - queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->rcv_hash) - goto free_snd_hash; - ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); - - return 0; -free_snd_hash: - ahash_request_free(queue->snd_hash); -free_tfm: - crypto_free_ahash(tfm); - return -ENOMEM; -} - - static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) { struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; @@ -922,11 +888,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); - if (queue->hdr_digest || queue->data_digest) { - ret = nvmet_tcp_alloc_crypto(queue); - if (ret) - return ret; - } memset(icresp, 0, sizeof(*icresp)); icresp->hdr.type = nvme_tcp_icresp; @@ -1078,12 +1039,12 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) req = &queue->cmd->req; memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); - if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, - &queue->nvme_sq, &nvmet_tcp_ops))) { - pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", + if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) { + pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n", req->cmd, req->cmd->common.command_id, req->cmd->common.opcode, - le32_to_cpu(req->cmd->common.dptr.sgl.length)); + le32_to_cpu(req->cmd->common.dptr.sgl.length), + le16_to_cpu(req->cqe->status)); nvmet_tcp_handle_req_failure(queue, queue->cmd, req); return 0; @@ -1247,7 +1208,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) { struct nvmet_tcp_queue *queue = cmd->queue; - nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd); + nvmet_tcp_calc_ddgst(cmd); queue->offset = 0; queue->left = NVME_TCP_DIGEST_LENGTH; queue->rcv_state = NVMET_TCP_RECV_DDGST; @@ -1560,6 +1521,9 @@ static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; + if (!queue->state_change) + return; + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_data_ready = queue->data_ready; sock->sk->sk_state_change = queue->state_change; @@ -1609,15 +1573,15 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) /* stop accepting incoming data */ queue->rcv_state = NVMET_TCP_RECV_ERR; + nvmet_sq_put_tls_key(&queue->nvme_sq); nvmet_tcp_uninit_data_in_cmds(queue); nvmet_sq_destroy(&queue->nvme_sq); + nvmet_cq_put(&queue->nvme_cq); cancel_work_sync(&queue->io_work); nvmet_tcp_free_cmd_data_in_buffers(queue); /* ->sock will be released by fput() */ fput(queue->sock->file); nvmet_tcp_free_cmds(queue); - if (queue->hdr_digest || queue->data_digest) - nvmet_tcp_free_crypto(queue); ida_free(&nvmet_tcp_queue_ida, queue->idx); page_frag_cache_drain(&queue->pf_cache); kfree(queue); @@ -1794,6 +1758,27 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) return 0; } +static int nvmet_tcp_tls_key_lookup(struct nvmet_tcp_queue *queue, + key_serial_t peerid) +{ + struct key *tls_key = nvme_tls_key_lookup(peerid); + int status = 0; + + if (IS_ERR(tls_key)) { + pr_warn("%s: queue %d failed to lookup key %x\n", + __func__, queue->idx, peerid); + spin_lock_bh(&queue->state_lock); + queue->state = NVMET_TCP_Q_FAILED; + spin_unlock_bh(&queue->state_lock); + status = PTR_ERR(tls_key); + } else { + pr_debug("%s: queue %d using TLS PSK %x\n", + __func__, queue->idx, peerid); + queue->nvme_sq.tls_key = tls_key; + } + return status; +} + static void nvmet_tcp_tls_handshake_done(void *data, int status, key_serial_t peerid) { @@ -1814,6 +1799,10 @@ static void nvmet_tcp_tls_handshake_done(void *data, int status, spin_unlock_bh(&queue->state_lock); cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); + + if (!status) + status = nvmet_tcp_tls_key_lookup(queue, peerid); + if (status) nvmet_tcp_schedule_release_queue(queue); else @@ -1921,7 +1910,8 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, if (ret) goto out_ida_remove; - ret = nvmet_sq_init(&queue->nvme_sq); + nvmet_cq_init(&queue->nvme_cq); + ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq); if (ret) goto out_free_connect; @@ -1938,10 +1928,10 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, struct sock *sk = queue->sock->sk; /* Restore the default callbacks before starting upcall */ - read_lock_bh(&sk->sk_callback_lock); + write_lock_bh(&sk->sk_callback_lock); sk->sk_user_data = NULL; sk->sk_data_ready = port->data_ready; - read_unlock_bh(&sk->sk_callback_lock); + write_unlock_bh(&sk->sk_callback_lock); if (!nvmet_tcp_try_peek_pdu(queue)) { if (!nvmet_tcp_tls_handshake(queue)) return; @@ -1964,6 +1954,7 @@ out_destroy_sq: mutex_unlock(&nvmet_tcp_queue_mutex); nvmet_sq_destroy(&queue->nvme_sq); out_free_connect: + nvmet_cq_put(&queue->nvme_cq); nvmet_tcp_free_cmd(&queue->connect); out_ida_remove: ida_free(&nvmet_tcp_queue_ida, queue->idx); @@ -2165,7 +2156,7 @@ static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, { struct nvmet_tcp_port *port = nport->priv; - if (inet_addr_is_any((struct sockaddr *)&port->addr)) { + if (inet_addr_is_any(&port->addr)) { struct nvmet_tcp_cmd *cmd = container_of(req, struct nvmet_tcp_cmd, req); struct nvmet_tcp_queue *queue = cmd->queue; |