diff options
Diffstat (limited to 'fs')
84 files changed, 9916 insertions, 1123 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 5b4847bd2fbb..44b6cdd36dc1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -335,6 +335,7 @@ source "fs/omfs/Kconfig" source "fs/hpfs/Kconfig" source "fs/qnx4/Kconfig" source "fs/qnx6/Kconfig" +source "fs/resctrl/Kconfig" source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/ufs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 77fd7f7b5d02..79c08b914c47 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -128,3 +128,4 @@ obj-$(CONFIG_EROFS_FS) += erofs/ obj-$(CONFIG_VBOXSF_FS) += vboxsf/ obj-$(CONFIG_ZONEFS_FS) += zonefs/ obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o +obj-$(CONFIG_RESCTRL_FS) += resctrl/ diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index fc8ba9142f2f..682bd8ec2c10 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -5,6 +5,7 @@ config AFS_FS select AF_RXRPC select DNS_RESOLVER select NETFS_SUPPORT + select CRYPTO_KRB5 help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 5efd7e13b304..b49b8fe682f3 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -8,6 +8,7 @@ kafs-y := \ addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ dir_edit.o \ diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 000000000000..edcbd249d202 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <crypto/krb5.h> +#include "internal.h" +#include "afs_cm.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> + +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32))) + +#ifdef CONFIG_RXGK +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server); +#endif + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; + struct afs_server *server; +#endif + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { +#ifdef CONFIG_RXKAD + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); + + case RXRPC_SECURITY_YFS_RXGK: + switch (service_id) { + case FS_SERVICE: + case YFS_FS_SERVICE: + server = (struct afs_server *)peer_data; + if (!server->cm_rxgk_appdata.data) { + mutex_lock(&server->cm_token_lock); + if (!server->cm_rxgk_appdata.data) + afs_create_yfs_cm_token(challenge, server); + mutex_unlock(&server->cm_token_lock); + } + if (server->cm_rxgk_appdata.data) + appdata = server->cm_rxgk_appdata; + break; + } + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} + +#ifdef CONFIG_RXGK +/* + * Create a securities keyring for the cache manager and attach a key to it for + * the RxGK tokens we want to use to secure the callback connection back from + * the fileserver. + */ +int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + const struct krb5_enctype *krb5; + struct key *ring; + key_ref_t key; + char K0[32], *desc; + int ret; + + ring = keyring_alloc("kafs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + ret = rxrpc_sock_set_security_keyring(socket->sk, ring); + if (ret < 0) + goto out; + + ret = -ENOPKG; + krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (!krb5) + goto out; + + if (WARN_ON_ONCE(krb5->key_len > sizeof(K0))) + goto out; + + ret = -ENOMEM; + desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u", + YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype); + if (!desc) + goto out; + + wait_for_random_bytes(); + get_random_bytes(K0, krb5->key_len); + + key = key_create(make_key_ref(ring, true), + "rxrpc_s", desc, + K0, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + kfree(desc); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + + net->fs_cm_token_key = key_ref_to_ptr(key); + ret = 0; +out: + key_put(ring); + return ret; +} + +/* + * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver. + */ +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server) +{ + const struct krb5_enctype *conn_krb5, *token_krb5; + const struct krb5_buffer *token_key; + struct crypto_aead *aead; + struct scatterlist sg; + struct afs_net *net = server->cell->net; + const struct key *key = net->fs_cm_token_key; + size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset; + __be32 caps[1] = { + [0] = htonl(AFS_CAP_ERROR_TRANSLATION), + }; + __be32 *xdr; + void *appdata, *K0, *encbase; + u32 enctype; + int ret; + + if (!key) + return -ENOKEY; + + /* Assume that the fileserver is happy to use the same encoding type as + * we were told to use by the token obtained by the user. + */ + enctype = rxgk_kernel_query_challenge(challenge); + + conn_krb5 = crypto_krb5_find_enctype(enctype); + if (!conn_krb5) + return -ENOPKG; + token_krb5 = key->payload.data[0]; + token_key = (const struct krb5_buffer *)&key->payload.data[2]; + + /* struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + */ + keysize = 4 + xdr_len_object(conn_krb5->key_len); + + /* struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + */ + uuidsize = sizeof(server->uuid); + authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0); + + /* struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ + toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize); + + offset = 0; + encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset); + + /* struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + */ + contsize = 4 + 4 + xdr_len_object(encsize); + + /* struct YFSAppData { + * opr_uuid initiatorUuid; + * opr_uuid acceptorUuid; + * Capabilities caps; + * afs_int32 enctype; + * opaque callbackKey<>; + * opaque callbackToken<>; + * }; + */ + adatasize = 16 + 16 + + xdr_len_object(sizeof(caps)) + + 4 + + xdr_len_object(conn_krb5->key_len) + + xdr_len_object(contsize); + + ret = -ENOMEM; + appdata = kzalloc(adatasize, GFP_KERNEL); + if (!appdata) + goto out; + xdr = appdata; + + memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */ + xdr += 16 / 4; + memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */ + xdr += 16 / 4; + *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */ + memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */ + xdr += ARRAY_SIZE(caps); + *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */ + + *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */ + K0 = xdr; + get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(contsize); /* appdata.callbackToken.len */ + *xdr++ = htonl(1); /* cont.kvno */ + *xdr++ = htonl(token_krb5->etype); /* cont.enctype */ + *xdr++ = htonl(encsize); /* cont.encrypted_token.len */ + + encbase = xdr; + xdr += offset / 4; + *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */ + *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */ + memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */ + *xdr++ = htonl(0); /* token.starttime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(0); /* token.lifetime */ + *xdr++ = htonl(0); /* token.bytelife */ + *xdr++ = htonl(0); /* token.expirationtime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(1); /* token.identities.count */ + *xdr++ = htonl(0); /* token.identities[0].kind */ + *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */ + memcpy(xdr, &server->uuid, uuidsize); + xdr += xdr_round_up(uuidsize) / 4; + *xdr++ = htonl(0); /* token.identities[0].display.len */ + + xdr = encbase + xdr_round_up(encsize); + + if ((unsigned long)xdr - (unsigned long)appdata != adatasize) + pr_err("Appdata size incorrect %lx != %zx\n", + (unsigned long)xdr - (unsigned long)appdata, adatasize); + + aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN, + GFP_KERNEL); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + goto out_token; + } + + sg_init_one(&sg, encbase, encsize); + ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false); + if (ret < 0) + goto out_aead; + + server->cm_rxgk_appdata.len = adatasize; + server->cm_rxgk_appdata.data = appdata; + appdata = NULL; + +out_aead: + crypto_free_aead(aead); +out_token: + kfree(appdata); +out: + return ret; +} +#endif /* CONFIG_RXGK */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 440b0e731093..1124ea4000cb 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -20,6 +20,7 @@ #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> +#include <crypto/krb5.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> @@ -176,8 +177,10 @@ struct afs_call { bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ @@ -281,6 +284,7 @@ struct afs_net { struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; @@ -305,6 +309,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ + struct key *fs_cm_token_key; /* Key for creating CM tokens */ struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -540,6 +545,8 @@ struct afs_server { struct list_head volumes; /* RCU list of afs_server_entry objects */ struct work_struct destroyer; /* Work item to try and destroy a server */ struct timer_list timer; /* Management timer */ + struct mutex cm_token_lock; /* Lock governing creation of appdata */ + struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ @@ -1059,6 +1066,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *); extern bool afs_cm_incoming_call(struct afs_call *); /* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); +#ifdef CONFIG_RXGK +int afs_create_token_key(struct afs_net *net, struct socket *socket); +#else +static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + return 0; +} +#endif + +/* * dir.c */ extern const struct file_operations afs_dir_file_operations; diff --git a/fs/afs/main.c b/fs/afs/main.c index c845c5daaeba..02475d415d88 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -73,6 +73,7 @@ static int __net_init afs_net_init(struct net *net_ns) generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index b8180bf2281f..8f2b3a177690 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> +#include <crypto/krb5.h> #include "internal.h" #include "afs_fs.h" #include "protocol_uae.h" @@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + case RXGEN_OPCODE: return -ENOTSUPP; default: return -EREMOTEIO; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5e480a33859..c1cadf8fb346 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -24,8 +24,17 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, +}; + /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -49,6 +58,7 @@ int afs_open_socket(struct afs_net *net) goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); @@ -64,6 +74,14 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + + ret = afs_create_token_key(net, socket); + if (ret < 0) + pr_err("Couldn't create RxGK CM key: %d\n", ret); + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -84,8 +102,7 @@ int afs_open_socket(struct afs_net *net) * it sends back to us. */ - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, - afs_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -125,7 +142,9 @@ void afs_close_socket(struct afs_net *net) kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; sock_release(net->socket); + key_put(net->fs_cm_token_key); _debug("dework"); _leave(""); @@ -738,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, - afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -800,10 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + trace_afs_cb_call(call); call->work.func = call->type->work; - /* pass responsibility for the remainer of this message off to the + /* pass responsibility for the remainder of this message off to the * cache manager op */ return call->type->deliver(call); } @@ -952,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call, call->unmarshalling_error = true; return -EBADMSG; } + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); +} diff --git a/fs/afs/server.c b/fs/afs/server.c index 8755f2703815..a97562f831eb 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -131,6 +131,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t * timer_setup(&server->timer, afs_server_timer, 0); INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); + mutex_init(&server->cm_token_lock); INIT_LIST_HEAD(&server->probe_link); INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); @@ -396,6 +397,7 @@ static void afs_server_rcu(struct rcu_head *rcu) afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), afs_estate_trace_put_server); afs_put_cell(server->cell, afs_cell_trace_put_server); + kfree(server->cm_rxgk_appdata.data); kfree(server); } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index d6dd12d74d4f..f57f9f4774e6 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -122,7 +122,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, __set_current_state(TASK_RUNNING); timer_delete_sync(&wait.cpu_timer); - destroy_timer_on_stack(&wait.cpu_timer); + timer_destroy_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e43f6280f954..849199768664 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2189,7 +2189,6 @@ retry: done = 1; break; } - free_extent_buffer(eb); continue; } diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..b46165df5a91 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -3,7 +3,6 @@ menuconfig DLM tristate "Distributed Lock Manager (DLM)" depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) - select IP_SCTP help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index cf9ba6fd7a28..a23fd524a6ee 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -197,6 +197,9 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x) break; case 1: /* SCTP */ + if (!IS_ENABLED(CONFIG_IP_SCTP)) + return -EOPNOTSUPP; + break; default: return -EINVAL; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 70abd4da17a6..e4373bce1bc2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -160,6 +160,7 @@ struct dlm_proto_ops { bool try_new_addr; const char *name; int proto; + int how; void (*sockopts)(struct socket *sock); int (*bind)(struct socket *sock); @@ -533,7 +534,7 @@ static void lowcomms_state_change(struct sock *sk) /* SCTP layer is not calling sk_data_ready when the connection * is done, so we catch the signal through here. */ - if (sk->sk_shutdown == RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) lowcomms_data_ready(sk); } @@ -810,7 +811,7 @@ static void shutdown_connection(struct connection *con, bool and_other) return; } - ret = kernel_sock_shutdown(con->sock, SHUT_WR); + ret = kernel_sock_shutdown(con->sock, dlm_proto_ops->how); up_read(&con->sock_lock); if (ret) { log_print("Connection %p failed to shutdown: %d will force close", @@ -1858,6 +1859,7 @@ static int dlm_tcp_listen_bind(struct socket *sock) static const struct dlm_proto_ops dlm_tcp_ops = { .name = "TCP", .proto = IPPROTO_TCP, + .how = SHUT_WR, .sockopts = dlm_tcp_sockopts, .bind = dlm_tcp_bind, .listen_validate = dlm_tcp_listen_validate, @@ -1896,6 +1898,7 @@ static void dlm_sctp_sockopts(struct socket *sock) static const struct dlm_proto_ops dlm_sctp_ops = { .name = "SCTP", .proto = IPPROTO_SCTP, + .how = SHUT_RDWR, .try_new_addr = true, .sockopts = dlm_sctp_sockopts, .bind = dlm_sctp_bind, diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index d47896a89596..1729bf42eb51 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -801,4 +801,5 @@ load_default: void exfat_free_upcase_table(struct exfat_sb_info *sbi) { kvfree(sbi->vol_utbl); + sbi->vol_utbl = NULL; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8465033a6cf0..7ed858937d45 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -36,31 +36,12 @@ static void exfat_put_super(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); mutex_lock(&sbi->s_lock); + exfat_clear_volume_dirty(sb); exfat_free_bitmap(sbi); brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); } -static int exfat_sync_fs(struct super_block *sb, int wait) -{ - struct exfat_sb_info *sbi = EXFAT_SB(sb); - int err = 0; - - if (unlikely(exfat_forced_shutdown(sb))) - return 0; - - if (!wait) - return 0; - - /* If there are some dirty buffers in the bdev inode */ - mutex_lock(&sbi->s_lock); - sync_blockdev(sb->s_bdev); - if (exfat_clear_volume_dirty(sb)) - err = -EIO; - mutex_unlock(&sbi->s_lock); - return err; -} - static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -219,7 +200,6 @@ static const struct super_operations exfat_sops = { .write_inode = exfat_write_inode, .evict_inode = exfat_evict_inode, .put_super = exfat_put_super, - .sync_fs = exfat_sync_fs, .statfs = exfat_statfs, .show_options = exfat_show_options, .shutdown = exfat_shutdown, @@ -751,10 +731,14 @@ static void exfat_free(struct fs_context *fc) static int exfat_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; fc->sb_flags |= SB_NODIRATIME; - /* volume flag will be updated in exfat_sync_fs */ - sync_filesystem(fc->root->d_sb); + sync_filesystem(sb); + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_clear_volume_dirty(sb); + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return 0; } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index a4dbaccee6e7..87760fabdd2e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -30,7 +30,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); @@ -52,7 +52,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); @@ -71,7 +71,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); @@ -92,7 +92,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5a20e9cd7184..18373de980f2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -256,9 +256,19 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) +/* + * This is for use in ext4_map_query_blocks() for a special case where we can + * have a physically and logically contiguous blocks split across two leaf + * nodes instead of a single extent. This is required in case of atomic writes + * to know whether the returned extent is last in leaf. If yes, then lookup for + * next in leaf block in ext4_map_query_blocks_next_in_leaf(). + * - This is never going to be added to any buffer head state. + * - We use the next available bit after BH_BITMAP_UPTODATE. + */ +#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_DELAYED) + EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -706,9 +716,6 @@ enum { #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -720,11 +727,23 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) - /* Caller will submit data before dropping transaction handle. This - * allows jbd2 to avoid submitting data before commit. */ + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 +/* + * Atomic write caller needs this to query in the slow path of mixed mapping + * case, when a contiguous extent can be split across two adjacent leaf nodes. + * Look EXT4_MAP_QUERY_LAST_IN_LEAF. + */ +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the @@ -738,6 +757,13 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 +/* + * ext4_map_query_blocks() uses this filter mask to filter the flags needed to + * pass while lookup/querying of on disk extent tree. + */ +#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ + EXT4_EX_NOFAIL |\ + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks @@ -1061,16 +1087,16 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; - /* Number of ongoing updates on this inode */ - atomic_t i_fc_updates; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; - /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ - struct mutex i_fc_lock; + /* + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len + * and inode's EXT4_FC_STATE_COMMITTING state bit. + */ + spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not @@ -1754,7 +1780,7 @@ struct ext4_sb_info { * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ - spinlock_t s_fc_lock; + struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; @@ -1913,6 +1939,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; @@ -2295,10 +2322,12 @@ static inline int ext4_emergency_state(struct super_block *sb) #define EXT4_DEFM_NODELALLOC 0x0800 /* - * Default journal batch times + * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + /* * Default values for superblock update @@ -2487,8 +2516,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH -static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } @@ -2922,8 +2950,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); -void ext4_fc_start_update(struct inode *inode); -void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); @@ -2973,6 +2999,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr) void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); +void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, @@ -2993,6 +3020,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +bool ext4_should_enable_large_folio(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3039,6 +3067,8 @@ extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); @@ -3050,6 +3080,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +static inline bool is_special_ino(struct super_block *sb, unsigned long ino) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum); +} + /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3119,8 +3160,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); -extern __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es); +extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); @@ -3378,6 +3418,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) return 1 << sbi->s_log_groups_per_flex; } +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ @@ -3710,6 +3757,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, + struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, @@ -3847,7 +3896,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) static inline bool ext4_inode_can_atomic_write(struct inode *inode) { - return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; + return S_ISREG(inode->i_mode) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 135e278c832e..b3e9b7bd7978 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { + !test_opt(inode->i_sb, DELALLOC) && + !mapping_large_folio_support(inode->i_mapping))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 3221714d9901..63d17c5201b5 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -319,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, revoke_creds, 0); } -static inline int ext4_journal_blocks_per_page(struct inode *inode) +static inline int ext4_journal_blocks_per_folio(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) - return jbd2_journal_blocks_per_page(inode); + return jbd2_journal_blocks_per_folio(inode); return 0; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c616a16a9f36..b543a46fc809 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } @@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ + ext4_check_map_extents_env(inode); + down_read(&ei->i_data_sem); depth = ext_depth(inode); @@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode, static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent *ret_ex) + struct ext4_extent *ret_ex, int flags) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1604,7 +1605,8 @@ got_index: ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, + flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -1612,7 +1614,7 @@ got_index: put_bh(bh); } - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; - depth = ext_depth(inode); - + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ if (extents <= 1) - index = depth * 2; + index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; else - index = depth * 3; + index = (EXT4_MAX_EXTENT_DEPTH * 3) + + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); return index; } @@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; partial.pclu = 0; partial.lblk = 0; @@ -2851,8 +2856,7 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, - EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, end, NULL, flags); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2918,7 +2922,7 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - NULL); + NULL, flags); if (err < 0) goto out; if (pblk) { @@ -2994,8 +2998,7 @@ again: i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, - depth - i - 1, - EXT4_EX_NOCACHE); + depth - i - 1, flags); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); @@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, + &ex2, flags); if (err < 0) goto out; @@ -4433,6 +4437,20 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: + /* + * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. + * So we know that the depth used here is correct, since there was no + * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. + * If tomorrow we start using this QUERY flag with CREATE, then we will + * need to re-calculate the depth as it might have changed due to block + * allocation. + */ + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { + WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); + if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) + map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + } + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, @@ -4781,6 +4799,93 @@ out_inode_lock: } /* + * This function converts a range of blocks to written extents. The caller of + * this function will pass the start offset and the size. all unwritten extents + * within this range will be converted to written extents. + * + * This function is called from the direct IO end io call back function for + * atomic writes, to convert the unwritten extents after IO is completed. + * + * Note that the requirement for atomic writes is that all conversion should + * happen atomically in a single fs journal transaction. We mainly only allocate + * unwritten extents either on a hole on a pre-exiting unwritten extent range in + * ext4_map_blocks_atomic_write(). The only case where we can have multiple + * unwritten extents in a range [offset, offset+len) is when there is a split + * unwritten extent between two leaf nodes which was cached in extent status + * cache during ext4_iomap_alloc() time. That will allow + * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going + * into the slow path. That means we might need a loop for conversion of this + * unwritten extent split across leaf block within a single journal transaction. + * Split extents across leaf nodes is a rare case, but let's still handle that + * to meet the requirements of multi-fsblock atomic writes. + * + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; + + map.m_lblk = offset >> blkbits; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + + if (!handle) { + /* + * TODO: An optimization can be added later by having an extent + * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that + * it can tell if the extent in the cache is a split extent. + * But for now let's assume pextents as 2 always. + */ + credits = ext4_meta_trans_blocks(inode, max_blocks, 2); + } + + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } + + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret != max_blocks) + ext4_msg(inode->i_sb, KERN_INFO, + "inode #%lu: block %u: len %u: " + "split block mapping found for atomic write, " + "ret = %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + if (ret <= 0) + break; + } + + ret2 = ext4_mark_inode_dirty(handle, inode); + + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + + if (ret <= 0 || ret2) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "returned %d or %d", + inode->i_ino, map.m_lblk, + map.m_len, ret, ret2); + + return ret > 0 ? ret2 : ret; +} + +/* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. * all unwritten extents within this range will be converted to @@ -4819,8 +4924,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, break; } } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " @@ -4931,12 +5042,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { - u64 maxbytes; - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - maxbytes = inode->i_sb->s_maxbytes; - else - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; @@ -4956,10 +5062,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int error = 0; + inode_lock_shared(inode); if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) - return error; + goto unlock; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } @@ -4970,15 +5077,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) - return error; + goto unlock; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - return iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } - - return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); +unlock: + inode_unlock_shared(inode); + return error; } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -4999,7 +5110,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + inode_lock_shared(inode); error = ext4_ext_precache(inode); + inode_unlock_shared(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; @@ -5328,6 +5441,8 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) start_lblk = offset >> inode->i_blkbits; end_lblk = (offset + len) >> inode->i_blkbits; + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); @@ -5429,6 +5544,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) start_lblk = offset >> inode->i_blkbits; len_lblk = len >> inode->i_blkbits; + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d1401d4a5513..31dc0496f8d0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -120,9 +120,40 @@ * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * + * ========================================================================== + * 3. Assurance of Ext4 extent status tree consistency + * + * When mapping blocks, Ext4 queries the extent status tree first and should + * always trusts that the extent status tree is consistent and up to date. + * Therefore, it is important to adheres to the following rules when createing, + * modifying and removing extents. + * + * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, + * the extent information should always be processed through the extent + * status tree instead of being organized manually through the on-disk + * extent tree. + * + * 2. When updating the extent tree, Ext4 should acquire the i_data_sem + * exclusively and update the extent status tree atomically. If the extents + * to be modified are large enough to exceed the range that a single + * i_data_sem can process (as ext4_datasem_ensure_credits() may drop + * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() + * does): + * + * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against page faults, as well as reads and writes that may + * concurrently modify the extent status tree. + * b) Evict all page cache in the affected range and recommend rebuilding + * or dropping the extent status tree after modifying the on-disk + * extent tree. This ensures exclusion against concurrent writebacks + * that do not hold those locks but only holds a folio lock. + * + * 3. Based on the rules above, when querying block mappings, Ext4 should at + * least hold the i_rwsem or invalidate_lock or folio lock(s) for the + * specified querying range. * * ========================================================================== - * 3. Performance analysis + * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are @@ -134,7 +165,7 @@ * * * ========================================================================== - * 4. TODO list + * 5. TODO list * * -- Refactor delayed space reservation * diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index da4263a14a20..42bee1d4f9f9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- @@ -49,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -142,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -156,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> @@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); -} - -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) -{ - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) @@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb) } /* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); -} - -/* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ @@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; -restart: - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + mutex_unlock(&sbi->s_fc_lock); + schedule(); + mutex_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -320,12 +298,10 @@ restart: list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -385,7 +361,7 @@ static int ext4_fc_track_template( int ret; tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { @@ -393,19 +369,18 @@ static int ext4_fc_track_template( ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); - spin_lock(&sbi->s_fc_lock); + INIT_LIST_HEAD(&node->fcd_list); + mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + mutex_unlock(&sbi->s_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } @@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); - *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); - crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); @@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", @@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; @@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - spin_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the - * corresponding inode pointer + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - WARN_ON(list_empty(&fc_dentry->fcd_dilist)); + if (list_empty(&fc_dentry->fcd_dilist)) + continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - spin_unlock(&sbi->s_fc_lock); - /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first @@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - spin_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - spin_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) @@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + mutex_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + mutex_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + mutex_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal) } } - spin_lock(&sbi->s_fc_lock); + /* Step 6.2: Now write all the dentry updates. */ + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - spin_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } @@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); @@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); + old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1228,6 +1241,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1242,6 +1264,7 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time @@ -1251,6 +1274,7 @@ restart_fc: return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; @@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) @@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, iter->i_sync_tid)) { - ext4_fc_reset_inode(&iter->vfs_inode); + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been @@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ - list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, + list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } @@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - spin_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } @@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); @@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: diff --git a/fs/ext4/file.c b/fs/ext4/file.c index beb078ee4811..21df81347147 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; @@ -929,12 +934,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; + loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e7ecc7c8a729..79aa3df8d019 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1288,10 +1288,9 @@ got: __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -1336,6 +1335,9 @@ got: } } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2c9b762925c7..a1bbcdf40824 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -397,7 +397,7 @@ out: } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) + loff_t len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); @@ -601,6 +601,7 @@ retry: goto out; } + ext4_fc_track_inode(handle, inode); ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cdf01e60fa6d..be9a4cba35fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -58,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -142,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -416,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -462,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map) + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + unsigned int orig_mlen = map->m_len; + flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; @@ -484,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, WARN_ON(1); } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); - return retval; + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -602,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -622,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { @@ -653,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -667,7 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - retval = ext4_map_query_blocks(handle, inode, map); + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -696,6 +801,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; + + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -1009,7 +1116,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } @@ -1027,7 +1139,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; @@ -1041,8 +1153,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); @@ -1152,6 +1263,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; + fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -1164,8 +1276,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, @@ -1184,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -1765,6 +1883,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, @@ -1805,7 +1925,7 @@ found: if (ext4_has_inline_data(inode)) retval = 0; else - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; @@ -1828,7 +1948,7 @@ add_delayed: goto found; } } else if (!ext4_has_inline_data(inode)) { - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; @@ -1936,7 +2056,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2159,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2170,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2212,11 +2332,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -2355,7 +2479,7 @@ update_disksize: */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); @@ -2391,7 +2515,7 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) @@ -2433,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -2920,6 +3044,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; + fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -2945,11 +3070,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -3038,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, unsigned long end; i_size_write(inode, new_i_size); - end = (new_i_size - 1) & (PAGE_SIZE - 1); + end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; @@ -3340,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3353,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3384,7 +3673,11 @@ retry: else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3398,6 +3691,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3408,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3421,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3431,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3449,6 +3772,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3690,9 +4023,7 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3707,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -4006,7 +4338,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; - loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + loff_t max_end = sb->s_maxbytes; loff_t end = offset + length; handle_t *handle; unsigned int credits; @@ -4015,14 +4347,20 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); + /* + * For indirect-block based inodes, make sure that the hole within + * one block before last range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) + if (offset >= inode->i_size || offset >= max_end) return 0; /* * If the hole extends beyond i_size, set the hole to end after - * the page that contains i_size, and also make sure that the hole - * within one block before last range. + * the page that contains i_size. */ if (end > inode->i_size) end = round_up(inode->i_size, PAGE_SIZE); @@ -4072,6 +4410,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -4224,8 +4564,10 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; - down_write(&EXT4_I(inode)->i_data_sem); + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4760,10 +5102,27 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, return 0; error: - ext4_error_inode(inode, function, line, 0, err_str); + ext4_error_inode(inode, function, line, 0, "%s", err_str); return -EFSCORRUPTED; } +bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -4781,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) @@ -4845,10 +5199,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || @@ -4916,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -5086,6 +5440,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, @@ -5564,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5577,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; @@ -5773,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5782,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and lead blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks + pextents; + groups = idxblocks; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -5801,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); @@ -5821,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_writepage_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); @@ -5895,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d17207386ead..5668a17458ae 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -143,7 +143,7 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); if (ext4_has_feature_metadata_csum(sb) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); @@ -151,7 +151,7 @@ static int ext4_update_backup_sb(struct super_block *sb, } func(es, arg); if (ext4_has_feature_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -354,8 +354,8 @@ void ext4_reset_inode_seed(struct inode *inode) if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* @@ -1505,8 +1505,14 @@ resizefs_out: return 0; } case EXT4_IOC_PRECACHE_EXTENTS: - return ext4_ext_precache(inode); + { + int ret; + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 3e26464b1425..51661570cf3b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -14,7 +14,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) int offset = offsetof(struct mmp_struct, mmp_checksum); __u32 csum; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); return cpu_to_le32(csum); } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 48649be64d6a..1f8493a56e8f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; - int from = data_offset_in_page << orig_inode->i_blkbits; + int from; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; @@ -323,11 +323,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - - VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); - VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); - VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); - if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -360,6 +355,8 @@ again: goto unlock_folios; } data_copy: + from = offset_in_folio(folio[0], + orig_blk_offset << orig_inode->i_blkbits); *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; @@ -390,7 +387,7 @@ data_copy: if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); - for (i = 0; i < data_offset_in_page; i++) + for (i = 0; i < from >> orig_inode->i_blkbits; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e9712e64ec8f..a178ac229489 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -346,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } @@ -442,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; @@ -450,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index c66e0cb29bd4..7c7f792ad6ab 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -541,9 +541,9 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, return 1; ot = ext4_orphan_block_tail(sb, bh); - calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } @@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, - inodes_per_ob * sizeof(__u32)); + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5d3a9dc9a32d..f329daf6e5c7 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode, int length; unsigned relative_block = 0; struct ext4_map_blocks map; - unsigned int nr_pages = rac ? readahead_count(rac) : 1; + unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - for (; nr_pages; nr_pages--) { + nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); + for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; - unsigned first_hole = blocks_per_page; + unsigned int first_hole; + unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; + blocks_per_folio = folio_size(folio) >> blkbits; + first_hole = blocks_per_folio; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; @@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode, map.m_flags &= ~EXT4_MAP_MAPPED; break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode, * Then do more ext4_map_blocks() calls until we are * done with this folio. */ - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; @@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode, } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode, /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } - if (first_hole != blocks_per_page) { + if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { @@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || - (first_hole != blocks_per_page)) { + (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else - last_block_in_bio = first_block + blocks_per_page - 1; + last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b7ff0d955f0d..050f26168d97 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1119,7 +1119,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, es->s_block_group_nr = cpu_to_le16(group); if (ext4_has_feature_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 181934499624..a7f80ca01174 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -286,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -__le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct ext4_super_block *es) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; - csum = ext4_chksum(sbi, ~0, (char *)es, offset); + csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } @@ -304,7 +302,7 @@ static int ext4_superblock_csum_verify(struct super_block *sb, if (!ext4_has_feature_metadata_csum(sb)) return 1; - return es->s_checksum == ext4_superblock_csum(sb, es); + return es->s_checksum == ext4_superblock_csum(es); } void ext4_superblock_csum_set(struct super_block *sb) @@ -314,7 +312,7 @@ void ext4_superblock_csum_set(struct super_block *sb) if (!ext4_has_feature_metadata_csum(sb)) return; - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -508,21 +506,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) ext4_maybe_update_superblock(sb); } -/* - * This writepage callback for write_cache_pages() - * takes care of a few cases after page cleaning. - * - * write_cache_pages() already checks for dirty pages - * and calls clear_page_dirty_for_io(), which we want, - * to write protect the pages. - * - * However, we may have to redirty a page (see below.) - */ -static int ext4_journalled_writepage_callback(struct folio *folio, - struct writeback_control *wbc, - void *data) +static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, + struct folio *folio) { - transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; @@ -543,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != transaction || - jh->b_next_transaction))) { - folio_redirty_for_writepage(wbc, folio); - goto out; - } + (jh && (jh->b_transaction != jinode->i_transaction || + jh->b_next_transaction))) + return true; } while ((bh = bh->b_this_page) != head); -out: - return AOP_WRITEPAGE_ACTIVATE; + return false; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -563,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; + struct folio *folio = NULL; + int error; - return write_cache_pages(mapping, &wbc, - ext4_journalled_writepage_callback, - jinode->i_transaction); + /* + * writeback_iter() already checks for dirty pages and calls + * folio_clear_dirty_for_io(), which we want to write protect the + * folios. + * + * However, we may have to redirty a folio sometimes. + */ + while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { + if (ext4_journalled_writepage_needs_redirty(jinode, folio)) + folio_redirty_for_writepage(&wbc, folio); + folio_unlock(folio); + } + + return error; } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -1415,7 +1411,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); - mutex_init(&ei->i_fc_lock); + spin_lock_init(&ei->i_fc_lock); return &ei->vfs_inode; } @@ -1809,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { {} }; -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 @@ -3209,14 +3204,14 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __u32 csum32; __u16 dummy_csum = 0; - csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; @@ -4441,13 +4436,16 @@ static int ext4_handle_clustersize(struct super_block *sb) /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * With non-bigalloc filesystem awu will be based upon filesystem blocksize + * & bdev awu units. + * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. * @sb: super block - * TODO: Later add support for bigalloc */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; + unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); if (!bdev_can_atomic_write(bdev)) return; @@ -4457,7 +4455,7 @@ static void ext4_atomic_write_init(struct super_block *sb) sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); - sbi->s_awu_max = min(sb->s_blocksize, + sbi->s_awu_max = min(clustersize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { @@ -4482,7 +4480,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); + mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; @@ -4644,7 +4642,7 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_feature_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } @@ -5255,7 +5253,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -5916,7 +5914,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; @@ -6495,7 +6493,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7ab8f2e8e815..8d15acbacc20 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); @@ -348,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { - return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1c7c49356878..7203d2d2624d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -99,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -330,8 +330,8 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, seq = cpu_to_be32(sequence); addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, addr, bh->b_size); kunmap_local(addr); if (jbd2_has_feature_csum3(j)) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 743a1d7633cd..6d5e76848733 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(jbd2_log_wait_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); -EXPORT_SYMBOL(jbd2_journal_blocks_per_page); +EXPORT_SYMBOL(jbd2_journal_blocks_per_folio); EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); @@ -115,14 +115,14 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +static __be32 jbd2_superblock_csum(journal_superblock_t *sb) { __u32 csum; __be32 old_csum; old_csum = sb->s_checksum; sb->s_checksum = 0; - csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t)); sb->s_checksum = old_csum; return cpu_to_be32(csum); @@ -728,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); - jbd2_journal_lock_updates(journal); return 0; } @@ -742,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); - jbd2_journal_unlock_updates(journal); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) @@ -1002,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } @@ -1386,7 +1384,7 @@ static int journal_check_superblock(journal_t *journal) } /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + if (sb->s_checksum != jbd2_superblock_csum(sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; return err; @@ -1492,7 +1490,7 @@ static int journal_load_superblock(journal_t *journal) journal->j_total_len = be32_to_cpu(sb->s_maxlen); /* Precompute checksum seed for all metadata */ if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); /* After journal features are set, we can compute transaction limits */ jbd2_journal_init_transaction_limits(journal); @@ -1821,7 +1819,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) set_buffer_uptodate(bh); } if (jbd2_journal_has_csum_v2or3(journal)) - sb->s_checksum = jbd2_superblock_csum(journal, sb); + sb->s_checksum = jbd2_superblock_csum(sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); @@ -2338,7 +2336,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); } @@ -2657,9 +2655,10 @@ void jbd2_journal_ack_err(journal_t *journal) write_unlock(&journal->j_state_lock); } -int jbd2_journal_blocks_per_page(struct inode *inode) +int jbd2_journal_blocks_per_folio(struct inode *inode) { - return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) - + inode->i_sb->s_blocksize_bits); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index c271a050b7e6..cac8c2cd4a92 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -185,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; return provided == cpu_to_be32(calculated); @@ -440,7 +440,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) h = buf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; return provided == cpu_to_be32(calculated); @@ -461,7 +461,7 @@ static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf) h = tmpbuf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize); kfree(tmpbuf); return provided == cpu_to_be32(calculated); @@ -478,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return 1; seq = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, buf, j->j_blocksize); if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cbc4785462f5..c7867139af69 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1509,7 +1509,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction == transaction); spin_unlock(&jh->b_state_lock); } - if (jh->b_modified == 1) { + if (data_race(jh->b_modified == 1)) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { @@ -1528,7 +1528,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out; } - journal = transaction->t_journal; spin_lock(&jh->b_state_lock); if (is_handle_aborted(handle)) { @@ -1543,6 +1542,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out_unlock_bh; } + journal = transaction->t_journal; + if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c index 5f4b305030ad..4b660296caf3 100644 --- a/fs/jfs/jfs_discard.c +++ b/fs/jfs/jfs_discard.c @@ -86,7 +86,8 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) down_read(&sb->s_umount); bmp = JFS_SBI(ip->i_sb)->bmap; - if (minlen > bmp->db_agsize || + if (bmp == NULL || + minlen > bmp->db_agsize || start >= bmp->db_mapsize || range->len < sb->s_blocksize) { up_read(&sb->s_umount); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 26e89d0c69b6..35e063c9f3a4 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -194,7 +194,11 @@ int dbMount(struct inode *ipbmap) !bmp->db_numag || (bmp->db_numag > MAXAG) || (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) || (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) || - !bmp->db_agwidth || + (bmp->db_agheight < 0) || (bmp->db_agheight > (L2LPERCTL >> 1)) || + (bmp->db_agwidth < 1) || (bmp->db_agwidth > (LPERCTL / MAXAG)) || + (bmp->db_agwidth > (1 << (L2LPERCTL - (bmp->db_agheight << 1)))) || + (bmp->db_agstart < 0) || + (bmp->db_agstart > (CTLTREESIZE - 1 - bmp->db_agwidth * (MAXAG - 1))) || (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) || (bmp->db_agl2size < 0) || ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 93db6eec4465..ab11849cf9cc 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -2613,7 +2613,7 @@ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ -static void add_missing_indices(struct inode *inode, s64 bn) +static int add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; @@ -2622,7 +2622,7 @@ static void add_missing_indices(struct inode *inode, s64 bn) struct lv *lv; struct metapage *mp; dtpage_t *p; - int rc; + int rc = 0; s8 *stbl; tid_t tid; struct tlock *tlck; @@ -2647,6 +2647,16 @@ static void add_missing_indices(struct inode *inode, s64 bn) stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { + if (stbl[i] < 0) { + jfs_err("jfs: add_missing_indices: Invalid stbl[%d] = %d for inode %ld, block = %lld", + i, stbl[i], (long)inode->i_ino, (long long)bn); + rc = -EIO; + + DT_PUTPAGE(mp); + txAbort(tid, 0); + goto end; + } + d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { @@ -2664,6 +2674,7 @@ static void add_missing_indices(struct inode *inode, s64 bn) (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); + return rc; } /* @@ -3017,7 +3028,8 @@ skip_one: } if (fix_page) { - add_missing_indices(ip, bn); + if ((rc = add_missing_indices(ip, bn))) + goto out; page_fixed = 1; } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 731a88f6313e..879e0b104d1c 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -77,8 +77,8 @@ config NFSD_V4 select FS_POSIX_ACL select RPCSEC_GSS_KRB5 select CRYPTO + select CRYPTO_LIB_SHA256 select CRYPTO_MD5 - select CRYPTO_SHA256 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 help diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2f687619f65b..55744bb786c9 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -24,6 +24,7 @@ nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o nfsd-$(CONFIG_NFS_LOCALIO) += localio.o +nfsd-$(CONFIG_DEBUG_FS) += debugfs.o .PHONY: xdrgen diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c new file mode 100644 index 000000000000..84b0c8b559dc --- /dev/null +++ b/fs/nfsd/debugfs.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debugfs.h> + +#include "nfsd.h" + +static struct dentry *nfsd_top_dir __read_mostly; + +/* + * /sys/kernel/debug/nfsd/disable-splice-read + * + * Contents: + * %0: NFS READ is allowed to use page splicing + * %1: NFS READ uses only iov iter read + * + * The default value of this setting is zero (page splicing is + * allowed). This setting takes immediate effect for all NFS + * versions, all exports, and in all NFSD net namespaces. + */ + +static int nfsd_dsr_get(void *data, u64 *val) +{ + *val = nfsd_disable_splice_read ? 1 : 0; + return 0; +} + +static int nfsd_dsr_set(void *data, u64 val) +{ + nfsd_disable_splice_read = (val > 0) ? true : false; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); + +void nfsd_debugfs_exit(void) +{ + debugfs_remove_recursive(nfsd_top_dir); + nfsd_top_dir = NULL; +} + +void nfsd_debugfs_init(void) +{ + nfsd_top_dir = debugfs_create_dir("nfsd", NULL); + + debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, + nfsd_top_dir, NULL, &nfsd_dsr_fops); +} diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 0363720280d4..88ae410b4113 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1124,7 +1124,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } - goto denied; + if (!may_bypass_gss) + goto denied; ok: /* legacy gss-only clients are always OK: */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index ac1731eb34ab..a817d8485d21 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -14,6 +14,7 @@ #include "xdr3.h" #include "vfs.h" #include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -69,8 +70,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: GETATTR(3) %s\n", - SVCFH_fmt(&argp->fh)); + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); resp->status = fh_verify(rqstp, &resp->fh, 0, @@ -220,7 +220,6 @@ nfsd3_proc_write(struct svc_rqst *rqstp) struct nfsd3_writeargs *argp = rqstp->rq_argp; struct nfsd3_writeres *resp = rqstp->rq_resp; unsigned long cnt = argp->len; - unsigned int nvecs; dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), @@ -235,10 +234,8 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nvecs = svc_fill_write_vector(rqstp, &argp->payload); - resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, nvecs, &cnt, + &argp->payload, &cnt, resp->committed, resp->verf); resp->count = cnt; resp->status = nfsd3_map_status(resp->status); @@ -266,6 +263,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 status; int host_err; + trace_nfsd_vfs_create(rqstp, fhp, S_IFREG, argp->name, argp->len); + if (isdotent(argp->name, argp->len)) return nfserr_exist; if (!(iap->ia_valid & ATTR_MODE)) @@ -382,11 +381,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp) struct nfsd3_diropres *resp = rqstp->rq_resp; svc_fh *dirfhp, *newfhp; - dprintk("nfsd: CREATE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - dirfhp = fh_copy(&resp->dirfh, &argp->fh); newfhp = fh_init(&resp->fh, NFS3_FHSIZE); @@ -407,11 +401,6 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp) .na_iattr = &argp->attrs, }; - dprintk("nfsd: MKDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - argp->attrs.ia_valid &= ~ATTR_SIZE; fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); @@ -447,11 +436,6 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) goto out; } - dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), - argp->flen, argp->fname, - argp->tlen, argp->tname); - fh_copy(&resp->dirfh, &argp->ffh); fh_init(&resp->fh, NFS3_FHSIZE); resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, @@ -476,11 +460,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) int type; dev_t rdev = 0; - dprintk("nfsd: MKNOD(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); @@ -513,11 +492,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp) struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: REMOVE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - /* Unlink. -S_IFDIR means file must not be a directory */ fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, @@ -535,11 +509,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp) struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: RMDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); @@ -553,15 +522,6 @@ nfsd3_proc_rename(struct svc_rqst *rqstp) struct nfsd3_renameargs *argp = rqstp->rq_argp; struct nfsd3_renameres *resp = rqstp->rq_resp; - dprintk("nfsd: RENAME(3) %s %.*s ->\n", - SVCFH_fmt(&argp->ffh), - argp->flen, - argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - fh_copy(&resp->ffh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, @@ -576,13 +536,6 @@ nfsd3_proc_link(struct svc_rqst *rqstp) struct nfsd3_linkargs *argp = rqstp->rq_argp; struct nfsd3_linkres *resp = rqstp->rq_resp; - dprintk("nfsd: LINK(3) %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - fh_copy(&resp->fh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, @@ -621,9 +574,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd3_init_dirlist_pages(rqstp, resp, argp->count); @@ -655,9 +606,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) struct nfsd3_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd3_init_dirlist_pages(rqstp, resp, argp->count); @@ -698,9 +647,6 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_fsstatres *resp = rqstp->rq_resp; - dprintk("nfsd: FSSTAT(3) %s\n", - SVCFH_fmt(&argp->fh)); - resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); resp->status = nfsd3_map_status(resp->status); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ec6539cec0fe..ccb00aa93be0 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -417,6 +417,29 @@ static u32 highest_slotid(struct nfsd4_session *ses) return idx; } +static void +encode_referring_call4(struct xdr_stream *xdr, + const struct nfsd4_referring_call *rc) +{ + encode_uint32(xdr, rc->rc_sequenceid); + encode_uint32(xdr, rc->rc_slotid); +} + +static void +encode_referring_call_list4(struct xdr_stream *xdr, + const struct nfsd4_referring_call_list *rcl) +{ + struct nfsd4_referring_call *rc; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, rcl->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + encode_uint32(xdr, rcl->__nr_referring_calls); + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) + encode_referring_call4(xdr, rc); +} + /* * CB_SEQUENCE4args * @@ -434,6 +457,7 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + struct nfsd4_referring_call_list *rcl; __be32 *p; if (hdr->minorversion == 0) @@ -442,12 +466,16 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); encode_sessionid4(xdr, session); - p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); + p = xdr_reserve_space(xdr, XDR_UNIT * 4); *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */ *p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */ *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */ *p++ = xdr_zero; /* csa_cachethis */ - xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + /* csa_referring_call_lists */ + encode_uint32(xdr, cb->cb_nr_referring_call_list); + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) + encode_referring_call_list4(xdr, rcl); hdr->nops++; } @@ -1320,10 +1348,102 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) nfsd41_cb_inflight_end(clp); } -/* - * TODO: cb_sequence should support referring call lists, cachethis, - * and mark callback channel down on communication errors. +/** + * nfsd41_cb_referring_call - add a referring call to a callback operation + * @cb: context of callback to add the rc to + * @sessionid: referring call's session ID + * @slotid: referring call's session slot index + * @seqno: referring call's slot sequence number + * + * Caller serializes access to @cb. + * + * NB: If memory allocation fails, the referring call is not added. */ +void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + bool found; + + might_sleep(); + + found = false; + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) { + if (!memcmp(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + found = true; + break; + } + } + if (!found) { + rcl = kmalloc(sizeof(*rcl), GFP_KERNEL); + if (!rcl) + return; + memcpy(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN); + rcl->__nr_referring_calls = 0; + INIT_LIST_HEAD(&rcl->rcl_referring_calls); + list_add(&rcl->__list, &cb->cb_referring_call_list); + cb->cb_nr_referring_call_list++; + } + + found = false; + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) { + if (rc->rc_sequenceid == seqno && rc->rc_slotid == slotid) { + found = true; + break; + } + } + if (!found) { + rc = kmalloc(sizeof(*rc), GFP_KERNEL); + if (!rc) + goto out; + rc->rc_sequenceid = seqno; + rc->rc_slotid = slotid; + rcl->__nr_referring_calls++; + list_add(&rc->__list, &rcl->rcl_referring_calls); + } + +out: + if (!rcl->__nr_referring_calls) { + cb->cb_nr_referring_call_list--; + kfree(rcl); + } +} + +/** + * nfsd41_cb_destroy_referring_call_list - release referring call info + * @cb: context of a callback that has completed + * + * Callers who allocate referring calls using nfsd41_cb_referring_call() must + * release those resources by calling nfsd41_cb_destroy_referring_call_list. + * + * Caller serializes access to @cb. + */ +void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + + while (!list_empty(&cb->cb_referring_call_list)) { + rcl = list_first_entry(&cb->cb_referring_call_list, + struct nfsd4_referring_call_list, + __list); + + while (!list_empty(&rcl->rcl_referring_calls)) { + rc = list_first_entry(&rcl->rcl_referring_calls, + struct nfsd4_referring_call, + __list); + list_del(&rc->__list); + kfree(rc); + } + list_del(&rcl->__list); + kfree(rcl); + } +} + static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; @@ -1643,6 +1763,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; cb->cb_held_slot = -1; + cb->cb_nr_referring_call_list = 0; + INIT_LIST_HEAD(&cb->cb_referring_call_list); } /** diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index fd560dcf6059..f13abbb13b38 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -878,6 +878,8 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_getattr *getattr = &u->getattr; __be32 status; + trace_nfsd_vfs_getattr(rqstp, &cstate->current_fh); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -1000,6 +1002,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u64 cookie = readdir->rd_cookie; static const nfs4_verifier zeroverf; + trace_nfsd_vfs_readdir(rqstp, &cstate->current_fh, + readdir->rd_maxcount, readdir->rd_cookie); + /* no need to check permission - this will be done in nfsd_readdir() */ if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) @@ -1213,7 +1218,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; - int nvecs; if (write->wr_offset > (u64)OFFSET_MAX || write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) @@ -1228,13 +1232,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; write->wr_how_written = write->wr_stable_how; - - nvecs = svc_fill_write_vector(rqstp, &write->wr_payload); - WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, - write->wr_offset, rqstp->rq_vec, nvecs, &cnt, - write->wr_how_written, + write->wr_offset, &write->wr_payload, + &cnt, write->wr_how_written, (__be32 *)write->wr_verifier.data); nfsd_file_put(nf); @@ -1381,8 +1381,11 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) static void nfsd4_stop_copy(struct nfsd4_copy *copy) { trace_nfsd_copy_async_cancel(copy); - if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) + if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) { kthread_stop(copy->copy_task); + copy->nfserr = nfs_ok; + set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); + } nfs4_put_copy(copy); } @@ -1711,10 +1714,11 @@ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, switch (task->tk_status) { case -NFS4ERR_DELAY: if (cbo->co_retries--) { - rpc_delay(task, 1 * HZ); + rpc_delay(task, HZ / 5); return 0; } } + nfsd41_cb_destroy_referring_call_list(cb); return 1; } @@ -1847,6 +1851,9 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid, + cbo->co_referring_slotid, + cbo->co_referring_seqno); trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid, &cbo->co_fh, copy->cp_count, copy->nfserr); nfsd4_try_run_cb(&cbo->co_cb); @@ -1963,6 +1970,11 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); + memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data, + cstate->session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + async_copy->cp_cb_offload.co_referring_slotid = cstate->slot->sl_index; + async_copy->cp_cb_offload.co_referring_seqno = cstate->slot->sl_seqid; async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) @@ -3768,7 +3780,8 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; u32 opiter; - if (!cstate->minorversion) + if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] || + cstate->minorversion == 0) return false; if (cstate->spo_must_allowed) @@ -3834,7 +3847,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_ressize = sizeof(struct nfsd4_compoundres), .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = NFSD_BUFSIZE/4, + .pc_xdrressize = 3+NFSSVC_MAXBLKSIZE/4, .pc_name = "COMPOUND", }, }; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index acde3edab733..82785db730d9 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -33,6 +33,7 @@ */ #include <crypto/hash.h> +#include <crypto/sha2.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> @@ -737,7 +738,6 @@ struct cld_net { spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; - struct crypto_shash *cn_tfm; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING bool cn_has_legacy; #endif @@ -1063,8 +1063,6 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); - if (cn->cn_tfm) - crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } @@ -1158,8 +1156,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct cld_msg_v2 *cmsg; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; char *principal = NULL; /* Don't upcall if it's already stored */ @@ -1182,22 +1178,9 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) { - ret = -ENOMEM; - goto out; - } - ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), - cksum.data); - if (ret) { - kfree(cksum.data); - goto out; - } - cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; - memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, - cksum.data, cksum.len); - kfree(cksum.data); + sha256(principal, strlen(principal), + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data); + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = SHA256_DIGEST_SIZE; } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; @@ -1207,7 +1190,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } -out: free_cld_upcall(cup); out_err: if (ret) @@ -1346,12 +1328,11 @@ found: static int nfsd4_cld_check_v2(struct nfs4_client *clp) { - struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING struct cld_net *cn = nn->cld_net; - int status; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; +#endif + struct nfs4_client_reclaim *crp; char *principal = NULL; /* did we already find that this client is stable? */ @@ -1367,6 +1348,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) if (cn->cn_has_legacy) { struct xdr_netobj name; char dname[HEXDIR_LEN]; + int status; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) @@ -1389,28 +1371,18 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) return -ENOENT; found: if (crp->cr_princhash.len) { + u8 digest[SHA256_DIGEST_SIZE]; + if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) - return -ENOENT; - status = crypto_shash_tfm_digest(tfm, principal, - strlen(principal), cksum.data); - if (status) { - kfree(cksum.data); + sha256(principal, strlen(principal), digest); + if (memcmp(crp->cr_princhash.data, digest, + crp->cr_princhash.len)) return -ENOENT; - } - if (memcmp(crp->cr_princhash.data, cksum.data, - crp->cr_princhash.len)) { - kfree(cksum.data); - return -ENOENT; - } - kfree(cksum.data); } crp->cr_clp = clp; return 0; @@ -1590,7 +1562,6 @@ nfsd4_cld_tracking_init(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; - struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) @@ -1615,12 +1586,6 @@ nfsd4_cld_tracking_init(struct net *net) status = -ETIMEDOUT; goto err_remove; } - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - status = PTR_ERR(tfm); - goto err_remove; - } - nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 59a693f22452..d5694987f86f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1987,26 +1987,30 @@ reduce_session_slots(struct nfsd4_session *ses, int dec) return ret; } -/* - * We don't actually need to cache the rpc and session headers, so we - * can allocate a little less for each slot: - */ -static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) +static struct nfsd4_slot *nfsd4_alloc_slot(struct nfsd4_channel_attrs *fattrs, + int index, gfp_t gfp) { - u32 size; + struct nfsd4_slot *slot; + size_t size; - if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) - size = 0; - else - size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; - return size + sizeof(struct nfsd4_slot); + /* + * The RPC and NFS session headers are never saved in + * the slot reply cache buffer. + */ + size = fattrs->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ ? + 0 : fattrs->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + + slot = kzalloc(struct_size(slot, sl_data, size), gfp); + if (!slot) + return NULL; + slot->sl_index = index; + return slot; } static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, struct nfsd4_channel_attrs *battrs) { int numslots = fattrs->maxreqs; - int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; struct nfsd4_slot *slot; int i; @@ -2015,14 +2019,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, if (!new) return NULL; xa_init(&new->se_slots); - /* allocate each struct nfsd4_slot and data cache in one piece */ - slot = kzalloc(slotsize, GFP_KERNEL); + + slot = nfsd4_alloc_slot(fattrs, 0, GFP_KERNEL); if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL))) goto out_free; for (i = 1; i < numslots; i++) { const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; - slot = kzalloc(slotsize, gfp); + slot = nfsd4_alloc_slot(fattrs, i, gfp); if (!slot) break; if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) { @@ -4402,7 +4406,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfserr_rep_too_big; if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) goto out_put_session; - svc_reserve(rqstp, buflen); + svc_reserve_auth(rqstp, buflen); status = nfs_ok; /* Success! accept new slot seqid */ @@ -4438,8 +4442,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * spinlock, and only succeeds if there is * plenty of memory. */ - slot = kzalloc(slot_bytes(&session->se_fchannel), - GFP_NOWAIT); + slot = nfsd4_alloc_slot(&session->se_fchannel, s, + GFP_NOWAIT); prev_slot = xa_load(&session->se_slots, s); if (xa_is_value(prev_slot) && slot) { slot->sl_seqid = xa_to_value(prev_slot); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fe876395985a..3afcdbed6e14 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2564,7 +2564,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) /* Sessions make the DRC unnecessary: */ if (argp->minorversion) cachethis = false; - svc_reserve(argp->rqstp, max_reply + readbytes); + svc_reserve_auth(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); @@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr, return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); } +/* + * Copied from generic_remap_checks/generic_remap_file_range_prep. + * + * These generic functions use the file system's s_blocksize, but + * individual file systems aren't required to use + * generic_remap_file_range_prep. Until there is a mechanism for + * determining a particular file system's (or file's) clone block + * size, this is the best NFSD can do. + */ +static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct inode *inode = d_inode(args->dentry); + + return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize); +} + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args) @@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop, [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat, [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop, - [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize, [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop, [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ac265d6fde35..3f3e9f6c4250 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2281,6 +2281,8 @@ static int __init init_nfsd(void) { int retval; + nfsd_debugfs_init(); + retval = nfsd4_init_slabs(); if (retval) return retval; @@ -2291,12 +2293,9 @@ static int __init init_nfsd(void) if (retval) goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ - retval = create_proc_exports_entry(); - if (retval) - goto out_free_lockd; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_free_exports; + goto out_free_lockd; retval = register_cld_notifier(); if (retval) goto out_free_subsys; @@ -2305,22 +2304,26 @@ static int __init init_nfsd(void) goto out_free_cld; retval = register_filesystem(&nfsd_fs_type); if (retval) - goto out_free_all; + goto out_free_nfsd4; retval = genl_register_family(&nfsd_nl_family); if (retval) + goto out_free_filesystem; + retval = create_proc_exports_entry(); + if (retval) goto out_free_all; nfsd_localio_ops_init(); return 0; out_free_all: + genl_unregister_family(&nfsd_nl_family); +out_free_filesystem: + unregister_filesystem(&nfsd_fs_type); +out_free_nfsd4: nfsd4_destroy_laundry_wq(); out_free_cld: unregister_cld_notifier(); out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); -out_free_exports: - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); @@ -2328,22 +2331,24 @@ out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); + nfsd_debugfs_exit(); return retval; } static void __exit exit_nfsd(void) { + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); + nfsd_debugfs_exit(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index e2997f0ffbc5..1bfd0b4e9af7 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -44,24 +44,14 @@ bool nfsd_support_version(int vers); #include "stats.h" /* - * Maximum blocksizes supported by daemon under various circumstances. + * Default and maximum payload size (NFS READ or WRITE), in bytes. + * The default is historical, and the maximum is an implementation + * limit. */ -#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD -/* NFSv2 is limited by the protocol specification, see RFC 1094 */ -#define NFSSVC_MAXBLKSIZE_V2 (8*1024) - - -/* - * Largest number of bytes we need to allocate for an NFS - * call or reply. Used to control buffer sizes. We use - * the length of v3 WRITE, READDIR and READDIR replies - * which are an RPC header, up to 26 XDR units of reply - * data, and some page data. - * - * Note that accuracy here doesn't matter too much as the - * size is rounded up to a page size when allocating space. - */ -#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) +enum { + NFSSVC_DEFBLKSIZE = 1 * 1024 * 1024, + NFSSVC_MAXBLKSIZE = RPCSVC_MAXPAYLOAD, +}; struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ @@ -156,6 +146,16 @@ void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); void nfsd_destroy_serv(struct net *net); +#ifdef CONFIG_DEBUG_FS +void nfsd_debugfs_init(void); +void nfsd_debugfs_exit(void); +#else +static inline void nfsd_debugfs_init(void) {} +static inline void nfsd_debugfs_exit(void) {} +#endif + +extern bool nfsd_disable_splice_read __read_mostly; + extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6370ac0a85fd..c10fa8128a8a 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -10,6 +10,7 @@ #include "cache.h" #include "xdr.h" #include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -54,7 +55,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); resp->status = fh_verify(rqstp, &resp->fh, 0, @@ -211,7 +212,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, argp->offset); - argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + argp->count = min_t(u32, argp->count, NFS_MAXDATA); argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); resp->pages = rqstp->rq_next_page; @@ -250,17 +251,14 @@ nfsd_proc_write(struct svc_rqst *rqstp) struct nfsd_writeargs *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; unsigned long cnt = argp->len; - unsigned int nvecs; dprintk("nfsd: WRITE %s %u bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nvecs = svc_fill_write_vector(rqstp, &argp->payload); - - resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, rqstp->rq_vec, nvecs, - &cnt, NFS_DATA_SYNC, NULL); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, + &argp->payload, &cnt, NFS_DATA_SYNC, NULL); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) @@ -292,9 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp) int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); - dprintk("nfsd: CREATE %s %.*s\n", - SVCFH_fmt(dirfhp), argp->len, argp->name); - /* First verify the parent file handle */ resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); if (resp->status != nfs_ok) @@ -446,9 +441,6 @@ nfsd_proc_remove(struct svc_rqst *rqstp) struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), - argp->len, argp->name); - /* Unlink. -SIFDIR means file must not be a directory */ resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len); @@ -463,11 +455,6 @@ nfsd_proc_rename(struct svc_rqst *rqstp) struct nfsd_renameargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RENAME %s %.*s -> \n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); - resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, &argp->tfh, argp->tname, argp->tlen); fh_put(&argp->ffh); @@ -482,13 +469,6 @@ nfsd_proc_link(struct svc_rqst *rqstp) struct nfsd_linkargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: LINK %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, &argp->ffh); fh_put(&argp->ffh); @@ -520,10 +500,6 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) goto out; } - dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, - argp->tlen, argp->tname); - fh_init(&newfh, NFS_FHSIZE); resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, argp->tname, &attrs, &newfh); @@ -549,8 +525,6 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp) .na_iattr = &argp->attrs, }; - dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - if (resp->fh.fh_dentry) { printk(KERN_WARNING "nfsd_proc_mkdir: response already verified??\n"); @@ -579,8 +553,6 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp) struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len); fh_put(&argp->fh); @@ -616,9 +588,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) struct nfsd_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd_init_dirlist_pages(rqstp, resp, argp->count); @@ -643,8 +613,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_statfsres *resp = rqstp->rq_resp; - dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); @@ -740,7 +708,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_argzero = sizeof(struct nfsd_readargs), .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + .pc_xdrressize = ST+AT+1+NFS_MAXDATA/4, .pc_name = "READ", }, [NFSPROC_WRITECACHE] = { diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9b3d6cff0e1e..82b0111ac469 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) if (ret) goto out_filecache; +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_init_umount_work(nn); +#endif ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; -#ifdef CONFIG_NFSD_V4_2_INTER_SSC - nfsd4_ssc_init_umount_work(nn); -#endif nn->nfsd_net_up = true; return 0; @@ -582,7 +582,7 @@ static int nfsd_get_default_max_blksize(void) */ target >>= 12; - ret = NFSSVC_MAXBLKSIZE; + ret = NFSSVC_DEFBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 5777f40c7353..fc262ceafca9 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -336,7 +336,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) /* opaque data */ if (xdr_stream_decode_u32(xdr, &args->len) < 0) return false; - if (args->len > NFSSVC_MAXBLKSIZE_V2) + if (args->len > NFS_MAXDATA) return false; return xdr_stream_subsegment(xdr, &args->payload, args->len); @@ -540,7 +540,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr) p = xdr_reserve_space(xdr, XDR_UNIT * 5); if (!p) return false; - *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2); + *p++ = cpu_to_be32(NFS_MAXDATA); *p++ = cpu_to_be32(stat->f_bsize); *p++ = cpu_to_be32(stat->f_blocks); *p++ = cpu_to_be32(stat->f_bfree); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 290e29dd43eb..1995bca158b8 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -64,6 +64,21 @@ typedef struct { refcount_t cs_count; } copy_stateid_t; +struct nfsd4_referring_call { + struct list_head __list; + + u32 rc_sequenceid; + u32 rc_slotid; +}; + +struct nfsd4_referring_call_list { + struct list_head __list; + + struct nfs4_sessionid rcl_sessionid; + int __nr_referring_calls; + struct list_head rcl_referring_calls; +}; + struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; @@ -76,6 +91,9 @@ struct nfsd4_callback { int cb_seq_status; int cb_status; int cb_held_slot; + + int cb_nr_referring_call_list; + struct list_head cb_referring_call_list; }; struct nfsd4_callback_ops { @@ -260,6 +278,7 @@ struct nfsd4_slot { u32 sl_seqid; __be32 sl_status; struct svc_cred sl_cred; + u32 sl_index; u32 sl_datalen; u16 sl_opcnt; u16 sl_generation; @@ -774,6 +793,10 @@ extern __be32 nfs4_check_open_reclaim(struct nfs4_client *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno); +extern void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern bool nfsd4_run_cb(struct nfsd4_callback *cb); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a7630e9f6577..3c5505ef5e3a 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -11,6 +11,7 @@ #include <linux/tracepoint.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprt.h> +#include <trace/misc/fs.h> #include <trace/misc/nfs.h> #include <trace/misc/sunrpc.h> @@ -18,22 +19,40 @@ #include "nfsfh.h" #include "xdr4.h" -#define NFSD_TRACE_PROC_RES_FIELDS \ +#define NFSD_TRACE_PROC_CALL_FIELDS(r) \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) + +#define NFSD_TRACE_PROC_CALL_ASSIGNMENTS(r) \ + do { \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ + } while (0) + +#define NFSD_TRACE_PROC_RES_FIELDS(r) \ __field(unsigned int, netns_ino) \ __field(u32, xid) \ __field(unsigned long, status) \ - __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ - __array(unsigned char, client, sizeof(struct sockaddr_in6)) + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) -#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \ +#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(r, error) \ do { \ - __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ - __entry->xid = be32_to_cpu(rqstp->rq_xid); \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ __entry->status = be32_to_cpu(error); \ - memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ - rqstp->rq_xprt->xpt_locallen); \ - memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ - rqstp->rq_xprt->xpt_remotelen); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ } while (0); DECLARE_EVENT_CLASS(nfsd_xdr_err_class, @@ -145,14 +164,14 @@ TRACE_EVENT(nfsd_compound_decode_err, ), TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status), TP_STRUCT__entry( - NFSD_TRACE_PROC_RES_FIELDS + NFSD_TRACE_PROC_RES_FIELDS(rqstp) __field(u32, args_opcnt) __field(u32, resp_opcnt) __field(u32, opnum) ), TP_fast_assign( - NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) __entry->args_opcnt = args_opcnt; __entry->resp_opcnt = resp_opcnt; @@ -171,12 +190,12 @@ DECLARE_EVENT_CLASS(nfsd_compound_err_class, ), TP_ARGS(rqstp, opnum, status), TP_STRUCT__entry( - NFSD_TRACE_PROC_RES_FIELDS + NFSD_TRACE_PROC_RES_FIELDS(rqstp) __field(u32, opnum) ), TP_fast_assign( - NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) __entry->opnum = opnum; ), @@ -451,6 +470,8 @@ DEFINE_NFSD_IO_EVENT(write_start); DEFINE_NFSD_IO_EVENT(write_opened); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); +DEFINE_NFSD_IO_EVENT(commit_start); +DEFINE_NFSD_IO_EVENT(commit_done); DECLARE_EVENT_CLASS(nfsd_err_class, TP_PROTO(struct svc_rqst *rqstp, @@ -2335,6 +2356,259 @@ DEFINE_EVENT(nfsd_copy_async_done_class, \ DEFINE_COPY_ASYNC_DONE_EVENT(done); DEFINE_COPY_ASYNC_DONE_EVENT(cancel); +TRACE_EVENT(nfsd_vfs_setattr, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const struct iattr *iap, + const struct timespec64 *guardtime + ), + TP_ARGS(rqstp, fhp, iap, guardtime), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(s64, gtime_tv_sec) + __field(u32, gtime_tv_nsec) + __field(unsigned int, ia_valid) + __field(loff_t, ia_size) + __field(uid_t, ia_uid) + __field(gid_t, ia_gid) + __field(umode_t, ia_mode) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->gtime_tv_sec = guardtime ? guardtime->tv_sec : 0; + __entry->gtime_tv_nsec = guardtime ? guardtime->tv_nsec : 0; + __entry->ia_valid = iap->ia_valid; + __entry->ia_size = iap->ia_size; + __entry->ia_uid = __kuid_val(iap->ia_uid); + __entry->ia_gid = __kgid_val(iap->ia_gid); + __entry->ia_mode = iap->ia_mode; + ), + TP_printk( + "xid=0x%08x fh_hash=0x%08x ia_valid=%s ia_size=%llu ia_mode=0%o ia_uid=%u ia_gid=%u guard_time=%lld.%u", + __entry->xid, __entry->fh_hash, show_ia_valid_flags(__entry->ia_valid), + __entry->ia_size, __entry->ia_mode, __entry->ia_uid, __entry->ia_gid, + __entry->gtime_tv_sec, __entry->gtime_tv_nsec + ) +) + +TRACE_EVENT(nfsd_vfs_lookup, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_create, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, type, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(umode_t, type) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->type = type; + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s name=%s", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_symlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int namelen, + const char *target + ), + TP_ARGS(rqstp, fhp, name, namelen, target), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, namelen) + __string(target, target) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + __assign_str(target); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s target=%s", + __entry->xid, __entry->fh_hash, + __get_str(name), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_link, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *name, + unsigned int namelen + ), + TP_ARGS(rqstp, sfhp, tfhp, name, namelen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(name, name, namelen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x src_fh=0x%08x tgt_fh=0x%08x name=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_unlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_rename, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *source, + unsigned int sourcelen, + const char *target, + unsigned int targetlen + ), + TP_ARGS(rqstp, sfhp, tfhp, source, sourcelen, target, targetlen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(source, source, sourcelen) + __string_len(target, target, targetlen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(source); + __assign_str(target); + ), + TP_printk("xid=0x%08x sfh_hash=0x%08x tfh_hash=0x%08x source=%s target=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(source), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_readdir, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + u32 count, + u64 offset + ), + TP_ARGS(rqstp, fhp, count, offset), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(u32, count) + __field(u64, offset) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->count = count; + __entry->offset = offset; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu count=%u", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->count + ) +); + +DECLARE_EVENT_CLASS(nfsd_vfs_getattr_class, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp + ), + TP_ARGS(rqstp, fhp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x", + __entry->xid, __entry->fh_hash + ) +); + +#define DEFINE_NFSD_VFS_GETATTR_EVENT(__name) \ +DEFINE_EVENT(nfsd_vfs_getattr_class, __name, \ + TP_PROTO( \ + const struct svc_rqst *rqstp, \ + const struct svc_fh *fhp \ + ), \ + TP_ARGS(rqstp, fhp)) + +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_getattr); +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_statfs); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 160a839af405..cd689df2ca5d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -31,6 +31,7 @@ #include <linux/exportfs.h> #include <linux/writeback.h> #include <linux/security.h> +#include <linux/sunrpc/xdr.h> #include "xdr3.h" @@ -47,6 +48,8 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP +bool nfsd_disable_splice_read __read_mostly; + /** * nfserrno - Map Linux errnos to NFS errnos * @errno: POSIX(-ish) error code to be mapped @@ -244,7 +247,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry; int host_err; - dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + trace_nfsd_vfs_lookup(rqstp, fhp, name, len); dparent = fhp->fh_dentry; exp = exp_get(fhp->fh_export); @@ -500,6 +503,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, bool size_change = (iap->ia_valid & ATTR_SIZE); int retries; + trace_nfsd_vfs_setattr(rqstp, fhp, iap, guardtime); + if (iap->ia_valid & ATTR_SIZE) { accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; ftype = S_IFREG; @@ -1082,23 +1087,23 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long v, total; struct iov_iter iter; loff_t ppos = offset; - struct page *page; ssize_t host_err; + size_t len; v = 0; total = *count; while (total) { - page = *(rqstp->rq_next_page++); - rqstp->rq_vec[v].iov_base = page_address(page) + base; - rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base); - total -= rqstp->rq_vec[v].iov_len; + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++), + len, base); + total -= len; ++v; base = 0; } - WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec)); + WARN_ON_ONCE(v > rqstp->rq_maxpages); trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1140,11 +1145,27 @@ static int wait_for_concurrent_writes(struct file *file) return err; } +/** + * nfsd_vfs_write - write data to an already-open file + * @rqstp: RPC execution context + * @fhp: File handle of file to write into + * @nf: An open file matching @fhp + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. + */ __be32 -nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, - loff_t offset, struct kvec *vec, int vlen, - unsigned long *cnt, int stable, - __be32 *verf) +nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, + const struct xdr_buf *payload, unsigned long *cnt, + int stable, __be32 *verf) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file *file = nf->nf_file; @@ -1159,6 +1180,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, unsigned int pflags = current->flags; rwf_t flags = 0; bool restore_flags = false; + unsigned int nvecs; trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); @@ -1186,7 +1208,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && !fhp->fh_use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt); + nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); @@ -1237,6 +1260,8 @@ out_nfserr: */ bool nfsd_read_splice_ok(struct svc_rqst *rqstp) { + if (nfsd_disable_splice_read) + return false; switch (svc_auth_flavor(rqstp)) { case RPC_AUTH_GSS_KRB5I: case RPC_AUTH_GSS_KRB5P: @@ -1284,14 +1309,24 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; } -/* - * Write data to a file. - * The stable flag requests synchronous writes. - * N.B. After this call fhp needs an fh_put +/** + * nfsd_write - open a file and write data to it + * @rqstp: RPC execution context + * @fhp: File handle of file to write into; nfsd_write() may modify it + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, int stable, + const struct xdr_buf *payload, unsigned long *cnt, int stable, __be32 *verf) { struct nfsd_file *nf; @@ -1303,8 +1338,8 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec, - vlen, cnt, stable, verf); + err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt, + stable, verf); nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); @@ -1340,6 +1375,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, loff_t start, end; struct nfsd_net *nn; + trace_nfsd_commit_start(rqstp, fhp, offset, count); + /* * Convert the client-provided (offset, count) range to a * (start, end) range. If the client-provided range falls @@ -1378,6 +1415,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, } else nfsd_copy_write_verifier(verf, nn); + trace_nfsd_commit_done(rqstp, fhp, offset, count); return err; } @@ -1541,6 +1579,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err; int host_err; + trace_nfsd_vfs_create(rqstp, fhp, type, fname, flen); + if (isdotent(fname, flen)) return nfserr_exist; @@ -1641,6 +1681,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err, cerr; int host_err; + trace_nfsd_vfs_symlink(rqstp, fhp, fname, flen, path); + err = nfserr_noent; if (!flen || path[0] == '\0') goto out; @@ -1709,6 +1751,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, __be32 err; int host_err; + trace_nfsd_vfs_link(rqstp, ffhp, tfhp, name, len); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1826,6 +1870,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, int host_err; bool close_cached = false; + trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) goto out; @@ -1984,6 +2030,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + trace_nfsd_vfs_unlink(rqstp, fhp, fname, flen); + err = nfserr_acces; if (!flen || isdotent(fname, flen)) goto out; @@ -2272,6 +2320,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in { __be32 err; + trace_nfsd_vfs_statfs(rqstp, fhp); + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err) { struct path path = { diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index f9b09b842856..eff04959606f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -128,13 +128,13 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp); __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long *count, u32 *eof); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, - struct kvec *, int, unsigned long *, - int stable, __be32 *verf); +__be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, - int stable, __be32 *verf); + const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index c26ba86dbdfd..aa2a356da784 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -676,6 +676,10 @@ struct nfsd4_cb_offload { __be32 co_nfserr; unsigned int co_retries; struct knfsd_fh co_fh; + + struct nfs4_sessionid co_referring_sessionid; + u32 co_referring_slotid; + u32 co_referring_seqno; }; struct nfsd4_copy { diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index f1a315cd31b7..f4e29c0c701c 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -6,8 +6,11 @@ #define cb_compound_enc_hdr_sz 4 #define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) #define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define enc_referring_call4_sz (1 + 1) +#define enc_referring_call_list4_sz (sessionid_sz + 1 + \ + enc_referring_call4_sz) #define cb_sequence_enc_sz (sessionid_sz + 4 + \ - 1 /* no referring calls list yet */) + enc_referring_call_list4_sz) #define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) #define op_enc_sz 1 diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index e946f75eb540..eced9013a881 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -2605,75 +2605,3 @@ int attr_force_nonresident(struct ntfs_inode *ni) return err; } - -/* - * Change the compression of data attribute - */ -int attr_set_compress(struct ntfs_inode *ni, bool compr) -{ - struct ATTRIB *attr; - struct mft_inode *mi; - - attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); - if (!attr) - return -ENOENT; - - if (is_attr_compressed(attr) == !!compr) { - /* Already required compressed state. */ - return 0; - } - - if (attr->non_res) { - u16 run_off; - u32 run_size; - char *run; - - if (attr->nres.data_size) { - /* - * There are rare cases when it possible to change - * compress state without big changes. - * TODO: Process these cases. - */ - return -EOPNOTSUPP; - } - - run_off = le16_to_cpu(attr->nres.run_off); - run_size = le32_to_cpu(attr->size) - run_off; - run = Add2Ptr(attr, run_off); - - if (!compr) { - /* remove field 'attr->nres.total_size'. */ - memmove(run - 8, run, run_size); - run_off -= 8; - } - - if (!mi_resize_attr(mi, attr, compr ? +8 : -8)) { - /* - * Ignore rare case when there are no 8 bytes in record with attr. - * TODO: split attribute. - */ - return -EOPNOTSUPP; - } - - if (compr) { - /* Make a gap for 'attr->nres.total_size'. */ - memmove(run + 8, run, run_size); - run_off += 8; - attr->nres.total_size = attr->nres.alloc_size; - } - attr->nres.run_off = cpu_to_le16(run_off); - } - - /* Update attribute flags. */ - if (compr) { - attr->flags &= ~ATTR_FLAG_SPARSED; - attr->flags |= ATTR_FLAG_COMPRESSED; - attr->nres.c_unit = NTFS_LZNT_CUNIT; - } else { - attr->flags &= ~ATTR_FLAG_COMPRESSED; - attr->nres.c_unit = 0; - } - mi->dirty = true; - - return 0; -} diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 9b6a3f8d2e7c..34ed242e1063 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -50,90 +50,6 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) } /* - * ntfs_fileattr_get - inode_operations::fileattr_get - */ -int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct ntfs_inode *ni = ntfs_i(inode); - u32 flags = 0; - - if (inode->i_flags & S_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - - if (inode->i_flags & S_APPEND) - flags |= FS_APPEND_FL; - - if (is_compressed(ni)) - flags |= FS_COMPR_FL; - - if (is_encrypted(ni)) - flags |= FS_ENCRYPT_FL; - - fileattr_fill_flags(fa, flags); - - return 0; -} - -/* - * ntfs_fileattr_set - inode_operations::fileattr_set - */ -int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct ntfs_inode *ni = ntfs_i(inode); - u32 flags = fa->flags; - unsigned int new_fl = 0; - - if (fileattr_has_fsx(fa)) - return -EOPNOTSUPP; - - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_COMPR_FL)) - return -EOPNOTSUPP; - - if (flags & FS_IMMUTABLE_FL) - new_fl |= S_IMMUTABLE; - - if (flags & FS_APPEND_FL) - new_fl |= S_APPEND; - - /* Allowed to change compression for empty files and for directories only. */ - if (!is_dedup(ni) && !is_encrypted(ni) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - int err = 0; - struct address_space *mapping = inode->i_mapping; - - /* write out all data and wait. */ - filemap_invalidate_lock(mapping); - err = filemap_write_and_wait(mapping); - - if (err >= 0) { - /* Change compress state. */ - bool compr = flags & FS_COMPR_FL; - err = ni_set_compress(inode, compr); - - /* For files change a_ops too. */ - if (!err) - mapping->a_ops = compr ? &ntfs_aops_cmpr : - &ntfs_aops; - } - - filemap_invalidate_unlock(mapping); - - if (err) - return err; - } - - inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); - - inode_set_ctime_current(inode); - mark_inode_dirty(inode); - - return 0; -} - -/* * ntfs_ioctl - file_operations::unlocked_ioctl */ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) @@ -430,7 +346,6 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, } if (extend_init && !is_compressed(ni)) { - WARN_ON(ni->i_valid >= pos); err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos); if (err) goto out; @@ -1409,8 +1324,6 @@ const struct inode_operations ntfs_file_inode_operations = { .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, - .fileattr_get = ntfs_fileattr_get, - .fileattr_set = ntfs_fileattr_set, }; const struct file_operations ntfs_file_operations = { diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index b7a83200f2cc..756e1306fe6c 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3327,77 +3327,3 @@ out: return 0; } - -/* - * ni_set_compress - * - * Helper for 'ntfs_fileattr_set'. - * Changes compression for empty files and directories only. - */ -int ni_set_compress(struct inode *inode, bool compr) -{ - int err; - struct ntfs_inode *ni = ntfs_i(inode); - struct ATTR_STD_INFO *std; - const char *bad_inode; - - if (is_compressed(ni) == !!compr) - return 0; - - if (is_sparsed(ni)) { - /* sparse and compress not compatible. */ - return -EOPNOTSUPP; - } - - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) { - /*Skip other inodes. (symlink,fifo,...) */ - return -EOPNOTSUPP; - } - - bad_inode = NULL; - - ni_lock(ni); - - std = ni_std(ni); - if (!std) { - bad_inode = "no std"; - goto out; - } - - if (S_ISREG(inode->i_mode)) { - err = attr_set_compress(ni, compr); - if (err) { - if (err == -ENOENT) { - /* Fix on the fly? */ - /* Each file must contain data attribute. */ - bad_inode = "no data attribute"; - } - goto out; - } - } - - ni->std_fa = std->fa; - if (compr) { - std->fa &= ~FILE_ATTRIBUTE_SPARSE_FILE; - std->fa |= FILE_ATTRIBUTE_COMPRESSED; - } else { - std->fa &= ~FILE_ATTRIBUTE_COMPRESSED; - } - - if (ni->std_fa != std->fa) { - ni->std_fa = std->fa; - ni->mi.dirty = true; - } - /* update duplicate information and directory entries in ni_write_inode.*/ - ni->ni_flags |= NI_FLAG_UPDATE_PARENT; - err = 0; - -out: - ni_unlock(ni); - if (bad_inode) { - ntfs_bad_inode(inode, bad_inode); - err = -EINVAL; - } - - return err; -} diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index d0d530f4e2b9..38934e6978ec 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3091,16 +3091,16 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, inode = ilookup(sbi->sb, rno); if (inode) { mi = &ntfs_i(inode)->mi; - } else if (op == InitializeFileRecordSegment) { - mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS); - if (!mi) - return -ENOMEM; - err = mi_format_new(mi, sbi, rno, 0, false); - if (err) - goto out; } else { /* Read from disk. */ err = mi_get(sbi, rno, &mi); + if (err && op == InitializeFileRecordSegment) { + mi = kzalloc(sizeof(struct mft_inode), + GFP_NOFS); + if (!mi) + return -ENOMEM; + err = mi_format_new(mi, sbi, rno, 0, false); + } if (err) return err; } @@ -3109,15 +3109,13 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, if (op == DeallocateFileRecordSegment) goto skip_load_parent; - if (InitializeFileRecordSegment != op) { - if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) - goto dirty_vol; - if (!check_lsn(&rec->rhdr, rlsn)) - goto out; - if (!check_file_record(rec, NULL, sbi)) - goto dirty_vol; - attr = Add2Ptr(rec, roff); - } + if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) + goto dirty_vol; + if (!check_lsn(&rec->rhdr, rlsn)) + goto out; + if (!check_file_record(rec, NULL, sbi)) + goto dirty_vol; + attr = Add2Ptr(rec, roff); if (is_rec_base(rec) || InitializeFileRecordSegment == op) { rno_base = rno; @@ -3143,7 +3141,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, if (inode) iput(inode); - else if (mi) + else mi_put(mi); inode = inode_parent; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 78d20e4baa2c..1bf2a6593dec 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -2182,6 +2182,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); + if (!e) { + err = -EINVAL; + goto out; + } if (!de_is_last(e)) { /* @@ -2203,6 +2207,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, n = fnd->nodes[level]; te = hdr_first_de(&n->index->ihdr); + if (!te) { + err = -EINVAL; + goto out; + } /* Copy the candidate entry into the replacement entry buffer. */ re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); if (!re) { diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 3e2957a1e360..0f0d27d4644a 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -805,6 +805,10 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ret = 0; goto out; } + if (is_compressed(ni)) { + ret = 0; + goto out; + } ret = blockdev_direct_IO(iocb, inode, iter, wr ? ntfs_get_block_direct_IO_W : @@ -2068,5 +2072,6 @@ const struct address_space_operations ntfs_aops_cmpr = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, .dirty_folio = block_dirty_folio, + .direct_IO = ntfs_direct_IO, }; // clang-format on diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 652735a0b0c4..b807744fc6a9 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -507,8 +507,6 @@ const struct inode_operations ntfs_dir_inode_operations = { .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, .fiemap = ntfs_fiemap, - .fileattr_get = ntfs_fileattr_get, - .fileattr_set = ntfs_fileattr_set, }; const struct inode_operations ntfs_special_inode_operations = { diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index d628977e2556..36b8052660d5 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -454,7 +454,6 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); int attr_force_nonresident(struct ntfs_inode *ni); -int attr_set_compress(struct ntfs_inode *ni, bool compr); /* Functions from attrlist.c */ void al_destroy(struct ntfs_inode *ni); @@ -497,9 +496,6 @@ extern const struct file_operations ntfs_dir_operations; extern const struct file_operations ntfs_legacy_dir_operations; /* Globals from file.c */ -int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa); int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, @@ -585,7 +581,6 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, bool *is_bad); bool ni_is_dirty(struct inode *inode); -int ni_set_compress(struct inode *inode, bool compr); /* Globals from fslog.c */ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3d4b883a7660..3e153c2f6b82 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -32,6 +32,8 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/vmalloc.h> #include <linux/aio.h> @@ -328,11 +330,9 @@ void purge_waiting_ops(void); * defined in super.c */ extern uint64_t orangefs_features; +extern const struct fs_parameter_spec orangefs_fs_param_spec[]; -struct dentry *orangefs_mount(struct file_system_type *fst, - int flags, - const char *devname, - void *data); +int orangefs_init_fs_context(struct fs_context *fc); void orangefs_kill_sb(struct super_block *sb); int orangefs_remount(struct orangefs_sb_info_s *); diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 5ab741c60b7e..7ac16a4d2dc6 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -46,7 +46,8 @@ MODULE_PARM_DESC(hash_table_size, static struct file_system_type orangefs_fs_type = { .name = "pvfs2", - .mount = orangefs_mount, + .init_fs_context = orangefs_init_fs_context, + .parameters = orangefs_fs_param_spec, .kill_sb = orangefs_kill_sb, .owner = THIS_MODULE, }; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index eba3e357192e..64ca9498f550 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -9,7 +9,6 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -#include <linux/parser.h> #include <linux/hashtable.h> #include <linux/seq_file.h> @@ -22,18 +21,16 @@ LIST_HEAD(orangefs_superblocks); DEFINE_SPINLOCK(orangefs_superblocks_lock); enum { - Opt_intr, Opt_acl, + Opt_intr, Opt_local_lock, - - Opt_err }; -static const match_table_t tokens = { - { Opt_acl, "acl" }, - { Opt_intr, "intr" }, - { Opt_local_lock, "local_lock" }, - { Opt_err, NULL } +const struct fs_parameter_spec orangefs_fs_param_spec[] = { + fsparam_flag ("acl", Opt_acl), + fsparam_flag ("intr", Opt_intr), + fsparam_flag ("local_lock", Opt_local_lock), + {} }; uint64_t orangefs_features; @@ -51,48 +48,30 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static int parse_mount_options(struct super_block *sb, char *options, - int silent) +static int orangefs_parse_param(struct fs_context *fc, + struct fs_parameter *param) { - struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); - substring_t args[MAX_OPT_ARGS]; - char *p; - - /* - * Force any potential flags that might be set from the mount - * to zero, ie, initialize to unset. - */ - sb->s_flags &= ~SB_POSIXACL; - orangefs_sb->flags &= ~ORANGEFS_OPT_INTR; - orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_acl: - sb->s_flags |= SB_POSIXACL; - break; - case Opt_intr: - orangefs_sb->flags |= ORANGEFS_OPT_INTR; - break; - case Opt_local_lock: - orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; - break; - default: - goto fail; - } + struct orangefs_sb_info_s *orangefs_sb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, orangefs_fs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_acl: + fc->sb_flags |= SB_POSIXACL; + break; + case Opt_intr: + orangefs_sb->flags |= ORANGEFS_OPT_INTR; + break; + case Opt_local_lock: + orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; + break; } return 0; -fail: - if (!silent) - gossip_err("Error: mount option [%s] is not supported.\n", p); - return -EINVAL; } static void orangefs_inode_cache_ctor(void *req) @@ -223,10 +202,20 @@ out_op_release: * Remount as initiated by VFS layer. We just need to reparse the mount * options, no need to signal pvfs2-client-core about it. */ -static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data) +static int orangefs_reconfigure(struct fs_context *fc) { - gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n"); - return parse_mount_options(sb, data, 1); + struct super_block *sb = fc->root->d_sb; + struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); + struct orangefs_sb_info_s *revised = fc->s_fs_info; + unsigned int flags; + + flags = orangefs_sb->flags; + flags &= ~(ORANGEFS_OPT_INTR | ORANGEFS_OPT_LOCAL_LOCK); + flags |= revised->flags; + WRITE_ONCE(orangefs_sb->flags, flags); + + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_reconfigure: called\n"); + return 0; } /* @@ -319,7 +308,6 @@ static const struct super_operations orangefs_s_ops = { .write_inode = orangefs_write_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, - .remount_fs = orangefs_remount_fs, .show_options = orangefs_show_options, }; @@ -410,8 +398,8 @@ static int orangefs_unmount(int id, __s32 fs_id, const char *devname) } static int orangefs_fill_sb(struct super_block *sb, - struct orangefs_fs_mount_response *fs_mount, - void *data, int silent) + struct fs_context *fc, + struct orangefs_fs_mount_response *fs_mount) { int ret; struct inode *root; @@ -424,12 +412,6 @@ static int orangefs_fill_sb(struct super_block *sb, ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id; ORANGEFS_SB(sb)->id = fs_mount->id; - if (data) { - ret = parse_mount_options(sb, data, silent); - if (ret) - return ret; - } - /* Hang the xattr handlers off the superblock */ sb->s_xattr = orangefs_xattr_handlers; sb->s_magic = ORANGEFS_SUPER_MAGIC; @@ -470,30 +452,24 @@ static int orangefs_fill_sb(struct super_block *sb, return 0; } -struct dentry *orangefs_mount(struct file_system_type *fst, - int flags, - const char *devname, - void *data) +static int orangefs_get_tree(struct fs_context *fc) { int ret; struct super_block *sb = ERR_PTR(-EINVAL); struct orangefs_kernel_op_s *new_op; - struct dentry *d = ERR_PTR(-EINVAL); + + if (!fc->source) + return invalf(fc, "Device name not specified.\n"); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: called with devname %s\n", - devname); - - if (!devname) { - gossip_err("ERROR: device name not specified.\n"); - return ERR_PTR(-EINVAL); - } + fc->source); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, devname); + strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, fc->source); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Mount via host %s\n", @@ -511,37 +487,27 @@ struct dentry *orangefs_mount(struct file_system_type *fst, goto free_op; } - sb = sget(fst, NULL, set_anon_super, flags, NULL); + sb = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(sb)) { - d = ERR_CAST(sb); + ret = PTR_ERR(sb); orangefs_unmount(new_op->downcall.resp.fs_mount.id, - new_op->downcall.resp.fs_mount.fs_id, devname); - goto free_op; - } - - /* alloc and init our private orangefs sb info */ - sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); - if (!ORANGEFS_SB(sb)) { - d = ERR_PTR(-ENOMEM); + new_op->downcall.resp.fs_mount.fs_id, + fc->source); goto free_op; } - ret = orangefs_fill_sb(sb, - &new_op->downcall.resp.fs_mount, data, - flags & SB_SILENT ? 1 : 0); + /* init our private orangefs sb info */ + ret = orangefs_fill_sb(sb, fc, &new_op->downcall.resp.fs_mount); - if (ret) { - d = ERR_PTR(ret); + if (ret) goto free_sb_and_op; - } /* * on successful mount, store the devname and data * used */ - strscpy(ORANGEFS_SB(sb)->devname, devname); - + strscpy(ORANGEFS_SB(sb)->devname, fc->source); /* mount_pending must be cleared */ ORANGEFS_SB(sb)->mount_pending = 0; @@ -564,7 +530,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) - return ERR_PTR(-ENOMEM); + return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", 0); orangefs_features = new_op->downcall.resp.features.features; @@ -573,7 +539,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst, orangefs_features = 0; } - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ @@ -589,7 +556,43 @@ free_op: op_release(new_op); - return d; + return ret; +} + +static void orangefs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations orangefs_context_ops = { + .free = orangefs_free_fc, + .parse_param = orangefs_parse_param, + .get_tree = orangefs_get_tree, + .reconfigure = orangefs_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +int orangefs_init_fs_context(struct fs_context *fc) +{ + struct orangefs_sb_info_s *osi; + + osi = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); + if (!osi) + return -ENOMEM; + + /* + * Force any potential flags that might be set from the mount + * to zero, ie, initialize to unset. + */ + fc->sb_flags_mask &= ~SB_POSIXACL; + osi->flags &= ~ORANGEFS_OPT_INTR; + osi->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; + + fc->s_fs_info = osi; + fc->ops = &orangefs_context_ops; + return 0; } void orangefs_kill_sb(struct super_block *sb) diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig new file mode 100644 index 000000000000..21671301bd8a --- /dev/null +++ b/fs/resctrl/Kconfig @@ -0,0 +1,39 @@ +config RESCTRL_FS + bool "CPU Resource Control Filesystem (resctrl)" + depends on ARCH_HAS_CPU_RESCTRL + select KERNFS + select PROC_CPU_RESCTRL if PROC_FS + help + Some architectures provide hardware facilities to group tasks and + monitor and control their usage of memory system resources such as + caches and memory bandwidth. Examples of such facilities include + Intel's Resource Director Technology (Intel(R) RDT) and AMD's + Platform Quality of Service (AMD QoS). + + If your system has the necessary support and you want to be able to + assign tasks to groups and manipulate the associated resource + monitors and controls from userspace, say Y here to get a mountable + 'resctrl' filesystem that lets you do just that. + + If nothing mounts or prods the 'resctrl' filesystem, resource + controls and monitors are left in a quiescent, permissive state. + + On architectures where this can be disabled independently, it is + safe to say N. + + See <file:Documentation/filesystems/resctrl.rst> for more information. + +config RESCTRL_FS_PSEUDO_LOCK + bool + depends on RESCTRL_FS + help + Software mechanism to pin data in a cache portion using + micro-architecture specific knowledge. + +config RESCTRL_RMID_DEPENDS_ON_CLOSID + bool + depends on RESCTRL_FS + help + Enabled by the architecture when the RMID values depend on the CLOSID. + This causes the CLOSID allocator to search for CLOSID with clean + RMID. diff --git a/fs/resctrl/Makefile b/fs/resctrl/Makefile new file mode 100644 index 000000000000..e67f34d2236a --- /dev/null +++ b/fs/resctrl/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o +obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: +CFLAGS_monitor.o = -I$(src) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c new file mode 100644 index 000000000000..6ed2dfd4dbbd --- /dev/null +++ b/fs/resctrl/ctrlmondata.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/tick.h> + +#include "internal.h" + +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + +typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, + struct resctrl_schema *s, + struct rdt_ctrl_domain *d); + +/* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + int ret; + u32 bw; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return false; + } + + ret = kstrtou32(buf, 10, &bw); + if (ret) { + rdt_last_cmd_printf("Invalid MB value %s\n", buf); + return false; + } + + /* Nothing else to do if software controller is enabled. */ + if (is_mba_sc(r)) { + *data = bw; + return true; + } + + if (bw < r->membw.min_bw || bw > r->membw.max_bw) { + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", + bw, r->membw.min_bw, r->membw.max_bw); + return false; + } + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; + struct rdt_resource *r = s->res; + u32 bw_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + + if (!bw_validate(data->buf, &bw_val, r)) + return -EINVAL; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + + cfg->new_ctrl = bw_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Check whether a cache bit mask is valid. + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 + * + * Haswell does not support a non-contiguous 1s value and additionally + * requires at least two bits set. + * AMD allows non-contiguous bitmasks. + */ +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit, val; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) { + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); + return false; + } + + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { + rdt_last_cmd_puts("Mask out of range\n"); + return false; + } + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Are non-contiguous bitmasks allowed? */ + if (!r->cache.arch_has_sparse_bitmasks && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { + rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); + return false; + } + + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in the mask\n", + r->cache.min_cbm_bits); + return false; + } + + *data = val; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct rdtgroup *rdtgrp = data->rdtgrp; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 cbm_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + + /* + * Cannot set up more than one pseudo-locked region in a cache + * hierarchy. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + rdtgroup_pseudo_locked_in_hierarchy(d)) { + rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); + return -EINVAL; + } + + if (!cbm_validate(data->buf, &cbm_val, r)) + return -EINVAL; + + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_SHAREABLE) && + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { + rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); + return -EINVAL; + } + + /* + * The CBM may not overlap with the CBM of another closid if + * either is exclusive. + */ + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { + rdt_last_cmd_puts("Overlaps with exclusive group\n"); + return -EINVAL; + } + + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + rdt_last_cmd_puts("Overlaps with other group\n"); + return -EINVAL; + } + } + + cfg->new_ctrl = cbm_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. + */ +static int parse_line(char *line, struct resctrl_schema *s, + struct rdtgroup *rdtgrp) +{ + enum resctrl_conf_type t = s->conf_type; + ctrlval_parser_t *parse_ctrlval = NULL; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + struct rdt_parse_data data; + struct rdt_ctrl_domain *d; + char *dom = NULL, *id; + unsigned long dom_id; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + parse_ctrlval = &parse_cbm; + break; + case RESCTRL_SCHEMA_RANGE: + parse_ctrlval = &parse_bw; + break; + } + + if (WARN_ON_ONCE(!parse_ctrlval)) + return -EINVAL; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + dom = strim(dom); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (d->hdr.id == dom_id) { + data.buf = dom; + data.rdtgrp = rdtgrp; + if (parse_ctrlval(&data, s, d)) + return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + cfg = &d->staged_config[t]; + /* + * In pseudo-locking setup mode and just + * parsed a valid CBM that should be + * pseudo-locked. Only one locked region per + * resource group and domain so just do + * the required initialization for single + * region and return. + */ + rdtgrp->plr->s = s; + rdtgrp->plr->d = d; + rdtgrp->plr->cbm = cfg->new_ctrl; + d->plr = rdtgrp->plr; + return 0; + } + goto next; + } + } + return -EINVAL; +} + +static int rdtgroup_parse_resource(char *resname, char *tok, + struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + + list_for_each_entry(s, &resctrl_schema_all, list) { + if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) + return parse_line(tok, s, rdtgrp); + } + rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); + return -EINVAL; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct resctrl_schema *s; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + /* + * No changes to pseudo-locked region allowed. It has to be removed + * and re-created instead. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = -EINVAL; + rdt_last_cmd_puts("Resource group is pseudo-locked\n"); + goto out; + } + + rdt_staged_configs_clear(); + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strim(strsep(&tok, ":")); + if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); + ret = -EINVAL; + goto out; + } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); + ret = -EINVAL; + goto out; + } + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); + if (ret) + goto out; + } + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret) + goto out; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * If pseudo-locking fails we keep the resource group in + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service + * active and updated for just the domain the pseudo-locked + * region was requested for. + */ + ret = rdtgroup_pseudo_lock_create(rdtgrp); + } + +out: + rdt_staged_configs_clear(); + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) +{ + struct rdt_resource *r = schema->res; + struct rdt_ctrl_domain *dom; + bool sep = false; + u32 ctrl_val; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { + if (sep) + seq_puts(s, ";"); + + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + + seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + struct rdtgroup *rdtgrp; + int ret = 0; + u32 closid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + list_for_each_entry(schema, &resctrl_schema_all, list) { + seq_printf(s, "%s:uninitialized\n", schema->name); + } + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->s->res->name, + rdtgrp->plr->d->hdr.id, + rdtgrp->plr->cbm); + } + } else { + closid = rdtgrp->closid; + list_for_each_entry(schema, &resctrl_schema_all, list) { + if (closid < schema->num_closid) + show_doms(s, schema, closid); + } + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} + +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!strcmp(buf, "mbm_local_bytes")) { + if (resctrl_arch_is_mbm_local_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else + ret = -EINVAL; + } else if (!strcmp(buf, "mbm_total_bytes")) { + if (resctrl_arch_is_mbm_total_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; + else + ret = -EINVAL; + } else { + ret = -EINVAL; + } + + if (ret) + rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + switch (rdtgrp->mba_mbps_event) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + seq_puts(s, "mbm_local_bytes\n"); + break; + case QOS_L3_MBM_TOTAL_EVENT_ID: + seq_puts(s, "mbm_total_bytes\n"); + break; + default: + pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); + ret = -EINVAL; + break; + } + } else { + ret = -ENOENT; + } + + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, + struct list_head **pos) +{ + struct rdt_domain_hdr *d; + struct list_head *l; + + list_for_each(l, h) { + d = list_entry(l, struct rdt_domain_hdr, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first) +{ + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + /* + * Setup the parameters to pass to mon_event_count() to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->r = r; + rr->d = d; + rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } + + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(cpumask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); + + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + enum resctrl_res_level resid; + enum resctrl_event_id evtid; + struct rdt_domain_hdr *hdr; + struct rmid_read rr = {0}; + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + struct mon_data *md; + int domid, ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } + + md = of->kn->priv; + if (WARN_ON_ONCE(!md)) { + ret = -EIO; + goto out; + } + + resid = md->rid; + domid = md->domid; + evtid = md->evtid; + r = resctrl_arch_get_resource(resid); + + if (md->sum) { + /* + * This file requires summing across all domains that share + * the L3 cache id that was provided in the "domid" field of the + * struct mon_data. Search all domains in the resource for + * one that matches this cache id. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->ci->id == domid) { + rr.ci = d->ci; + mon_event_read(&rr, r, NULL, rdtgrp, + &d->ci->shared_cpu_map, evtid, false); + goto checkresult; + } + } + ret = -ENOENT; + goto out; + } else { + /* + * This file provides data from a single domain. Search + * the resource to find the domain with "domid". + */ + hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + ret = -ENOENT; + goto out; + } + d = container_of(hdr, struct rdt_mon_domain, hdr); + mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + } + +checkresult: + + if (rr.err == -EIO) + seq_puts(m, "Error\n"); + else if (rr.err == -EINVAL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h new file mode 100644 index 000000000000..9a8cf6f11151 --- /dev/null +++ b/fs/resctrl/internal.h @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_RESCTRL_INTERNAL_H +#define _FS_RESCTRL_INTERNAL_H + +#include <linux/resctrl.h> +#include <linux/kernfs.h> +#include <linux/fs_context.h> +#include <linux/tick.h> + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. + * + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. + */ +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) +{ + unsigned int cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ + if (tick_nohz_full_enabled()) { + cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu); + if (cpu < nr_cpu_ids) + return cpu; + } + + return cpumask_any_but(mask, exclude_cpu); +} + +struct rdt_fs_context { + struct kernfs_fs_context kfc; + bool enable_cdpl2; + bool enable_cdpl3; + bool enable_mba_mbps; + bool enable_debug; +}; + +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct rdt_fs_context, kfc); +} + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + * @configurable: true if the event is configurable + * @list: entry in &rdt_resource->evt_list + */ +struct mon_evt { + enum resctrl_event_id evtid; + char *name; + bool configurable; + struct list_head list; +}; + +/** + * struct mon_data - Monitoring details for each event file. + * @list: Member of the global @mon_data_kn_priv_list list. + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file. + * @sum: Set when event must be summed across multiple + * domains. + * @domid: When @sum is zero this is the domain to which + * the event file belongs. When @sum is one this + * is the id of the L3 cache that all domains to be + * summed share. + * + * Pointed to by the kernfs kn->priv field of monitoring event files. + * Readers and writers must hold rdtgroup_mutex. + */ +struct mon_data { + struct list_head list; + enum resctrl_res_level rid; + enum resctrl_event_id evtid; + int domid; + bool sum; +}; + +/** + * struct rmid_read - Data passed across smp_call*() to read event count. + * @rgrp: Resource group for which the counter is being read. If it is a parent + * resource group then its event count is summed with the count from all + * its child resource groups. + * @r: Resource describing the properties of the event being read. + * @d: Domain that the counter should be read from. If NULL then sum all + * domains in @r sharing L3 @ci.id + * @evtid: Which monitor event to read. + * @first: Initialize MBM counter when true. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @err: Error encountered when reading counter. + * @val: Returned value of event counter. If @rgrp is a parent resource group, + * @val includes the sum of event counts from its child resource groups. + * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, + * (summed across child resource groups if @rgrp is a parent resource group). + * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). + */ +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_resource *r; + struct rdt_mon_domain *d; + enum resctrl_event_id evtid; + bool first; + struct cacheinfo *ci; + int err; + u64 val; + void *arch_mon_ctx; +}; + +extern struct list_head resctrl_schema_all; + +extern bool resctrl_mounted; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * enum rdtgrp_mode - Mode of a RDT resource group + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations + * allowed AND the allocations are Cache Pseudo-Locked + * @RDT_NUM_MODES: Total number of modes + * + * The mode of a resource group enables control over the allowed overlap + * between allocations associated with different resource groups (classes + * of service). User is able to modify the mode of a resource group by + * writing to the "mode" resctrl file associated with the resource group. + * + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by + * writing the appropriate text to the "mode" file. A resource group enters + * "pseudo-locked" mode after the schemata is written while the resource + * group is in "pseudo-locksetup" mode. + */ +enum rdtgrp_mode { + RDT_MODE_SHAREABLE = 0, + RDT_MODE_EXCLUSIVE, + RDT_MODE_PSEUDO_LOCKSETUP, + RDT_MODE_PSEUDO_LOCKED, + + /* Must be last */ + RDT_NUM_MODES, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn: kernfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + * @mode: mode of resource group + * @mba_mbps_event: input monitoring event id when mba_sc is enabled + * @plr: pseudo-locked region + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; + enum rdtgrp_mode mode; + enum resctrl_event_id mba_mbps_event; + struct pseudo_lock_region *plr; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) + +#define RFTYPE_BASE BIT(1) + +#define RFTYPE_CTRL BIT(4) + +#define RFTYPE_MON BIT(5) + +#define RFTYPE_TOP BIT(6) + +#define RFTYPE_RES_CACHE BIT(8) + +#define RFTYPE_RES_MB BIT(9) + +#define RFTYPE_DEBUG BIT(10) + +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) + +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) + +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) + +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width; + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + const struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation + * @prev_bw: The most recent bandwidth in MBps + */ +struct mbm_state { + u64 prev_bw_bytes; + u32 prev_bw; +}; + +extern struct mutex rdtgroup_mutex; + +static inline const char *rdt_kn_name(const struct kernfs_node *kn) +{ + return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); +} + +extern struct rdtgroup rdtgroup_default; + +extern struct dentry *debugfs_resctrl; + +extern enum resctrl_event_id mba_mbps_default_event; + +void rdt_last_cmd_clear(void); + +void rdt_last_cmd_puts(const char *s); + +__printf(1, 2) +void rdt_last_cmd_printf(const char *fmt, ...); + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); + +void rdtgroup_kn_unlock(struct kernfs_node *kn); + +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); + +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask); + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, bool exclusive); + +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, + unsigned long cbm); + +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); + +int rdtgroup_tasks_assigned(struct rdtgroup *r); + +int closids_supported(void); + +void closid_free(int closid); + +int alloc_rmid(u32 closid); + +void free_rmid(u32 closid, u32 rmid); + +void resctrl_mon_resource_exit(void); + +void mon_event_count(void *info); + +int rdtgroup_mondata_show(struct seq_file *m, void *arg); + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first); + +int resctrl_mon_resource_init(void); + +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, + unsigned long delay_ms, + int exclude_cpu); + +void mbm_handle_overflow(struct work_struct *work); + +bool is_mba_sc(struct rdt_resource *r); + +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu); + +void cqm_handle_limbo(struct work_struct *work); + +bool has_busy_rmid(struct rdt_mon_domain *d); + +void __check_limbo(struct rdt_mon_domain *d, bool force_free); + +void resctrl_file_fflags_init(const char *config, unsigned long fflags); + +void rdt_staged_configs_clear(void); + +bool closid_allocated(unsigned int closid); + +int resctrl_find_cleanest_closid(void); + +#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); + +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); + +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); + +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); + +int rdt_pseudo_lock_init(void); + +void rdt_pseudo_lock_release(void); + +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); + +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); + +#else +static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) +{ + return false; +} + +static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) +{ + return false; +} + +static inline int rdt_pseudo_lock_init(void) { return 0; } +static inline void rdt_pseudo_lock_release(void) { } +static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } +#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ + +#endif /* _FS_RESCTRL_INTERNAL_H */ diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c new file mode 100644 index 000000000000..bde2801289d3 --- /dev/null +++ b/fs/resctrl/monitor.c @@ -0,0 +1,929 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include <linux/cpu.h> +#include <linux/resctrl.h> +#include <linux/sizes.h> +#include <linux/slab.h> + +#include "internal.h" + +#define CREATE_TRACE_POINTS + +#include "monitor_trace.h" + +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ +struct rmid_entry { + u32 closid; + u32 rmid; + int busy; + struct list_head list; +}; + +/* + * @rmid_free_lru - A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* + * @rmid_limbo_count - count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. + */ +static unsigned int rmid_limbo_count; + +/* + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * This is the threshold cache occupancy in bytes at which we will consider an + * RMID available for re-allocation. + */ +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; + +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) +{ + struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); + + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); + + return entry; +} + +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_mon_domain *d, bool force_free) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + struct rmid_entry *entry; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; + bool rmid_dirty; + u64 val = 0; + + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) + break; + + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + + /* + * x86's CLOSID and RMID are independent numbers, so the entry's + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't + * used to select the configuration. It is thus necessary to track both + * CLOSID and RMID because there may be dependencies between them + * on some architectures. + */ + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); + } + + if (force_free || !rmid_dirty) { + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); + } + cur_idx = idx + 1; + } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); +} + +bool has_busy_rmid(struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; +} + +/* + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. + */ +int alloc_rmid(u32 closid) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + list_del(&entry->list); + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + u32 idx; + + lockdep_assert_held(&rdtgroup_mutex); + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); + + entry->busy = 0; + list_for_each_entry(d, &r->mon_domains, hdr.list) { + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); + entry->busy++; + } + + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; +} + +void free_rmid(u32 closid, u32 rmid) +{ + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); + + if (resctrl_arch_is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) +{ + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &d->mbm_total[idx]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &d->mbm_local[idx]; + default: + return NULL; + } +} + +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +{ + int cpu = smp_processor_id(); + struct rdt_mon_domain *d; + struct mbm_state *m; + int err, ret; + u64 tval = 0; + + if (rr->first) { + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (m) + memset(m, 0, sizeof(struct mbm_state)); + return 0; + } + + if (rr->d) { + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) + return -EINVAL; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; + } + + /* Summing domains that share a cache, must be on a CPU for that cache. */ + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + return -EINVAL; + + /* + * Legacy files must report the sum of an event across all + * domains that share the same L3 cache instance. + * Report success if a read from any domain succeeds, -EINVAL + * (translated to "Unavailable" for user space) if reading from + * all domains fail for any reason. + */ + ret = -EINVAL; + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { + if (d->ci->id != rr->ci->id) + continue; + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (!err) { + rr->val += tval; + ret = 0; + } + } + + if (ret) + rr->err = ret; + + return ret; +} + +/* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * + * Supporting function to calculate the memory bandwidth + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. + */ +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +{ + u64 cur_bw, bytes, cur_bytes; + struct mbm_state *m; + + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (WARN_ON_ONCE(!m)) + return; + + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; + + cur_bw = bytes / SZ_1M; + + m->prev_bw = cur_bw; +} + +/* + * This is scheduled by mon_event_read() to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + int ret; + + rdtgrp = rr->rgrp; + + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + + /* + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) + ret = 0; + } + } + + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; +} + +static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +/* + * Feedback loop for MBA software controller (mba_sc) + * + * mba_sc is a feedback loop where we periodically read MBM counters and + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so + * that: + * + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) + * + * This uses the MBM counters to measure the bandwidth and MBA throttle + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the + * fact that resctrl rdtgroups have both monitoring and control. + * + * The frequency of the checks is 1s and we just tag along the MBM overflow + * timer. Having 1s interval makes the calculation of bandwidth simpler. + * + * Although MBA's goal is to restrict the bandwidth to a maximum, there may + * be a need to increase the bandwidth to avoid unnecessarily restricting + * the L2 <-> L3 traffic. + * + * Since MBA controls the L2 external bandwidth where as MBM measures the + * L3 external bandwidth the following sequence could lead to such a + * situation. + * + * Consider an rdtgroup which had high L3 <-> memory traffic in initial + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but + * after some time rdtgroup has mostly L2 <-> L3 traffic. + * + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its + * throttle MSRs already have low percentage values. To avoid + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. + */ +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) +{ + u32 closid, rmid, cur_msr_val, new_msr_val; + struct mbm_state *pmbm_data, *cmbm_data; + struct rdt_ctrl_domain *dom_mba; + enum resctrl_event_id evt_id; + struct rdt_resource *r_mba; + struct list_head *head; + struct rdtgroup *entry; + u32 cur_bw, user_bw; + + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + evt_id = rgrp->mba_mbps_event; + + closid = rgrp->closid; + rmid = rgrp->mon.rmid; + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); + if (WARN_ON_ONCE(!pmbm_data)) + return; + + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); + if (!dom_mba) { + pr_warn_once("Failure to get domain for MBA update\n"); + return; + } + + cur_bw = pmbm_data->prev_bw; + user_bw = dom_mba->mbps_val[closid]; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rgrp->mon.crdtgrp_list; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); + if (WARN_ON_ONCE(!cmbm_data)) + return; + cur_bw += cmbm_data->prev_bw; + } + + /* + * Scale up/down the bandwidth linearly for the ctrl group. The + * bandwidth step is the bandwidth granularity specified by the + * hardware. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". + */ + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; + } else if (cur_msr_val < MAX_MBA_BW && + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; + } else { + return; + } + + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); +} + +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id evtid) +{ + struct rmid_read rr = {0}; + + rr.r = r; + rr.d = d; + rr.evtid = evtid; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); + + /* + * If the software controller is enabled, compute the + * bandwidth for this event id. + */ + if (is_mba_sc(NULL)) + mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); +} + +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid) +{ + /* + * This is protected from concurrent reads from user as both + * the user and overflow handler hold the global mutex. + */ + if (resctrl_arch_is_mbm_total_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + + if (resctrl_arch_is_mbm_local_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + struct rdt_mon_domain *d; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + + __check_limbo(d, false); + + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); + dom->cqm_work_cpu = cpu; + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + struct rdt_mon_domain *d; + struct list_head *head; + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) + goto out_unlock; + + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + d = container_of(work, struct rdt_mon_domain, mbm_over.work); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + + if (is_mba_sc(NULL)) + update_mba_bw(prgrp, d); + } + + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) + return; + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); + dom->mbm_work_cpu = cpu; + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); + struct rmid_entry *entry = NULL; + int err = 0, i; + u32 idx; + + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } + + closid_num_dirty_rmid = tmp; + } + + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < idx_limit; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in resctrl_init(). + */ + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); + list_del(&entry->list); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void dom_data_exit(struct rdt_resource *r) +{ + mutex_lock(&rdtgroup_mutex); + + if (!r->mon_capable) + goto out_unlock; + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (resctrl_arch_is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (resctrl_arch_is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (resctrl_arch_is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +/** + * resctrl_mon_resource_init() - Initialise global monitoring structures. + * + * Allocate and initialise global monitor resources that do not belong to a + * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * Called once during boot after the struct rdt_resource's have been configured + * but before the filesystem is mounted. + * Resctrl's cpuhp callbacks may be called before this point to bring a domain + * online. + * + * Returns 0 for success, or -ENOMEM. + */ +int resctrl_mon_resource_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { + mbm_total_event.configurable = true; + resctrl_file_fflags_init("mbm_total_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { + mbm_local_event.configurable = true; + resctrl_file_fflags_init("mbm_local_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + + if (resctrl_arch_is_mbm_local_enabled()) + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else if (resctrl_arch_is_mbm_total_enabled()) + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + + return 0; +} + +void resctrl_mon_resource_exit(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + dom_data_exit(r); +} diff --git a/fs/resctrl/monitor_trace.h b/fs/resctrl/monitor_trace.h new file mode 100644 index 000000000000..fdf49f22576a --- /dev/null +++ b/fs/resctrl/monitor_trace.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM resctrl + +#if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _FS_RESCTRL_MONITOR_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(mon_llc_occupancy_limbo, + TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), + TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), + TP_STRUCT__entry(__field(u32, ctrl_hw_id) + __field(u32, mon_hw_id) + __field(int, domain_id) + __field(u64, llc_occupancy_bytes)), + TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; + __entry->mon_hw_id = mon_hw_id; + __entry->domain_id = domain_id; + __entry->llc_occupancy_bytes = llc_occupancy_bytes;), + TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", + __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, + __entry->llc_occupancy_bytes) + ); + +#endif /* _FS_RESCTRL_MONITOR_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#define TRACE_INCLUDE_FILE monitor_trace + +#include <trace/define_trace.h> diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c new file mode 100644 index 000000000000..ccc2f9213b4b --- /dev/null +++ b/fs/resctrl/pseudo_lock.c @@ -0,0 +1,1105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resource Director Technology (RDT) + * + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Reinette Chatre <reinette.chatre@intel.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cacheinfo.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/kthread.h> +#include <linux/mman.h> +#include <linux/pm_qos.h> +#include <linux/resctrl.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "internal.h" + +/* + * Major number assigned to and shared by all devices exposing + * pseudo-locked regions. + */ +static unsigned int pseudo_lock_major; + +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); + +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) +{ + const struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + guard(mutex)(&rdtgroup_mutex); + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); +} + +static const struct class pseudo_lock_class = { + .name = "pseudo_lock", + .devnode = pseudo_lock_devnode, +}; + +/** + * pseudo_lock_minor_get - Obtain available minor number + * @minor: Pointer to where new minor number will be stored + * + * A bitmask is used to track available minor numbers. Here the next free + * minor number is marked as unavailable and returned. + * + * Return: 0 on success, <0 on failure. + */ +static int pseudo_lock_minor_get(unsigned int *minor) +{ + unsigned long first_bit; + + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); + + if (first_bit == MINORBITS) + return -ENOSPC; + + __clear_bit(first_bit, &pseudo_lock_minor_avail); + *minor = first_bit; + + return 0; +} + +/** + * pseudo_lock_minor_release - Return minor number to available + * @minor: The minor number made available + */ +static void pseudo_lock_minor_release(unsigned int minor) +{ + __set_bit(minor, &pseudo_lock_minor_avail); +} + +/** + * region_find_by_minor - Locate a pseudo-lock region by inode minor number + * @minor: The minor number of the device representing pseudo-locked region + * + * When the character device is accessed we need to determine which + * pseudo-locked region it belongs to. This is done by matching the minor + * number of the device to the pseudo-locked region it belongs. + * + * Minor numbers are assigned at the time a pseudo-locked region is associated + * with a cache instance. + * + * Return: On success return pointer to resource group owning the pseudo-locked + * region, NULL on failure. + */ +static struct rdtgroup *region_find_by_minor(unsigned int minor) +{ + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { + rdtgrp_match = rdtgrp; + break; + } + } + return rdtgrp_match; +} + +/** + * struct pseudo_lock_pm_req - A power management QoS request list entry + * @list: Entry within the @pm_reqs list for a pseudo-locked region + * @req: PM QoS request + */ +struct pseudo_lock_pm_req { + struct list_head list; + struct dev_pm_qos_request req; +}; + +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req, *next; + + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { + dev_pm_qos_remove_request(&pm_req->req); + list_del(&pm_req->list); + kfree(pm_req); + } +} + +/** + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * @plr: Pseudo-locked region + * + * To prevent the cache from being affected by power management entering + * C6 has to be avoided. This is accomplished by requesting a latency + * requirement lower than lowest C6 exit latency of all supported + * platforms as found in the cpuidle state tables in the intel_idle driver. + * At this time it is possible to do so with a single latency requirement + * for all supported platforms. + * + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, + * the ACPI latencies need to be considered while keeping in mind that C2 + * may be set to map to deeper sleep states. In this case the latency + * requirement needs to prevent entering C2 also. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req; + int cpu; + int ret; + + for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); + if (!pm_req) { + rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); + ret = -ENOMEM; + goto out_err; + } + ret = dev_pm_qos_add_request(get_cpu_device(cpu), + &pm_req->req, + DEV_PM_QOS_RESUME_LATENCY, + 30); + if (ret < 0) { + rdt_last_cmd_printf("Failed to add latency req CPU%d\n", + cpu); + kfree(pm_req); + ret = -1; + goto out_err; + } + list_add(&pm_req->list, &plr->pm_reqs); + } + + return 0; + +out_err: + pseudo_lock_cstates_relax(plr); + return ret; +} + +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->s = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; + plr->debugfs_dir = NULL; +} + +/** + * pseudo_lock_region_init - Initialize pseudo-lock region information + * @plr: pseudo-lock region + * + * Called after user provided a schemata to be pseudo-locked. From the + * schemata the &struct pseudo_lock_region is on entry already initialized + * with the resource, domain, and capacity bitmask. Here the information + * required for pseudo-locking is deduced from this data and &struct + * pseudo_lock_region initialized further. This information includes: + * - size in bytes of the region to be pseudo-locked + * - cache line size to know the stride with which data needs to be accessed + * to be pseudo-locked + * - a cpu associated with the cache instance on which the pseudo-locking + * flow can be executed + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_init(struct pseudo_lock_region *plr) +{ + enum resctrl_scope scope = plr->s->res->ctrl_scope; + struct cacheinfo *ci; + int ret; + + if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) + return -ENODEV; + + /* Pick the first cpu we find that is associated with the cache. */ + plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); + + if (!cpu_online(plr->cpu)) { + rdt_last_cmd_printf("CPU %u associated with cache not online\n", + plr->cpu); + ret = -ENODEV; + goto out_region; + } + + ci = get_cpu_cacheinfo_level(plr->cpu, scope); + if (ci) { + plr->line_size = ci->coherency_line_size; + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); + return 0; + } + + ret = -1; + rdt_last_cmd_puts("Unable to determine cache line size\n"); +out_region: + pseudo_lock_region_clear(plr); + return ret; +} + +/** + * pseudo_lock_init - Initialize a pseudo-lock region + * @rdtgrp: resource group to which new pseudo-locked region will belong + * + * A pseudo-locked region is associated with a resource group. When this + * association is created the pseudo-locked region is initialized. The + * details of the pseudo-locked region are not known at this time so only + * allocation is done and association established. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_init(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr; + + plr = kzalloc(sizeof(*plr), GFP_KERNEL); + if (!plr) + return -ENOMEM; + + init_waitqueue_head(&plr->lock_thread_wq); + INIT_LIST_HEAD(&plr->pm_reqs); + rdtgrp->plr = plr; + return 0; +} + +/** + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked + * @plr: pseudo-lock region + * + * Initialize the details required to set up the pseudo-locked region and + * allocate the contiguous memory that will be pseudo-locked to the cache. + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) +{ + int ret; + + ret = pseudo_lock_region_init(plr); + if (ret < 0) + return ret; + + /* + * We do not yet support contiguous regions larger than + * KMALLOC_MAX_SIZE. + */ + if (plr->size > KMALLOC_MAX_SIZE) { + rdt_last_cmd_puts("Requested region exceeds maximum size\n"); + ret = -E2BIG; + goto out_region; + } + + plr->kmem = kzalloc(plr->size, GFP_KERNEL); + if (!plr->kmem) { + rdt_last_cmd_puts("Unable to allocate memory\n"); + ret = -ENOMEM; + goto out_region; + } + + ret = 0; + goto out; +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * pseudo_lock_free - Free a pseudo-locked region + * @rdtgrp: resource group to which pseudo-locked region belonged + * + * The pseudo-locked region's resources have already been released, or not + * yet created at this point. Now it can be freed and disassociated from the + * resource group. + * + * Return: void + */ +static void pseudo_lock_free(struct rdtgroup *rdtgrp) +{ + pseudo_lock_region_clear(rdtgrp->plr); + kfree(rdtgrp->plr); + rdtgrp->plr = NULL; +} + +/** + * rdtgroup_monitor_in_progress - Test if monitoring in progress + * @rdtgrp: resource group being queried + * + * Return: 1 if monitor groups have been created for this resource + * group, 0 otherwise. + */ +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +{ + return !list_empty(&rdtgrp->mon.crdtgrp_list); +} + +/** + * rdtgroup_locksetup_user_restrict - Restrict user access to group + * @rdtgrp: resource group needing access restricted + * + * A resource group used for cache pseudo-locking cannot have cpus or tasks + * assigned to it. This is communicated to the user by restricting access + * to all the files that can be used to make such changes. + * + * Permissions restored with rdtgroup_locksetup_user_restore() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restriction of access an attempt will be made to restore permissions but + * the state of the mode of these files will be uncertain when a failure + * occurs. + */ +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (resctrl_arch_mon_capable()) { + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); +err_cpus: + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); +err_tasks: + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); +out: + return ret; +} + +/** + * rdtgroup_locksetup_user_restore - Restore user access to group + * @rdtgrp: resource group needing access restored + * + * Restore all file access previously removed using + * rdtgroup_locksetup_user_restrict() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restoration of access an attempt will be made to restrict permissions + * again but the state of the mode of these files will be uncertain when + * a failure occurs. + */ +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); + if (ret) + goto err_cpus; + + if (resctrl_arch_mon_capable()) { + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); +out: + return ret; +} + +/** + * rdtgroup_locksetup_enter - Resource group enters locksetup mode + * @rdtgrp: resource group requested to enter locksetup mode + * + * A resource group enters locksetup mode to reflect that it would be used + * to represent a pseudo-locked region and is in the process of being set + * up to do so. A resource group used for a pseudo-locked region would + * lose the closid associated with it so we cannot allow it to have any + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the + * future. Monitoring of a pseudo-locked region is not allowed either. + * + * The above and more restrictions on a pseudo-locked region are checked + * for and enforced before the resource group enters the locksetup mode. + * + * Returns: 0 if the resource group successfully entered locksetup mode, <0 + * on failure. On failure the last_cmd_status buffer is updated with text to + * communicate details of failure to the user. + */ +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + int ret; + + /* + * The default resource group can neither be removed nor lose the + * default closid associated with it. + */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); + return -EINVAL; + } + + /* + * Cache Pseudo-locking not supported when CDP is enabled. + * + * Some things to consider if you would like to enable this + * support (using L3 CDP as example): + * - When CDP is enabled two separate resources are exposed, + * L3DATA and L3CODE, but they are actually on the same cache. + * The implication for pseudo-locking is that if a + * pseudo-locked region is created on a domain of one + * resource (eg. L3CODE), then a pseudo-locked region cannot + * be created on that same domain of the other resource + * (eg. L3DATA). This is because the creation of a + * pseudo-locked region involves a call to wbinvd that will + * affect all cache allocations on particular domain. + * - Considering the previous, it may be possible to only + * expose one of the CDP resources to pseudo-locking and + * hide the other. For example, we could consider to only + * expose L3DATA and since the L3 cache is unified it is + * still possible to place instructions there are execute it. + * - If only one region is exposed to pseudo-locking we should + * still keep in mind that availability of a portion of cache + * for pseudo-locking should take into account both resources. + * Similarly, if a pseudo-locked region is created in one + * resource, the portion of cache used by it should be made + * unavailable to all future allocations from both resources. + */ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || + resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { + rdt_last_cmd_puts("CDP enabled\n"); + return -EINVAL; + } + + /* + * Not knowing the bits to disable prefetching implies that this + * platform does not support Cache Pseudo-Locking. + */ + if (resctrl_arch_get_prefetch_disable_bits() == 0) { + rdt_last_cmd_puts("Pseudo-locking not supported\n"); + return -EINVAL; + } + + if (rdtgroup_monitor_in_progress(rdtgrp)) { + rdt_last_cmd_puts("Monitoring in progress\n"); + return -EINVAL; + } + + if (rdtgroup_tasks_assigned(rdtgrp)) { + rdt_last_cmd_puts("Tasks assigned to resource group\n"); + return -EINVAL; + } + + if (!cpumask_empty(&rdtgrp->cpu_mask)) { + rdt_last_cmd_puts("CPUs assigned to resource group\n"); + return -EINVAL; + } + + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { + rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); + return -EIO; + } + + ret = pseudo_lock_init(rdtgrp); + if (ret) { + rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); + goto out_release; + } + + /* + * If this system is capable of monitoring a rmid would have been + * allocated when the control group was created. This is not needed + * anymore when this group would be used for pseudo-locking. This + * is safe to call on platforms not capable of monitoring. + */ + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + ret = 0; + goto out; + +out_release: + rdtgroup_locksetup_user_restore(rdtgrp); +out: + return ret; +} + +/** + * rdtgroup_locksetup_exit - resource group exist locksetup mode + * @rdtgrp: resource group + * + * When a resource group exits locksetup mode the earlier restrictions are + * lifted. + * + * Return: 0 on success, <0 on failure + */ +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + int ret; + + if (resctrl_arch_mon_capable()) { + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + } + + ret = rdtgroup_locksetup_user_restore(rdtgrp); + if (ret) { + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + pseudo_lock_free(rdtgrp); + return 0; +} + +/** + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked + * @d: RDT domain + * @cbm: CBM to test + * + * @d represents a cache instance and @cbm a capacity bitmask that is + * considered for it. Determine if @cbm overlaps with any existing + * pseudo-locked region on @d. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: true if @cbm overlaps with pseudo-locked region on @d, false + * otherwise. + */ +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) +{ + unsigned int cbm_len; + unsigned long cbm_b; + + if (d->plr) { + cbm_len = d->plr->s->res->cache.cbm_len; + cbm_b = d->plr->cbm; + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) + return true; + } + return false; +} + +/** + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy + * @d: RDT domain under test + * + * The setup of a pseudo-locked region affects all cache instances within + * the hierarchy of the region. It is thus essential to know if any + * pseudo-locked regions exist within a cache hierarchy to prevent any + * attempts to create new pseudo-locked regions in the same hierarchy. + * + * Return: true if a pseudo-locked region exists in the hierarchy of @d or + * if it is not possible to test due to memory allocation issue, + * false otherwise. + */ +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) +{ + struct rdt_ctrl_domain *d_i; + cpumask_var_t cpu_with_psl; + struct rdt_resource *r; + bool ret = false; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) + return true; + + /* + * First determine which cpus have pseudo-locked regions + * associated with them. + */ + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { + if (d_i->plr) + cpumask_or(cpu_with_psl, cpu_with_psl, + &d_i->hdr.cpu_mask); + } + } + + /* + * Next test if new pseudo-locked region would intersect with + * existing region. + */ + if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) + ret = true; + + free_cpumask_var(cpu_with_psl); + return ret; +} + +/** + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * @rdtgrp: Resource group to which the pseudo-locked region belongs. + * @sel: Selector of which measurement to perform on a pseudo-locked region. + * + * The measurement of latency to access a pseudo-locked region should be + * done from a cpu that is associated with that pseudo-locked region. + * Determine which cpu is associated with this region and start a thread on + * that cpu to perform the measurement, wait for that thread to complete. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int cpu; + int ret = -1; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out; + } + + if (!plr->d) { + ret = -ENODEV; + goto out; + } + + plr->thread_done = 0; + cpu = cpumask_first(&plr->d->hdr.cpu_mask); + if (!cpu_online(cpu)) { + ret = -ENODEV; + goto out; + } + + plr->cpu = cpu; + + if (sel == 1) + thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, + plr, cpu, "pseudo_lock_measure/%u"); + else if (sel == 2) + thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, + plr, cpu, "pseudo_lock_measure/%u"); + else if (sel == 3) + thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, + plr, cpu, "pseudo_lock_measure/%u"); + else + goto out; + + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + goto out; + } + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) + goto out; + + ret = 0; + +out: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static ssize_t pseudo_lock_measure_trigger(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rdtgroup *rdtgrp = file->private_data; + size_t buf_size; + char buf[32]; + int ret; + int sel; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + ret = kstrtoint(buf, 10, &sel); + if (ret == 0) { + if (sel != 1 && sel != 2 && sel != 3) + return -EINVAL; + ret = debugfs_file_get(file->f_path.dentry); + if (ret) + return ret; + ret = pseudo_lock_measure_cycles(rdtgrp, sel); + if (ret == 0) + ret = count; + debugfs_file_put(file->f_path.dentry); + } + + return ret; +} + +static const struct file_operations pseudo_measure_fops = { + .write = pseudo_lock_measure_trigger, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region + * @rdtgrp: resource group to which pseudo-lock region belongs + * + * Called when a resource group in the pseudo-locksetup mode receives a + * valid schemata that should be pseudo-locked. Since the resource group is + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been + * allocated and initialized with the essential information. If a failure + * occurs the resource group remains in the pseudo-locksetup mode with the + * &struct pseudo_lock_region associated with it, but cleared from all + * information and ready for the user to re-attempt pseudo-locking by + * writing the schemata again. + * + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 + * on failure. Descriptive error will be written to last_cmd_status buffer. + */ +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int new_minor; + struct device *dev; + char *kn_name __free(kfree) = NULL; + int ret; + + ret = pseudo_lock_region_alloc(plr); + if (ret < 0) + return ret; + + ret = pseudo_lock_cstates_constrain(plr); + if (ret < 0) { + ret = -EINVAL; + goto out_region; + } + kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); + if (!kn_name) { + ret = -ENOMEM; + goto out_cstates; + } + + plr->thread_done = 0; + + thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, + plr->cpu, "pseudo_lock/%u"); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + rdt_last_cmd_printf("Locking thread returned error %d\n", ret); + goto out_cstates; + } + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) { + /* + * If the thread does not get on the CPU for whatever + * reason and the process which sets up the region is + * interrupted then this will leave the thread in runnable + * state and once it gets on the CPU it will dereference + * the cleared, but not freed, plr struct resulting in an + * empty pseudo-locking loop. + */ + rdt_last_cmd_puts("Locking thread interrupted\n"); + goto out_cstates; + } + + ret = pseudo_lock_minor_get(&new_minor); + if (ret < 0) { + rdt_last_cmd_puts("Unable to obtain a new minor number\n"); + goto out_cstates; + } + + /* + * Unlock access but do not release the reference. The + * pseudo-locked region will still be here on return. + * + * The mutex has to be released temporarily to avoid a potential + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. + */ + mutex_unlock(&rdtgroup_mutex); + + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + + dev = device_create(&pseudo_lock_class, NULL, + MKDEV(pseudo_lock_major, new_minor), + rdtgrp, "%s", kn_name); + + mutex_lock(&rdtgroup_mutex); + + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + rdt_last_cmd_printf("Failed to create character device: %d\n", + ret); + goto out_debugfs; + } + + /* We released the mutex - check if group was removed while we did so */ + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out_device; + } + + plr->minor = new_minor; + + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; + closid_free(rdtgrp->closid); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); + + ret = 0; + goto out; + +out_device: + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); +out_debugfs: + debugfs_remove_recursive(plr->debugfs_dir); + pseudo_lock_minor_release(new_minor); +out_cstates: + pseudo_lock_cstates_relax(plr); +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region + * @rdtgrp: resource group to which the pseudo-locked region belongs + * + * The removal of a pseudo-locked region can be initiated when the resource + * group is removed from user space via a "rmdir" from userspace or the + * unmount of the resctrl filesystem. On removal the resource group does + * not go back to pseudo-locksetup mode before it is removed, instead it is + * removed directly. There is thus asymmetry with the creation where the + * &struct pseudo_lock_region is removed here while it was not created in + * rdtgroup_pseudo_lock_create(). + * + * Return: void + */ +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * Default group cannot be a pseudo-locked region so we can + * free closid here. + */ + closid_free(rdtgrp->closid); + goto free; + } + + pseudo_lock_cstates_relax(plr); + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + pseudo_lock_minor_release(plr->minor); + +free: + pseudo_lock_free(rdtgrp); +} + +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = region_find_by_minor(iminor(inode)); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + filp->private_data = rdtgrp; + atomic_inc(&rdtgrp->waitcount); + /* Perform a non-seekable open - llseek is not supported */ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + filp->private_data = NULL; + atomic_dec(&rdtgrp->waitcount); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +{ + /* Not supported */ + return -EINVAL; +} + +static const struct vm_operations_struct pseudo_mmap_ops = { + .mremap = pseudo_lock_dev_mremap, +}; + +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long vsize = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct pseudo_lock_region *plr; + struct rdtgroup *rdtgrp; + unsigned long physical; + unsigned long psize; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + plr = rdtgrp->plr; + + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + /* + * Task is required to run with affinity to the cpus associated + * with the pseudo-locked region. If this is not the case the task + * may be scheduled elsewhere and invalidate entries in the + * pseudo-locked region. + */ + if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + physical = __pa(plr->kmem) >> PAGE_SHIFT; + psize = plr->size - off; + + if (off > plr->size) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + /* + * Ensure changes are carried directly to the memory being mapped, + * do not allow copy-on-write mapping. + */ + if (!(vma->vm_flags & VM_SHARED)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + if (vsize > psize) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + memset(plr->kmem + off, 0, vsize); + + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, + vsize, vma->vm_page_prot)) { + mutex_unlock(&rdtgroup_mutex); + return -EAGAIN; + } + vma->vm_ops = &pseudo_mmap_ops; + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static const struct file_operations pseudo_lock_dev_fops = { + .owner = THIS_MODULE, + .read = NULL, + .write = NULL, + .open = pseudo_lock_dev_open, + .release = pseudo_lock_dev_release, + .mmap = pseudo_lock_dev_mmap, +}; + +int rdt_pseudo_lock_init(void) +{ + int ret; + + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); + if (ret < 0) + return ret; + + pseudo_lock_major = ret; + + ret = class_register(&pseudo_lock_class); + if (ret) { + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + return ret; + } + + return 0; +} + +void rdt_pseudo_lock_release(void) +{ + class_unregister(&pseudo_lock_class); + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + pseudo_lock_major = 0; +} diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c new file mode 100644 index 000000000000..cc37f58b47dd --- /dev/null +++ b/fs/resctrl/rdtgroup.c @@ -0,0 +1,4353 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * User interface for Resource Allocation in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu <fenghua.yu@intel.com> + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/fs_parser.h> +#include <linux/sysfs.h> +#include <linux/kernfs.h> +#include <linux/resctrl.h> +#include <linux/seq_buf.h> +#include <linux/seq_file.h> +#include <linux/sched/task.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +#include <uapi/linux/magic.h> + +#include "internal.h" + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +static struct kernfs_root *rdt_root; + +struct rdtgroup rdtgroup_default; + +LIST_HEAD(rdt_all_groups); + +/* list of entries for the schemata file */ +LIST_HEAD(resctrl_schema_all); + +/* + * List of struct mon_data containing private data of event files for use by + * rdtgroup_mondata_show(). Protected by rdtgroup_mutex. + */ +static LIST_HEAD(mon_data_kn_priv_list); + +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + +/* + * Used to store the max resource name width to display the schemata names in + * a tabular format. + */ +int max_name_width; + +static struct seq_buf last_cmd_status; + +static char last_cmd_status_buf[512]; + +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); + +static void rdtgroup_destroy_root(void); + +struct dentry *debugfs_resctrl; + +/* + * Memory bandwidth monitoring event to use for the default CTRL_MON group + * and each new CTRL_MON group created by the user. Only relevant when + * the filesystem is mounted with the "mba_MBps" option so it does not + * matter that it remains uninitialized on systems that do not support + * the "mba_MBps" option. + */ +enum resctrl_event_id mba_mbps_default_event; + +static bool resctrl_debug; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + +void rdt_staged_configs_clear(void) +{ + struct rdt_ctrl_domain *dom; + struct rdt_resource *r; + + lockdep_assert_held(&rdtgroup_mutex); + + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) + memset(dom->staged_config, 0, sizeof(dom->staged_config)); + } +} + +static bool resctrl_is_mbm_enabled(void) +{ + return (resctrl_arch_is_mbm_total_enabled() || + resctrl_arch_is_mbm_local_enabled()); +} + +static bool resctrl_is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* + * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap + * of free CLOSIDs. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set current's closid to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static unsigned long *closid_free_map; + +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} + +static int closid_init(void) +{ + struct resctrl_schema *s; + u32 rdt_min_closid = ~0; + + /* Monitor only platforms still call closid_init() */ + if (list_empty(&resctrl_schema_all)) + return 0; + + /* Compute rdt_min_closid across all resources */ + list_for_each_entry(s, &resctrl_schema_all, list) + rdt_min_closid = min(rdt_min_closid, s->num_closid); + + closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL); + if (!closid_free_map) + return -ENOMEM; + bitmap_fill(closid_free_map, rdt_min_closid); + + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map); + closid_free_map_len = rdt_min_closid; + + return 0; +} + +static void closid_exit(void) +{ + bitmap_free(closid_free_map); + closid_free_map = NULL; +} + +static int closid_alloc(void) +{ + int cleanest_closid; + u32 closid; + + lockdep_assert_held(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && + resctrl_arch_is_llc_occupancy_enabled()) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = find_first_bit(closid_free_map, closid_free_map_len); + if (closid == closid_free_map_len) + return -ENOSPC; + } + __clear_bit(closid, closid_free_map); + + return closid; +} + +void closid_free(int closid) +{ + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, closid_free_map); +} + +/** + * closid_allocated - test if provided closid is in use + * @closid: closid to be tested + * + * Return: true if @closid is currently associated with a resource group, + * false if @closid is free + */ +bool closid_allocated(unsigned int closid) +{ + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, closid_free_map); +} + +/** + * rdtgroup_mode_by_closid - Return mode of resource group with closid + * @closid: closid if the resource group + * + * Each resource group is associated with a @closid. Here the mode + * of a resource group can be queried by searching for it using its closid. + * + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid + */ +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdtgrp->mode; + } + + return RDT_NUM_MODES; +} + +static const char * const rdt_mode_str[] = { + [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", +}; + +/** + * rdtgroup_mode_str - Return the string representation of mode + * @mode: the resource group mode as &enum rdtgroup_mode + * + * Return: string representation of valid mode, "unknown" otherwise + */ +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) +{ + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) + return "unknown"; + + return rdt_mode_str[mode]; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static const struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static const struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct cpumask *mask; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->hdr.cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids/rmids must have been set up before calling this function. + * @r may be NULL. + */ +static void +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) +{ + struct resctrl_cpu_defaults defaults, *p = NULL; + + if (r) { + defaults.closid = r->closid; + defaults.rmid = r->mon.rmid; + p = &defaults; + } + + on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); +} + +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); + return -EINVAL; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); + return -EINVAL; + } + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (!cpumask_empty(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + + if (ret) { + rdt_last_cmd_puts("Bad CPU list/mask\n"); + goto unlock; + } + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (!cpumask_empty(tmpmask)) { + ret = -EINVAL; + rdt_last_cmd_puts("Can only assign online CPUs\n"); + goto unlock; + } + + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); + + return ret ?: nbytes; +} + +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} + +static void _update_task_closid_rmid(void *task) +{ + /* + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. + */ + if (task == current) + resctrl_arch_sched_in(task); +} + +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); +} + +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + /* If the task is already in rdtgrp, no need to move the task. */ + if (task_in_rdtgroup(tsk, rdtgrp)) + return 0; + + /* + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; + } + + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_arch_sched_in() during context switch. + */ + smp_mb(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; +} + +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); +} + +/** + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group + * @r: Resource group + * + * Return: 1 if tasks have been assigned to @r, 0 otherwise + */ +int rdtgroup_tasks_assigned(struct rdtgroup *r) +{ + struct task_struct *p, *t; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); + ret = -EPERM; + } + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + char *pid_str; + int ret = 0; + pid_t pid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } + +unlock: + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + pid_t pid; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +#ifdef CONFIG_PROC_CPU_RESCTRL +/* + * A task can only be part of one resctrl control group and of one monitor + * group which is associated to that control group. + * + * 1) res: + * mon: + * + * resctrl is not available. + * + * 2) res:/ + * mon: + * + * Task is part of the root resctrl control group, and it is not associated + * to any monitor group. + * + * 3) res:/ + * mon:mon0 + * + * Task is part of the root resctrl control group and monitor group mon0. + * + * 4) res:group0 + * mon: + * + * Task is part of resctrl control group group0, and it is not associated + * to any monitor group. + * + * 5) res:group0 + * mon:mon1 + * + * Task is part of resctrl control group group0 and monitor group mon1. + */ +int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + struct rdtgroup *rdtg; + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + + /* Return empty if resctrl has not been mounted. */ + if (!resctrl_mounted) { + seq_puts(s, "res:\nmon:\n"); + goto unlock; + } + + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { + struct rdtgroup *crg; + + /* + * Task information is only relevant for shareable + * and exclusive groups. + */ + if (rdtg->mode != RDT_MODE_SHAREABLE && + rdtg->mode != RDT_MODE_EXCLUSIVE) + continue; + + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) + continue; + + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", + rdt_kn_name(rdtg->kn)); + seq_puts(s, "mon:"); + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, + mon.crdtgrp_list) { + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) + continue; + seq_printf(s, "%s", rdt_kn_name(crg->kn)); + break; + } + seq_putc(s, '\n'); + goto unlock; + } + /* + * The above search should succeed. Otherwise return + * with an error. + */ + ret = -ENOENT; +unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} +#endif + +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static void *rdt_kn_parent_priv(struct kernfs_node *kn) +{ + /* + * The parent pointer is only valid within RCU section since it can be + * replaced. + */ + guard(rcu)(); + return rcu_dereference(kn->__parent)->priv; +} + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + + seq_printf(seq, "%u\n", s->num_closid); + return 0; +} + +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + +/* + * rdt_bit_usage_show - Display current usage of resources + * + * A domain is a shared resource that can now be allocated differently. Here + * we display the current regions of the domain as an annotated bitmask. + * For each domain of this resource its allocation bitmask + * is annotated as below to indicate the current usage of the corresponding bit: + * 0 - currently unused + * X - currently available for sharing and used by software and hardware + * H - currently used by hardware only but available for software use + * S - currently used and shareable by software only + * E - currently used exclusively by one resource group + * P - currently pseudo-locked by one resource group + */ +static int rdt_bit_usage_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + /* + * Use unsigned long even though only 32 bits are used to ensure + * test_bit() is used safely. + */ + unsigned long sw_shareable = 0, hw_shareable = 0; + unsigned long exclusive = 0, pseudo_locked = 0; + struct rdt_resource *r = s->res; + struct rdt_ctrl_domain *dom; + int i, hwb, swb, excl, psl; + enum rdtgrp_mode mode; + bool sep = false; + u32 ctrl_val; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + hw_shareable = r->cache.shareable_bits; + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { + if (sep) + seq_putc(seq, ';'); + sw_shareable = 0; + exclusive = 0; + seq_printf(seq, "%d=", dom->hdr.id); + for (i = 0; i < closids_supported(); i++) { + if (!closid_allocated(i)) + continue; + ctrl_val = resctrl_arch_get_config(r, dom, i, + s->conf_type); + mode = rdtgroup_mode_by_closid(i); + switch (mode) { + case RDT_MODE_SHAREABLE: + sw_shareable |= ctrl_val; + break; + case RDT_MODE_EXCLUSIVE: + exclusive |= ctrl_val; + break; + case RDT_MODE_PSEUDO_LOCKSETUP: + /* + * RDT_MODE_PSEUDO_LOCKSETUP is possible + * here but not included since the CBM + * associated with this CLOSID in this mode + * is not initialized and no task or cpu can be + * assigned this CLOSID. + */ + break; + case RDT_MODE_PSEUDO_LOCKED: + case RDT_NUM_MODES: + WARN(1, + "invalid mode for closid %d\n", i); + break; + } + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { + pseudo_locked = dom->plr ? dom->plr->cbm : 0; + hwb = test_bit(i, &hw_shareable); + swb = test_bit(i, &sw_shareable); + excl = test_bit(i, &exclusive); + psl = test_bit(i, &pseudo_locked); + if (hwb && swb) + seq_putc(seq, 'X'); + else if (hwb && !swb) + seq_putc(seq, 'H'); + else if (!hwb && swb) + seq_putc(seq, 'S'); + else if (excl) + seq_putc(seq, 'E'); + else if (psl) + seq_putc(seq, 'P'); + else /* Unused bits remain */ + seq_putc(seq, '0'); + } + sep = true; + } + seq_putc(seq, '\n'); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.min_bw); + return 0; +} + +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) { + seq_printf(seq, "%s\n", mevt->name); + if (mevt->configurable) + seq_printf(seq, "%s_config\n", mevt->name); + } + + return 0; +} + +static int rdt_bw_gran_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} + +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.delay_linear); + return 0; +} + +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); + + return 0; +} + +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + switch (r->membw.throttle_mode) { + case THREAD_THROTTLE_PER_THREAD: + seq_puts(seq, "per-thread\n"); + return 0; + case THREAD_THROTTLE_MAX: + seq_puts(seq, "max\n"); + return 0; + case THREAD_THROTTLE_UNDEFINED: + seq_puts(seq, "undefined\n"); + return 0; + } + + WARN_ON_ONCE(1); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > resctrl_rmid_realloc_limit) + return -EINVAL; + + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); + + return nbytes; +} + +/* + * rdtgroup_mode_show - Display mode of this resource group + */ +static int rdtgroup_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) +{ + switch (my_type) { + case CDP_CODE: + return CDP_DATA; + case CDP_DATA: + return CDP_CODE; + default: + case CDP_NONE: + return CDP_NONE; + } +} + +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); + + return 0; +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @type: CDP type of @r. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Checks if provided @cbm intended to be used for @closid on domain + * @d overlaps with any other closids or other hardware usage associated + * with this domain. If @exclusive is true then only overlaps with + * resource groups in exclusive mode will be considered. If @exclusive + * is false then overlaps with any resource group or hardware entities + * will be considered. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: false if CBM does not overlap, true if it does. + */ +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, + enum resctrl_conf_type type, bool exclusive) +{ + enum rdtgrp_mode mode; + unsigned long ctrl_b; + int i; + + /* Check for any overlap with regions used by hardware directly */ + if (!exclusive) { + ctrl_b = r->cache.shareable_bits; + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) + return true; + } + + /* Check for overlap with other resource groups */ + for (i = 0; i < closids_supported(); i++) { + ctrl_b = resctrl_arch_get_config(r, d, i, type); + mode = rdtgroup_mode_by_closid(i); + if (closid_allocated(i) && i != closid && + mode != RDT_MODE_PSEUDO_LOCKSETUP) { + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { + if (exclusive) { + if (mode == RDT_MODE_EXCLUSIVE) + return true; + continue; + } + return true; + } + } + } + + return false; +} + +/** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @s: Schema for the resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + struct rdt_resource *r = s->res; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, + exclusive)) + return true; + + if (!resctrl_arch_get_cdp_enabled(r->rid)) + return false; + return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); +} + +/** + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. + * + * An exclusive resource group implies that there should be no sharing of + * its allocated resources. At the time this group is considered to be + * exclusive this test can determine if its current schemata supports this + * setting by testing for overlap with all other resource groups. + * + * Return: true if resource group can be exclusive, false if there is overlap + * with allocations of other resource groups and thus this resource group + * cannot be exclusive. + */ +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) +{ + int closid = rdtgrp->closid; + struct rdt_ctrl_domain *d; + struct resctrl_schema *s; + struct rdt_resource *r; + bool has_cache = false; + u32 ctrl; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) + continue; + has_cache = true; + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + ctrl = resctrl_arch_get_config(r, d, closid, + s->conf_type); + if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { + rdt_last_cmd_puts("Schemata overlaps\n"); + return false; + } + } + } + + if (!has_cache) { + rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); + return false; + } + + return true; +} + +/* + * rdtgroup_mode_write - Modify the resource group's mode + */ +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + enum rdtgrp_mode mode; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + rdt_last_cmd_clear(); + + mode = rdtgrp->mode; + + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || + (!strcmp(buf, "pseudo-locksetup") && + mode == RDT_MODE_PSEUDO_LOCKSETUP) || + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) + goto out; + + if (mode == RDT_MODE_PSEUDO_LOCKED) { + rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); + ret = -EINVAL; + goto out; + } + + if (!strcmp(buf, "shareable")) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } else if (!strcmp(buf, "exclusive")) { + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { + ret = -EINVAL; + goto out; + } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_EXCLUSIVE; + } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && + !strcmp(buf, "pseudo-locksetup")) { + ret = rdtgroup_locksetup_enter(rdtgrp); + if (ret) + goto out; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; + } else { + rdt_last_cmd_puts("Unknown or unsupported mode\n"); + ret = -EINVAL; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +/** + * rdtgroup_cbm_to_size - Translate CBM to size in bytes + * @r: RDT resource to which @d belongs. + * @d: RDT domain instance. + * @cbm: bitmask for which the size should be computed. + * + * The bitmask provided associated with the RDT domain instance @d will be + * translated into how many bytes it represents. The size in bytes is + * computed by first dividing the total cache size by the CBM length to + * determine how many bytes each bit in the bitmask represents. The result + * is multiplied with the number of bits set in the bitmask. + * + * @cbm is unsigned long, even if only 32 bits are used to make the + * bitmap functions work correctly. + */ +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, + struct rdt_ctrl_domain *d, unsigned long cbm) +{ + unsigned int size = 0; + struct cacheinfo *ci; + int num_b; + + if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) + return size; + + num_b = bitmap_weight(&cbm, r->cache.cbm_len); + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); + if (ci) + size = ci->size / r->cache.cbm_len * num_b; + + return size; +} + +bool is_mba_sc(struct rdt_resource *r) +{ + if (!r) + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + /* + * The software controller support is only applicable to MBA resource. + * Make sure to check for resource type. + */ + if (r->rid != RDT_RESOURCE_MBA) + return false; + + return r->membw.mba_sc; +} + +/* + * rdtgroup_size_show - Display size in bytes of allocated regions + * + * The "size" file mirrors the layout of the "schemata" file, printing the + * size in bytes of each region instead of the capacity bitmask. + */ +static int rdtgroup_size_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + enum resctrl_conf_type type; + struct rdt_ctrl_domain *d; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + unsigned int size; + int ret = 0; + u32 closid; + bool sep; + u32 ctrl; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->s->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); + } + goto out; + } + + closid = rdtgrp->closid; + + list_for_each_entry(schema, &resctrl_schema_all, list) { + r = schema->res; + type = schema->conf_type; + sep = false; + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + size = 0; + } else { + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); + } + seq_printf(s, "%d=%u", d->hdr.id, size); + sep = true; + } + seq_putc(s, '\n'); + } + +out: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static void mondata_config_read(struct resctrl_mon_config_info *mon_info) +{ + smp_call_function_any(&mon_info->d->hdr.cpu_mask, + resctrl_arch_mon_event_config_read, mon_info, 1); +} + +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) +{ + struct resctrl_mon_config_info mon_info; + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_puts(s, ";"); + + memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); + mon_info.r = r; + mon_info.d = dom; + mon_info.evtid = evtid; + mondata_config_read(&mon_info); + + seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); + sep = true; + } + seq_puts(s, "\n"); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return 0; +} + +static int mbm_total_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); + + return 0; +} + +static int mbm_local_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); + + return 0; +} + +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_mon_domain *d, u32 evtid, u32 val) +{ + struct resctrl_mon_config_info mon_info = {0}; + + /* + * Read the current config value first. If both are the same then + * no need to write it again. + */ + mon_info.r = r; + mon_info.d = d; + mon_info.evtid = evtid; + mondata_config_read(&mon_info); + if (mon_info.mon_config == val) + return; + + mon_info.mon_config = val; + + /* + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE + * are scoped at the domain level. Writing any of these MSRs + * on one CPU is observed by all the CPUs in the domain. + */ + smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, + &mon_info, 1); + + /* + * When an Event Configuration is changed, the bandwidth counters + * for all RMIDs and Events will be cleared by the hardware. The + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for + * every RMID on the next read to any event for every RMID. + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) + * cleared while it is tracked by the hardware. Clear the + * mbm_local and mbm_total counts for all the RMIDs. + */ + resctrl_arch_reset_rmid_all(r, d); +} + +static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) +{ + char *dom_str = NULL, *id_str; + unsigned long dom_id, val; + struct rdt_mon_domain *d; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + id_str = strsep(&dom_str, "="); + + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); + return -EINVAL; + } + + if (!dom_str || kstrtoul(dom_str, 16, &val)) { + rdt_last_cmd_puts("Non-numeric event configuration value\n"); + return -EINVAL; + } + + /* Value from user cannot be more than the supported set of events */ + if ((val & r->mbm_cfg_mask) != val) { + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", + r->mbm_cfg_mask); + return -EINVAL; + } + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + mbm_config_write_domain(r, d, evtid, val); + goto next; + } + } + + return -EINVAL; +} + +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_common_files[] = { + { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RFTYPE_TOP_INFO, + }, + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + .fflags = RFTYPE_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RFTYPE_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RFTYPE_MON_INFO, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "shareable_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_shareable_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "bit_usage", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bit_usage_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "mbm_total_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_total_bytes_config_show, + .write = mbm_total_bytes_config_write, + }, + { + .name = "mbm_local_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_local_bytes_config_show, + .write = mbm_local_bytes_config_write, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "mba_MBps_event", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mba_mbps_event_write, + .seq_show = rdtgroup_mba_mbps_event_show, + }, + { + .name = "mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mode_write, + .seq_show = rdtgroup_mode_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "size", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_size_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "sparse_masks", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_has_sparse_bitmasks_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, + }, +}; + +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) +{ + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + + for (rft = rfts; rft < rfts + len; rft++) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; +} + +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +static void thread_throttle_mode_init(void) +{ + enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; + struct rdt_resource *r_mba, *r_smba; + + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r_mba->alloc_capable && + r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_mba->membw.throttle_mode; + + r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); + if (r_smba->alloc_capable && + r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_smba->membw.throttle_mode; + + if (throttle_mode == THREAD_THROTTLE_UNDEFINED) + return; + + resctrl_file_fflags_init("thread_throttle_mode", + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); +} + +void resctrl_file_fflags_init(const char *config, unsigned long fflags) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name(config); + if (rft) + rft->fflags = fflags; +} + +/** + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * The permissions of named resctrl file, directory, or link are modified + * to not allow read, write, or execute by any user. + * + * WARNING: This function is intended to communicate to the user that the + * resctrl file has been locked down - that it is not relevant to the + * particular state the system finds itself in. It should not be relied + * on to protect from user access because after the file's permissions + * are restricted the user can still change the permissions using chmod + * from the command line. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn; + int ret = 0; + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + iattr.ia_mode = S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode = S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode = S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +/** + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * @mask: Mask of permissions that should be restored + * + * Restore the permissions of the named file. If @name is a directory the + * permissions of its parent will be used. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn, *parent; + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + iattr.ia_mode = rft->mode & mask; + } + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + parent = kernfs_get_parent(kn); + if (parent) { + iattr.ia_mode |= parent->mode; + kernfs_put(parent); + } + iattr.ia_mode |= S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode |= S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode |= S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +static int rdtgroup_mkdir_info_resdir(void *priv, char *name, + unsigned long fflags) +{ + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, priv); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; +} + +static unsigned long fflags_from_resource(struct rdt_resource *r) +{ + switch (r->rid) { + case RDT_RESOURCE_L3: + case RDT_RESOURCE_L2: + return RFTYPE_RES_CACHE; + case RDT_RESOURCE_MBA: + case RDT_RESOURCE_SMBA: + return RFTYPE_RES_MB; + } + + return WARN_ON_ONCE(1); +} + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + unsigned long fflags; + char name[32]; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); + if (ret) + goto out_destroy; + + /* loop over enabled controls, these are all alloc_capable */ + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); + if (ret) + goto out_destroy; + } + + for_each_mon_capable_rdt_resource(r) { + fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); + if (ret) + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static inline bool is_mba_linear(void) +{ + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; +} + +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->hdr.cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_ctrl_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + +/* + * MBA software controller is supported only if + * MBM is supported and MBA is in linear scale, + * and the MBM monitor scope is the same as MBA + * control scope. + */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + return (resctrl_is_mbm_enabled() && + r->alloc_capable && is_mba_linear() && + r->ctrl_scope == rmbm->mon_scope); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ +static int set_mba_sc(bool mba_sc) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + u32 num_closid = resctrl_arch_get_num_closid(r); + struct rdt_ctrl_domain *d; + unsigned long fflags; + int i; + + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) + return -EINVAL; + + r->membw.mba_sc = mba_sc; + + rdtgroup_default.mba_mbps_event = mba_mbps_default_event; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + } + + fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; + resctrl_file_fflags_init("mba_MBps_event", fflags); + + return 0; +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || + rcu_access_pointer(kn->__parent) == kn_info) + return NULL; + else + return kn->priv; + } else { + return rdt_kn_parent_priv(kn); + } +} + +static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); +} + +static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + kernfs_unbreak_active_protection(kn); + rdtgroup_remove(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + rdtgroup_kn_get(rdtgrp, kn); + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + rdtgroup_kn_put(rdtgrp, kn); +} + +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); + + resctrl_debug = false; +} + +static int rdt_enable_ctx(struct rdt_fs_context *ctx) +{ + int ret = 0; + + if (ctx->enable_cdpl2) { + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } + + if (ctx->enable_cdpl3) { + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } + + if (ctx->enable_mba_mbps) { + ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + if (ctx->enable_debug) + resctrl_debug = true; + + return 0; + +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: + return ret; +} + +static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) +{ + struct resctrl_schema *s; + const char *suffix = ""; + int ret, cl; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->res = r; + s->num_closid = resctrl_arch_get_num_closid(r); + if (resctrl_arch_get_cdp_enabled(r->rid)) + s->num_closid /= 2; + + s->conf_type = type; + switch (type) { + case CDP_CODE: + suffix = "CODE"; + break; + case CDP_DATA: + suffix = "DATA"; + break; + case CDP_NONE: + suffix = ""; + break; + } + + ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); + if (ret >= sizeof(s->name)) { + kfree(s); + return -EINVAL; + } + + cl = strlen(s->name); + + /* + * If CDP is supported by this resource, but not enabled, + * include the suffix. This ensures the tabular format of the + * schemata file does not change between mounts of the filesystem. + */ + if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) + cl += 4; + + if (cl > max_name_width) + max_name_width = cl; + + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + s->fmt_str = "%d=%x"; + break; + case RESCTRL_SCHEMA_RANGE: + s->fmt_str = "%d=%u"; + break; + } + + if (WARN_ON_ONCE(!s->fmt_str)) { + kfree(s); + return -EINVAL; + } + + INIT_LIST_HEAD(&s->list); + list_add(&s->list, &resctrl_schema_all); + + return 0; +} + +static int schemata_list_create(void) +{ + struct rdt_resource *r; + int ret = 0; + + for_each_alloc_capable_rdt_resource(r) { + if (resctrl_arch_get_cdp_enabled(r->rid)) { + ret = schemata_list_add(r, CDP_CODE); + if (ret) + break; + + ret = schemata_list_add(r, CDP_DATA); + } else { + ret = schemata_list_add(r, CDP_NONE); + } + + if (ret) + break; + } + + return ret; +} + +static void schemata_list_destroy(void) +{ + struct resctrl_schema *s, *tmp; + + list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { + list_del(&s->list); + kfree(s); + } +} + +static int rdt_get_tree(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; + struct rdt_mon_domain *dom; + struct rdt_resource *r; + int ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (resctrl_mounted) { + ret = -EBUSY; + goto out; + } + + ret = rdtgroup_setup_root(ctx); + if (ret) + goto out; + + ret = rdt_enable_ctx(ctx); + if (ret) + goto out_root; + + ret = schemata_list_create(); + if (ret) { + schemata_list_destroy(); + goto out_ctx; + } + + ret = closid_init(); + if (ret) + goto out_schemata_free; + + if (resctrl_arch_mon_capable()) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); + if (ret) + goto out_closid_exit; + + kernfs_activate(rdtgroup_default.kn); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret < 0) + goto out_closid_exit; + + if (resctrl_arch_mon_capable()) { + ret = mongroup_create_dir(rdtgroup_default.kn, + &rdtgroup_default, "mon_groups", + &kn_mongrp); + if (ret < 0) + goto out_info; + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret < 0) + goto out_mongrp; + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + + ret = rdt_pseudo_lock_init(); + if (ret) + goto out_mondata; + + ret = kernfs_get_tree(fc); + if (ret < 0) + goto out_psl; + + if (resctrl_arch_alloc_capable()) + resctrl_arch_enable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_enable_mon(); + + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) + resctrl_mounted = true; + + if (resctrl_is_mbm_enabled()) { + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + list_for_each_entry(dom, &r->mon_domains, hdr.list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); + } + + goto out; + +out_psl: + rdt_pseudo_lock_release(); +out_mondata: + if (resctrl_arch_mon_capable()) + kernfs_remove(kn_mondata); +out_mongrp: + if (resctrl_arch_mon_capable()) + kernfs_remove(kn_mongrp); +out_info: + kernfs_remove(kn_info); +out_closid_exit: + closid_exit(); +out_schemata_free: + schemata_list_destroy(); +out_ctx: + rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); +out: + rdt_last_cmd_clear(); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +enum rdt_param { + Opt_cdp, + Opt_cdpl2, + Opt_mba_mbps, + Opt_debug, + nr__rdt_params +}; + +static const struct fs_parameter_spec rdt_fs_parameters[] = { + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), + {} +}; + +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct fs_parse_result result; + const char *msg; + int opt; + + opt = fs_parse(fc, rdt_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cdp: + ctx->enable_cdpl3 = true; + return 0; + case Opt_cdpl2: + ctx->enable_cdpl2 = true; + return 0; + case Opt_mba_mbps: + msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; + if (!supports_mba_mbps()) + return invalfc(fc, msg); + ctx->enable_mba_mbps = true; + return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; + } + + return -EINVAL; +} + +static void rdt_fs_context_free(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static const struct fs_context_operations rdt_fs_context_ops = { + .free = rdt_fs_context_free, + .parse_param = rdt_parse_param, + .get_tree = rdt_get_tree, +}; + +static int rdt_init_fs_context(struct fs_context *fc) +{ + struct rdt_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; + fc->fs_private = &ctx->kfc; + fc->ops = &rdt_fs_context_ops; + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(&init_user_ns); + fc->global = true; + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); + + /* + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and + * resctrl_arch_sched_in() during context switch. + */ + smp_mb(); + + /* + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but + * there is no other side effect. + */ + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) + cpumask_set_cpu(task_cpu(t), mask); + } + } + read_unlock(&tasklist_lock); +} + +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->closid, sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + + if (atomic_read(&sentry->waitcount) != 0) + sentry->flags = RDT_DELETED; + else + rdtgroup_remove(sentry); + } +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + + if (atomic_read(&rdtgrp->waitcount) != 0) + rdtgrp->flags = RDT_DELETED; + else + rdtgroup_remove(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + update_closid_rmid(cpu_online_mask, &rdtgroup_default); + + kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); +} + +/** + * mon_get_kn_priv() - Get the mon_data priv data for this event. + * + * The same values are used across the mon_data directories of all control and + * monitor groups for the same event in the same domain. Keep a list of + * allocated structures and re-use an existing one with the same values for + * @rid, @domid, etc. + * + * @rid: The resource id for the event file being created. + * @domid: The domain id for the event file being created. + * @mevt: The type of event file being created. + * @do_sum: Whether SNC summing monitors are being created. + */ +static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, + struct mon_evt *mevt, + bool do_sum) +{ + struct mon_data *priv; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry(priv, &mon_data_kn_priv_list, list) { + if (priv->rid == rid && priv->domid == domid && + priv->sum == do_sum && priv->evtid == mevt->evtid) + return priv; + } + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return NULL; + + priv->rid = rid; + priv->domid = domid; + priv->sum = do_sum; + priv->evtid = mevt->evtid; + list_add_tail(&priv->list, &mon_data_kn_priv_list); + + return priv; +} + +/** + * mon_put_kn_priv() - Free all allocated mon_data structures. + * + * Called when resctrl file system is unmounted. + */ +static void mon_put_kn_priv(void) +{ + struct mon_data *priv, *tmp; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) { + list_del(&priv->list); + kfree(priv); + } +} + +static void resctrl_fs_teardown(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + /* Cleared by rdtgroup_destroy_root() */ + if (!rdtgroup_default.kn) + return; + + rmdir_all_sub(); + mon_put_kn_priv(); + rdt_pseudo_lock_release(); + rdtgroup_default.mode = RDT_MODE_SHAREABLE; + closid_exit(); + schemata_list_destroy(); + rdtgroup_destroy_root(); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_disable_ctx(); + + /* Put everything back to default values. */ + for_each_alloc_capable_rdt_resource(r) + resctrl_arch_reset_all_ctrls(r); + + resctrl_fs_teardown(); + if (resctrl_arch_alloc_capable()) + resctrl_arch_disable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_disable_mon(); + resctrl_mounted = false; + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .init_fs_context = rdt_init_fs_context, + .parameters = rdt_fs_parameters, + .kill_sb = rdt_kill_sb, +}; + +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) +{ + struct kernfs_node *kn; + int ret = 0; + + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return ret; +} + +static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get(pkn, name); + if (!kn) + return; + kernfs_put(kn); + + if (kn->dir.subdirs <= 1) + kernfs_remove(kn); + else + kernfs_remove_by_name(kn, subname); +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups for the given domain. + * Remove files and directories containing "sum" of domain data + * when last domain being summed is removed. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_mon_domain *d) +{ + struct rdtgroup *prgrp, *crgrp; + char subname[32]; + bool snc_mode; + char name[32]; + + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + if (snc_mode) + sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); + } +} + +static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp, + bool do_sum) +{ + struct rmid_read rr = {0}; + struct mon_data *priv; + struct mon_evt *mevt; + int ret, domid; + + if (WARN_ON(list_empty(&r->evt_list))) + return -EPERM; + + list_for_each_entry(mevt, &r->evt_list, list) { + domid = do_sum ? d->ci->id : d->hdr.id; + priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); + if (WARN_ON_ONCE(!priv)) + return -EINVAL; + + ret = mon_addfile(kn, mevt->name, priv); + if (ret) + return ret; + + if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + } + + return 0; +} + +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn, *ckn; + char name[32]; + bool snc_mode; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + kn = kernfs_find_and_get(parent_kn, name); + if (kn) { + /* + * rdtgroup_mutex will prevent this directory from being + * removed. No need to keep this hold. + */ + kernfs_put(kn); + } else { + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); + if (ret) + goto out_destroy; + } + + if (snc_mode) { + sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); + ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); + if (IS_ERR(ckn)) { + ret = -EINVAL; + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(ckn); + if (ret) + goto out_destroy; + + ret = mon_add_all_files(ckn, d, r, prgrp, false); + if (ret) + goto out_destroy; + } + + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_mon_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_mon_domain *dom; + int ret; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain which are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_capable_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/** + * cbm_ensure_valid - Enforce validity on provided CBM + * @_val: Candidate CBM + * @r: RDT resource to which the CBM belongs + * + * The provided CBM represents all cache portions available for use. This + * may be represented by a bitmap that does not consist of contiguous ones + * and thus be an invalid CBM. + * Here the provided CBM is forced to be a valid CBM by only considering + * the first set of contiguous bits as valid and clearing all bits. + * The intention here is to provide a valid default CBM with which a new + * resource group is initialized. The user can follow this with a + * modification to the CBM if the default does not satisfy the + * requirements. + */ +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) +{ + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit; + unsigned long val = _val; + + if (!val) + return 0; + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Clear any remaining bits to ensure contiguous region */ + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); + return (u32)val; +} + +/* + * Initialize cache resources per RDT domain + * + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. + */ +static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, + u32 closid) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + enum resctrl_conf_type t = s->conf_type; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 used_b = 0, unused_b = 0; + unsigned long tmp_cbm; + enum rdtgrp_mode mode; + u32 peer_ctl, ctrl_val; + int i; + + cfg = &d->staged_config[t]; + cfg->have_new_ctrl = false; + cfg->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + for (i = 0; i < closids_supported(); i++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + /* + * ctrl values for locksetup aren't relevant + * until the schemata is written, and the mode + * becomes RDT_MODE_PSEUDO_LOCKED. + */ + continue; + /* + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. + */ + if (resctrl_arch_get_cdp_enabled(r->rid)) + peer_ctl = resctrl_arch_get_config(r, d, i, + peer_type); + else + peer_ctl = 0; + ctrl_val = resctrl_arch_get_config(r, d, i, + s->conf_type); + used_b |= ctrl_val | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + cfg->new_ctrl |= ctrl_val | peer_ctl; + } + } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + cfg->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = cfg->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); + return -ENOSPC; + } + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) +{ + struct rdt_ctrl_domain *d; + int ret; + + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { + ret = __init_one_rdt_domain(d, s, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) +{ + struct resctrl_staged_config *cfg; + struct rdt_ctrl_domain *d; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + + cfg = &d->staged_config[CDP_NONE]; + cfg->new_ctrl = resctrl_get_default_ctrl(r); + cfg->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + int ret = 0; + + rdt_staged_configs_clear(); + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) { + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; + } else { + ret = rdtgroup_init_cat(s, rdtgrp->closid); + if (ret < 0) + goto out; + } + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Failed to initialize allocations\n"); + goto out; + } + } + + rdtgrp->mode = RDT_MODE_SHAREABLE; + +out: + rdt_staged_configs_clear(); + return ret; +} + +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!resctrl_arch_mon_capable()) + return 0; + + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (resctrl_arch_mon_capable()) + free_rmid(rgrp->closid, rgrp->mon.rmid); +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(rdt_kn_name(kn), "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; + struct kernfs_node *kn; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(parent_kn); + if (!prdtgrp) { + ret = -ENODEV; + goto out_unlock; + } + + /* + * Check that the parent directory for a monitor group is a "mon_groups" + * directory. + */ + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { + ret = -EPERM; + goto out_unlock; + } + + if (rtype == RDTMON_GROUP && + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto out_unlock; + } + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + rdt_last_cmd_puts("Kernel out of memory\n"); + goto out_unlock; + } + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); + goto out_free_rgrp; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, + * which will be dropped by kernfs_put() in rdtgroup_remove(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); + goto out_destroy; + } + + if (rtype == RDTCTRL_GROUP) { + files = RFTYPE_BASE | RFTYPE_CTRL; + if (resctrl_arch_mon_capable()) + files |= RFTYPE_MON; + } else { + files = RFTYPE_BASE | RFTYPE_MON; + } + + ret = rdtgroup_add_files(kn, files); + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); + goto out_destroy; + } + + /* + * The caller unlocks the parent_kn upon success. + */ + return 0; + +out_destroy: + kernfs_put(rdtgrp->kn); + kernfs_remove(rdtgrp->kn); +out_free_rgrp: + kfree(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + rdtgroup_remove(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp; + struct kernfs_node *kn; + u32 closid; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); + if (ret) + return ret; + + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) { + rdt_last_cmd_puts("Out of CLOSIDs\n"); + goto out_common_fail; + } + closid = ret; + ret = 0; + + rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + + ret = rdtgroup_init_alloc(rdtgrp); + if (ret < 0) + goto out_rmid_free; + + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (resctrl_arch_mon_capable()) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_del_list; + } + if (is_mba_sc(NULL)) + rdtgrp->mba_mbps_event = mba_mbps_default_event; + } + + goto out_unlock; + +out_del_list: + list_del(&rdtgrp->rdtgroup_list); +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: + closid_free(closid); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); + + /* Else, attempt to add a monitoring subdirectory. */ + if (resctrl_arch_mon_capable()) + return rdtgroup_mkdir_mon(parent_kn, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + u32 closid, rmid; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* + * Update per cpu closid/rmid of the moved CPUs first. + * Note: the closid will not change, but the arch code still needs it. + */ + closid = prdtgrp->closid; + rmid = prdtgrp->mon.rmid; + for_each_cpu(cpu, &rdtgrp->cpu_mask) + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) +{ + rdtgrp->flags = RDT_DELETED; + list_del(&rdtgrp->rdtgroup_list); + + kernfs_remove(rdtgrp->kn); + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + u32 closid, rmid; + int cpu; + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid and rmid of the moved CPUs first */ + closid = rdtgroup_default.closid; + rmid = rdtgroup_default.mon.rmid; + for_each_cpu(cpu, &rdtgrp->cpu_mask) + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + closid_free(rdtgrp->closid); + + rdtgroup_ctrl_remove(rdtgrp); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + + return 0; +} + +static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) +{ + /* + * Valid within the RCU section it was obtained or while rdtgroup_mutex + * is held. + */ + return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + parent_kn = rdt_kn_parent(kn); + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = rdtgroup_ctrl_remove(rdtgrp); + } else { + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); + } + } else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, rdt_kn_name(kn))) { + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); + } else { + ret = -EPERM; + } + +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +/** + * mongrp_reparent() - replace parent CTRL_MON group of a MON group + * @rdtgrp: the MON group whose parent should be replaced + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp + * @cpus: cpumask provided by the caller for use during this call + * + * Replaces the parent CTRL_MON group for a MON group, resulting in all member + * tasks' CLOSID immediately changing to that of the new parent group. + * Monitoring data for the group is unaffected by this operation. + */ +static void mongrp_reparent(struct rdtgroup *rdtgrp, + struct rdtgroup *new_prdtgrp, + cpumask_var_t cpus) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + + WARN_ON(rdtgrp->type != RDTMON_GROUP); + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); + + /* Nothing to do when simply renaming a MON group. */ + if (prdtgrp == new_prdtgrp) + return; + + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_move_tail(&rdtgrp->mon.crdtgrp_list, + &new_prdtgrp->mon.crdtgrp_list); + + rdtgrp->mon.parent = new_prdtgrp; + rdtgrp->closid = new_prdtgrp->closid; + + /* Propagate updated closid to all tasks in this group. */ + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); + + update_closid_rmid(cpus, NULL); +} + +static int rdtgroup_rename(struct kernfs_node *kn, + struct kernfs_node *new_parent, const char *new_name) +{ + struct kernfs_node *kn_parent; + struct rdtgroup *new_prdtgrp; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret; + + rdtgrp = kernfs_to_rdtgroup(kn); + new_prdtgrp = kernfs_to_rdtgroup(new_parent); + if (!rdtgrp || !new_prdtgrp) + return -ENOENT; + + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ + rdtgroup_kn_get(rdtgrp, kn); + rdtgroup_kn_get(new_prdtgrp, new_parent); + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + /* + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if + * either kernfs_node is a file. + */ + if (kernfs_type(kn) != KERNFS_DIR || + kernfs_type(new_parent) != KERNFS_DIR) { + rdt_last_cmd_puts("Source and destination must be directories"); + ret = -EPERM; + goto out; + } + + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { + ret = -ENOENT; + goto out; + } + + kn_parent = rdt_kn_parent(kn); + if (rdtgrp->type != RDTMON_GROUP || !kn_parent || + !is_mon_groups(kn_parent, rdt_kn_name(kn))) { + rdt_last_cmd_puts("Source must be a MON group\n"); + ret = -EPERM; + goto out; + } + + if (!is_mon_groups(new_parent, new_name)) { + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); + ret = -EPERM; + goto out; + } + + /* + * If the MON group is monitoring CPUs, the CPUs must be assigned to the + * current parent CTRL_MON group and therefore cannot be assigned to + * the new parent, making the move illegal. + */ + if (!cpumask_empty(&rdtgrp->cpu_mask) && + rdtgrp->mon.parent != new_prdtgrp) { + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); + ret = -EPERM; + goto out; + } + + /* + * Allocate the cpumask for use in mongrp_reparent() to avoid the + * possibility of failing to allocate it after kernfs_rename() has + * succeeded. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } + + /* + * Perform all input validation and allocations needed to ensure + * mongrp_reparent() will succeed before calling kernfs_rename(), + * otherwise it would be necessary to revert this call if + * mongrp_reparent() failed. + */ + ret = kernfs_rename(kn, new_parent, new_name); + if (!ret) + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); + + free_cpumask_var(tmpmask); + +out: + mutex_unlock(&rdtgroup_mutex); + rdtgroup_kn_put(rdtgrp, kn); + rdtgroup_kn_put(new_prdtgrp, new_parent); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) + seq_puts(seq, ",cdp"); + + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) + seq_puts(seq, ",mba_MBps"); + + if (resctrl_debug) + seq_puts(seq, ",debug"); + + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .rename = rdtgroup_rename, + .show_options = rdtgroup_show_options, +}; + +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) +{ + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void rdtgroup_setup_default(void) +{ + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + mutex_unlock(&rdtgroup_mutex); +} + +static void domain_destroy_mon_state(struct rdt_mon_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + mutex_lock(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + mutex_unlock(&rdtgroup_mutex); +} + +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + mutex_lock(&rdtgroup_mutex); + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) + rmdir_mondata_subdir_allrdtgrp(r, d); + + if (resctrl_is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); + + mutex_unlock(&rdtgroup_mutex); +} + +/** + * domain_setup_mon_state() - Initialise domain monitoring structures. + * @r: The resource for the newly online domain. + * @d: The newly online domain. + * + * Allocate monitor resources that belong to this domain. + * Called when the first CPU of a domain comes online, regardless of whether + * the filesystem is mounted. + * During boot this may be called before global allocations have been made by + * resctrl_mon_resource_init(). + * + * Returns 0 for success, or -ENOMEM. + */ +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + size_t tsize; + + if (resctrl_arch_is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (resctrl_arch_is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (resctrl_arch_is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + int err = 0; + + mutex_lock(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { + /* RDT_RESOURCE_MBA is never mon_capable */ + err = mba_sc_domain_allocate(r, d); + } + + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + int err; + + mutex_lock(&rdtgroup_mutex); + + err = domain_setup_mon_state(r, d); + if (err) + goto out_unlock; + + if (resctrl_is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); + } + + if (resctrl_arch_is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) + mkdir_mondata_subdir_allrdtgrp(r, d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +void resctrl_online_cpu(unsigned int cpu) +{ + mutex_lock(&rdtgroup_mutex); + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + + if (!l3->mon_capable) + goto out_unlock; + + d = get_mon_domain_from_cpu(cpu, l3); + if (d) { + if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (resctrl_arch_is_llc_occupancy_enabled() && + cpu == d->cqm_work_cpu && has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +/* + * resctrl_init - resctrl filesystem initialization + * + * Setup resctrl file system including set up root, create mount point, + * register resctrl filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int resctrl_init(void) +{ + int ret = 0; + + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + + rdtgroup_setup_default(); + + thread_throttle_mode_init(); + + ret = resctrl_mon_resource_init(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) { + resctrl_mon_resource_exit(); + return ret; + } + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + /* + * Adding the resctrl debugfs directory here may not be ideal since + * it would let the resctrl debugfs directory appear on the debugfs + * filesystem before the resctrl filesystem is mounted. + * It may also be ok since that would enable debugging of RDT before + * resctrl is mounted. + * The reason why the debugfs directory is created here and not in + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and + * during the debugfs directory creation also &sb->s_type->i_mutex_key + * (the lockdep class of inode->i_rwsem). Other filesystem + * interactions (eg. SyS_getdents) have the lock ordering: + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex + * is taken, thus creating dependency: + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause + * issues considering the other two lock dependencies. + * By creating the debugfs directory here we avoid a dependency + * that may cause deadlock (even though file operations cannot + * occur until the filesystem is mounted, but I do not know how to + * tell lockdep that). + */ + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); + resctrl_mon_resource_exit(); + + return ret; +} + +static bool resctrl_online_domains_exist(void) +{ + struct rdt_resource *r; + + /* + * Only walk capable resources to allow resctrl_arch_get_resource() + * to return dummy 'not capable' resources. + */ + for_each_alloc_capable_rdt_resource(r) { + if (!list_empty(&r->ctrl_domains)) + return true; + } + + for_each_mon_capable_rdt_resource(r) { + if (!list_empty(&r->mon_domains)) + return true; + } + + return false; +} + +/** + * resctrl_exit() - Remove the resctrl filesystem and free resources. + * + * Called by the architecture code in response to a fatal error. + * Removes resctrl files and structures from kernfs to prevent further + * configuration. + * + * When called by the architecture code, all CPUs and resctrl domains must be + * offline. This ensures the limbo and overflow handlers are not scheduled to + * run, meaning the data structures they access can be freed by + * resctrl_mon_resource_exit(). + * + * After resctrl_exit() returns, the architecture code should return an + * error from all resctrl_arch_ functions that can do this. + * resctrl_arch_get_resource() must continue to return struct rdt_resources + * with the correct rid field to ensure the filesystem can be unmounted. + */ +void resctrl_exit(void) +{ + cpus_read_lock(); + WARN_ON_ONCE(resctrl_online_domains_exist()); + + mutex_lock(&rdtgroup_mutex); + resctrl_fs_teardown(); + mutex_unlock(&rdtgroup_mutex); + + cpus_read_unlock(); + + debugfs_remove_recursive(debugfs_resctrl); + debugfs_resctrl = NULL; + unregister_filesystem(&rdt_fs_type); + + /* + * Do not remove the sysfs mount point added by resctrl_init() so that + * it can be used to umount resctrl. + */ + + resctrl_mon_resource_exit(); +} |