From 44c6cb9fe9888b371e31165b2854bd0f4e2787d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:58 -0700 Subject: KVM: guest_memfd: Allow mmap() on guest_memfd for x86 VMs with private memory Allow mmap() on guest_memfd instances for x86 VMs with private memory as the need to track private vs. shared state in the guest_memfd instance is only pertinent to INIT_SHARED. Doing mmap() on private memory isn't terrible useful (yet!), but it's now possible, and will be desirable when guest_memfd gains support for other VMA-based syscalls, e.g. mbind() to set NUMA policy. Lift the restriction now, before MMAP support is officially released, so that KVM doesn't need to add another capability to enumerate support for mmap() on private memory. Fixes: 3d3a04fad25a ("KVM: Allow and advertise support for host mmap() on guest_memfd files") Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-6-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 19b8c4bebb9c..680ca838f018 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -729,7 +729,17 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) #endif #ifdef CONFIG_KVM_GUEST_MEMFD -bool kvm_arch_supports_gmem_mmap(struct kvm *kvm); +bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm); + +static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm) +{ + u64 flags = GUEST_MEMFD_FLAG_MMAP; + + if (!kvm || kvm_arch_supports_gmem_init_shared(kvm)) + flags |= GUEST_MEMFD_FLAG_INIT_SHARED; + + return flags; +} #endif #ifndef kvm_arch_has_readonly_mem -- cgit v1.2.3 From 12d724f2852d094d68dccaf5101e0ef89a971cde Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 9 Oct 2025 19:46:00 +0900 Subject: ata: libata-core: relax checks in ata_read_log_directory() Commit 6d4405b16d37 ("ata: libata-core: Cache the general purpose log directory") introduced caching of a device general purpose log directory to avoid repeated access to this log page during device scan. This change also added a check on this log page to verify that the log page version is 0x0001 as mandated by the ACS specifications. And it turns out that some devices do not bother reporting this version, instead reporting a version 0, resulting in error messages such as: ata6.00: Invalid log directory version 0x0000 and to the device being marked as not supporting the general purpose log directory log page. Since before commit 6d4405b16d37 the log page version check did not exist and things were still working correctly for these devices, relax ata_read_log_directory() version check and only warn about the invalid log page version number without disabling access to the log directory page. Fixes: 6d4405b16d37 ("ata: libata-core: Cache the general purpose log directory") Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220635 Signed-off-by: Damien Le Moal Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 11 ++++------- include/linux/libata.h | 6 ++++++ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index ff53f5f029b4..2a210719c4ce 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2174,13 +2174,10 @@ static int ata_read_log_directory(struct ata_device *dev) } version = get_unaligned_le16(&dev->gp_log_dir[0]); - if (version != 0x0001) { - ata_dev_err(dev, "Invalid log directory version 0x%04x\n", - version); - ata_clear_log_directory(dev); - dev->quirks |= ATA_QUIRK_NO_LOG_DIR; - return -EINVAL; - } + if (version != 0x0001) + ata_dev_warn_once(dev, + "Invalid log directory version 0x%04x\n", + version); return 0; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 21de0935775d..7a98de1cc995 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1594,6 +1594,12 @@ do { \ #define ata_dev_dbg(dev, fmt, ...) \ ata_dev_printk(debug, dev, fmt, ##__VA_ARGS__) +#define ata_dev_warn_once(dev, fmt, ...) \ + pr_warn_once("ata%u.%02u: " fmt, \ + (dev)->link->ap->print_id, \ + (dev)->link->pmp + (dev)->devno, \ + ##__VA_ARGS__) + static inline void ata_print_version_once(const struct device *dev, const char *version) { -- cgit v1.2.3 From 7e8242405b94ceac6db820de7d4fd9318cbc1219 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Wed, 1 Oct 2025 08:08:03 +0200 Subject: rpmb: move rpmb_frame struct and constants to common header Move struct rpmb_frame and RPMB operation constants from MMC block driver to include/linux/rpmb.h for reuse across different RPMB implementations (UFS, NVMe, etc.). Signed-off-by: Bean Huo Reviewed-by: Avri Altman Acked-by: Jens Wiklander Reviewed-by: Bart Van Assche Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 42 ------------------------------------------ include/linux/rpmb.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 9399bf6c766a..c0ffe0817fd4 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -79,48 +79,6 @@ MODULE_ALIAS("mmc:block"); #define MMC_EXTRACT_INDEX_FROM_ARG(x) ((x & 0x00FF0000) >> 16) #define MMC_EXTRACT_VALUE_FROM_ARG(x) ((x & 0x0000FF00) >> 8) -/** - * struct rpmb_frame - rpmb frame as defined by eMMC 5.1 (JESD84-B51) - * - * @stuff : stuff bytes - * @key_mac : The authentication key or the message authentication - * code (MAC) depending on the request/response type. - * The MAC will be delivered in the last (or the only) - * block of data. - * @data : Data to be written or read by signed access. - * @nonce : Random number generated by the host for the requests - * and copied to the response by the RPMB engine. - * @write_counter: Counter value for the total amount of the successful - * authenticated data write requests made by the host. - * @addr : Address of the data to be programmed to or read - * from the RPMB. Address is the serial number of - * the accessed block (half sector 256B). - * @block_count : Number of blocks (half sectors, 256B) requested to be - * read/programmed. - * @result : Includes information about the status of the write counter - * (valid, expired) and result of the access made to the RPMB. - * @req_resp : Defines the type of request and response to/from the memory. - * - * The stuff bytes and big-endian properties are modeled to fit to the spec. - */ -struct rpmb_frame { - u8 stuff[196]; - u8 key_mac[32]; - u8 data[256]; - u8 nonce[16]; - __be32 write_counter; - __be16 addr; - __be16 block_count; - __be16 result; - __be16 req_resp; -} __packed; - -#define RPMB_PROGRAM_KEY 0x1 /* Program RPMB Authentication Key */ -#define RPMB_GET_WRITE_COUNTER 0x2 /* Read RPMB write counter */ -#define RPMB_WRITE_DATA 0x3 /* Write data to RPMB partition */ -#define RPMB_READ_DATA 0x4 /* Read data from RPMB partition */ -#define RPMB_RESULT_READ 0x5 /* Read result request (Internal) */ - #define RPMB_FRAME_SIZE sizeof(struct rpmb_frame) #define CHECK_SIZE_NEQ(val) ((val) != sizeof(struct rpmb_frame)) #define CHECK_SIZE_ALIGNED(val) IS_ALIGNED((val), sizeof(struct rpmb_frame)) diff --git a/include/linux/rpmb.h b/include/linux/rpmb.h index cccda73eea4d..ed3f8e431eff 100644 --- a/include/linux/rpmb.h +++ b/include/linux/rpmb.h @@ -61,6 +61,50 @@ struct rpmb_dev { #define to_rpmb_dev(x) container_of((x), struct rpmb_dev, dev) +/** + * struct rpmb_frame - RPMB frame structure for authenticated access + * + * @stuff : stuff bytes, a padding/reserved area of 196 bytes at the + * beginning of the RPMB frame. They don’t carry meaningful + * data but are required to make the frame exactly 512 bytes. + * @key_mac : The authentication key or the message authentication + * code (MAC) depending on the request/response type. + * The MAC will be delivered in the last (or the only) + * block of data. + * @data : Data to be written or read by signed access. + * @nonce : Random number generated by the host for the requests + * and copied to the response by the RPMB engine. + * @write_counter: Counter value for the total amount of the successful + * authenticated data write requests made by the host. + * @addr : Address of the data to be programmed to or read + * from the RPMB. Address is the serial number of + * the accessed block (half sector 256B). + * @block_count : Number of blocks (half sectors, 256B) requested to be + * read/programmed. + * @result : Includes information about the status of the write counter + * (valid, expired) and result of the access made to the RPMB. + * @req_resp : Defines the type of request and response to/from the memory. + * + * The stuff bytes and big-endian properties are modeled to fit to the spec. + */ +struct rpmb_frame { + u8 stuff[196]; + u8 key_mac[32]; + u8 data[256]; + u8 nonce[16]; + __be32 write_counter; + __be16 addr; + __be16 block_count; + __be16 result; + __be16 req_resp; +}; + +#define RPMB_PROGRAM_KEY 0x1 /* Program RPMB Authentication Key */ +#define RPMB_GET_WRITE_COUNTER 0x2 /* Read RPMB write counter */ +#define RPMB_WRITE_DATA 0x3 /* Write data to RPMB partition */ +#define RPMB_READ_DATA 0x4 /* Read data from RPMB partition */ +#define RPMB_RESULT_READ 0x5 /* Read result request (Internal) */ + #if IS_ENABLED(CONFIG_RPMB) struct rpmb_dev *rpmb_dev_get(struct rpmb_dev *rdev); void rpmb_dev_put(struct rpmb_dev *rdev); -- cgit v1.2.3 From 7a84394f02ab1985ebbe0a8d6f6d69bd040de4b3 Mon Sep 17 00:00:00 2001 From: Joshua Watt Date: Tue, 7 Oct 2025 15:22:58 -0600 Subject: NFS4: Apply delay_retrans to async operations The setting of delay_retrans is applied to synchronous RPC operations because the retransmit count is stored in same struct nfs4_exception that is passed each time an error is checked. However, for asynchronous operations (READ, WRITE, LOCKU, CLOSE, DELEGRETURN), a new struct nfs4_exception is made on the stack each time the task callback is invoked. This means that the retransmit count is always zero and thus delay_retrans never takes effect. Apply delay_retrans to these operations by tracking and updating their retransmit count. Change-Id: Ieb33e046c2b277cb979caa3faca7f52faf0568c9 Signed-off-by: Joshua Watt Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 13 +++++++++++++ include/linux/nfs_xdr.h | 1 + 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f58098417142..411776718494 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3636,6 +3636,7 @@ struct nfs4_closedata { } lr; struct nfs_fattr fattr; unsigned long timestamp; + unsigned short retrans; }; static void nfs4_free_closedata(void *data) @@ -3664,6 +3665,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) .state = state, .inode = calldata->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3711,6 +3713,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -5593,9 +5596,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -5709,10 +5714,12 @@ static int nfs4_write_done_cb(struct rpc_task *task, .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, NFS_SERVER(inode), task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -6726,6 +6733,7 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + unsigned short retrans; struct { struct nfs4_layoutreturn_args arg; struct nfs4_layoutreturn_res res; @@ -6746,6 +6754,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) .inode = data->inode, .stateid = &data->stateid, .task_is_privileged = data->args.seq_args.sa_privileged, + .retrans = data->retrans, }; if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6817,6 +6826,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) task->tk_status = nfs4_async_handle_exception(task, data->res.server, task->tk_status, &exception); + data->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -7093,6 +7103,7 @@ struct nfs4_unlockdata { struct file_lock fl; struct nfs_server *server; unsigned long timestamp; + unsigned short retrans; }; static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, @@ -7147,6 +7158,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) struct nfs4_exception exception = { .inode = calldata->lsp->ls_state->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -7180,6 +7192,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_status = nfs4_async_handle_exception(task, calldata->server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) rpc_restart_call_prepare(task); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d56583572c98..31463286402f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1659,6 +1659,7 @@ struct nfs_pgio_header { void *netfs; #endif + unsigned short retrans; int pnfs_error; int error; /* merge with pnfs_error */ unsigned int good_bytes; /* boundary of good data */ -- cgit v1.2.3 From e4d0c909bf8328d986bf3aadba0c33a72b5ae30d Mon Sep 17 00:00:00 2001 From: Kamil Horák - 2N Date: Thu, 9 Oct 2025 15:06:56 +0200 Subject: net: phy: bcm54811: Fix GMII/MII/MII-Lite selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Broadcom bcm54811 is hardware-strapped to select among RGMII and GMII/MII/MII-Lite modes. However, the corresponding bit, RGMII Enable in Miscellaneous Control Register must be also set to select desired RGMII or MII(-lite)/GMII mode. Fixes: 3117a11fff5af9e7 ("net: phy: bcm54811: PHY initialization") Signed-off-by: Kamil Horák - 2N Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20251009130656.1308237-2-kamilh@axis.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/broadcom.c | 20 +++++++++++++++++++- include/linux/brcmphy.h | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 3459a0e9d8b9..cb306f9e80cc 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -405,7 +405,7 @@ static int bcm5481x_set_brrmode(struct phy_device *phydev, bool on) static int bcm54811_config_init(struct phy_device *phydev) { struct bcm54xx_phy_priv *priv = phydev->priv; - int err, reg, exp_sync_ethernet; + int err, reg, exp_sync_ethernet, aux_rgmii_en; /* Enable CLK125 MUX on LED4 if ref clock is enabled. */ if (!(phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED)) { @@ -434,6 +434,24 @@ static int bcm54811_config_init(struct phy_device *phydev) if (err < 0) return err; + /* Enable RGMII if configured */ + if (phy_interface_is_rgmii(phydev)) + aux_rgmii_en = MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN | + MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN; + else + aux_rgmii_en = 0; + + /* Also writing Reserved bits 6:5 because the documentation requires + * them to be written to 0b11 + */ + err = bcm54xx_auxctl_write(phydev, + MII_BCM54XX_AUXCTL_SHDWSEL_MISC, + MII_BCM54XX_AUXCTL_MISC_WREN | + aux_rgmii_en | + MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RSVD); + if (err < 0) + return err; + return bcm5481x_set_brrmode(phydev, priv->brr_mode); } diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 15c35655f482..115a964f3006 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -137,6 +137,7 @@ #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC 0x07 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN 0x0010 +#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RSVD 0x0060 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_EN 0x0080 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN 0x0100 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX 0x0200 -- cgit v1.2.3 From 1d64624243af8329b4b219d8c39e28ea448f9929 Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Mon, 6 Oct 2025 18:05:31 -0700 Subject: HID: core: Add printk_ratelimited variants to hid_warn() etc hid_warn_ratelimited() is needed. Add the others as part of the block. Signed-off-by: Vicki Pfau Signed-off-by: Jiri Kosina --- include/linux/hid.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index e1b673ad7457..a4ddb94e3ee5 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1292,4 +1292,15 @@ void hid_quirks_exit(__u16 bus); #define hid_dbg_once(hid, fmt, ...) \ dev_dbg_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_err_ratelimited(hid, fmt, ...) \ + dev_err_ratelimited(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_notice_ratelimited(hid, fmt, ...) \ + dev_notice_ratelimited(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_warn_ratelimited(hid, fmt, ...) \ + dev_warn_ratelimited(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_info_ratelimited(hid, fmt, ...) \ + dev_info_ratelimited(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_dbg_ratelimited(hid, fmt, ...) \ + dev_dbg_ratelimited(&(hid)->dev, fmt, ##__VA_ARGS__) + #endif -- cgit v1.2.3 From 5fb750e8a9ae123b2034771b864b8a21dbef65cd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 14 Oct 2025 17:07:00 -0700 Subject: bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures. The following kmemleak splat: [ 8.105530] kmemleak: Trying to color unknown object at 0xff11000100e918c0 as Black [ 8.106521] Call Trace: [ 8.106521] [ 8.106521] dump_stack_lvl+0x4b/0x70 [ 8.106521] kvfree_call_rcu+0xcb/0x3b0 [ 8.106521] ? hrtimer_cancel+0x21/0x40 [ 8.106521] bpf_obj_free_fields+0x193/0x200 [ 8.106521] htab_map_update_elem+0x29c/0x410 [ 8.106521] bpf_prog_cfc8cd0f42c04044_overwrite_cb+0x47/0x4b [ 8.106521] bpf_prog_8c30cd7c4db2e963_overwrite_timer+0x65/0x86 [ 8.106521] bpf_prog_test_run_syscall+0xe1/0x2a0 happens due to the combination of features and fixes, but mainly due to commit 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()") It's using __GFP_HIGH, which instructs slub/kmemleak internals to skip kmemleak_alloc_recursive() on allocation, so subsequent kfree_rcu()-> kvfree_call_rcu()->kmemleak_ignore() complains with the above splat. To fix this imbalance, replace bpf_map_kmalloc_node() with kmalloc_nolock() and kfree_rcu() with call_rcu() + kfree_nolock() to make sure that the objects allocated with kmalloc_nolock() are freed with kfree_nolock() rather than the implicit kfree() that kfree_rcu() uses internally. Note, the kmalloc_nolock() happens under bpf_spin_lock_irqsave(), so it will always fail in PREEMPT_RT. This is not an issue at the moment, since bpf_timers are disabled in PREEMPT_RT. In the future bpf_spin_lock will be replaced with state machine similar to bpf_task_work. Fixes: 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Shakeel Butt Acked-by: Harry Yoo Acked-by: Vlastimil Babka Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20251015000700.28988-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 4 ++++ kernel/bpf/helpers.c | 25 ++++++++++++++----------- kernel/bpf/syscall.c | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a98c83346134..d808253f2e94 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2499,6 +2499,8 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, #ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); +void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, + int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags); @@ -2511,6 +2513,8 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, */ #define bpf_map_kmalloc_node(_map, _size, _flags, _node) \ kmalloc_node(_size, _flags, _node) +#define bpf_map_kmalloc_nolock(_map, _size, _flags, _node) \ + kmalloc_nolock(_size, _flags, _node) #define bpf_map_kzalloc(_map, _size, _flags) \ kzalloc(_size, _flags) #define bpf_map_kvcalloc(_map, _n, _size, _flags) \ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c9fab9a356df..8eb117c52817 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work) rcu_read_unlock_trace(); } +static void bpf_async_cb_rcu_free(struct rcu_head *rcu) +{ + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + + kfree_nolock(cb); +} + static void bpf_wq_delete_work(struct work_struct *work) { struct bpf_work *w = container_of(work, struct bpf_work, delete_work); cancel_work_sync(&w->work); - kfree_rcu(w, cb.rcu); + call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); } static void bpf_timer_delete_work(struct work_struct *work) @@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work) /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call - * kfree_rcu(t) right after for both preallocated and non-preallocated + * call_rcu() right after for both preallocated and non-preallocated * maps. The async->cb = NULL was already done and no code path can see * address 't' anymore. Timer if armed for existing bpf_hrtimer before * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, @@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until - * kmalloc_nolock() is available, avoid locking issues by using - * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). - */ - cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); + cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); - kfree(cb); + kfree_nolock(cb); ret = -EPERM; } out: @@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val) * timer _before_ calling us, such that failing to cancel it here will * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. * Therefore, we _need_ to cancel any outstanding timers before we do - * kfree_rcu, even though no more timers can be armed. + * call_rcu, even though no more timers can be armed. * * Moreover, we need to schedule work even if timer does not belong to * the calling callback_fn, as on two different CPUs, we can end up in a @@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val) * completion. */ if (hrtimer_try_to_cancel(&t->timer) >= 0) - kfree_rcu(t, cb.rcu); + call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); else queue_work(system_dfl_wq, &t->cb.delete_work); } else { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2a9456a3e730..8a129746bd6c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, return ptr; } +void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + struct mem_cgroup *memcg, *old_memcg; + void *ptr; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); + ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + + return ptr; +} + void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; -- cgit v1.2.3 From 16dad7801aad73138a2dff5ea950130646914d1f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:15 -1000 Subject: cgroup: Rename cgroup lifecycle hooks to cgroup_task_*() The current names cgroup_exit(), cgroup_release(), and cgroup_free() are confusing because they look like they're operating on cgroups themselves when they're actually task lifecycle hooks. For example, cgroup_init() initializes the cgroup subsystem while cgroup_exit() is a task exit notification to cgroup. Rename them to cgroup_task_exit(), cgroup_task_release(), and cgroup_task_free() to make it clear that these operate on tasks. Cc: Dan Schatzberg Cc: Peter Zijlstra Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 12 ++++++------ kernel/cgroup/cgroup.c | 11 ++++++----- kernel/exit.c | 4 ++-- kernel/fork.c | 2 +- kernel/sched/autogroup.c | 4 ++-- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6ed477338b16..4068035176c4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -137,9 +137,9 @@ extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); -void cgroup_exit(struct task_struct *p); -void cgroup_release(struct task_struct *p); -void cgroup_free(struct task_struct *p); +void cgroup_task_exit(struct task_struct *p); +void cgroup_task_release(struct task_struct *p); +void cgroup_task_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); @@ -680,9 +680,9 @@ static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} -static inline void cgroup_exit(struct task_struct *p) {} -static inline void cgroup_release(struct task_struct *p) {} -static inline void cgroup_free(struct task_struct *p) {} +static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_release(struct task_struct *p) {} +static inline void cgroup_task_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6ae5f48cf64e..826b7fd2f85d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -944,7 +944,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_exit()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6972,13 +6973,13 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; @@ -7010,7 +7011,7 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_release(struct task_struct *task) +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -7027,7 +7028,7 @@ void cgroup_release(struct task_struct *task) } } -void cgroup_free(struct task_struct *task) +void cgroup_task_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); put_css_set(cset); diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..46173461e8de 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -257,7 +257,7 @@ repeat: rcu_read_unlock(); pidfs_exit(p); - cgroup_release(p); + cgroup_task_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); @@ -967,7 +967,7 @@ void __noreturn do_exit(long code) exit_thread(tsk); sched_autogroup_exit_task(tsk); - cgroup_exit(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..960c39c9c264 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -738,7 +738,7 @@ void __put_task_struct(struct task_struct *tsk) unwind_task_free(tsk); sched_ext_free(tsk); io_uring_free(tsk); - cgroup_free(tsk); + cgroup_task_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index cdea931aae30..954137775f38 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * this process can already run with task_group() == prev->tg or we can * race with cgroup code which can read autogroup = prev under rq->lock. * In the latter case for_each_thread() can not miss a migrating thread, - * cpu_cgroup_attach() must not be possible after cgroup_exit() and it - * can't be removed from thread list, we hold ->siglock. + * cpu_cgroup_attach() must not be possible after cgroup_task_exit() + * and it can't be removed from thread list, we hold ->siglock. * * If an exiting thread was already removed from thread list we rely on * sched_autogroup_exit_task(). -- cgit v1.2.3 From d245698d727ab8f5420b3e28d1243f96a5234851 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:17 -1000 Subject: cgroup: Defer task cgroup unlink until after the task is done switching out When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task from its cgroup. From the cgroup's perspective, the task is now gone. If this makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks that notify controllers the cgroup is going offline resource-wise. However, the exiting task can still run, perform memory operations, and schedule until the final context switch in finish_task_switch(). This creates a confusing situation where controllers are told a cgroup is offline while resource activities are still happening in it. While this hasn't broken existing controllers, it has caused direct confusion for sched_ext schedulers. Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls the subsystem exit callbacks and continues to be called from do_exit(). The css_set cleanup is moved to the new cgroup_task_dead() which is called from finish_task_switch() after the final context switch, so that the cgroup only appears empty after the task is truly done running. This also reorders operations so that subsys->exit() is now called before unlinking from the cgroup, which shouldn't break anything. Cc: Dan Schatzberg Cc: Peter Zijlstra Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ kernel/cgroup/cgroup.c | 23 ++++++++++++++--------- kernel/sched/core.c | 2 ++ 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4068035176c4..bc892e3b37ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p, extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_task_exit(struct task_struct *p); +void cgroup_task_dead(struct task_struct *p); void cgroup_task_release(struct task_struct *p); void cgroup_task_free(struct task_struct *p); @@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p, static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_dead(struct task_struct *p) {} static inline void cgroup_task_release(struct task_struct *p) {} static inline void cgroup_task_free(struct task_struct *p) {} diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b3c27900c5d2..aae180d56c8c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -944,7 +944,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_task_exit()/cgroup_task_free() dropping + * against cgroup_task_dead()/cgroup_task_free() dropping * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6982,10 +6982,20 @@ void cgroup_post_fork(struct task_struct *child, void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +void cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -7003,12 +7013,7 @@ void cgroup_task_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); + spin_unlock_irqrestore(&css_set_lock, flags); } void cgroup_task_release(struct task_struct *task) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..40f12e37f60f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5222,6 +5222,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); -- cgit v1.2.3